buffer_cache: Fix various thread races on data upload and invalidation (#3186)

* buffer_cache: Fix various thread races on data upload and invalidation

* memory_tracker: Improve locking more on invalidation

---------

Co-authored-by: georgemoralis <giorgosmrls@gmail.com>
This commit is contained in:
TheTurtle 2025-07-03 23:36:01 +03:00 committed by GitHub
parent df22c4225e
commit b22da77f9a
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 84 additions and 84 deletions

View file

@ -2,8 +2,8 @@
// SPDX-License-Identifier: GPL-2.0-or-later // SPDX-License-Identifier: GPL-2.0-or-later
#include <algorithm> #include <algorithm>
#include <mutex>
#include "common/alignment.h" #include "common/alignment.h"
#include "common/config.h"
#include "common/debug.h" #include "common/debug.h"
#include "common/scope_exit.h" #include "common/scope_exit.h"
#include "common/types.h" #include "common/types.h"
@ -136,20 +136,19 @@ void BufferCache::InvalidateMemory(VAddr device_addr, u64 size) {
if (!IsRegionRegistered(device_addr, size)) { if (!IsRegionRegistered(device_addr, size)) {
return; return;
} }
if (Config::readbacks() && memory_tracker->IsRegionGpuModified(device_addr, size)) { memory_tracker->InvalidateRegion(
ReadMemory(device_addr, size); device_addr, size, Config::readbacks(),
} [this, device_addr, size] { ReadMemory(device_addr, size, true); });
memory_tracker->MarkRegionAsCpuModified(device_addr, size);
} }
void BufferCache::ReadMemory(VAddr device_addr, u64 size) { void BufferCache::ReadMemory(VAddr device_addr, u64 size, bool is_write) {
liverpool->SendCommand<true>([this, device_addr, size] { liverpool->SendCommand<true>([this, device_addr, size, is_write] {
Buffer& buffer = slot_buffers[FindBuffer(device_addr, size)]; Buffer& buffer = slot_buffers[FindBuffer(device_addr, size)];
DownloadBufferMemory(buffer, device_addr, size); DownloadBufferMemory(buffer, device_addr, size, is_write);
}); });
} }
void BufferCache::DownloadBufferMemory(Buffer& buffer, VAddr device_addr, u64 size) { void BufferCache::DownloadBufferMemory(Buffer& buffer, VAddr device_addr, u64 size, bool is_write) {
boost::container::small_vector<vk::BufferCopy, 1> copies; boost::container::small_vector<vk::BufferCopy, 1> copies;
u64 total_size_bytes = 0; u64 total_size_bytes = 0;
memory_tracker->ForEachDownloadRange<false>( memory_tracker->ForEachDownloadRange<false>(
@ -192,6 +191,9 @@ void BufferCache::DownloadBufferMemory(Buffer& buffer, VAddr device_addr, u64 si
copy.size); copy.size);
} }
memory_tracker->UnmarkRegionAsGpuModified(device_addr, size); memory_tracker->UnmarkRegionAsGpuModified(device_addr, size);
if (is_write) {
memory_tracker->MarkRegionAsCpuModified(device_addr, size);
}
} }
void BufferCache::BindVertexBuffers(const Vulkan::GraphicsPipeline& pipeline) { void BufferCache::BindVertexBuffers(const Vulkan::GraphicsPipeline& pipeline) {
@ -359,7 +361,7 @@ void BufferCache::CopyBuffer(VAddr dst, VAddr src, u32 num_bytes, bool dst_gds,
// Avoid using ObtainBuffer here as that might give us the stream buffer. // Avoid using ObtainBuffer here as that might give us the stream buffer.
const BufferId buffer_id = FindBuffer(src, num_bytes); const BufferId buffer_id = FindBuffer(src, num_bytes);
auto& buffer = slot_buffers[buffer_id]; auto& buffer = slot_buffers[buffer_id];
SynchronizeBuffer(buffer, src, num_bytes, false); SynchronizeBuffer(buffer, src, num_bytes, false, false);
return buffer; return buffer;
}(); }();
auto& dst_buffer = [&] -> const Buffer& { auto& dst_buffer = [&] -> const Buffer& {
@ -441,9 +443,8 @@ std::pair<Buffer*, u32> BufferCache::ObtainBuffer(VAddr device_addr, u32 size, b
buffer_id = FindBuffer(device_addr, size); buffer_id = FindBuffer(device_addr, size);
} }
Buffer& buffer = slot_buffers[buffer_id]; Buffer& buffer = slot_buffers[buffer_id];
SynchronizeBuffer(buffer, device_addr, size, is_texel_buffer); SynchronizeBuffer(buffer, device_addr, size, is_written, is_texel_buffer);
if (is_written) { if (is_written) {
memory_tracker->MarkRegionAsGpuModified(device_addr, size);
gpu_modified_ranges.Add(device_addr, size); gpu_modified_ranges.Add(device_addr, size);
} }
return {&buffer, buffer.Offset(device_addr)}; return {&buffer, buffer.Offset(device_addr)};
@ -454,7 +455,7 @@ std::pair<Buffer*, u32> BufferCache::ObtainBufferForImage(VAddr gpu_addr, u32 si
const BufferId buffer_id = page_table[gpu_addr >> CACHING_PAGEBITS].buffer_id; const BufferId buffer_id = page_table[gpu_addr >> CACHING_PAGEBITS].buffer_id;
if (buffer_id) { if (buffer_id) {
if (Buffer& buffer = slot_buffers[buffer_id]; buffer.IsInBounds(gpu_addr, size)) { if (Buffer& buffer = slot_buffers[buffer_id]; buffer.IsInBounds(gpu_addr, size)) {
SynchronizeBuffer(buffer, gpu_addr, size, false); SynchronizeBuffer(buffer, gpu_addr, size, false, false);
return {&buffer, buffer.Offset(gpu_addr)}; return {&buffer, buffer.Offset(gpu_addr)};
} }
} }
@ -813,56 +814,27 @@ void BufferCache::ChangeRegister(BufferId buffer_id) {
} }
} }
void BufferCache::SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size, void BufferCache::SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size, bool is_written,
bool is_texel_buffer) { bool is_texel_buffer) {
boost::container::small_vector<vk::BufferCopy, 4> copies; boost::container::small_vector<vk::BufferCopy, 4> copies;
u64 total_size_bytes = 0;
VAddr buffer_start = buffer.CpuAddr(); VAddr buffer_start = buffer.CpuAddr();
memory_tracker->ForEachUploadRange(device_addr, size, [&](u64 device_addr_out, u64 range_size) { memory_tracker->ForEachUploadRange(
copies.push_back(vk::BufferCopy{ device_addr, size, is_written, [&](u64 device_addr_out, u64 range_size) {
.srcOffset = total_size_bytes, const u64 offset = staging_buffer.Copy(device_addr_out, range_size);
.dstOffset = device_addr_out - buffer_start, copies.push_back(vk::BufferCopy{
.size = range_size, .srcOffset = offset,
.dstOffset = device_addr_out - buffer_start,
.size = range_size,
});
}); });
total_size_bytes += range_size;
});
SCOPE_EXIT { SCOPE_EXIT {
if (is_texel_buffer) { if (is_texel_buffer) {
SynchronizeBufferFromImage(buffer, device_addr, size); SynchronizeBufferFromImage(buffer, device_addr, size);
} }
}; };
if (total_size_bytes == 0) { if (copies.empty()) {
return; return;
} }
vk::Buffer src_buffer = staging_buffer.Handle();
if (total_size_bytes < StagingBufferSize) {
const auto [staging, offset] = staging_buffer.Map(total_size_bytes);
for (auto& copy : copies) {
u8* const src_pointer = staging + copy.srcOffset;
const VAddr device_addr = buffer.CpuAddr() + copy.dstOffset;
std::memcpy(src_pointer, std::bit_cast<const u8*>(device_addr), copy.size);
// Apply the staging offset
copy.srcOffset += offset;
}
staging_buffer.Commit();
} else {
// For large one time transfers use a temporary host buffer.
// RenderDoc can lag quite a bit if the stream buffer is too large.
Buffer temp_buffer{instance,
scheduler,
MemoryUsage::Upload,
0,
vk::BufferUsageFlagBits::eTransferSrc,
total_size_bytes};
src_buffer = temp_buffer.Handle();
u8* const staging = temp_buffer.mapped_data.data();
for (auto& copy : copies) {
u8* const src_pointer = staging + copy.srcOffset;
const VAddr device_addr = buffer.CpuAddr() + copy.dstOffset;
std::memcpy(src_pointer, std::bit_cast<const u8*>(device_addr), copy.size);
}
scheduler.DeferOperation([buffer = std::move(temp_buffer)]() mutable {});
}
scheduler.EndRendering(); scheduler.EndRendering();
const auto cmdbuf = scheduler.CommandBuffer(); const auto cmdbuf = scheduler.CommandBuffer();
const vk::BufferMemoryBarrier2 pre_barrier = { const vk::BufferMemoryBarrier2 pre_barrier = {
@ -889,7 +861,7 @@ void BufferCache::SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size,
.bufferMemoryBarrierCount = 1, .bufferMemoryBarrierCount = 1,
.pBufferMemoryBarriers = &pre_barrier, .pBufferMemoryBarriers = &pre_barrier,
}); });
cmdbuf.copyBuffer(src_buffer, buffer.buffer, copies); cmdbuf.copyBuffer(staging_buffer.Handle(), buffer.buffer, copies);
cmdbuf.pipelineBarrier2(vk::DependencyInfo{ cmdbuf.pipelineBarrier2(vk::DependencyInfo{
.dependencyFlags = vk::DependencyFlagBits::eByRegion, .dependencyFlags = vk::DependencyFlagBits::eByRegion,
.bufferMemoryBarrierCount = 1, .bufferMemoryBarrierCount = 1,
@ -1020,7 +992,7 @@ void BufferCache::SynchronizeBuffersInRange(VAddr device_addr, u64 size) {
VAddr start = std::max(buffer.CpuAddr(), device_addr); VAddr start = std::max(buffer.CpuAddr(), device_addr);
VAddr end = std::min(buffer.CpuAddr() + buffer.SizeBytes(), device_addr_end); VAddr end = std::min(buffer.CpuAddr() + buffer.SizeBytes(), device_addr_end);
u32 size = static_cast<u32>(end - start); u32 size = static_cast<u32>(end - start);
SynchronizeBuffer(buffer, start, size, false); SynchronizeBuffer(buffer, start, size, false, false);
}); });
} }

View file

@ -113,7 +113,7 @@ public:
void InvalidateMemory(VAddr device_addr, u64 size); void InvalidateMemory(VAddr device_addr, u64 size);
/// Waits on pending downloads in the logical page range. /// Waits on pending downloads in the logical page range.
void ReadMemory(VAddr device_addr, u64 size); void ReadMemory(VAddr device_addr, u64 size, bool is_write = false);
/// Binds host vertex buffers for the current draw. /// Binds host vertex buffers for the current draw.
void BindVertexBuffers(const Vulkan::GraphicsPipeline& pipeline); void BindVertexBuffers(const Vulkan::GraphicsPipeline& pipeline);
@ -176,7 +176,7 @@ private:
return !buffer_id || slot_buffers[buffer_id].is_deleted; return !buffer_id || slot_buffers[buffer_id].is_deleted;
} }
void DownloadBufferMemory(Buffer& buffer, VAddr device_addr, u64 size); void DownloadBufferMemory(Buffer& buffer, VAddr device_addr, u64 size, bool is_write);
[[nodiscard]] OverlapResult ResolveOverlaps(VAddr device_addr, u32 wanted_size); [[nodiscard]] OverlapResult ResolveOverlaps(VAddr device_addr, u32 wanted_size);
@ -191,7 +191,8 @@ private:
template <bool insert> template <bool insert>
void ChangeRegister(BufferId buffer_id); void ChangeRegister(BufferId buffer_id);
void SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size, bool is_texel_buffer); void SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size, bool is_written,
bool is_texel_buffer);
bool SynchronizeBufferFromImage(Buffer& buffer, VAddr device_addr, u32 size); bool SynchronizeBufferFromImage(Buffer& buffer, VAddr device_addr, u32 size);

View file

@ -27,6 +27,7 @@ public:
bool IsRegionCpuModified(VAddr query_cpu_addr, u64 query_size) noexcept { bool IsRegionCpuModified(VAddr query_cpu_addr, u64 query_size) noexcept {
return IteratePages<true>( return IteratePages<true>(
query_cpu_addr, query_size, [](RegionManager* manager, u64 offset, size_t size) { query_cpu_addr, query_size, [](RegionManager* manager, u64 offset, size_t size) {
std::scoped_lock lk{manager->lock};
return manager->template IsRegionModified<Type::CPU>(offset, size); return manager->template IsRegionModified<Type::CPU>(offset, size);
}); });
} }
@ -35,6 +36,7 @@ public:
bool IsRegionGpuModified(VAddr query_cpu_addr, u64 query_size) noexcept { bool IsRegionGpuModified(VAddr query_cpu_addr, u64 query_size) noexcept {
return IteratePages<false>( return IteratePages<false>(
query_cpu_addr, query_size, [](RegionManager* manager, u64 offset, size_t size) { query_cpu_addr, query_size, [](RegionManager* manager, u64 offset, size_t size) {
std::scoped_lock lk{manager->lock};
return manager->template IsRegionModified<Type::GPU>(offset, size); return manager->template IsRegionModified<Type::GPU>(offset, size);
}); });
} }
@ -43,34 +45,57 @@ public:
void MarkRegionAsCpuModified(VAddr dirty_cpu_addr, u64 query_size) { void MarkRegionAsCpuModified(VAddr dirty_cpu_addr, u64 query_size) {
IteratePages<false>(dirty_cpu_addr, query_size, IteratePages<false>(dirty_cpu_addr, query_size,
[](RegionManager* manager, u64 offset, size_t size) { [](RegionManager* manager, u64 offset, size_t size) {
std::scoped_lock lk{manager->lock};
manager->template ChangeRegionState<Type::CPU, true>( manager->template ChangeRegionState<Type::CPU, true>(
manager->GetCpuAddr() + offset, size); manager->GetCpuAddr() + offset, size);
}); });
} }
/// Mark region as modified from the host GPU /// Unmark region as modified from the host GPU
void MarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 query_size) noexcept {
IteratePages<false>(dirty_cpu_addr, query_size,
[](RegionManager* manager, u64 offset, size_t size) {
manager->template ChangeRegionState<Type::GPU, true>(
manager->GetCpuAddr() + offset, size);
});
}
void UnmarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 query_size) noexcept { void UnmarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 query_size) noexcept {
IteratePages<false>(dirty_cpu_addr, query_size, IteratePages<false>(dirty_cpu_addr, query_size,
[](RegionManager* manager, u64 offset, size_t size) { [](RegionManager* manager, u64 offset, size_t size) {
std::scoped_lock lk{manager->lock};
manager->template ChangeRegionState<Type::GPU, false>( manager->template ChangeRegionState<Type::GPU, false>(
manager->GetCpuAddr() + offset, size); manager->GetCpuAddr() + offset, size);
}); });
} }
/// Removes all protection from a page and ensures GPU data has been flushed if requested
void InvalidateRegion(VAddr cpu_addr, u64 size, bool try_flush, auto&& on_flush) noexcept {
IteratePages<false>(
cpu_addr, size,
[try_flush, &on_flush](RegionManager* manager, u64 offset, size_t size) {
const bool should_flush = [&] {
// Perform both the GPU modification check and CPU state change with the lock
// in case we are racing with GPU thread trying to mark the page as GPU
// modified. If we need to flush the flush function is going to perform CPU
// state change.
std::scoped_lock lk{manager->lock};
if (try_flush && manager->template IsRegionModified<Type::GPU>(offset, size)) {
return true;
}
manager->template ChangeRegionState<Type::CPU, true>(
manager->GetCpuAddr() + offset, size);
return false;
}();
if (should_flush) {
on_flush();
}
});
}
/// Call 'func' for each CPU modified range and unmark those pages as CPU modified /// Call 'func' for each CPU modified range and unmark those pages as CPU modified
void ForEachUploadRange(VAddr query_cpu_range, u64 query_size, auto&& func) { void ForEachUploadRange(VAddr query_cpu_range, u64 query_size, bool is_written, auto&& func) {
IteratePages<true>(query_cpu_range, query_size, IteratePages<true>(query_cpu_range, query_size,
[&func](RegionManager* manager, u64 offset, size_t size) { [&func, is_written](RegionManager* manager, u64 offset, size_t size) {
std::scoped_lock lk{manager->lock};
manager->template ForEachModifiedRange<Type::CPU, true>( manager->template ForEachModifiedRange<Type::CPU, true>(
manager->GetCpuAddr() + offset, size, func); manager->GetCpuAddr() + offset, size, func);
if (is_written) {
manager->template ChangeRegionState<Type::GPU, true>(
manager->GetCpuAddr() + offset, size);
}
}); });
} }
@ -79,6 +104,7 @@ public:
void ForEachDownloadRange(VAddr query_cpu_range, u64 query_size, auto&& func) { void ForEachDownloadRange(VAddr query_cpu_range, u64 query_size, auto&& func) {
IteratePages<false>(query_cpu_range, query_size, IteratePages<false>(query_cpu_range, query_size,
[&func](RegionManager* manager, u64 offset, size_t size) { [&func](RegionManager* manager, u64 offset, size_t size) {
std::scoped_lock lk{manager->lock};
manager->template ForEachModifiedRange<Type::GPU, clear>( manager->template ForEachModifiedRange<Type::GPU, clear>(
manager->GetCpuAddr() + offset, size, func); manager->GetCpuAddr() + offset, size, func);
}); });

View file

@ -3,9 +3,9 @@
#pragma once #pragma once
#include <mutex>
#include "common/config.h" #include "common/config.h"
#include "common/div_ceil.h" #include "common/div_ceil.h"
#include "common/logging/log.h"
#ifdef __linux__ #ifdef __linux__
#include "common/adaptive_mutex.h" #include "common/adaptive_mutex.h"
@ -19,6 +19,12 @@
namespace VideoCore { namespace VideoCore {
#ifdef PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP
using LockType = Common::AdaptiveMutex;
#else
using LockType = Common::SpinLock;
#endif
/** /**
* Allows tracking CPU and GPU modification of pages in a contigious 16MB virtual address region. * Allows tracking CPU and GPU modification of pages in a contigious 16MB virtual address region.
* Information is stored in bitsets for spacial locality and fast update of single pages. * Information is stored in bitsets for spacial locality and fast update of single pages.
@ -80,7 +86,6 @@ public:
if (start_page >= NUM_PAGES_PER_REGION || end_page <= start_page) { if (start_page >= NUM_PAGES_PER_REGION || end_page <= start_page) {
return; return;
} }
std::scoped_lock lk{lock};
RegionBits& bits = GetRegionBits<type>(); RegionBits& bits = GetRegionBits<type>();
if constexpr (enable) { if constexpr (enable) {
@ -113,15 +118,10 @@ public:
if (start_page >= NUM_PAGES_PER_REGION || end_page <= start_page) { if (start_page >= NUM_PAGES_PER_REGION || end_page <= start_page) {
return; return;
} }
std::scoped_lock lk{lock};
RegionBits& bits = GetRegionBits<type>(); RegionBits& bits = GetRegionBits<type>();
RegionBits mask(bits, start_page, end_page); RegionBits mask(bits, start_page, end_page);
for (const auto& [start, end] : mask) {
func(cpu_addr + start * TRACKER_BYTES_PER_PAGE, (end - start) * TRACKER_BYTES_PER_PAGE);
}
if constexpr (clear) { if constexpr (clear) {
bits.UnsetRange(start_page, end_page); bits.UnsetRange(start_page, end_page);
if constexpr (type == Type::CPU) { if constexpr (type == Type::CPU) {
@ -130,6 +130,10 @@ public:
UpdateProtection<false, true>(); UpdateProtection<false, true>();
} }
} }
for (const auto& [start, end] : mask) {
func(cpu_addr + start * TRACKER_BYTES_PER_PAGE, (end - start) * TRACKER_BYTES_PER_PAGE);
}
} }
/** /**
@ -147,13 +151,14 @@ public:
if (start_page >= NUM_PAGES_PER_REGION || end_page <= start_page) { if (start_page >= NUM_PAGES_PER_REGION || end_page <= start_page) {
return false; return false;
} }
std::scoped_lock lk{lock};
const RegionBits& bits = GetRegionBits<type>(); const RegionBits& bits = GetRegionBits<type>();
RegionBits test(bits, start_page, end_page); RegionBits test(bits, start_page, end_page);
return test.Any(); return test.Any();
} }
LockType lock;
private: private:
/** /**
* Notify tracker about changes in the CPU tracking state of a word in the buffer * Notify tracker about changes in the CPU tracking state of a word in the buffer
@ -179,11 +184,6 @@ private:
tracker->UpdatePageWatchersForRegion<track, is_read>(cpu_addr, mask); tracker->UpdatePageWatchersForRegion<track, is_read>(cpu_addr, mask);
} }
#ifdef PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP
Common::AdaptiveMutex lock;
#else
Common::SpinLock lock;
#endif
PageManager* tracker; PageManager* tracker;
VAddr cpu_addr = 0; VAddr cpu_addr = 0;
RegionBits cpu; RegionBits cpu;

View file

@ -201,16 +201,17 @@ struct PageManager::Impl {
RENDERER_TRACE; RENDERER_TRACE;
auto* memory = Core::Memory::Instance(); auto* memory = Core::Memory::Instance();
auto& impl = memory->GetAddressSpace(); auto& impl = memory->GetAddressSpace();
// ASSERT(perms != Core::MemoryPermission::Write); ASSERT_MSG(perms != Core::MemoryPermission::Write,
"Attempted to protect region as write-only which is not a valid permission");
impl.Protect(address, size, perms); impl.Protect(address, size, perms);
} }
static bool GuestFaultSignalHandler(void* context, void* fault_address) { static bool GuestFaultSignalHandler(void* context, void* fault_address) {
const auto addr = reinterpret_cast<VAddr>(fault_address); const auto addr = reinterpret_cast<VAddr>(fault_address);
if (Common::IsWriteError(context)) { if (Common::IsWriteError(context)) {
return rasterizer->InvalidateMemory(addr, 1); return rasterizer->InvalidateMemory(addr, 8);
} else { } else {
return rasterizer->ReadMemory(addr, 1); return rasterizer->ReadMemory(addr, 8);
} }
return false; return false;
} }