buffer_cache: Fix various thread races on data upload and invalidation (#3186)

* buffer_cache: Fix various thread races on data upload and invalidation

* memory_tracker: Improve locking more on invalidation

---------

Co-authored-by: georgemoralis <giorgosmrls@gmail.com>
This commit is contained in:
TheTurtle 2025-07-03 23:36:01 +03:00 committed by GitHub
parent df22c4225e
commit b22da77f9a
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 84 additions and 84 deletions

View file

@ -2,8 +2,8 @@
// SPDX-License-Identifier: GPL-2.0-or-later
#include <algorithm>
#include <mutex>
#include "common/alignment.h"
#include "common/config.h"
#include "common/debug.h"
#include "common/scope_exit.h"
#include "common/types.h"
@ -136,20 +136,19 @@ void BufferCache::InvalidateMemory(VAddr device_addr, u64 size) {
if (!IsRegionRegistered(device_addr, size)) {
return;
}
if (Config::readbacks() && memory_tracker->IsRegionGpuModified(device_addr, size)) {
ReadMemory(device_addr, size);
}
memory_tracker->MarkRegionAsCpuModified(device_addr, size);
memory_tracker->InvalidateRegion(
device_addr, size, Config::readbacks(),
[this, device_addr, size] { ReadMemory(device_addr, size, true); });
}
void BufferCache::ReadMemory(VAddr device_addr, u64 size) {
liverpool->SendCommand<true>([this, device_addr, size] {
void BufferCache::ReadMemory(VAddr device_addr, u64 size, bool is_write) {
liverpool->SendCommand<true>([this, device_addr, size, is_write] {
Buffer& buffer = slot_buffers[FindBuffer(device_addr, size)];
DownloadBufferMemory(buffer, device_addr, size);
DownloadBufferMemory(buffer, device_addr, size, is_write);
});
}
void BufferCache::DownloadBufferMemory(Buffer& buffer, VAddr device_addr, u64 size) {
void BufferCache::DownloadBufferMemory(Buffer& buffer, VAddr device_addr, u64 size, bool is_write) {
boost::container::small_vector<vk::BufferCopy, 1> copies;
u64 total_size_bytes = 0;
memory_tracker->ForEachDownloadRange<false>(
@ -192,6 +191,9 @@ void BufferCache::DownloadBufferMemory(Buffer& buffer, VAddr device_addr, u64 si
copy.size);
}
memory_tracker->UnmarkRegionAsGpuModified(device_addr, size);
if (is_write) {
memory_tracker->MarkRegionAsCpuModified(device_addr, size);
}
}
void BufferCache::BindVertexBuffers(const Vulkan::GraphicsPipeline& pipeline) {
@ -359,7 +361,7 @@ void BufferCache::CopyBuffer(VAddr dst, VAddr src, u32 num_bytes, bool dst_gds,
// Avoid using ObtainBuffer here as that might give us the stream buffer.
const BufferId buffer_id = FindBuffer(src, num_bytes);
auto& buffer = slot_buffers[buffer_id];
SynchronizeBuffer(buffer, src, num_bytes, false);
SynchronizeBuffer(buffer, src, num_bytes, false, false);
return buffer;
}();
auto& dst_buffer = [&] -> const Buffer& {
@ -441,9 +443,8 @@ std::pair<Buffer*, u32> BufferCache::ObtainBuffer(VAddr device_addr, u32 size, b
buffer_id = FindBuffer(device_addr, size);
}
Buffer& buffer = slot_buffers[buffer_id];
SynchronizeBuffer(buffer, device_addr, size, is_texel_buffer);
SynchronizeBuffer(buffer, device_addr, size, is_written, is_texel_buffer);
if (is_written) {
memory_tracker->MarkRegionAsGpuModified(device_addr, size);
gpu_modified_ranges.Add(device_addr, size);
}
return {&buffer, buffer.Offset(device_addr)};
@ -454,7 +455,7 @@ std::pair<Buffer*, u32> BufferCache::ObtainBufferForImage(VAddr gpu_addr, u32 si
const BufferId buffer_id = page_table[gpu_addr >> CACHING_PAGEBITS].buffer_id;
if (buffer_id) {
if (Buffer& buffer = slot_buffers[buffer_id]; buffer.IsInBounds(gpu_addr, size)) {
SynchronizeBuffer(buffer, gpu_addr, size, false);
SynchronizeBuffer(buffer, gpu_addr, size, false, false);
return {&buffer, buffer.Offset(gpu_addr)};
}
}
@ -813,56 +814,27 @@ void BufferCache::ChangeRegister(BufferId buffer_id) {
}
}
void BufferCache::SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size,
void BufferCache::SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size, bool is_written,
bool is_texel_buffer) {
boost::container::small_vector<vk::BufferCopy, 4> copies;
u64 total_size_bytes = 0;
VAddr buffer_start = buffer.CpuAddr();
memory_tracker->ForEachUploadRange(device_addr, size, [&](u64 device_addr_out, u64 range_size) {
copies.push_back(vk::BufferCopy{
.srcOffset = total_size_bytes,
.dstOffset = device_addr_out - buffer_start,
.size = range_size,
memory_tracker->ForEachUploadRange(
device_addr, size, is_written, [&](u64 device_addr_out, u64 range_size) {
const u64 offset = staging_buffer.Copy(device_addr_out, range_size);
copies.push_back(vk::BufferCopy{
.srcOffset = offset,
.dstOffset = device_addr_out - buffer_start,
.size = range_size,
});
});
total_size_bytes += range_size;
});
SCOPE_EXIT {
if (is_texel_buffer) {
SynchronizeBufferFromImage(buffer, device_addr, size);
}
};
if (total_size_bytes == 0) {
if (copies.empty()) {
return;
}
vk::Buffer src_buffer = staging_buffer.Handle();
if (total_size_bytes < StagingBufferSize) {
const auto [staging, offset] = staging_buffer.Map(total_size_bytes);
for (auto& copy : copies) {
u8* const src_pointer = staging + copy.srcOffset;
const VAddr device_addr = buffer.CpuAddr() + copy.dstOffset;
std::memcpy(src_pointer, std::bit_cast<const u8*>(device_addr), copy.size);
// Apply the staging offset
copy.srcOffset += offset;
}
staging_buffer.Commit();
} else {
// For large one time transfers use a temporary host buffer.
// RenderDoc can lag quite a bit if the stream buffer is too large.
Buffer temp_buffer{instance,
scheduler,
MemoryUsage::Upload,
0,
vk::BufferUsageFlagBits::eTransferSrc,
total_size_bytes};
src_buffer = temp_buffer.Handle();
u8* const staging = temp_buffer.mapped_data.data();
for (auto& copy : copies) {
u8* const src_pointer = staging + copy.srcOffset;
const VAddr device_addr = buffer.CpuAddr() + copy.dstOffset;
std::memcpy(src_pointer, std::bit_cast<const u8*>(device_addr), copy.size);
}
scheduler.DeferOperation([buffer = std::move(temp_buffer)]() mutable {});
}
scheduler.EndRendering();
const auto cmdbuf = scheduler.CommandBuffer();
const vk::BufferMemoryBarrier2 pre_barrier = {
@ -889,7 +861,7 @@ void BufferCache::SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size,
.bufferMemoryBarrierCount = 1,
.pBufferMemoryBarriers = &pre_barrier,
});
cmdbuf.copyBuffer(src_buffer, buffer.buffer, copies);
cmdbuf.copyBuffer(staging_buffer.Handle(), buffer.buffer, copies);
cmdbuf.pipelineBarrier2(vk::DependencyInfo{
.dependencyFlags = vk::DependencyFlagBits::eByRegion,
.bufferMemoryBarrierCount = 1,
@ -1020,7 +992,7 @@ void BufferCache::SynchronizeBuffersInRange(VAddr device_addr, u64 size) {
VAddr start = std::max(buffer.CpuAddr(), device_addr);
VAddr end = std::min(buffer.CpuAddr() + buffer.SizeBytes(), device_addr_end);
u32 size = static_cast<u32>(end - start);
SynchronizeBuffer(buffer, start, size, false);
SynchronizeBuffer(buffer, start, size, false, false);
});
}

View file

@ -113,7 +113,7 @@ public:
void InvalidateMemory(VAddr device_addr, u64 size);
/// Waits on pending downloads in the logical page range.
void ReadMemory(VAddr device_addr, u64 size);
void ReadMemory(VAddr device_addr, u64 size, bool is_write = false);
/// Binds host vertex buffers for the current draw.
void BindVertexBuffers(const Vulkan::GraphicsPipeline& pipeline);
@ -176,7 +176,7 @@ private:
return !buffer_id || slot_buffers[buffer_id].is_deleted;
}
void DownloadBufferMemory(Buffer& buffer, VAddr device_addr, u64 size);
void DownloadBufferMemory(Buffer& buffer, VAddr device_addr, u64 size, bool is_write);
[[nodiscard]] OverlapResult ResolveOverlaps(VAddr device_addr, u32 wanted_size);
@ -191,7 +191,8 @@ private:
template <bool insert>
void ChangeRegister(BufferId buffer_id);
void SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size, bool is_texel_buffer);
void SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size, bool is_written,
bool is_texel_buffer);
bool SynchronizeBufferFromImage(Buffer& buffer, VAddr device_addr, u32 size);

View file

@ -27,6 +27,7 @@ public:
bool IsRegionCpuModified(VAddr query_cpu_addr, u64 query_size) noexcept {
return IteratePages<true>(
query_cpu_addr, query_size, [](RegionManager* manager, u64 offset, size_t size) {
std::scoped_lock lk{manager->lock};
return manager->template IsRegionModified<Type::CPU>(offset, size);
});
}
@ -35,6 +36,7 @@ public:
bool IsRegionGpuModified(VAddr query_cpu_addr, u64 query_size) noexcept {
return IteratePages<false>(
query_cpu_addr, query_size, [](RegionManager* manager, u64 offset, size_t size) {
std::scoped_lock lk{manager->lock};
return manager->template IsRegionModified<Type::GPU>(offset, size);
});
}
@ -43,34 +45,57 @@ public:
void MarkRegionAsCpuModified(VAddr dirty_cpu_addr, u64 query_size) {
IteratePages<false>(dirty_cpu_addr, query_size,
[](RegionManager* manager, u64 offset, size_t size) {
std::scoped_lock lk{manager->lock};
manager->template ChangeRegionState<Type::CPU, true>(
manager->GetCpuAddr() + offset, size);
});
}
/// Mark region as modified from the host GPU
void MarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 query_size) noexcept {
IteratePages<false>(dirty_cpu_addr, query_size,
[](RegionManager* manager, u64 offset, size_t size) {
manager->template ChangeRegionState<Type::GPU, true>(
manager->GetCpuAddr() + offset, size);
});
}
/// Unmark region as modified from the host GPU
void UnmarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 query_size) noexcept {
IteratePages<false>(dirty_cpu_addr, query_size,
[](RegionManager* manager, u64 offset, size_t size) {
std::scoped_lock lk{manager->lock};
manager->template ChangeRegionState<Type::GPU, false>(
manager->GetCpuAddr() + offset, size);
});
}
/// Removes all protection from a page and ensures GPU data has been flushed if requested
void InvalidateRegion(VAddr cpu_addr, u64 size, bool try_flush, auto&& on_flush) noexcept {
IteratePages<false>(
cpu_addr, size,
[try_flush, &on_flush](RegionManager* manager, u64 offset, size_t size) {
const bool should_flush = [&] {
// Perform both the GPU modification check and CPU state change with the lock
// in case we are racing with GPU thread trying to mark the page as GPU
// modified. If we need to flush the flush function is going to perform CPU
// state change.
std::scoped_lock lk{manager->lock};
if (try_flush && manager->template IsRegionModified<Type::GPU>(offset, size)) {
return true;
}
manager->template ChangeRegionState<Type::CPU, true>(
manager->GetCpuAddr() + offset, size);
return false;
}();
if (should_flush) {
on_flush();
}
});
}
/// Call 'func' for each CPU modified range and unmark those pages as CPU modified
void ForEachUploadRange(VAddr query_cpu_range, u64 query_size, auto&& func) {
void ForEachUploadRange(VAddr query_cpu_range, u64 query_size, bool is_written, auto&& func) {
IteratePages<true>(query_cpu_range, query_size,
[&func](RegionManager* manager, u64 offset, size_t size) {
[&func, is_written](RegionManager* manager, u64 offset, size_t size) {
std::scoped_lock lk{manager->lock};
manager->template ForEachModifiedRange<Type::CPU, true>(
manager->GetCpuAddr() + offset, size, func);
if (is_written) {
manager->template ChangeRegionState<Type::GPU, true>(
manager->GetCpuAddr() + offset, size);
}
});
}
@ -79,6 +104,7 @@ public:
void ForEachDownloadRange(VAddr query_cpu_range, u64 query_size, auto&& func) {
IteratePages<false>(query_cpu_range, query_size,
[&func](RegionManager* manager, u64 offset, size_t size) {
std::scoped_lock lk{manager->lock};
manager->template ForEachModifiedRange<Type::GPU, clear>(
manager->GetCpuAddr() + offset, size, func);
});

View file

@ -3,9 +3,9 @@
#pragma once
#include <mutex>
#include "common/config.h"
#include "common/div_ceil.h"
#include "common/logging/log.h"
#ifdef __linux__
#include "common/adaptive_mutex.h"
@ -19,6 +19,12 @@
namespace VideoCore {
#ifdef PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP
using LockType = Common::AdaptiveMutex;
#else
using LockType = Common::SpinLock;
#endif
/**
* Allows tracking CPU and GPU modification of pages in a contigious 16MB virtual address region.
* Information is stored in bitsets for spacial locality and fast update of single pages.
@ -80,7 +86,6 @@ public:
if (start_page >= NUM_PAGES_PER_REGION || end_page <= start_page) {
return;
}
std::scoped_lock lk{lock};
RegionBits& bits = GetRegionBits<type>();
if constexpr (enable) {
@ -113,15 +118,10 @@ public:
if (start_page >= NUM_PAGES_PER_REGION || end_page <= start_page) {
return;
}
std::scoped_lock lk{lock};
RegionBits& bits = GetRegionBits<type>();
RegionBits mask(bits, start_page, end_page);
for (const auto& [start, end] : mask) {
func(cpu_addr + start * TRACKER_BYTES_PER_PAGE, (end - start) * TRACKER_BYTES_PER_PAGE);
}
if constexpr (clear) {
bits.UnsetRange(start_page, end_page);
if constexpr (type == Type::CPU) {
@ -130,6 +130,10 @@ public:
UpdateProtection<false, true>();
}
}
for (const auto& [start, end] : mask) {
func(cpu_addr + start * TRACKER_BYTES_PER_PAGE, (end - start) * TRACKER_BYTES_PER_PAGE);
}
}
/**
@ -147,13 +151,14 @@ public:
if (start_page >= NUM_PAGES_PER_REGION || end_page <= start_page) {
return false;
}
std::scoped_lock lk{lock};
const RegionBits& bits = GetRegionBits<type>();
RegionBits test(bits, start_page, end_page);
return test.Any();
}
LockType lock;
private:
/**
* Notify tracker about changes in the CPU tracking state of a word in the buffer
@ -179,11 +184,6 @@ private:
tracker->UpdatePageWatchersForRegion<track, is_read>(cpu_addr, mask);
}
#ifdef PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP
Common::AdaptiveMutex lock;
#else
Common::SpinLock lock;
#endif
PageManager* tracker;
VAddr cpu_addr = 0;
RegionBits cpu;

View file

@ -201,16 +201,17 @@ struct PageManager::Impl {
RENDERER_TRACE;
auto* memory = Core::Memory::Instance();
auto& impl = memory->GetAddressSpace();
// ASSERT(perms != Core::MemoryPermission::Write);
ASSERT_MSG(perms != Core::MemoryPermission::Write,
"Attempted to protect region as write-only which is not a valid permission");
impl.Protect(address, size, perms);
}
static bool GuestFaultSignalHandler(void* context, void* fault_address) {
const auto addr = reinterpret_cast<VAddr>(fault_address);
if (Common::IsWriteError(context)) {
return rasterizer->InvalidateMemory(addr, 1);
return rasterizer->InvalidateMemory(addr, 8);
} else {
return rasterizer->ReadMemory(addr, 1);
return rasterizer->ReadMemory(addr, 8);
}
return false;
}