buffer_cache: Bring back upload batching and temporary buffer (#3211)
Some checks are pending
Build and Release / reuse (push) Waiting to run
Build and Release / clang-format (push) Waiting to run
Build and Release / get-info (push) Waiting to run
Build and Release / windows-sdl (push) Blocked by required conditions
Build and Release / windows-qt (push) Blocked by required conditions
Build and Release / macos-sdl (push) Blocked by required conditions
Build and Release / macos-qt (push) Blocked by required conditions
Build and Release / linux-sdl (push) Blocked by required conditions
Build and Release / linux-qt (push) Blocked by required conditions
Build and Release / linux-sdl-gcc (push) Blocked by required conditions
Build and Release / linux-qt-gcc (push) Blocked by required conditions
Build and Release / pre-release (push) Blocked by required conditions

* buffer_cache: Bring back upload batching and temporary buffer

Because that PR fused the write and read protections under a single function call, it was a requirement to move the actual memory copy part inside the lambda to perform it before the read protection kicks in. However on certain large data transfers it had potential for data corruption. If, for example, an upload had two copies, a 400MB and a 300MB one, the first one would fit in the staging buffer, very likely with an induced stall. However the second one wouldn't have space to fit alongside the other data, but it's also small enough for the buffer to fit it, so the staging buffer would cause a flush and wait to copy it, overwriting the previous transfer.

To address this the upload function has been reworked to allow for batching like previously but with the new locking behavior. Also the condition to use temporary buffers has been expanded to also include cases when staging buffer will stall, which should increase performance a little in some cases.

* buffer_cache: Move buffer barriers and copy outside of lock range
This commit is contained in:
TheTurtle 2025-07-08 10:32:39 +03:00 committed by GitHub
parent ddede4a52d
commit 2d1a2982df
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 96 additions and 35 deletions

View file

@ -137,12 +137,15 @@ StreamBuffer::StreamBuffer(const Vulkan::Instance& instance, Vulkan::Scheduler&
size_bytes); size_bytes);
} }
std::pair<u8*, u64> StreamBuffer::Map(u64 size, u64 alignment) { std::pair<u8*, u64> StreamBuffer::Map(u64 size, u64 alignment, bool allow_wait) {
if (!is_coherent && usage == MemoryUsage::Stream) { if (!is_coherent && usage == MemoryUsage::Stream) {
size = Common::AlignUp(size, instance->NonCoherentAtomSize()); size = Common::AlignUp(size, instance->NonCoherentAtomSize());
} }
ASSERT(size <= this->size_bytes); if (size > this->size_bytes) {
return {nullptr, 0};
}
mapped_size = size; mapped_size = size;
if (alignment > 0) { if (alignment > 0) {
@ -162,8 +165,11 @@ std::pair<u8*, u64> StreamBuffer::Map(u64 size, u64 alignment) {
} }
const u64 mapped_upper_bound = offset + size; const u64 mapped_upper_bound = offset + size;
WaitPendingOperations(mapped_upper_bound); if (!WaitPendingOperations(mapped_upper_bound, allow_wait)) {
return std::make_pair(mapped_data.data() + offset, offset); return {nullptr, 0};
}
return {mapped_data.data() + offset, offset};
} }
void StreamBuffer::Commit() { void StreamBuffer::Commit() {
@ -177,6 +183,12 @@ void StreamBuffer::Commit() {
} }
offset += mapped_size; offset += mapped_size;
if (current_watch_cursor != 0 &&
current_watches[current_watch_cursor].tick == scheduler->CurrentTick()) {
current_watches[current_watch_cursor].upper_bound = offset;
return;
}
if (current_watch_cursor + 1 >= current_watches.size()) { if (current_watch_cursor + 1 >= current_watches.size()) {
// Ensure that there are enough watches. // Ensure that there are enough watches.
ReserveWatches(current_watches, WATCHES_RESERVE_CHUNK); ReserveWatches(current_watches, WATCHES_RESERVE_CHUNK);
@ -191,16 +203,20 @@ void StreamBuffer::ReserveWatches(std::vector<Watch>& watches, std::size_t grow_
watches.resize(watches.size() + grow_size); watches.resize(watches.size() + grow_size);
} }
void StreamBuffer::WaitPendingOperations(u64 requested_upper_bound) { bool StreamBuffer::WaitPendingOperations(u64 requested_upper_bound, bool allow_wait) {
if (!invalidation_mark) { if (!invalidation_mark) {
return; return true;
} }
while (requested_upper_bound > wait_bound && wait_cursor < *invalidation_mark) { while (requested_upper_bound > wait_bound && wait_cursor < *invalidation_mark) {
auto& watch = previous_watches[wait_cursor]; auto& watch = previous_watches[wait_cursor];
wait_bound = watch.upper_bound; if (!scheduler->IsFree(watch.tick) && !allow_wait) {
return false;
}
scheduler->Wait(watch.tick); scheduler->Wait(watch.tick);
wait_bound = watch.upper_bound;
++wait_cursor; ++wait_cursor;
} }
return true;
} }
} // namespace VideoCore } // namespace VideoCore

View file

@ -168,7 +168,7 @@ public:
MemoryUsage usage, u64 size_bytes_); MemoryUsage usage, u64 size_bytes_);
/// Reserves a region of memory from the stream buffer. /// Reserves a region of memory from the stream buffer.
std::pair<u8*, u64> Map(u64 size, u64 alignment = 0); std::pair<u8*, u64> Map(u64 size, u64 alignment = 0, bool allow_wait = true);
/// Ensures that reserved bytes of memory are available to the GPU. /// Ensures that reserved bytes of memory are available to the GPU.
void Commit(); void Commit();
@ -181,10 +181,6 @@ public:
return offset; return offset;
} }
u64 GetFreeSize() const {
return size_bytes - offset - mapped_size;
}
private: private:
struct Watch { struct Watch {
u64 tick{}; u64 tick{};
@ -195,7 +191,7 @@ private:
void ReserveWatches(std::vector<Watch>& watches, std::size_t grow_size); void ReserveWatches(std::vector<Watch>& watches, std::size_t grow_size);
/// Waits pending watches until requested upper bound. /// Waits pending watches until requested upper bound.
void WaitPendingOperations(u64 requested_upper_bound); bool WaitPendingOperations(u64 requested_upper_bound, bool allow_wait);
private: private:
u64 offset{}; u64 offset{};

View file

@ -137,8 +137,7 @@ void BufferCache::InvalidateMemory(VAddr device_addr, u64 size) {
return; return;
} }
memory_tracker->InvalidateRegion( memory_tracker->InvalidateRegion(
device_addr, size, Config::readbacks(), device_addr, size, [this, device_addr, size] { ReadMemory(device_addr, size, true); });
[this, device_addr, size] { ReadMemory(device_addr, size, true); });
} }
void BufferCache::ReadMemory(VAddr device_addr, u64 size, bool is_write) { void BufferCache::ReadMemory(VAddr device_addr, u64 size, bool is_write) {
@ -817,22 +816,22 @@ void BufferCache::ChangeRegister(BufferId buffer_id) {
void BufferCache::SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size, bool is_written, void BufferCache::SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size, bool is_written,
bool is_texel_buffer) { bool is_texel_buffer) {
boost::container::small_vector<vk::BufferCopy, 4> copies; boost::container::small_vector<vk::BufferCopy, 4> copies;
size_t total_size_bytes = 0;
VAddr buffer_start = buffer.CpuAddr(); VAddr buffer_start = buffer.CpuAddr();
vk::Buffer src_buffer = VK_NULL_HANDLE;
memory_tracker->ForEachUploadRange( memory_tracker->ForEachUploadRange(
device_addr, size, is_written, [&](u64 device_addr_out, u64 range_size) { device_addr, size, is_written,
const u64 offset = staging_buffer.Copy(device_addr_out, range_size); [&](u64 device_addr_out, u64 range_size) {
copies.push_back(vk::BufferCopy{ copies.emplace_back(total_size_bytes, device_addr_out - buffer_start, range_size);
.srcOffset = offset, total_size_bytes += range_size;
.dstOffset = device_addr_out - buffer_start, },
.size = range_size, [&] { src_buffer = UploadCopies(buffer, copies, total_size_bytes); });
});
});
SCOPE_EXIT { SCOPE_EXIT {
if (is_texel_buffer) { if (is_texel_buffer) {
SynchronizeBufferFromImage(buffer, device_addr, size); SynchronizeBufferFromImage(buffer, device_addr, size);
} }
}; };
if (copies.empty()) { if (!src_buffer) {
return; return;
} }
scheduler.EndRendering(); scheduler.EndRendering();
@ -861,7 +860,7 @@ void BufferCache::SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size,
.bufferMemoryBarrierCount = 1, .bufferMemoryBarrierCount = 1,
.pBufferMemoryBarriers = &pre_barrier, .pBufferMemoryBarriers = &pre_barrier,
}); });
cmdbuf.copyBuffer(staging_buffer.Handle(), buffer.buffer, copies); cmdbuf.copyBuffer(src_buffer, buffer.buffer, copies);
cmdbuf.pipelineBarrier2(vk::DependencyInfo{ cmdbuf.pipelineBarrier2(vk::DependencyInfo{
.dependencyFlags = vk::DependencyFlagBits::eByRegion, .dependencyFlags = vk::DependencyFlagBits::eByRegion,
.bufferMemoryBarrierCount = 1, .bufferMemoryBarrierCount = 1,
@ -869,6 +868,39 @@ void BufferCache::SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size,
}); });
} }
vk::Buffer BufferCache::UploadCopies(Buffer& buffer, std::span<vk::BufferCopy> copies,
size_t total_size_bytes) {
if (copies.empty()) {
return VK_NULL_HANDLE;
}
const auto [staging, offset] = staging_buffer.Map(total_size_bytes);
if (staging) {
for (auto& copy : copies) {
u8* const src_pointer = staging + copy.srcOffset;
const VAddr device_addr = buffer.CpuAddr() + copy.dstOffset;
std::memcpy(src_pointer, std::bit_cast<const u8*>(device_addr), copy.size);
// Apply the staging offset
copy.srcOffset += offset;
}
staging_buffer.Commit();
return staging_buffer.Handle();
} else {
// For large one time transfers use a temporary host buffer.
auto temp_buffer =
std::make_unique<Buffer>(instance, scheduler, MemoryUsage::Upload, 0,
vk::BufferUsageFlagBits::eTransferSrc, total_size_bytes);
const vk::Buffer src_buffer = temp_buffer->Handle();
u8* const staging = temp_buffer->mapped_data.data();
for (const auto& copy : copies) {
u8* const src_pointer = staging + copy.srcOffset;
const VAddr device_addr = buffer.CpuAddr() + copy.dstOffset;
std::memcpy(src_pointer, std::bit_cast<const u8*>(device_addr), copy.size);
}
scheduler.DeferOperation([buffer = std::move(temp_buffer)]() mutable { buffer.reset(); });
return src_buffer;
}
}
bool BufferCache::SynchronizeBufferFromImage(Buffer& buffer, VAddr device_addr, u32 size) { bool BufferCache::SynchronizeBufferFromImage(Buffer& buffer, VAddr device_addr, u32 size) {
boost::container::small_vector<ImageId, 6> image_ids; boost::container::small_vector<ImageId, 6> image_ids;
texture_cache.ForEachImageInRegion(device_addr, size, [&](ImageId image_id, Image& image) { texture_cache.ForEachImageInRegion(device_addr, size, [&](ImageId image_id, Image& image) {

View file

@ -194,6 +194,9 @@ private:
void SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size, bool is_written, void SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size, bool is_written,
bool is_texel_buffer); bool is_texel_buffer);
vk::Buffer UploadCopies(Buffer& buffer, std::span<vk::BufferCopy> copies,
size_t total_size_bytes);
bool SynchronizeBufferFromImage(Buffer& buffer, VAddr device_addr, u32 size); bool SynchronizeBufferFromImage(Buffer& buffer, VAddr device_addr, u32 size);
void InlineDataBuffer(Buffer& buffer, VAddr address, const void* value, u32 num_bytes); void InlineDataBuffer(Buffer& buffer, VAddr address, const void* value, u32 num_bytes);

View file

@ -62,17 +62,17 @@ public:
} }
/// Removes all protection from a page and ensures GPU data has been flushed if requested /// Removes all protection from a page and ensures GPU data has been flushed if requested
void InvalidateRegion(VAddr cpu_addr, u64 size, bool try_flush, auto&& on_flush) noexcept { void InvalidateRegion(VAddr cpu_addr, u64 size, auto&& on_flush) noexcept {
IteratePages<false>( IteratePages<false>(
cpu_addr, size, cpu_addr, size, [&on_flush](RegionManager* manager, u64 offset, size_t size) {
[try_flush, &on_flush](RegionManager* manager, u64 offset, size_t size) {
const bool should_flush = [&] { const bool should_flush = [&] {
// Perform both the GPU modification check and CPU state change with the lock // Perform both the GPU modification check and CPU state change with the lock
// in case we are racing with GPU thread trying to mark the page as GPU // in case we are racing with GPU thread trying to mark the page as GPU
// modified. If we need to flush the flush function is going to perform CPU // modified. If we need to flush the flush function is going to perform CPU
// state change. // state change.
std::scoped_lock lk{manager->lock}; std::scoped_lock lk{manager->lock};
if (try_flush && manager->template IsRegionModified<Type::GPU>(offset, size)) { if (Config::readbacks() &&
manager->template IsRegionModified<Type::GPU>(offset, size)) {
return true; return true;
} }
manager->template ChangeRegionState<Type::CPU, true>( manager->template ChangeRegionState<Type::CPU, true>(
@ -86,16 +86,26 @@ public:
} }
/// Call 'func' for each CPU modified range and unmark those pages as CPU modified /// Call 'func' for each CPU modified range and unmark those pages as CPU modified
void ForEachUploadRange(VAddr query_cpu_range, u64 query_size, bool is_written, auto&& func) { void ForEachUploadRange(VAddr query_cpu_range, u64 query_size, bool is_written, auto&& func,
auto&& on_upload) {
IteratePages<true>(query_cpu_range, query_size, IteratePages<true>(query_cpu_range, query_size,
[&func, is_written](RegionManager* manager, u64 offset, size_t size) { [&func, is_written](RegionManager* manager, u64 offset, size_t size) {
std::scoped_lock lk{manager->lock}; manager->lock.lock();
manager->template ForEachModifiedRange<Type::CPU, true>( manager->template ForEachModifiedRange<Type::CPU, true>(
manager->GetCpuAddr() + offset, size, func); manager->GetCpuAddr() + offset, size, func);
if (is_written) { if (!is_written) {
manager->lock.unlock();
}
});
on_upload();
if (!is_written) {
return;
}
IteratePages<false>(query_cpu_range, query_size,
[&func, is_written](RegionManager* manager, u64 offset, size_t size) {
manager->template ChangeRegionState<Type::GPU, true>( manager->template ChangeRegionState<Type::GPU, true>(
manager->GetCpuAddr() + offset, size); manager->GetCpuAddr() + offset, size);
} manager->lock.unlock();
}); });
} }

View file

@ -347,7 +347,11 @@ public:
} }
/// Returns true when a tick has been triggered by the GPU. /// Returns true when a tick has been triggered by the GPU.
[[nodiscard]] bool IsFree(u64 tick) const noexcept { [[nodiscard]] bool IsFree(u64 tick) noexcept {
if (master_semaphore.IsFree(tick)) {
return true;
}
master_semaphore.Refresh();
return master_semaphore.IsFree(tick); return master_semaphore.IsFree(tick);
} }