buffer_cache: Bring back upload batching and temporary buffer (#3211)

* buffer_cache: Bring back upload batching and temporary buffer Because that PR fused the write and read protections under a single function call, it was a requirement to move the actual memory copy part inside the lambda to perform it before the read protection kicks in. However on certain large data transfers it had potential for data corruption. If, for example, an upload had two copies, a 400MB and a 300MB one, the first one would fit in the staging buffer, very likely with an induced stall. However the second one wouldn't have space to fit alongside the other data, but it's also small enough for the buffer to fit it, so the staging buffer would cause a flush and wait to copy it, overwriting the previous transfer. To address this the upload function has been reworked to allow for batching like previously but with the new locking behavior. Also the condition to use temporary buffers has been expanded to also include cases when staging buffer will stall, which should increase performance a little in some cases. * buffer_cache: Move buffer barriers and copy outside of lock range
2025-07-12 04:35:56 +00:00 · 2025-07-08 10:32:39 +03:00 · 2025-07-08 10:32:39 +03:00 · 2d1a2982df
commit 2d1a2982df
parent ddede4a52d
6 changed files with 96 additions and 35 deletions
--- a/src/video_core/buffer_cache/buffer.cpp
+++ b/src/video_core/buffer_cache/buffer.cpp
@ -137,12 +137,15 @@ StreamBuffer::StreamBuffer(const Vulkan::Instance& instance, Vulkan::Scheduler&
                          size_bytes);
 }

-std::pair<u8*, u64> StreamBuffer::Map(u64 size, u64 alignment) {
+std::pair<u8*, u64> StreamBuffer::Map(u64 size, u64 alignment, bool allow_wait) {
    if (!is_coherent && usage == MemoryUsage::Stream) {
        size = Common::AlignUp(size, instance->NonCoherentAtomSize());
    }

-    ASSERT(size <= this->size_bytes);
+    if (size > this->size_bytes) {
+        return {nullptr, 0};
+    }
+
    mapped_size = size;

    if (alignment > 0) {
@ -162,8 +165,11 @@ std::pair<u8*, u64> StreamBuffer::Map(u64 size, u64 alignment) {
    }

    const u64 mapped_upper_bound = offset + size;
-    WaitPendingOperations(mapped_upper_bound);
-    return std::make_pair(mapped_data.data() + offset, offset);
+    if (!WaitPendingOperations(mapped_upper_bound, allow_wait)) {
+        return {nullptr, 0};
+    }
+
+    return {mapped_data.data() + offset, offset};
 }

 void StreamBuffer::Commit() {
@ -177,6 +183,12 @@ void StreamBuffer::Commit() {
    }

    offset += mapped_size;
+    if (current_watch_cursor != 0 &&
+        current_watches[current_watch_cursor].tick == scheduler->CurrentTick()) {
+        current_watches[current_watch_cursor].upper_bound = offset;
+        return;
+    }
+
    if (current_watch_cursor + 1 >= current_watches.size()) {
        // Ensure that there are enough watches.
        ReserveWatches(current_watches, WATCHES_RESERVE_CHUNK);
@ -191,16 +203,20 @@ void StreamBuffer::ReserveWatches(std::vector<Watch>& watches, std::size_t grow_
    watches.resize(watches.size() + grow_size);
 }

-void StreamBuffer::WaitPendingOperations(u64 requested_upper_bound) {
+bool StreamBuffer::WaitPendingOperations(u64 requested_upper_bound, bool allow_wait) {
    if (!invalidation_mark) {
-        return;
+        return true;
    }
    while (requested_upper_bound > wait_bound && wait_cursor < *invalidation_mark) {
        auto& watch = previous_watches[wait_cursor];
-        wait_bound = watch.upper_bound;
+        if (!scheduler->IsFree(watch.tick) && !allow_wait) {
+            return false;
+        }
        scheduler->Wait(watch.tick);
+        wait_bound = watch.upper_bound;
        ++wait_cursor;
    }
+    return true;
 }

 } // namespace VideoCore
--- a/src/video_core/buffer_cache/buffer.h
+++ b/src/video_core/buffer_cache/buffer.h
@ -168,7 +168,7 @@ public:
                          MemoryUsage usage, u64 size_bytes_);

    /// Reserves a region of memory from the stream buffer.
-    std::pair<u8*, u64> Map(u64 size, u64 alignment = 0);
+    std::pair<u8*, u64> Map(u64 size, u64 alignment = 0, bool allow_wait = true);

    /// Ensures that reserved bytes of memory are available to the GPU.
    void Commit();
@ -181,10 +181,6 @@ public:
        return offset;
    }

-    u64 GetFreeSize() const {
-        return size_bytes - offset - mapped_size;
-    }
-
 private:
    struct Watch {
        u64 tick{};
@ -195,7 +191,7 @@ private:
    void ReserveWatches(std::vector<Watch>& watches, std::size_t grow_size);

    /// Waits pending watches until requested upper bound.
-    void WaitPendingOperations(u64 requested_upper_bound);
+    bool WaitPendingOperations(u64 requested_upper_bound, bool allow_wait);

 private:
    u64 offset{};
--- a/src/video_core/buffer_cache/buffer_cache.cpp
+++ b/src/video_core/buffer_cache/buffer_cache.cpp
@ -137,8 +137,7 @@ void BufferCache::InvalidateMemory(VAddr device_addr, u64 size) {
        return;
    }
    memory_tracker->InvalidateRegion(
-        device_addr, size, Config::readbacks(),
-        [this, device_addr, size] { ReadMemory(device_addr, size, true); });
+        device_addr, size, [this, device_addr, size] { ReadMemory(device_addr, size, true); });
 }

 void BufferCache::ReadMemory(VAddr device_addr, u64 size, bool is_write) {
@ -817,22 +816,22 @@ void BufferCache::ChangeRegister(BufferId buffer_id) {
 void BufferCache::SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size, bool is_written,
                                    bool is_texel_buffer) {
    boost::container::small_vector<vk::BufferCopy, 4> copies;
+    size_t total_size_bytes = 0;
    VAddr buffer_start = buffer.CpuAddr();
+    vk::Buffer src_buffer = VK_NULL_HANDLE;
    memory_tracker->ForEachUploadRange(
-        device_addr, size, is_written, [&](u64 device_addr_out, u64 range_size) {
-            const u64 offset = staging_buffer.Copy(device_addr_out, range_size);
-            copies.push_back(vk::BufferCopy{
-                .srcOffset = offset,
-                .dstOffset = device_addr_out - buffer_start,
-                .size = range_size,
-            });
-        });
+        device_addr, size, is_written,
+        [&](u64 device_addr_out, u64 range_size) {
+            copies.emplace_back(total_size_bytes, device_addr_out - buffer_start, range_size);
+            total_size_bytes += range_size;
+        },
+        [&] { src_buffer = UploadCopies(buffer, copies, total_size_bytes); });
    SCOPE_EXIT {
        if (is_texel_buffer) {
            SynchronizeBufferFromImage(buffer, device_addr, size);
        }
    };
-    if (copies.empty()) {
+    if (!src_buffer) {
        return;
    }
    scheduler.EndRendering();
@ -861,7 +860,7 @@ void BufferCache::SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size,
        .bufferMemoryBarrierCount = 1,
        .pBufferMemoryBarriers = &pre_barrier,
    });
-    cmdbuf.copyBuffer(staging_buffer.Handle(), buffer.buffer, copies);
+    cmdbuf.copyBuffer(src_buffer, buffer.buffer, copies);
    cmdbuf.pipelineBarrier2(vk::DependencyInfo{
        .dependencyFlags = vk::DependencyFlagBits::eByRegion,
        .bufferMemoryBarrierCount = 1,
@ -869,6 +868,39 @@ void BufferCache::SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size,
    });
 }

+vk::Buffer BufferCache::UploadCopies(Buffer& buffer, std::span<vk::BufferCopy> copies,
+                                     size_t total_size_bytes) {
+    if (copies.empty()) {
+        return VK_NULL_HANDLE;
+    }
+    const auto [staging, offset] = staging_buffer.Map(total_size_bytes);
+    if (staging) {
+        for (auto& copy : copies) {
+            u8* const src_pointer = staging + copy.srcOffset;
+            const VAddr device_addr = buffer.CpuAddr() + copy.dstOffset;
+            std::memcpy(src_pointer, std::bit_cast<const u8*>(device_addr), copy.size);
+            // Apply the staging offset
+            copy.srcOffset += offset;
+        }
+        staging_buffer.Commit();
+        return staging_buffer.Handle();
+    } else {
+        // For large one time transfers use a temporary host buffer.
+        auto temp_buffer =
+            std::make_unique<Buffer>(instance, scheduler, MemoryUsage::Upload, 0,
+                                     vk::BufferUsageFlagBits::eTransferSrc, total_size_bytes);
+        const vk::Buffer src_buffer = temp_buffer->Handle();
+        u8* const staging = temp_buffer->mapped_data.data();
+        for (const auto& copy : copies) {
+            u8* const src_pointer = staging + copy.srcOffset;
+            const VAddr device_addr = buffer.CpuAddr() + copy.dstOffset;
+            std::memcpy(src_pointer, std::bit_cast<const u8*>(device_addr), copy.size);
+        }
+        scheduler.DeferOperation([buffer = std::move(temp_buffer)]() mutable { buffer.reset(); });
+        return src_buffer;
+    }
+}
+
 bool BufferCache::SynchronizeBufferFromImage(Buffer& buffer, VAddr device_addr, u32 size) {
    boost::container::small_vector<ImageId, 6> image_ids;
    texture_cache.ForEachImageInRegion(device_addr, size, [&](ImageId image_id, Image& image) {
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@ -194,6 +194,9 @@ private:
    void SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size, bool is_written,
                           bool is_texel_buffer);

+    vk::Buffer UploadCopies(Buffer& buffer, std::span<vk::BufferCopy> copies,
+                            size_t total_size_bytes);
+
    bool SynchronizeBufferFromImage(Buffer& buffer, VAddr device_addr, u32 size);

    void InlineDataBuffer(Buffer& buffer, VAddr address, const void* value, u32 num_bytes);
--- a/src/video_core/buffer_cache/memory_tracker.h
+++ b/src/video_core/buffer_cache/memory_tracker.h
@ -62,17 +62,17 @@ public:
    }

    /// Removes all protection from a page and ensures GPU data has been flushed if requested
-    void InvalidateRegion(VAddr cpu_addr, u64 size, bool try_flush, auto&& on_flush) noexcept {
+    void InvalidateRegion(VAddr cpu_addr, u64 size, auto&& on_flush) noexcept {
        IteratePages<false>(
-            cpu_addr, size,
-            [try_flush, &on_flush](RegionManager* manager, u64 offset, size_t size) {
+            cpu_addr, size, [&on_flush](RegionManager* manager, u64 offset, size_t size) {
                const bool should_flush = [&] {
                    // Perform both the GPU modification check and CPU state change with the lock
                    // in case we are racing with GPU thread trying to mark the page as GPU
                    // modified. If we need to flush the flush function is going to perform CPU
                    // state change.
                    std::scoped_lock lk{manager->lock};
-                    if (try_flush && manager->template IsRegionModified<Type::GPU>(offset, size)) {
+                    if (Config::readbacks() &&
+                        manager->template IsRegionModified<Type::GPU>(offset, size)) {
                        return true;
                    }
                    manager->template ChangeRegionState<Type::CPU, true>(
@ -86,16 +86,26 @@ public:
    }

    /// Call 'func' for each CPU modified range and unmark those pages as CPU modified
-    void ForEachUploadRange(VAddr query_cpu_range, u64 query_size, bool is_written, auto&& func) {
+    void ForEachUploadRange(VAddr query_cpu_range, u64 query_size, bool is_written, auto&& func,
+                            auto&& on_upload) {
        IteratePages<true>(query_cpu_range, query_size,
                           [&func, is_written](RegionManager* manager, u64 offset, size_t size) {
-                               std::scoped_lock lk{manager->lock};
+                               manager->lock.lock();
                               manager->template ForEachModifiedRange<Type::CPU, true>(
                                   manager->GetCpuAddr() + offset, size, func);
-                               if (is_written) {
+                               if (!is_written) {
+                                   manager->lock.unlock();
+                               }
+                           });
+        on_upload();
+        if (!is_written) {
+            return;
+        }
+        IteratePages<false>(query_cpu_range, query_size,
+                            [&func, is_written](RegionManager* manager, u64 offset, size_t size) {
                                manager->template ChangeRegionState<Type::GPU, true>(
                                    manager->GetCpuAddr() + offset, size);
-                               }
+                                manager->lock.unlock();
                            });
    }

--- a/src/video_core/renderer_vulkan/vk_scheduler.h
+++ b/src/video_core/renderer_vulkan/vk_scheduler.h
@ -347,7 +347,11 @@ public:
    }

    /// Returns true when a tick has been triggered by the GPU.
-    [[nodiscard]] bool IsFree(u64 tick) const noexcept {
+    [[nodiscard]] bool IsFree(u64 tick) noexcept {
+        if (master_semaphore.IsFree(tick)) {
+            return true;
+        }
+        master_semaphore.Refresh();
        return master_semaphore.IsFree(tick);
    }