Readbacks proof of concept rebased (#3178)

* Readbacks proof of concept * liverpool: Use span for acb too * config: Add readbacks config option * config: Log readbacks
2025-07-12 04:35:56 +00:00 · 2025-07-01 23:41:00 +03:00 · 2025-07-01 23:41:00 +03:00 · 0594dac405
commit 0594dac405
parent 5789fd881c
17 changed files with 375 additions and 186 deletions
--- a/src/common/config.cpp
+++ b/src/common/config.cpp
@ -51,6 +51,7 @@ static bool isShowSplash = false;
 static std::string isSideTrophy = "right";
 static bool isNullGpu = false;
 static bool shouldCopyGPUBuffers = false;
+static bool readbacksEnabled = false;
 static bool shouldDumpShaders = false;
 static bool shouldPatchShaders = true;
 static u32 vblankDivider = 1;
@ -240,6 +241,10 @@ bool copyGPUCmdBuffers() {
    return shouldCopyGPUBuffers;
 }

+bool readbacks() {
+    return readbacksEnabled;
+}
+
 bool dumpShaders() {
    return shouldDumpShaders;
 }
@ -344,6 +349,10 @@ void setCopyGPUCmdBuffers(bool enable) {
    shouldCopyGPUBuffers = enable;
 }

+void setReadbacks(bool enable) {
+    readbacksEnabled = enable;
+}
+
 void setDumpShaders(bool enable) {
    shouldDumpShaders = enable;
 }
@ -586,6 +595,7 @@ void load(const std::filesystem::path& path) {
        screenHeight = toml::find_or<int>(gpu, "screenHeight", screenHeight);
        isNullGpu = toml::find_or<bool>(gpu, "nullGpu", false);
        shouldCopyGPUBuffers = toml::find_or<bool>(gpu, "copyGPUBuffers", false);
+        readbacksEnabled = toml::find_or<bool>(gpu, "readbacks", false);
        shouldDumpShaders = toml::find_or<bool>(gpu, "dumpShaders", false);
        shouldPatchShaders = toml::find_or<bool>(gpu, "patchShaders", true);
        vblankDivider = toml::find_or<int>(gpu, "vblankDivider", 1);
@ -735,6 +745,7 @@ void save(const std::filesystem::path& path) {
    data["GPU"]["screenHeight"] = screenHeight;
    data["GPU"]["nullGpu"] = isNullGpu;
    data["GPU"]["copyGPUBuffers"] = shouldCopyGPUBuffers;
+    data["GPU"]["readbacks"] = readbacksEnabled;
    data["GPU"]["dumpShaders"] = shouldDumpShaders;
    data["GPU"]["patchShaders"] = shouldPatchShaders;
    data["GPU"]["vblankDivider"] = vblankDivider;
--- a/src/common/config.h
+++ b/src/common/config.h
@ -45,6 +45,8 @@ bool nullGpu();
 void setNullGpu(bool enable);
 bool copyGPUCmdBuffers();
 void setCopyGPUCmdBuffers(bool enable);
+bool readbacks();
+void setReadbacks(bool enable);
 bool dumpShaders();
 void setDumpShaders(bool enable);
 u32 vblankDiv();
--- a/src/core/address_space.cpp
+++ b/src/core/address_space.cpp
@ -302,14 +302,15 @@ struct AddressSpace::Impl {
            new_flags = PAGE_READWRITE;
        } else if (read && !write) {
            new_flags = PAGE_READONLY;
-        } else if (execute && !read && not write) {
+        } else if (execute && !read && !write) {
            new_flags = PAGE_EXECUTE;
        } else if (!read && !write && !execute) {
            new_flags = PAGE_NOACCESS;
        } else {
            LOG_CRITICAL(Common_Memory,
-                         "Unsupported protection flag combination for address {:#x}, size {}",
-                         virtual_addr, size);
+                         "Unsupported protection flag combination for address {:#x}, size {}, "
+                         "read={}, write={}, execute={}",
+                         virtual_addr, size, read, write, execute);
            return;
        }

--- a/src/core/address_space.h
+++ b/src/core/address_space.h
@ -11,6 +11,7 @@
 namespace Core {

 enum class MemoryPermission : u32 {
+    None = 0,
    Read = 1 << 0,
    Write = 1 << 1,
    ReadWrite = Read | Write,
--- a/src/core/libraries/gnmdriver/gnmdriver.cpp
+++ b/src/core/libraries/gnmdriver/gnmdriver.cpp
@ -2834,7 +2834,7 @@ void RegisterlibSceGnmDriver(Core::Loader::SymbolsResolver* sym) {
    }

    if (Config::copyGPUCmdBuffers()) {
-        liverpool->reserveCopyBufferSpace();
+        liverpool->ReserveCopyBufferSpace();
    }

    Platform::IrqC::Instance()->Register(Platform::InterruptId::GpuIdle, ResetSubmissionLock,
--- a/src/emulator.cpp
+++ b/src/emulator.cpp
@ -132,6 +132,7 @@ void Emulator::Run(std::filesystem::path file, const std::vector<std::string> ar
    LOG_INFO(Config, "General LogType: {}", Config::getLogType());
    LOG_INFO(Config, "General isNeo: {}", Config::isNeoModeConsole());
    LOG_INFO(Config, "GPU isNullGpu: {}", Config::nullGpu());
+    LOG_INFO(Config, "GPU readbacks: {}", Config::readbacks());
    LOG_INFO(Config, "GPU shouldDumpShaders: {}", Config::dumpShaders());
    LOG_INFO(Config, "GPU vblankDivider: {}", Config::vblankDiv());
    LOG_INFO(Config, "Vulkan gpuId: {}", Config::getGpuId());
--- a/src/video_core/amdgpu/liverpool.cpp
+++ b/src/video_core/amdgpu/liverpool.cpp
@ -72,8 +72,23 @@ Liverpool::~Liverpool() {
    process_thread.join();
 }

+void Liverpool::ProcessCommands() {
+    // Process incoming commands with high priority
+    while (num_commands) {
+        Common::UniqueFunction<void> callback{};
+        {
+            std::scoped_lock lk{submit_mutex};
+            callback = std::move(command_queue.front());
+            command_queue.pop();
+            --num_commands;
+        }
+        callback();
+    }
+}
+
 void Liverpool::Process(std::stop_token stoken) {
    Common::SetCurrentThreadName("shadPS4:GpuCommandProcessor");
+    gpu_id = std::this_thread::get_id();

    while (!stoken.stop_requested()) {
        {
@ -90,18 +105,7 @@ void Liverpool::Process(std::stop_token stoken) {
        curr_qid = -1;

        while (num_submits || num_commands) {
-
-            // Process incoming commands with high priority
-            while (num_commands) {
-                Common::UniqueFunction<void> callback{};
-                {
-                    std::unique_lock lk{submit_mutex};
-                    callback = std::move(command_queue.front());
-                    command_queue.pop();
-                    --num_commands;
-                }
-                callback();
-            }
+            ProcessCommands();

            curr_qid = (curr_qid + 1) % num_mapped_queues;

@ -147,6 +151,8 @@ Liverpool::Task Liverpool::ProcessCeUpdate(std::span<const u32> ccb) {
    FIBER_ENTER(ccb_task_name);

    while (!ccb.empty()) {
+        ProcessCommands();
+
        const auto* header = reinterpret_cast<const PM4Header*>(ccb.data());
        const u32 type = header->type;
        if (type != 3) {
@ -224,6 +230,8 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span<const u32> dcb, std::span<c

    const auto base_addr = reinterpret_cast<uintptr_t>(dcb.data());
    while (!dcb.empty()) {
+        ProcessCommands();
+
        const auto* header = reinterpret_cast<const PM4Header*>(dcb.data());
        const u32 type = header->type;

@ -638,9 +646,8 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span<const u32> dcb, std::span<c
                } else if ((dma_data->src_sel == DmaDataSrc::Memory ||
                            dma_data->src_sel == DmaDataSrc::MemoryUsingL2) &&
                           dma_data->dst_sel == DmaDataDst::Gds) {
-                    rasterizer->InlineData(dma_data->dst_addr_lo,
-                                           dma_data->SrcAddress<const void*>(),
-                                           dma_data->NumBytes(), true);
+                    rasterizer->CopyBuffer(dma_data->dst_addr_lo, dma_data->SrcAddress<VAddr>(),
+                                           dma_data->NumBytes(), true, false);
                } else if (dma_data->src_sel == DmaDataSrc::Data &&
                           (dma_data->dst_sel == DmaDataDst::Memory ||
                            dma_data->dst_sel == DmaDataDst::MemoryUsingL2)) {
@ -649,14 +656,15 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span<const u32> dcb, std::span<c
                } else if (dma_data->src_sel == DmaDataSrc::Gds &&
                           (dma_data->dst_sel == DmaDataDst::Memory ||
                            dma_data->dst_sel == DmaDataDst::MemoryUsingL2)) {
-                    // LOG_WARNING(Render_Vulkan, "GDS memory read");
+                    rasterizer->CopyBuffer(dma_data->DstAddress<VAddr>(), dma_data->src_addr_lo,
+                                           dma_data->NumBytes(), false, true);
                } else if ((dma_data->src_sel == DmaDataSrc::Memory ||
                            dma_data->src_sel == DmaDataSrc::MemoryUsingL2) &&
                           (dma_data->dst_sel == DmaDataDst::Memory ||
                            dma_data->dst_sel == DmaDataDst::MemoryUsingL2)) {
-                    rasterizer->InlineData(dma_data->DstAddress<VAddr>(),
-                                           dma_data->SrcAddress<const void*>(),
-                                           dma_data->NumBytes(), false);
+                    rasterizer->CopyBuffer(dma_data->DstAddress<VAddr>(),
+                                           dma_data->SrcAddress<VAddr>(), dma_data->NumBytes(),
+                                           false, false);
                } else {
                    UNREACHABLE_MSG("WriteData src_sel = {}, dst_sel = {}",
                                    u32(dma_data->src_sel.Value()), u32(dma_data->dst_sel.Value()));
@ -702,6 +710,9 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span<const u32> dcb, std::span<c
                break;
            }
            case PM4ItOpcode::Rewind: {
+                if (!rasterizer) {
+                    break;
+                }
                const PM4CmdRewind* rewind = reinterpret_cast<const PM4CmdRewind*>(header);
                while (!rewind->Valid()) {
                    YIELD_GFX();
@ -801,29 +812,32 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span<const u32> dcb, std::span<c
 }

 template <bool is_indirect>
-Liverpool::Task Liverpool::ProcessCompute(const u32* acb, u32 acb_dwords, u32 vqid) {
+Liverpool::Task Liverpool::ProcessCompute(std::span<const u32> acb, u32 vqid) {
    FIBER_ENTER(acb_task_name[vqid]);
    auto& queue = asc_queues[{vqid}];

-    auto base_addr = reinterpret_cast<VAddr>(acb);
-    while (acb_dwords > 0) {
-        auto* header = reinterpret_cast<const PM4Header*>(acb);
+    auto base_addr = reinterpret_cast<VAddr>(acb.data());
+    while (!acb.empty()) {
+        ProcessCommands();
+
+        auto* header = reinterpret_cast<const PM4Header*>(acb.data());
        u32 next_dw_off = header->type3.NumWords() + 1;

        // If we have a buffered packet, use it.
        if (queue.tmp_dwords > 0) [[unlikely]] {
            header = reinterpret_cast<const PM4Header*>(queue.tmp_packet.data());
            next_dw_off = header->type3.NumWords() + 1 - queue.tmp_dwords;
-            std::memcpy(queue.tmp_packet.data() + queue.tmp_dwords, acb, next_dw_off * sizeof(u32));
+            std::memcpy(queue.tmp_packet.data() + queue.tmp_dwords, acb.data(),
+                        next_dw_off * sizeof(u32));
            queue.tmp_dwords = 0;
        }

        // If the packet is split across ring boundary, buffer until next submission
-        if (next_dw_off > acb_dwords) [[unlikely]] {
-            std::memcpy(queue.tmp_packet.data(), acb, acb_dwords * sizeof(u32));
-            queue.tmp_dwords = acb_dwords;
+        if (next_dw_off > acb.size()) [[unlikely]] {
+            std::memcpy(queue.tmp_packet.data(), acb.data(), acb.size_bytes());
+            queue.tmp_dwords = acb.size();
            if constexpr (!is_indirect) {
-                *queue.read_addr += acb_dwords;
+                *queue.read_addr += acb.size();
                *queue.read_addr %= queue.ring_size_dw;
            }
            break;
@ -832,9 +846,7 @@ Liverpool::Task Liverpool::ProcessCompute(const u32* acb, u32 acb_dwords, u32 vq
        if (header->type == 2) {
            // Type-2 packet are used for padding purposes
            next_dw_off = 1;
-            acb += next_dw_off;
-            acb_dwords -= next_dw_off;
-
+            acb = NextPacket(acb, next_dw_off);
            if constexpr (!is_indirect) {
                *queue.read_addr += next_dw_off;
                *queue.read_addr %= queue.ring_size_dw;
@ -856,8 +868,8 @@ Liverpool::Task Liverpool::ProcessCompute(const u32* acb, u32 acb_dwords, u32 vq
        }
        case PM4ItOpcode::IndirectBuffer: {
            const auto* indirect_buffer = reinterpret_cast<const PM4CmdIndirectBuffer*>(header);
-            auto task = ProcessCompute<true>(indirect_buffer->Address<const u32>(),
-                                             indirect_buffer->ib_size, vqid);
+            auto task = ProcessCompute<true>(
+                {indirect_buffer->Address<const u32>(), indirect_buffer->ib_size}, vqid);
            RESUME_ASC(task, vqid);

            while (!task.handle.done()) {
@ -876,8 +888,8 @@ Liverpool::Task Liverpool::ProcessCompute(const u32* acb, u32 acb_dwords, u32 vq
            } else if ((dma_data->src_sel == DmaDataSrc::Memory ||
                        dma_data->src_sel == DmaDataSrc::MemoryUsingL2) &&
                       dma_data->dst_sel == DmaDataDst::Gds) {
-                rasterizer->InlineData(dma_data->dst_addr_lo, dma_data->SrcAddress<const void*>(),
-                                       dma_data->NumBytes(), true);
+                rasterizer->CopyBuffer(dma_data->dst_addr_lo, dma_data->SrcAddress<VAddr>(),
+                                       dma_data->NumBytes(), true, false);
            } else if (dma_data->src_sel == DmaDataSrc::Data &&
                       (dma_data->dst_sel == DmaDataDst::Memory ||
                        dma_data->dst_sel == DmaDataDst::MemoryUsingL2)) {
@ -886,14 +898,14 @@ Liverpool::Task Liverpool::ProcessCompute(const u32* acb, u32 acb_dwords, u32 vq
            } else if (dma_data->src_sel == DmaDataSrc::Gds &&
                       (dma_data->dst_sel == DmaDataDst::Memory ||
                        dma_data->dst_sel == DmaDataDst::MemoryUsingL2)) {
-                // LOG_WARNING(Render_Vulkan, "GDS memory read");
+                rasterizer->CopyBuffer(dma_data->DstAddress<VAddr>(), dma_data->src_addr_lo,
+                                       dma_data->NumBytes(), false, true);
            } else if ((dma_data->src_sel == DmaDataSrc::Memory ||
                        dma_data->src_sel == DmaDataSrc::MemoryUsingL2) &&
                       (dma_data->dst_sel == DmaDataDst::Memory ||
                        dma_data->dst_sel == DmaDataDst::MemoryUsingL2)) {
-                rasterizer->InlineData(dma_data->DstAddress<VAddr>(),
-                                       dma_data->SrcAddress<const void*>(), dma_data->NumBytes(),
-                                       false);
+                rasterizer->CopyBuffer(dma_data->DstAddress<VAddr>(), dma_data->SrcAddress<VAddr>(),
+                                       dma_data->NumBytes(), false, false);
            } else {
                UNREACHABLE_MSG("WriteData src_sel = {}, dst_sel = {}",
                                u32(dma_data->src_sel.Value()), u32(dma_data->dst_sel.Value()));
@ -904,6 +916,9 @@ Liverpool::Task Liverpool::ProcessCompute(const u32* acb, u32 acb_dwords, u32 vq
            break;
        }
        case PM4ItOpcode::Rewind: {
+            if (!rasterizer) {
+                break;
+            }
            const PM4CmdRewind* rewind = reinterpret_cast<const PM4CmdRewind*>(header);
            while (!rewind->Valid()) {
                YIELD_ASC(vqid);
@ -1016,8 +1031,7 @@ Liverpool::Task Liverpool::ProcessCompute(const u32* acb, u32 acb_dwords, u32 vq
                            static_cast<u32>(opcode), header->type3.NumWords());
        }

-        acb += next_dw_off;
-        acb_dwords -= next_dw_off;
+        acb = NextPacket(acb, next_dw_off);

        if constexpr (!is_indirect) {
            *queue.read_addr += next_dw_off;
@ -1087,7 +1101,7 @@ void Liverpool::SubmitAsc(u32 gnm_vqid, std::span<const u32> acb) {
    auto& queue = mapped_queues[gnm_vqid];

    const auto vqid = gnm_vqid - 1;
-    const auto& task = ProcessCompute(acb.data(), acb.size(), vqid);
+    const auto& task = ProcessCompute(acb, vqid);
    {
        std::scoped_lock lock{queue.m_access};
        queue.submits.emplace(task.handle);
--- a/src/video_core/amdgpu/liverpool.h
+++ b/src/video_core/amdgpu/liverpool.h
@ -8,6 +8,7 @@
 #include <coroutine>
 #include <exception>
 #include <mutex>
+#include <semaphore>
 #include <span>
 #include <thread>
 #include <vector>
@ -1512,14 +1513,32 @@ public:
        rasterizer = rasterizer_;
    }

-    void SendCommand(Common::UniqueFunction<void>&& func) {
-        std::scoped_lock lk{submit_mutex};
-        command_queue.emplace(std::move(func));
-        ++num_commands;
-        submit_cv.notify_one();
+    template <bool wait_done = false>
+    void SendCommand(auto&& func) {
+        if (std::this_thread::get_id() == gpu_id) {
+            return func();
+        }
+        if constexpr (wait_done) {
+            std::binary_semaphore sem{0};
+            {
+                std::scoped_lock lk{submit_mutex};
+                command_queue.emplace([&sem, &func] {
+                    func();
+                    sem.release();
+                });
+                ++num_commands;
+                submit_cv.notify_one();
+            }
+            sem.acquire();
+        } else {
+            std::scoped_lock lk{submit_mutex};
+            command_queue.emplace(std::move(func));
+            ++num_commands;
+            submit_cv.notify_one();
+        }
    }

-    void reserveCopyBufferSpace() {
+    void ReserveCopyBufferSpace() {
        GpuQueue& gfx_queue = mapped_queues[GfxQueueId];
        std::scoped_lock<std::mutex> lk(gfx_queue.m_access);

@ -1581,8 +1600,9 @@ private:
    Task ProcessGraphics(std::span<const u32> dcb, std::span<const u32> ccb);
    Task ProcessCeUpdate(std::span<const u32> ccb);
    template <bool is_indirect = false>
-    Task ProcessCompute(const u32* acb, u32 acb_dwords, u32 vqid);
+    Task ProcessCompute(std::span<const u32> acb, u32 vqid);

+    void ProcessCommands();
    void Process(std::stop_token stoken);

    struct GpuQueue {
@ -1626,6 +1646,7 @@ private:
    std::mutex submit_mutex;
    std::condition_variable_any submit_cv;
    std::queue<Common::UniqueFunction<void>> command_queue{};
+    std::thread::id gpu_id;
    int curr_qid{-1};
 };

--- a/src/video_core/buffer_cache/buffer_cache.cpp
+++ b/src/video_core/buffer_cache/buffer_cache.cpp
@ -3,12 +3,14 @@

 #include <algorithm>
 #include "common/alignment.h"
+#include "common/config.h"
 #include "common/debug.h"
 #include "common/scope_exit.h"
 #include "common/types.h"
 #include "core/memory.h"
 #include "video_core/amdgpu/liverpool.h"
 #include "video_core/buffer_cache/buffer_cache.h"
+#include "video_core/buffer_cache/memory_tracker.h"
 #include "video_core/host_shaders/fault_buffer_process_comp.h"
 #include "video_core/renderer_vulkan/vk_graphics_pipeline.h"
 #include "video_core/renderer_vulkan/vk_instance.h"
@ -27,10 +29,10 @@ static constexpr size_t DeviceBufferSize = 128_MB;
 static constexpr size_t MaxPageFaults = 1024;

 BufferCache::BufferCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_,
-                         Vulkan::Rasterizer& rasterizer_, AmdGpu::Liverpool* liverpool_,
-                         TextureCache& texture_cache_, PageManager& tracker_)
-    : instance{instance_}, scheduler{scheduler_}, rasterizer{rasterizer_}, liverpool{liverpool_},
-      memory{Core::Memory::Instance()}, texture_cache{texture_cache_}, tracker{tracker_},
+                         AmdGpu::Liverpool* liverpool_, TextureCache& texture_cache_,
+                         PageManager& tracker)
+    : instance{instance_}, scheduler{scheduler_}, liverpool{liverpool_},
+      memory{Core::Memory::Instance()}, texture_cache{texture_cache_},
      staging_buffer{instance, scheduler, MemoryUsage::Upload, StagingBufferSize},
      stream_buffer{instance, scheduler, MemoryUsage::Stream, UboStreamBufferSize},
      download_buffer{instance, scheduler, MemoryUsage::Download, DownloadBufferSize},
@ -38,13 +40,14 @@ BufferCache::BufferCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& s
      gds_buffer{instance, scheduler, MemoryUsage::Stream, 0, AllFlags, DataShareBufferSize},
      bda_pagetable_buffer{instance, scheduler, MemoryUsage::DeviceLocal,
                           0,        AllFlags,  BDA_PAGETABLE_SIZE},
-      fault_buffer(instance, scheduler, MemoryUsage::DeviceLocal, 0, AllFlags, FAULT_BUFFER_SIZE),
-      memory_tracker{tracker} {
+      fault_buffer(instance, scheduler, MemoryUsage::DeviceLocal, 0, AllFlags, FAULT_BUFFER_SIZE) {
    Vulkan::SetObjectName(instance.GetDevice(), gds_buffer.Handle(), "GDS Buffer");
    Vulkan::SetObjectName(instance.GetDevice(), bda_pagetable_buffer.Handle(),
                          "BDA Page Table Buffer");
    Vulkan::SetObjectName(instance.GetDevice(), fault_buffer.Handle(), "Fault Buffer");

+    memory_tracker = std::make_unique<MemoryTracker>(tracker);
+
    // Ensure the first slot is used for the null buffer
    const auto null_id =
        slot_buffers.insert(instance, scheduler, MemoryUsage::DeviceLocal, 0, AllFlags, 16);
@ -129,22 +132,27 @@ BufferCache::BufferCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& s

 BufferCache::~BufferCache() = default;

-void BufferCache::InvalidateMemory(VAddr device_addr, u64 size, bool unmap) {
-    const bool is_tracked = IsRegionRegistered(device_addr, size);
-    if (is_tracked) {
-        // Mark the page as CPU modified to stop tracking writes.
-        memory_tracker.MarkRegionAsCpuModified(device_addr, size);
-
-        if (unmap) {
-            return;
-        }
+void BufferCache::InvalidateMemory(VAddr device_addr, u64 size) {
+    if (!IsRegionRegistered(device_addr, size)) {
+        return;
    }
+    if (Config::readbacks() && memory_tracker->IsRegionGpuModified(device_addr, size)) {
+        ReadMemory(device_addr, size);
+    }
+    memory_tracker->MarkRegionAsCpuModified(device_addr, size);
+}
+
+void BufferCache::ReadMemory(VAddr device_addr, u64 size) {
+    liverpool->SendCommand<true>([this, device_addr, size] {
+        Buffer& buffer = slot_buffers[FindBuffer(device_addr, size)];
+        DownloadBufferMemory(buffer, device_addr, size);
+    });
 }

 void BufferCache::DownloadBufferMemory(Buffer& buffer, VAddr device_addr, u64 size) {
    boost::container::small_vector<vk::BufferCopy, 1> copies;
    u64 total_size_bytes = 0;
-    memory_tracker.ForEachDownloadRange<true>(
+    memory_tracker->ForEachDownloadRange<false>(
        device_addr, size, [&](u64 device_addr_out, u64 range_size) {
            const VAddr buffer_addr = buffer.CpuAddr();
            const auto add_download = [&](VAddr start, VAddr end) {
@ -155,7 +163,10 @@ void BufferCache::DownloadBufferMemory(Buffer& buffer, VAddr device_addr, u64 si
                    .dstOffset = total_size_bytes,
                    .size = new_size,
                });
-                total_size_bytes += new_size;
+                // Align up to avoid cache conflicts
+                constexpr u64 align = 64ULL;
+                constexpr u64 mask = ~(align - 1ULL);
+                total_size_bytes += (new_size + align - 1) & mask;
            };
            gpu_modified_ranges.ForEachInRange(device_addr_out, range_size, add_download);
            gpu_modified_ranges.Subtract(device_addr_out, range_size);
@ -173,11 +184,14 @@ void BufferCache::DownloadBufferMemory(Buffer& buffer, VAddr device_addr, u64 si
    const auto cmdbuf = scheduler.CommandBuffer();
    cmdbuf.copyBuffer(buffer.buffer, download_buffer.Handle(), copies);
    scheduler.Finish();
+    auto* memory = Core::Memory::Instance();
    for (const auto& copy : copies) {
        const VAddr copy_device_addr = buffer.CpuAddr() + copy.srcOffset;
        const u64 dst_offset = copy.dstOffset - offset;
-        std::memcpy(std::bit_cast<u8*>(copy_device_addr), download + dst_offset, copy.size);
+        memory->TryWriteBacking(std::bit_cast<u8*>(copy_device_addr), download + dst_offset,
+                                copy.size);
    }
+    memory_tracker->UnmarkRegionAsGpuModified(device_addr, size);
 }

 void BufferCache::BindVertexBuffers(const Vulkan::GraphicsPipeline& pipeline) {
@ -296,9 +310,11 @@ void BufferCache::BindIndexBuffer(u32 index_offset) {

 void BufferCache::InlineData(VAddr address, const void* value, u32 num_bytes, bool is_gds) {
    ASSERT_MSG(address % 4 == 0, "GDS offset must be dword aligned");
-    if (!is_gds && !IsRegionGpuModified(address, num_bytes)) {
-        memcpy(std::bit_cast<void*>(address), value, num_bytes);
-        return;
+    if (!is_gds) {
+        ASSERT(memory->TryWriteBacking(std::bit_cast<void*>(address), value, num_bytes));
+        if (!IsRegionRegistered(address, num_bytes)) {
+            return;
+        }
    }
    Buffer* buffer = [&] {
        if (is_gds) {
@ -326,25 +342,108 @@ void BufferCache::WriteData(VAddr address, const void* value, u32 num_bytes, boo
    WriteDataBuffer(*buffer, address, value, num_bytes);
 }

+void BufferCache::CopyBuffer(VAddr dst, VAddr src, u32 num_bytes, bool dst_gds, bool src_gds) {
+    if (!dst_gds && !IsRegionGpuModified(dst, num_bytes)) {
+        if (!src_gds && !IsRegionGpuModified(src, num_bytes)) {
+            // Both buffers were not transferred to GPU yet. Can safely copy in host memory.
+            memcpy(std::bit_cast<void*>(dst), std::bit_cast<void*>(src), num_bytes);
+            return;
+        }
+        // Without a readback there's nothing we can do with this
+        // Fallback to creating dst buffer on GPU to at least have this data there
+    }
+    auto& src_buffer = [&] -> const Buffer& {
+        if (src_gds) {
+            return gds_buffer;
+        }
+        // Avoid using ObtainBuffer here as that might give us the stream buffer.
+        const BufferId buffer_id = FindBuffer(src, num_bytes);
+        auto& buffer = slot_buffers[buffer_id];
+        SynchronizeBuffer(buffer, src, num_bytes, false);
+        return buffer;
+    }();
+    auto& dst_buffer = [&] -> const Buffer& {
+        if (dst_gds) {
+            return gds_buffer;
+        }
+        // Prefer using ObtainBuffer here as that will auto-mark the region as GPU modified.
+        const auto [buffer, offset] = ObtainBuffer(dst, num_bytes, true);
+        return *buffer;
+    }();
+    vk::BufferCopy region{
+        .srcOffset = src_buffer.Offset(src),
+        .dstOffset = dst_buffer.Offset(dst),
+        .size = num_bytes,
+    };
+    const vk::BufferMemoryBarrier2 buf_barriers_before[2] = {
+        {
+            .srcStageMask = vk::PipelineStageFlagBits2::eAllCommands,
+            .srcAccessMask = vk::AccessFlagBits2::eMemoryRead,
+            .dstStageMask = vk::PipelineStageFlagBits2::eAllCommands,
+            .dstAccessMask = vk::AccessFlagBits2::eTransferWrite,
+            .buffer = dst_buffer.Handle(),
+            .offset = dst_buffer.Offset(dst),
+            .size = num_bytes,
+        },
+        {
+            .srcStageMask = vk::PipelineStageFlagBits2::eAllCommands,
+            .srcAccessMask = vk::AccessFlagBits2::eMemoryWrite,
+            .dstStageMask = vk::PipelineStageFlagBits2::eAllCommands,
+            .dstAccessMask = vk::AccessFlagBits2::eTransferRead,
+            .buffer = src_buffer.Handle(),
+            .offset = src_buffer.Offset(src),
+            .size = num_bytes,
+        },
+    };
+    scheduler.EndRendering();
+    const auto cmdbuf = scheduler.CommandBuffer();
+    cmdbuf.pipelineBarrier2(vk::DependencyInfo{
+        .dependencyFlags = vk::DependencyFlagBits::eByRegion,
+        .bufferMemoryBarrierCount = 2,
+        .pBufferMemoryBarriers = buf_barriers_before,
+    });
+    cmdbuf.copyBuffer(src_buffer.Handle(), dst_buffer.Handle(), region);
+    const vk::BufferMemoryBarrier2 buf_barriers_after[2] = {
+        {
+            .srcStageMask = vk::PipelineStageFlagBits2::eAllCommands,
+            .srcAccessMask = vk::AccessFlagBits2::eTransferWrite,
+            .dstStageMask = vk::PipelineStageFlagBits2::eAllCommands,
+            .dstAccessMask = vk::AccessFlagBits2::eMemoryRead,
+            .buffer = dst_buffer.Handle(),
+            .offset = dst_buffer.Offset(dst),
+            .size = num_bytes,
+        },
+        {
+            .srcStageMask = vk::PipelineStageFlagBits2::eAllCommands,
+            .srcAccessMask = vk::AccessFlagBits2::eTransferRead,
+            .dstStageMask = vk::PipelineStageFlagBits2::eAllCommands,
+            .dstAccessMask = vk::AccessFlagBits2::eMemoryWrite,
+            .buffer = src_buffer.Handle(),
+            .offset = src_buffer.Offset(src),
+            .size = num_bytes,
+        },
+    };
+    cmdbuf.pipelineBarrier2(vk::DependencyInfo{
+        .dependencyFlags = vk::DependencyFlagBits::eByRegion,
+        .bufferMemoryBarrierCount = 2,
+        .pBufferMemoryBarriers = buf_barriers_after,
+    });
+}
+
 std::pair<Buffer*, u32> BufferCache::ObtainBuffer(VAddr device_addr, u32 size, bool is_written,
                                                  bool is_texel_buffer, BufferId buffer_id) {
-    // For small uniform buffers that have not been modified by gpu
-    // use device local stream buffer to reduce renderpass breaks.
-    // Maybe we want to modify the threshold now that the page size is 16KB?
-    static constexpr u64 StreamThreshold = CACHING_PAGESIZE;
-    const bool is_gpu_dirty = memory_tracker.IsRegionGpuModified(device_addr, size);
-    if (!is_written && size <= StreamThreshold && !is_gpu_dirty) {
+    // For read-only buffers use device local stream buffer to reduce renderpass breaks.
+    if (!is_written && size <= CACHING_PAGESIZE && !IsRegionGpuModified(device_addr, size)) {
        const u64 offset = stream_buffer.Copy(device_addr, size, instance.UniformMinAlignment());
        return {&stream_buffer, offset};
    }
-
-    if (!buffer_id || slot_buffers[buffer_id].is_deleted) {
+    if (IsBufferInvalid(buffer_id)) {
        buffer_id = FindBuffer(device_addr, size);
    }
    Buffer& buffer = slot_buffers[buffer_id];
    SynchronizeBuffer(buffer, device_addr, size, is_texel_buffer);
    if (is_written) {
-        memory_tracker.MarkRegionAsGpuModified(device_addr, size);
+        memory_tracker->MarkRegionAsGpuModified(device_addr, size);
        gpu_modified_ranges.Add(device_addr, size);
    }
    return {&buffer, buffer.Offset(device_addr)};
@ -352,21 +451,17 @@ std::pair<Buffer*, u32> BufferCache::ObtainBuffer(VAddr device_addr, u32 size, b

 std::pair<Buffer*, u32> BufferCache::ObtainBufferForImage(VAddr gpu_addr, u32 size) {
    // Check if any buffer contains the full requested range.
-    const u64 page = gpu_addr >> CACHING_PAGEBITS;
-    const BufferId buffer_id = page_table[page].buffer_id;
+    const BufferId buffer_id = page_table[gpu_addr >> CACHING_PAGEBITS].buffer_id;
    if (buffer_id) {
-        Buffer& buffer = slot_buffers[buffer_id];
-        if (buffer.IsInBounds(gpu_addr, size)) {
+        if (Buffer& buffer = slot_buffers[buffer_id]; buffer.IsInBounds(gpu_addr, size)) {
            SynchronizeBuffer(buffer, gpu_addr, size, false);
            return {&buffer, buffer.Offset(gpu_addr)};
        }
    }
-    // If no buffer contains the full requested range but some buffer within was GPU-modified,
-    // fall back to ObtainBuffer to create a full buffer and avoid losing GPU modifications.
-    if (memory_tracker.IsRegionGpuModified(gpu_addr, size)) {
+    // If some buffer within was GPU modified create a full buffer to avoid losing GPU data.
+    if (IsRegionGpuModified(gpu_addr, size)) {
        return ObtainBuffer(gpu_addr, size, false, false);
    }
-
    // In all other cases, just do a CPU copy to the staging buffer.
    const auto [data, offset] = staging_buffer.Map(size, 16);
    memory->CopySparseMemory(gpu_addr, data, size);
@ -380,11 +475,11 @@ bool BufferCache::IsRegionRegistered(VAddr addr, size_t size) {
 }

 bool BufferCache::IsRegionCpuModified(VAddr addr, size_t size) {
-    return memory_tracker.IsRegionCpuModified(addr, size);
+    return memory_tracker->IsRegionCpuModified(addr, size);
 }

 bool BufferCache::IsRegionGpuModified(VAddr addr, size_t size) {
-    return memory_tracker.IsRegionGpuModified(addr, size);
+    return memory_tracker->IsRegionGpuModified(addr, size);
 }

 BufferId BufferCache::FindBuffer(VAddr device_addr, u32 size) {
@ -723,7 +818,7 @@ void BufferCache::SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size,
    boost::container::small_vector<vk::BufferCopy, 4> copies;
    u64 total_size_bytes = 0;
    VAddr buffer_start = buffer.CpuAddr();
-    memory_tracker.ForEachUploadRange(device_addr, size, [&](u64 device_addr_out, u64 range_size) {
+    memory_tracker->ForEachUploadRange(device_addr, size, [&](u64 device_addr_out, u64 range_size) {
        copies.push_back(vk::BufferCopy{
            .srcOffset = total_size_bytes,
            .dstOffset = device_addr_out - buffer_start,
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@ -9,7 +9,6 @@
 #include "common/slot_vector.h"
 #include "common/types.h"
 #include "video_core/buffer_cache/buffer.h"
-#include "video_core/buffer_cache/memory_tracker.h"
 #include "video_core/buffer_cache/range_set.h"
 #include "video_core/multi_level_page_table.h"

@ -21,13 +20,6 @@ namespace Core {
 class MemoryManager;
 }

-namespace Shader {
-namespace Gcn {
-struct FetchShaderData;
-}
-struct Info;
-} // namespace Shader
-
 namespace Vulkan {
 class GraphicsPipeline;
 }
@ -39,6 +31,8 @@ using BufferId = Common::SlotId;
 static constexpr BufferId NULL_BUFFER_ID{0};

 class TextureCache;
+class MemoryTracker;
+class PageManager;

 class BufferCache {
 public:
@ -69,10 +63,16 @@ public:
        bool has_stream_leap = false;
    };

+    using IntervalSet =
+        boost::icl::interval_set<VAddr, std::less,
+                                 ICL_INTERVAL_INSTANCE(ICL_INTERVAL_DEFAULT, VAddr, std::less),
+                                 RangeSetsAllocator>;
+    using IntervalType = typename IntervalSet::interval_type;
+
 public:
    explicit BufferCache(const Vulkan::Instance& instance, Vulkan::Scheduler& scheduler,
-                         Vulkan::Rasterizer& rasterizer_, AmdGpu::Liverpool* liverpool,
-                         TextureCache& texture_cache, PageManager& tracker);
+                         AmdGpu::Liverpool* liverpool, TextureCache& texture_cache,
+                         PageManager& tracker);
    ~BufferCache();

    /// Returns a pointer to GDS device local buffer.
@ -110,7 +110,10 @@ public:
    }

    /// Invalidates any buffer in the logical page range.
-    void InvalidateMemory(VAddr device_addr, u64 size, bool unmap);
+    void InvalidateMemory(VAddr device_addr, u64 size);
+
+    /// Waits on pending downloads in the logical page range.
+    void ReadMemory(VAddr device_addr, u64 size);

    /// Binds host vertex buffers for the current draw.
    void BindVertexBuffers(const Vulkan::GraphicsPipeline& pipeline);
@ -124,6 +127,9 @@ public:
    /// Writes a value to GPU buffer. (uses staging buffer to temporarily store the data)
    void WriteData(VAddr address, const void* value, u32 num_bytes, bool is_gds);

+    /// Performs buffer to buffer data copy on the GPU.
+    void CopyBuffer(VAddr dst, VAddr src, u32 num_bytes, bool dst_gds, bool src_gds);
+
    /// Obtains a buffer for the specified region.
    [[nodiscard]] std::pair<Buffer*, u32> ObtainBuffer(VAddr gpu_addr, u32 size, bool is_written,
                                                       bool is_texel_buffer = false,
@ -166,6 +172,10 @@ private:
                                     });
    }

+    inline bool IsBufferInvalid(BufferId buffer_id) const {
+        return !buffer_id || slot_buffers[buffer_id].is_deleted;
+    }
+
    void DownloadBufferMemory(Buffer& buffer, VAddr device_addr, u64 size);

    [[nodiscard]] OverlapResult ResolveOverlaps(VAddr device_addr, u32 wanted_size);
@ -193,11 +203,10 @@ private:

    const Vulkan::Instance& instance;
    Vulkan::Scheduler& scheduler;
-    Vulkan::Rasterizer& rasterizer;
    AmdGpu::Liverpool* liverpool;
    Core::MemoryManager* memory;
    TextureCache& texture_cache;
-    PageManager& tracker;
+    std::unique_ptr<MemoryTracker> memory_tracker;
    StreamBuffer staging_buffer;
    StreamBuffer stream_buffer;
    StreamBuffer download_buffer;
@ -209,7 +218,6 @@ private:
    Common::SlotVector<Buffer> slot_buffers;
    RangeSet gpu_modified_ranges;
    SplitRangeMap<BufferId> buffer_ranges;
-    MemoryTracker memory_tracker;
    PageTable page_table;
    vk::UniqueDescriptorSetLayout fault_process_desc_layout;
    vk::UniquePipeline fault_process_pipeline;
--- a/src/video_core/buffer_cache/memory_tracker.h
+++ b/src/video_core/buffer_cache/memory_tracker.h
@ -57,6 +57,14 @@ public:
                            });
    }

+    void UnmarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 query_size) noexcept {
+        IteratePages<false>(dirty_cpu_addr, query_size,
+                            [](RegionManager* manager, u64 offset, size_t size) {
+                                manager->template ChangeRegionState<Type::GPU, false>(
+                                    manager->GetCpuAddr() + offset, size);
+                            });
+    }
+
    /// Call 'func' for each CPU modified range and unmark those pages as CPU modified
    void ForEachUploadRange(VAddr query_cpu_range, u64 query_size, auto&& func) {
        IteratePages<true>(query_cpu_range, query_size,
--- a/src/video_core/buffer_cache/region_definitions.h
+++ b/src/video_core/buffer_cache/region_definitions.h
@ -3,7 +3,6 @@

 #pragma once

-#include <array>
 #include "common/bit_array.h"
 #include "common/types.h"

@ -20,7 +19,6 @@ constexpr u64 NUM_PAGES_PER_REGION = TRACKER_HIGHER_PAGE_SIZE / TRACKER_BYTES_PE
 enum class Type {
    CPU,
    GPU,
-    Writeable,
 };

 using RegionBits = Common::BitArray<NUM_PAGES_PER_REGION>;
--- a/src/video_core/buffer_cache/region_manager.h
+++ b/src/video_core/buffer_cache/region_manager.h
@ -4,7 +4,7 @@
 #pragma once

 #include <mutex>
-#include <utility>
+#include "common/config.h"
 #include "common/div_ceil.h"

 #ifdef __linux__
@ -20,7 +20,7 @@
 namespace VideoCore {

 /**
- * Allows tracking CPU and GPU modification of pages in a contigious 4MB virtual address region.
+ * Allows tracking CPU and GPU modification of pages in a contigious 16MB virtual address region.
 * Information is stored in bitsets for spacial locality and fast update of single pages.
 */
 class RegionManager {
@ -30,6 +30,7 @@ public:
        cpu.Fill();
        gpu.Clear();
        writeable.Fill();
+        readable.Fill();
    }
    explicit RegionManager() = default;

@ -47,29 +48,19 @@ public:

    template <Type type>
    RegionBits& GetRegionBits() noexcept {
-        static_assert(type != Type::Writeable);
        if constexpr (type == Type::CPU) {
            return cpu;
        } else if constexpr (type == Type::GPU) {
            return gpu;
-        } else if constexpr (type == Type::Writeable) {
-            return writeable;
-        } else {
-            static_assert(false, "Invalid type");
        }
    }

    template <Type type>
    const RegionBits& GetRegionBits() const noexcept {
-        static_assert(type != Type::Writeable);
        if constexpr (type == Type::CPU) {
            return cpu;
        } else if constexpr (type == Type::GPU) {
            return gpu;
-        } else if constexpr (type == Type::Writeable) {
-            return writeable;
-        } else {
-            static_assert(false, "Invalid type");
        }
    }

@ -90,7 +81,6 @@ public:
            return;
        }
        std::scoped_lock lk{lock};
-        static_assert(type != Type::Writeable);

        RegionBits& bits = GetRegionBits<type>();
        if constexpr (enable) {
@ -99,7 +89,9 @@ public:
            bits.UnsetRange(start_page, end_page);
        }
        if constexpr (type == Type::CPU) {
-            UpdateProtection<!enable>();
+            UpdateProtection<!enable, false>();
+        } else if (Config::readbacks()) {
+            UpdateProtection<enable, true>();
        }
    }

@ -122,16 +114,10 @@ public:
            return;
        }
        std::scoped_lock lk{lock};
-        static_assert(type != Type::Writeable);

        RegionBits& bits = GetRegionBits<type>();
        RegionBits mask(bits, start_page, end_page);

-        // TODO: this will not be needed once we handle readbacks
-        if constexpr (type == Type::GPU) {
-            mask &= ~writeable;
-        }
-
        for (const auto& [start, end] : mask) {
            func(cpu_addr + start * TRACKER_BYTES_PER_PAGE, (end - start) * TRACKER_BYTES_PER_PAGE);
        }
@ -139,7 +125,9 @@ public:
        if constexpr (clear) {
            bits.UnsetRange(start_page, end_page);
            if constexpr (type == Type::CPU) {
-                UpdateProtection<true>();
+                UpdateProtection<true, false>();
+            } else if (Config::readbacks()) {
+                UpdateProtection<false, true>();
            }
        }
    }
@ -151,7 +139,7 @@ public:
     * @param size   Size in bytes of the region to query for modifications
     */
    template <Type type>
-    [[nodiscard]] bool IsRegionModified(u64 offset, u64 size) const noexcept {
+    [[nodiscard]] bool IsRegionModified(u64 offset, u64 size) noexcept {
        RENDERER_TRACE;
        const size_t start_page = SanitizeAddress(offset) / TRACKER_BYTES_PER_PAGE;
        const size_t end_page =
@ -159,17 +147,10 @@ public:
        if (start_page >= NUM_PAGES_PER_REGION || end_page <= start_page) {
            return false;
        }
-        // std::scoped_lock lk{lock}; // Is this needed?
-        static_assert(type != Type::Writeable);
+        std::scoped_lock lk{lock};

        const RegionBits& bits = GetRegionBits<type>();
        RegionBits test(bits, start_page, end_page);
-
-        // TODO: this will not be needed once we handle readbacks
-        if constexpr (type == Type::GPU) {
-            test &= ~writeable;
-        }
-
        return test.Any();
    }

@ -181,19 +162,21 @@ private:
     * @param current_bits Current state of the word
     * @param new_bits     New state of the word
     *
-     * @tparam add_to_tracker True when the tracker should start tracking the new pages
+     * @tparam track True when the tracker should start tracking the new pages
     */
-    template <bool add_to_tracker>
+    template <bool track, bool is_read>
    void UpdateProtection() {
        RENDERER_TRACE;
-        RegionBits mask = cpu ^ writeable;
-
+        RegionBits mask = is_read ? (~gpu ^ readable) : (cpu ^ writeable);
        if (mask.None()) {
-            return; // No changes to the CPU tracking state
+            return;
        }
-
-        writeable = cpu;
-        tracker->UpdatePageWatchersForRegion<add_to_tracker>(cpu_addr, mask);
+        if constexpr (is_read) {
+            readable = ~gpu;
+        } else {
+            writeable = cpu;
+        }
+        tracker->UpdatePageWatchersForRegion<track, is_read>(cpu_addr, mask);
    }

 #ifdef PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP
@ -206,6 +189,7 @@ private:
    RegionBits cpu;
    RegionBits gpu;
    RegionBits writeable;
+    RegionBits readable;
 };

 } // namespace VideoCore
--- a/src/video_core/page_manager.cpp
+++ b/src/video_core/page_manager.cpp
@ -13,6 +13,7 @@

 #ifndef _WIN64
 #include <sys/mman.h>
+#include "common/adaptive_mutex.h"
 #ifdef ENABLE_USERFAULTFD
 #include <thread>
 #include <fcntl.h>
@ -23,6 +24,7 @@
 #endif
 #else
 #include <windows.h>
+#include "common/spin_lock.h"
 #endif

 #ifdef __linux__
@ -38,22 +40,45 @@ constexpr size_t PAGE_BITS = 12;

 struct PageManager::Impl {
    struct PageState {
-        u8 num_watchers{};
+        u8 num_write_watchers : 7;
+        // At the moment only buffer cache can request read watchers.
+        // And buffers cannot overlap, thus only 1 can exist per page.
+        u8 num_read_watchers : 1;

-        Core::MemoryPermission Perm() const noexcept {
-            return num_watchers == 0 ? Core::MemoryPermission::ReadWrite
-                                     : Core::MemoryPermission::Read;
+        Core::MemoryPermission WritePerm() const noexcept {
+            return num_write_watchers == 0 ? Core::MemoryPermission::Write
+                                           : Core::MemoryPermission::None;
        }

-        template <s32 delta>
+        Core::MemoryPermission ReadPerm() const noexcept {
+            return num_read_watchers == 0 ? Core::MemoryPermission::Read
+                                          : Core::MemoryPermission::None;
+        }
+
+        Core::MemoryPermission Perms() const noexcept {
+            return ReadPerm() | WritePerm();
+        }
+
+        template <s32 delta, bool is_read>
        u8 AddDelta() {
-            if constexpr (delta == 1) {
-                return ++num_watchers;
-            } else if constexpr (delta == -1) {
-                ASSERT_MSG(num_watchers > 0, "Not enough watchers");
-                return --num_watchers;
+            if constexpr (is_read) {
+                if constexpr (delta == 1) {
+                    return ++num_read_watchers;
+                } else if (delta == -1) {
+                    ASSERT_MSG(num_read_watchers > 0, "Not enough watchers");
+                    return --num_read_watchers;
+                } else {
+                    return num_read_watchers;
+                }
            } else {
-                return num_watchers;
+                if constexpr (delta == 1) {
+                    return ++num_write_watchers;
+                } else if (delta == -1) {
+                    ASSERT_MSG(num_write_watchers > 0, "Not enough watchers");
+                    return --num_write_watchers;
+                } else {
+                    return num_write_watchers;
+                }
            }
        }
    };
@ -176,6 +201,7 @@ struct PageManager::Impl {
        RENDERER_TRACE;
        auto* memory = Core::Memory::Instance();
        auto& impl = memory->GetAddressSpace();
+        // ASSERT(perms != Core::MemoryPermission::Write);
        impl.Protect(address, size, perms);
    }

@ -183,12 +209,14 @@ struct PageManager::Impl {
        const auto addr = reinterpret_cast<VAddr>(fault_address);
        if (Common::IsWriteError(context)) {
            return rasterizer->InvalidateMemory(addr, 1);
+        } else {
+            return rasterizer->ReadMemory(addr, 1);
        }
        return false;
    }
-
 #endif
-    template <bool track>
+
+    template <bool track, bool is_read>
    void UpdatePageWatchers(VAddr addr, u64 size) {
        RENDERER_TRACE;

@ -200,7 +228,7 @@ struct PageManager::Impl {
        const auto lock_end = locks.begin() + Common::DivCeil(page_end, PAGES_PER_LOCK);
        Common::RangeLockGuard lk(lock_start, lock_end);

-        auto perms = cached_pages[page].Perm();
+        auto perms = cached_pages[page].Perms();
        u64 range_begin = 0;
        u64 range_bytes = 0;
        u64 potential_range_bytes = 0;
@ -226,9 +254,9 @@ struct PageManager::Impl {
            PageState& state = cached_pages[page];

            // Apply the change to the page state
-            const u8 new_count = state.AddDelta<track ? 1 : -1>();
+            const u8 new_count = state.AddDelta<track ? 1 : -1, is_read>();

-            if (auto new_perms = state.Perm(); new_perms != perms) [[unlikely]] {
+            if (auto new_perms = state.Perms(); new_perms != perms) [[unlikely]] {
                // If the protection changed add pending (un)protect action
                release_pending();
                perms = new_perms;
@ -253,25 +281,23 @@ struct PageManager::Impl {
        release_pending();
    }

-    template <bool track>
+    template <bool track, bool is_read>
    void UpdatePageWatchersForRegion(VAddr base_addr, RegionBits& mask) {
        RENDERER_TRACE;
        auto start_range = mask.FirstRange();
        auto end_range = mask.LastRange();

        if (start_range.second == end_range.second) {
-            // Optimization: if all pages are contiguous, use the regular UpdatePageWatchers
+            // if all pages are contiguous, use the regular UpdatePageWatchers
            const VAddr start_addr = base_addr + (start_range.first << PAGE_BITS);
            const u64 size = (start_range.second - start_range.first) << PAGE_BITS;
-
-            UpdatePageWatchers<track>(start_addr, size);
-            return;
+            return UpdatePageWatchers<track, is_read>(start_addr, size);
        }

        size_t base_page = (base_addr >> PAGE_BITS);
        ASSERT(base_page % PAGES_PER_LOCK == 0);
        std::scoped_lock lk(locks[base_page / PAGES_PER_LOCK]);
-        auto perms = cached_pages[base_page + start_range.first].Perm();
+        auto perms = cached_pages[base_page + start_range.first].Perms();
        u64 range_begin = 0;
        u64 range_bytes = 0;
        u64 potential_range_bytes = 0;
@ -292,9 +318,10 @@ struct PageManager::Impl {
            const bool update = mask.Get(page);

            // Apply the change to the page state
-            const u8 new_count = update ? state.AddDelta<track ? 1 : -1>() : state.AddDelta<0>();
+            const u8 new_count =
+                update ? state.AddDelta<track ? 1 : -1, is_read>() : state.AddDelta<0, is_read>();

-            if (auto new_perms = state.Perm(); new_perms != perms) [[unlikely]] {
+            if (auto new_perms = state.Perms(); new_perms != perms) [[unlikely]] {
                // If the protection changed add pending (un)protect action
                release_pending();
                perms = new_perms;
@ -348,19 +375,23 @@ void PageManager::OnGpuUnmap(VAddr address, size_t size) {

 template <bool track>
 void PageManager::UpdatePageWatchers(VAddr addr, u64 size) const {
-    impl->UpdatePageWatchers<track>(addr, size);
+    impl->UpdatePageWatchers<track, false>(addr, size);
 }

-template <bool track>
+template <bool track, bool is_read>
 void PageManager::UpdatePageWatchersForRegion(VAddr base_addr, RegionBits& mask) const {
-    impl->UpdatePageWatchersForRegion<track>(base_addr, mask);
+    impl->UpdatePageWatchersForRegion<track, is_read>(base_addr, mask);
 }

 template void PageManager::UpdatePageWatchers<true>(VAddr addr, u64 size) const;
 template void PageManager::UpdatePageWatchers<false>(VAddr addr, u64 size) const;
-template void PageManager::UpdatePageWatchersForRegion<true>(VAddr base_addr,
-                                                             RegionBits& mask) const;
-template void PageManager::UpdatePageWatchersForRegion<false>(VAddr base_addr,
-                                                              RegionBits& mask) const;
+template void PageManager::UpdatePageWatchersForRegion<true, true>(VAddr base_addr,
+                                                                   RegionBits& mask) const;
+template void PageManager::UpdatePageWatchersForRegion<true, false>(VAddr base_addr,
+                                                                    RegionBits& mask) const;
+template void PageManager::UpdatePageWatchersForRegion<false, true>(VAddr base_addr,
+                                                                    RegionBits& mask) const;
+template void PageManager::UpdatePageWatchersForRegion<false, false>(VAddr base_addr,
+                                                                     RegionBits& mask) const;

 } // namespace VideoCore
--- a/src/video_core/page_manager.h
+++ b/src/video_core/page_manager.h
@ -37,9 +37,8 @@ public:
    template <bool track>
    void UpdatePageWatchers(VAddr addr, u64 size) const;

-    /// Updates watches in the pages touching the specified region
-    /// using a mask.
-    template <bool track>
+    /// Updates watches in the pages touching the specified region using a mask.
+    template <bool track, bool is_read = false>
    void UpdatePageWatchersForRegion(VAddr base_addr, RegionBits& mask) const;

    /// Returns page aligned address.
--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@ -36,7 +36,7 @@ static Shader::PushData MakeUserData(const AmdGpu::Liverpool::Regs& regs) {
 Rasterizer::Rasterizer(const Instance& instance_, Scheduler& scheduler_,
                       AmdGpu::Liverpool* liverpool_)
    : instance{instance_}, scheduler{scheduler_}, page_manager{this},
-      buffer_cache{instance, scheduler, *this, liverpool_, texture_cache, page_manager},
+      buffer_cache{instance, scheduler, liverpool_, texture_cache, page_manager},
      texture_cache{instance, scheduler, buffer_cache, page_manager}, liverpool{liverpool_},
      memory{Core::Memory::Instance()}, pipeline_cache{instance, scheduler, liverpool} {
    if (!Config::nullGpu()) {
@ -945,6 +945,10 @@ void Rasterizer::InlineData(VAddr address, const void* value, u32 num_bytes, boo
    buffer_cache.InlineData(address, value, num_bytes, is_gds);
 }

+void Rasterizer::CopyBuffer(VAddr dst, VAddr src, u32 num_bytes, bool dst_gds, bool src_gds) {
+    buffer_cache.CopyBuffer(dst, src, num_bytes, dst_gds, src_gds);
+}
+
 u32 Rasterizer::ReadDataFromGds(u32 gds_offset) {
    auto* gds_buf = buffer_cache.GetGdsBuffer();
    u32 value;
@ -957,11 +961,20 @@ bool Rasterizer::InvalidateMemory(VAddr addr, u64 size) {
        // Not GPU mapped memory, can skip invalidation logic entirely.
        return false;
    }
-    buffer_cache.InvalidateMemory(addr, size, false);
+    buffer_cache.InvalidateMemory(addr, size);
    texture_cache.InvalidateMemory(addr, size);
    return true;
 }

+bool Rasterizer::ReadMemory(VAddr addr, u64 size) {
+    if (!IsMapped(addr, size)) {
+        // Not GPU mapped memory, can skip invalidation logic entirely.
+        return false;
+    }
+    buffer_cache.ReadMemory(addr, size);
+    return true;
+}
+
 bool Rasterizer::IsMapped(VAddr addr, u64 size) {
    if (size == 0) {
        // There is no memory, so not mapped.
@ -982,7 +995,7 @@ void Rasterizer::MapMemory(VAddr addr, u64 size) {
 }

 void Rasterizer::UnmapMemory(VAddr addr, u64 size) {
-    buffer_cache.InvalidateMemory(addr, size, true);
+    buffer_cache.InvalidateMemory(addr, size);
    texture_cache.UnmapMemory(addr, size);
    page_manager.OnGpuUnmap(addr, size);
    {
--- a/src/video_core/renderer_vulkan/vk_rasterizer.h
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.h
@ -56,8 +56,10 @@ public:
                                 bool from_guest = false);

    void InlineData(VAddr address, const void* value, u32 num_bytes, bool is_gds);
+    void CopyBuffer(VAddr dst, VAddr src, u32 num_bytes, bool dst_gds, bool src_gds);
    u32 ReadDataFromGds(u32 gsd_offset);
    bool InvalidateMemory(VAddr addr, u64 size);
+    bool ReadMemory(VAddr addr, u64 size);
    bool IsMapped(VAddr addr, u64 size);
    void MapMemory(VAddr addr, u64 size);
    void UnmapMemory(VAddr addr, u64 size);