diff --git a/src/common/config.cpp b/src/common/config.cpp index 9c316949a..0b5f11200 100644 --- a/src/common/config.cpp +++ b/src/common/config.cpp @@ -51,6 +51,7 @@ static bool isShowSplash = false; static std::string isSideTrophy = "right"; static bool isNullGpu = false; static bool shouldCopyGPUBuffers = false; +static bool readbacksEnabled = false; static bool shouldDumpShaders = false; static bool shouldPatchShaders = true; static u32 vblankDivider = 1; @@ -240,6 +241,10 @@ bool copyGPUCmdBuffers() { return shouldCopyGPUBuffers; } +bool readbacks() { + return readbacksEnabled; +} + bool dumpShaders() { return shouldDumpShaders; } @@ -344,6 +349,10 @@ void setCopyGPUCmdBuffers(bool enable) { shouldCopyGPUBuffers = enable; } +void setReadbacks(bool enable) { + readbacksEnabled = enable; +} + void setDumpShaders(bool enable) { shouldDumpShaders = enable; } @@ -586,6 +595,7 @@ void load(const std::filesystem::path& path) { screenHeight = toml::find_or(gpu, "screenHeight", screenHeight); isNullGpu = toml::find_or(gpu, "nullGpu", false); shouldCopyGPUBuffers = toml::find_or(gpu, "copyGPUBuffers", false); + readbacksEnabled = toml::find_or(gpu, "readbacks", false); shouldDumpShaders = toml::find_or(gpu, "dumpShaders", false); shouldPatchShaders = toml::find_or(gpu, "patchShaders", true); vblankDivider = toml::find_or(gpu, "vblankDivider", 1); @@ -735,6 +745,7 @@ void save(const std::filesystem::path& path) { data["GPU"]["screenHeight"] = screenHeight; data["GPU"]["nullGpu"] = isNullGpu; data["GPU"]["copyGPUBuffers"] = shouldCopyGPUBuffers; + data["GPU"]["readbacks"] = readbacksEnabled; data["GPU"]["dumpShaders"] = shouldDumpShaders; data["GPU"]["patchShaders"] = shouldPatchShaders; data["GPU"]["vblankDivider"] = vblankDivider; diff --git a/src/common/config.h b/src/common/config.h index 38114983f..219461e7e 100644 --- a/src/common/config.h +++ b/src/common/config.h @@ -45,6 +45,8 @@ bool nullGpu(); void setNullGpu(bool enable); bool copyGPUCmdBuffers(); void setCopyGPUCmdBuffers(bool enable); +bool readbacks(); +void setReadbacks(bool enable); bool dumpShaders(); void setDumpShaders(bool enable); u32 vblankDiv(); diff --git a/src/core/address_space.cpp b/src/core/address_space.cpp index 2e66bdf83..2e29f70ee 100644 --- a/src/core/address_space.cpp +++ b/src/core/address_space.cpp @@ -302,14 +302,15 @@ struct AddressSpace::Impl { new_flags = PAGE_READWRITE; } else if (read && !write) { new_flags = PAGE_READONLY; - } else if (execute && !read && not write) { + } else if (execute && !read && !write) { new_flags = PAGE_EXECUTE; } else if (!read && !write && !execute) { new_flags = PAGE_NOACCESS; } else { LOG_CRITICAL(Common_Memory, - "Unsupported protection flag combination for address {:#x}, size {}", - virtual_addr, size); + "Unsupported protection flag combination for address {:#x}, size {}, " + "read={}, write={}, execute={}", + virtual_addr, size, read, write, execute); return; } diff --git a/src/core/address_space.h b/src/core/address_space.h index d7f3efc75..85b4c36ac 100644 --- a/src/core/address_space.h +++ b/src/core/address_space.h @@ -11,6 +11,7 @@ namespace Core { enum class MemoryPermission : u32 { + None = 0, Read = 1 << 0, Write = 1 << 1, ReadWrite = Read | Write, diff --git a/src/core/libraries/gnmdriver/gnmdriver.cpp b/src/core/libraries/gnmdriver/gnmdriver.cpp index 9cf340050..8c3ab1612 100644 --- a/src/core/libraries/gnmdriver/gnmdriver.cpp +++ b/src/core/libraries/gnmdriver/gnmdriver.cpp @@ -2834,7 +2834,7 @@ void RegisterlibSceGnmDriver(Core::Loader::SymbolsResolver* sym) { } if (Config::copyGPUCmdBuffers()) { - liverpool->reserveCopyBufferSpace(); + liverpool->ReserveCopyBufferSpace(); } Platform::IrqC::Instance()->Register(Platform::InterruptId::GpuIdle, ResetSubmissionLock, diff --git a/src/emulator.cpp b/src/emulator.cpp index 99fd50af5..d6d523fa0 100644 --- a/src/emulator.cpp +++ b/src/emulator.cpp @@ -132,6 +132,7 @@ void Emulator::Run(std::filesystem::path file, const std::vector ar LOG_INFO(Config, "General LogType: {}", Config::getLogType()); LOG_INFO(Config, "General isNeo: {}", Config::isNeoModeConsole()); LOG_INFO(Config, "GPU isNullGpu: {}", Config::nullGpu()); + LOG_INFO(Config, "GPU readbacks: {}", Config::readbacks()); LOG_INFO(Config, "GPU shouldDumpShaders: {}", Config::dumpShaders()); LOG_INFO(Config, "GPU vblankDivider: {}", Config::vblankDiv()); LOG_INFO(Config, "Vulkan gpuId: {}", Config::getGpuId()); diff --git a/src/video_core/amdgpu/liverpool.cpp b/src/video_core/amdgpu/liverpool.cpp index 464f02e3a..9b8c28b66 100644 --- a/src/video_core/amdgpu/liverpool.cpp +++ b/src/video_core/amdgpu/liverpool.cpp @@ -72,8 +72,23 @@ Liverpool::~Liverpool() { process_thread.join(); } +void Liverpool::ProcessCommands() { + // Process incoming commands with high priority + while (num_commands) { + Common::UniqueFunction callback{}; + { + std::scoped_lock lk{submit_mutex}; + callback = std::move(command_queue.front()); + command_queue.pop(); + --num_commands; + } + callback(); + } +} + void Liverpool::Process(std::stop_token stoken) { Common::SetCurrentThreadName("shadPS4:GpuCommandProcessor"); + gpu_id = std::this_thread::get_id(); while (!stoken.stop_requested()) { { @@ -90,18 +105,7 @@ void Liverpool::Process(std::stop_token stoken) { curr_qid = -1; while (num_submits || num_commands) { - - // Process incoming commands with high priority - while (num_commands) { - Common::UniqueFunction callback{}; - { - std::unique_lock lk{submit_mutex}; - callback = std::move(command_queue.front()); - command_queue.pop(); - --num_commands; - } - callback(); - } + ProcessCommands(); curr_qid = (curr_qid + 1) % num_mapped_queues; @@ -147,6 +151,8 @@ Liverpool::Task Liverpool::ProcessCeUpdate(std::span ccb) { FIBER_ENTER(ccb_task_name); while (!ccb.empty()) { + ProcessCommands(); + const auto* header = reinterpret_cast(ccb.data()); const u32 type = header->type; if (type != 3) { @@ -224,6 +230,8 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::span(dcb.data()); while (!dcb.empty()) { + ProcessCommands(); + const auto* header = reinterpret_cast(dcb.data()); const u32 type = header->type; @@ -638,9 +646,8 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::spansrc_sel == DmaDataSrc::Memory || dma_data->src_sel == DmaDataSrc::MemoryUsingL2) && dma_data->dst_sel == DmaDataDst::Gds) { - rasterizer->InlineData(dma_data->dst_addr_lo, - dma_data->SrcAddress(), - dma_data->NumBytes(), true); + rasterizer->CopyBuffer(dma_data->dst_addr_lo, dma_data->SrcAddress(), + dma_data->NumBytes(), true, false); } else if (dma_data->src_sel == DmaDataSrc::Data && (dma_data->dst_sel == DmaDataDst::Memory || dma_data->dst_sel == DmaDataDst::MemoryUsingL2)) { @@ -649,14 +656,15 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::spansrc_sel == DmaDataSrc::Gds && (dma_data->dst_sel == DmaDataDst::Memory || dma_data->dst_sel == DmaDataDst::MemoryUsingL2)) { - // LOG_WARNING(Render_Vulkan, "GDS memory read"); + rasterizer->CopyBuffer(dma_data->DstAddress(), dma_data->src_addr_lo, + dma_data->NumBytes(), false, true); } else if ((dma_data->src_sel == DmaDataSrc::Memory || dma_data->src_sel == DmaDataSrc::MemoryUsingL2) && (dma_data->dst_sel == DmaDataDst::Memory || dma_data->dst_sel == DmaDataDst::MemoryUsingL2)) { - rasterizer->InlineData(dma_data->DstAddress(), - dma_data->SrcAddress(), - dma_data->NumBytes(), false); + rasterizer->CopyBuffer(dma_data->DstAddress(), + dma_data->SrcAddress(), dma_data->NumBytes(), + false, false); } else { UNREACHABLE_MSG("WriteData src_sel = {}, dst_sel = {}", u32(dma_data->src_sel.Value()), u32(dma_data->dst_sel.Value())); @@ -702,6 +710,9 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::span(header); while (!rewind->Valid()) { YIELD_GFX(); @@ -801,29 +812,32 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::span -Liverpool::Task Liverpool::ProcessCompute(const u32* acb, u32 acb_dwords, u32 vqid) { +Liverpool::Task Liverpool::ProcessCompute(std::span acb, u32 vqid) { FIBER_ENTER(acb_task_name[vqid]); auto& queue = asc_queues[{vqid}]; - auto base_addr = reinterpret_cast(acb); - while (acb_dwords > 0) { - auto* header = reinterpret_cast(acb); + auto base_addr = reinterpret_cast(acb.data()); + while (!acb.empty()) { + ProcessCommands(); + + auto* header = reinterpret_cast(acb.data()); u32 next_dw_off = header->type3.NumWords() + 1; // If we have a buffered packet, use it. if (queue.tmp_dwords > 0) [[unlikely]] { header = reinterpret_cast(queue.tmp_packet.data()); next_dw_off = header->type3.NumWords() + 1 - queue.tmp_dwords; - std::memcpy(queue.tmp_packet.data() + queue.tmp_dwords, acb, next_dw_off * sizeof(u32)); + std::memcpy(queue.tmp_packet.data() + queue.tmp_dwords, acb.data(), + next_dw_off * sizeof(u32)); queue.tmp_dwords = 0; } // If the packet is split across ring boundary, buffer until next submission - if (next_dw_off > acb_dwords) [[unlikely]] { - std::memcpy(queue.tmp_packet.data(), acb, acb_dwords * sizeof(u32)); - queue.tmp_dwords = acb_dwords; + if (next_dw_off > acb.size()) [[unlikely]] { + std::memcpy(queue.tmp_packet.data(), acb.data(), acb.size_bytes()); + queue.tmp_dwords = acb.size(); if constexpr (!is_indirect) { - *queue.read_addr += acb_dwords; + *queue.read_addr += acb.size(); *queue.read_addr %= queue.ring_size_dw; } break; @@ -832,9 +846,7 @@ Liverpool::Task Liverpool::ProcessCompute(const u32* acb, u32 acb_dwords, u32 vq if (header->type == 2) { // Type-2 packet are used for padding purposes next_dw_off = 1; - acb += next_dw_off; - acb_dwords -= next_dw_off; - + acb = NextPacket(acb, next_dw_off); if constexpr (!is_indirect) { *queue.read_addr += next_dw_off; *queue.read_addr %= queue.ring_size_dw; @@ -856,8 +868,8 @@ Liverpool::Task Liverpool::ProcessCompute(const u32* acb, u32 acb_dwords, u32 vq } case PM4ItOpcode::IndirectBuffer: { const auto* indirect_buffer = reinterpret_cast(header); - auto task = ProcessCompute(indirect_buffer->Address(), - indirect_buffer->ib_size, vqid); + auto task = ProcessCompute( + {indirect_buffer->Address(), indirect_buffer->ib_size}, vqid); RESUME_ASC(task, vqid); while (!task.handle.done()) { @@ -876,8 +888,8 @@ Liverpool::Task Liverpool::ProcessCompute(const u32* acb, u32 acb_dwords, u32 vq } else if ((dma_data->src_sel == DmaDataSrc::Memory || dma_data->src_sel == DmaDataSrc::MemoryUsingL2) && dma_data->dst_sel == DmaDataDst::Gds) { - rasterizer->InlineData(dma_data->dst_addr_lo, dma_data->SrcAddress(), - dma_data->NumBytes(), true); + rasterizer->CopyBuffer(dma_data->dst_addr_lo, dma_data->SrcAddress(), + dma_data->NumBytes(), true, false); } else if (dma_data->src_sel == DmaDataSrc::Data && (dma_data->dst_sel == DmaDataDst::Memory || dma_data->dst_sel == DmaDataDst::MemoryUsingL2)) { @@ -886,14 +898,14 @@ Liverpool::Task Liverpool::ProcessCompute(const u32* acb, u32 acb_dwords, u32 vq } else if (dma_data->src_sel == DmaDataSrc::Gds && (dma_data->dst_sel == DmaDataDst::Memory || dma_data->dst_sel == DmaDataDst::MemoryUsingL2)) { - // LOG_WARNING(Render_Vulkan, "GDS memory read"); + rasterizer->CopyBuffer(dma_data->DstAddress(), dma_data->src_addr_lo, + dma_data->NumBytes(), false, true); } else if ((dma_data->src_sel == DmaDataSrc::Memory || dma_data->src_sel == DmaDataSrc::MemoryUsingL2) && (dma_data->dst_sel == DmaDataDst::Memory || dma_data->dst_sel == DmaDataDst::MemoryUsingL2)) { - rasterizer->InlineData(dma_data->DstAddress(), - dma_data->SrcAddress(), dma_data->NumBytes(), - false); + rasterizer->CopyBuffer(dma_data->DstAddress(), dma_data->SrcAddress(), + dma_data->NumBytes(), false, false); } else { UNREACHABLE_MSG("WriteData src_sel = {}, dst_sel = {}", u32(dma_data->src_sel.Value()), u32(dma_data->dst_sel.Value())); @@ -904,6 +916,9 @@ Liverpool::Task Liverpool::ProcessCompute(const u32* acb, u32 acb_dwords, u32 vq break; } case PM4ItOpcode::Rewind: { + if (!rasterizer) { + break; + } const PM4CmdRewind* rewind = reinterpret_cast(header); while (!rewind->Valid()) { YIELD_ASC(vqid); @@ -1016,8 +1031,7 @@ Liverpool::Task Liverpool::ProcessCompute(const u32* acb, u32 acb_dwords, u32 vq static_cast(opcode), header->type3.NumWords()); } - acb += next_dw_off; - acb_dwords -= next_dw_off; + acb = NextPacket(acb, next_dw_off); if constexpr (!is_indirect) { *queue.read_addr += next_dw_off; @@ -1087,7 +1101,7 @@ void Liverpool::SubmitAsc(u32 gnm_vqid, std::span acb) { auto& queue = mapped_queues[gnm_vqid]; const auto vqid = gnm_vqid - 1; - const auto& task = ProcessCompute(acb.data(), acb.size(), vqid); + const auto& task = ProcessCompute(acb, vqid); { std::scoped_lock lock{queue.m_access}; queue.submits.emplace(task.handle); diff --git a/src/video_core/amdgpu/liverpool.h b/src/video_core/amdgpu/liverpool.h index d88a44375..0613823ab 100644 --- a/src/video_core/amdgpu/liverpool.h +++ b/src/video_core/amdgpu/liverpool.h @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -1512,14 +1513,32 @@ public: rasterizer = rasterizer_; } - void SendCommand(Common::UniqueFunction&& func) { - std::scoped_lock lk{submit_mutex}; - command_queue.emplace(std::move(func)); - ++num_commands; - submit_cv.notify_one(); + template + void SendCommand(auto&& func) { + if (std::this_thread::get_id() == gpu_id) { + return func(); + } + if constexpr (wait_done) { + std::binary_semaphore sem{0}; + { + std::scoped_lock lk{submit_mutex}; + command_queue.emplace([&sem, &func] { + func(); + sem.release(); + }); + ++num_commands; + submit_cv.notify_one(); + } + sem.acquire(); + } else { + std::scoped_lock lk{submit_mutex}; + command_queue.emplace(std::move(func)); + ++num_commands; + submit_cv.notify_one(); + } } - void reserveCopyBufferSpace() { + void ReserveCopyBufferSpace() { GpuQueue& gfx_queue = mapped_queues[GfxQueueId]; std::scoped_lock lk(gfx_queue.m_access); @@ -1581,8 +1600,9 @@ private: Task ProcessGraphics(std::span dcb, std::span ccb); Task ProcessCeUpdate(std::span ccb); template - Task ProcessCompute(const u32* acb, u32 acb_dwords, u32 vqid); + Task ProcessCompute(std::span acb, u32 vqid); + void ProcessCommands(); void Process(std::stop_token stoken); struct GpuQueue { @@ -1626,6 +1646,7 @@ private: std::mutex submit_mutex; std::condition_variable_any submit_cv; std::queue> command_queue{}; + std::thread::id gpu_id; int curr_qid{-1}; }; diff --git a/src/video_core/buffer_cache/buffer_cache.cpp b/src/video_core/buffer_cache/buffer_cache.cpp index 23f9dc0bc..4a88c7ed4 100644 --- a/src/video_core/buffer_cache/buffer_cache.cpp +++ b/src/video_core/buffer_cache/buffer_cache.cpp @@ -3,12 +3,14 @@ #include #include "common/alignment.h" +#include "common/config.h" #include "common/debug.h" #include "common/scope_exit.h" #include "common/types.h" #include "core/memory.h" #include "video_core/amdgpu/liverpool.h" #include "video_core/buffer_cache/buffer_cache.h" +#include "video_core/buffer_cache/memory_tracker.h" #include "video_core/host_shaders/fault_buffer_process_comp.h" #include "video_core/renderer_vulkan/vk_graphics_pipeline.h" #include "video_core/renderer_vulkan/vk_instance.h" @@ -27,10 +29,10 @@ static constexpr size_t DeviceBufferSize = 128_MB; static constexpr size_t MaxPageFaults = 1024; BufferCache::BufferCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_, - Vulkan::Rasterizer& rasterizer_, AmdGpu::Liverpool* liverpool_, - TextureCache& texture_cache_, PageManager& tracker_) - : instance{instance_}, scheduler{scheduler_}, rasterizer{rasterizer_}, liverpool{liverpool_}, - memory{Core::Memory::Instance()}, texture_cache{texture_cache_}, tracker{tracker_}, + AmdGpu::Liverpool* liverpool_, TextureCache& texture_cache_, + PageManager& tracker) + : instance{instance_}, scheduler{scheduler_}, liverpool{liverpool_}, + memory{Core::Memory::Instance()}, texture_cache{texture_cache_}, staging_buffer{instance, scheduler, MemoryUsage::Upload, StagingBufferSize}, stream_buffer{instance, scheduler, MemoryUsage::Stream, UboStreamBufferSize}, download_buffer{instance, scheduler, MemoryUsage::Download, DownloadBufferSize}, @@ -38,13 +40,14 @@ BufferCache::BufferCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& s gds_buffer{instance, scheduler, MemoryUsage::Stream, 0, AllFlags, DataShareBufferSize}, bda_pagetable_buffer{instance, scheduler, MemoryUsage::DeviceLocal, 0, AllFlags, BDA_PAGETABLE_SIZE}, - fault_buffer(instance, scheduler, MemoryUsage::DeviceLocal, 0, AllFlags, FAULT_BUFFER_SIZE), - memory_tracker{tracker} { + fault_buffer(instance, scheduler, MemoryUsage::DeviceLocal, 0, AllFlags, FAULT_BUFFER_SIZE) { Vulkan::SetObjectName(instance.GetDevice(), gds_buffer.Handle(), "GDS Buffer"); Vulkan::SetObjectName(instance.GetDevice(), bda_pagetable_buffer.Handle(), "BDA Page Table Buffer"); Vulkan::SetObjectName(instance.GetDevice(), fault_buffer.Handle(), "Fault Buffer"); + memory_tracker = std::make_unique(tracker); + // Ensure the first slot is used for the null buffer const auto null_id = slot_buffers.insert(instance, scheduler, MemoryUsage::DeviceLocal, 0, AllFlags, 16); @@ -129,22 +132,27 @@ BufferCache::BufferCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& s BufferCache::~BufferCache() = default; -void BufferCache::InvalidateMemory(VAddr device_addr, u64 size, bool unmap) { - const bool is_tracked = IsRegionRegistered(device_addr, size); - if (is_tracked) { - // Mark the page as CPU modified to stop tracking writes. - memory_tracker.MarkRegionAsCpuModified(device_addr, size); - - if (unmap) { - return; - } +void BufferCache::InvalidateMemory(VAddr device_addr, u64 size) { + if (!IsRegionRegistered(device_addr, size)) { + return; } + if (Config::readbacks() && memory_tracker->IsRegionGpuModified(device_addr, size)) { + ReadMemory(device_addr, size); + } + memory_tracker->MarkRegionAsCpuModified(device_addr, size); +} + +void BufferCache::ReadMemory(VAddr device_addr, u64 size) { + liverpool->SendCommand([this, device_addr, size] { + Buffer& buffer = slot_buffers[FindBuffer(device_addr, size)]; + DownloadBufferMemory(buffer, device_addr, size); + }); } void BufferCache::DownloadBufferMemory(Buffer& buffer, VAddr device_addr, u64 size) { boost::container::small_vector copies; u64 total_size_bytes = 0; - memory_tracker.ForEachDownloadRange( + memory_tracker->ForEachDownloadRange( device_addr, size, [&](u64 device_addr_out, u64 range_size) { const VAddr buffer_addr = buffer.CpuAddr(); const auto add_download = [&](VAddr start, VAddr end) { @@ -155,7 +163,10 @@ void BufferCache::DownloadBufferMemory(Buffer& buffer, VAddr device_addr, u64 si .dstOffset = total_size_bytes, .size = new_size, }); - total_size_bytes += new_size; + // Align up to avoid cache conflicts + constexpr u64 align = 64ULL; + constexpr u64 mask = ~(align - 1ULL); + total_size_bytes += (new_size + align - 1) & mask; }; gpu_modified_ranges.ForEachInRange(device_addr_out, range_size, add_download); gpu_modified_ranges.Subtract(device_addr_out, range_size); @@ -173,11 +184,14 @@ void BufferCache::DownloadBufferMemory(Buffer& buffer, VAddr device_addr, u64 si const auto cmdbuf = scheduler.CommandBuffer(); cmdbuf.copyBuffer(buffer.buffer, download_buffer.Handle(), copies); scheduler.Finish(); + auto* memory = Core::Memory::Instance(); for (const auto& copy : copies) { const VAddr copy_device_addr = buffer.CpuAddr() + copy.srcOffset; const u64 dst_offset = copy.dstOffset - offset; - std::memcpy(std::bit_cast(copy_device_addr), download + dst_offset, copy.size); + memory->TryWriteBacking(std::bit_cast(copy_device_addr), download + dst_offset, + copy.size); } + memory_tracker->UnmarkRegionAsGpuModified(device_addr, size); } void BufferCache::BindVertexBuffers(const Vulkan::GraphicsPipeline& pipeline) { @@ -296,9 +310,11 @@ void BufferCache::BindIndexBuffer(u32 index_offset) { void BufferCache::InlineData(VAddr address, const void* value, u32 num_bytes, bool is_gds) { ASSERT_MSG(address % 4 == 0, "GDS offset must be dword aligned"); - if (!is_gds && !IsRegionGpuModified(address, num_bytes)) { - memcpy(std::bit_cast(address), value, num_bytes); - return; + if (!is_gds) { + ASSERT(memory->TryWriteBacking(std::bit_cast(address), value, num_bytes)); + if (!IsRegionRegistered(address, num_bytes)) { + return; + } } Buffer* buffer = [&] { if (is_gds) { @@ -326,25 +342,108 @@ void BufferCache::WriteData(VAddr address, const void* value, u32 num_bytes, boo WriteDataBuffer(*buffer, address, value, num_bytes); } +void BufferCache::CopyBuffer(VAddr dst, VAddr src, u32 num_bytes, bool dst_gds, bool src_gds) { + if (!dst_gds && !IsRegionGpuModified(dst, num_bytes)) { + if (!src_gds && !IsRegionGpuModified(src, num_bytes)) { + // Both buffers were not transferred to GPU yet. Can safely copy in host memory. + memcpy(std::bit_cast(dst), std::bit_cast(src), num_bytes); + return; + } + // Without a readback there's nothing we can do with this + // Fallback to creating dst buffer on GPU to at least have this data there + } + auto& src_buffer = [&] -> const Buffer& { + if (src_gds) { + return gds_buffer; + } + // Avoid using ObtainBuffer here as that might give us the stream buffer. + const BufferId buffer_id = FindBuffer(src, num_bytes); + auto& buffer = slot_buffers[buffer_id]; + SynchronizeBuffer(buffer, src, num_bytes, false); + return buffer; + }(); + auto& dst_buffer = [&] -> const Buffer& { + if (dst_gds) { + return gds_buffer; + } + // Prefer using ObtainBuffer here as that will auto-mark the region as GPU modified. + const auto [buffer, offset] = ObtainBuffer(dst, num_bytes, true); + return *buffer; + }(); + vk::BufferCopy region{ + .srcOffset = src_buffer.Offset(src), + .dstOffset = dst_buffer.Offset(dst), + .size = num_bytes, + }; + const vk::BufferMemoryBarrier2 buf_barriers_before[2] = { + { + .srcStageMask = vk::PipelineStageFlagBits2::eAllCommands, + .srcAccessMask = vk::AccessFlagBits2::eMemoryRead, + .dstStageMask = vk::PipelineStageFlagBits2::eAllCommands, + .dstAccessMask = vk::AccessFlagBits2::eTransferWrite, + .buffer = dst_buffer.Handle(), + .offset = dst_buffer.Offset(dst), + .size = num_bytes, + }, + { + .srcStageMask = vk::PipelineStageFlagBits2::eAllCommands, + .srcAccessMask = vk::AccessFlagBits2::eMemoryWrite, + .dstStageMask = vk::PipelineStageFlagBits2::eAllCommands, + .dstAccessMask = vk::AccessFlagBits2::eTransferRead, + .buffer = src_buffer.Handle(), + .offset = src_buffer.Offset(src), + .size = num_bytes, + }, + }; + scheduler.EndRendering(); + const auto cmdbuf = scheduler.CommandBuffer(); + cmdbuf.pipelineBarrier2(vk::DependencyInfo{ + .dependencyFlags = vk::DependencyFlagBits::eByRegion, + .bufferMemoryBarrierCount = 2, + .pBufferMemoryBarriers = buf_barriers_before, + }); + cmdbuf.copyBuffer(src_buffer.Handle(), dst_buffer.Handle(), region); + const vk::BufferMemoryBarrier2 buf_barriers_after[2] = { + { + .srcStageMask = vk::PipelineStageFlagBits2::eAllCommands, + .srcAccessMask = vk::AccessFlagBits2::eTransferWrite, + .dstStageMask = vk::PipelineStageFlagBits2::eAllCommands, + .dstAccessMask = vk::AccessFlagBits2::eMemoryRead, + .buffer = dst_buffer.Handle(), + .offset = dst_buffer.Offset(dst), + .size = num_bytes, + }, + { + .srcStageMask = vk::PipelineStageFlagBits2::eAllCommands, + .srcAccessMask = vk::AccessFlagBits2::eTransferRead, + .dstStageMask = vk::PipelineStageFlagBits2::eAllCommands, + .dstAccessMask = vk::AccessFlagBits2::eMemoryWrite, + .buffer = src_buffer.Handle(), + .offset = src_buffer.Offset(src), + .size = num_bytes, + }, + }; + cmdbuf.pipelineBarrier2(vk::DependencyInfo{ + .dependencyFlags = vk::DependencyFlagBits::eByRegion, + .bufferMemoryBarrierCount = 2, + .pBufferMemoryBarriers = buf_barriers_after, + }); +} + std::pair BufferCache::ObtainBuffer(VAddr device_addr, u32 size, bool is_written, bool is_texel_buffer, BufferId buffer_id) { - // For small uniform buffers that have not been modified by gpu - // use device local stream buffer to reduce renderpass breaks. - // Maybe we want to modify the threshold now that the page size is 16KB? - static constexpr u64 StreamThreshold = CACHING_PAGESIZE; - const bool is_gpu_dirty = memory_tracker.IsRegionGpuModified(device_addr, size); - if (!is_written && size <= StreamThreshold && !is_gpu_dirty) { + // For read-only buffers use device local stream buffer to reduce renderpass breaks. + if (!is_written && size <= CACHING_PAGESIZE && !IsRegionGpuModified(device_addr, size)) { const u64 offset = stream_buffer.Copy(device_addr, size, instance.UniformMinAlignment()); return {&stream_buffer, offset}; } - - if (!buffer_id || slot_buffers[buffer_id].is_deleted) { + if (IsBufferInvalid(buffer_id)) { buffer_id = FindBuffer(device_addr, size); } Buffer& buffer = slot_buffers[buffer_id]; SynchronizeBuffer(buffer, device_addr, size, is_texel_buffer); if (is_written) { - memory_tracker.MarkRegionAsGpuModified(device_addr, size); + memory_tracker->MarkRegionAsGpuModified(device_addr, size); gpu_modified_ranges.Add(device_addr, size); } return {&buffer, buffer.Offset(device_addr)}; @@ -352,21 +451,17 @@ std::pair BufferCache::ObtainBuffer(VAddr device_addr, u32 size, b std::pair BufferCache::ObtainBufferForImage(VAddr gpu_addr, u32 size) { // Check if any buffer contains the full requested range. - const u64 page = gpu_addr >> CACHING_PAGEBITS; - const BufferId buffer_id = page_table[page].buffer_id; + const BufferId buffer_id = page_table[gpu_addr >> CACHING_PAGEBITS].buffer_id; if (buffer_id) { - Buffer& buffer = slot_buffers[buffer_id]; - if (buffer.IsInBounds(gpu_addr, size)) { + if (Buffer& buffer = slot_buffers[buffer_id]; buffer.IsInBounds(gpu_addr, size)) { SynchronizeBuffer(buffer, gpu_addr, size, false); return {&buffer, buffer.Offset(gpu_addr)}; } } - // If no buffer contains the full requested range but some buffer within was GPU-modified, - // fall back to ObtainBuffer to create a full buffer and avoid losing GPU modifications. - if (memory_tracker.IsRegionGpuModified(gpu_addr, size)) { + // If some buffer within was GPU modified create a full buffer to avoid losing GPU data. + if (IsRegionGpuModified(gpu_addr, size)) { return ObtainBuffer(gpu_addr, size, false, false); } - // In all other cases, just do a CPU copy to the staging buffer. const auto [data, offset] = staging_buffer.Map(size, 16); memory->CopySparseMemory(gpu_addr, data, size); @@ -380,11 +475,11 @@ bool BufferCache::IsRegionRegistered(VAddr addr, size_t size) { } bool BufferCache::IsRegionCpuModified(VAddr addr, size_t size) { - return memory_tracker.IsRegionCpuModified(addr, size); + return memory_tracker->IsRegionCpuModified(addr, size); } bool BufferCache::IsRegionGpuModified(VAddr addr, size_t size) { - return memory_tracker.IsRegionGpuModified(addr, size); + return memory_tracker->IsRegionGpuModified(addr, size); } BufferId BufferCache::FindBuffer(VAddr device_addr, u32 size) { @@ -723,7 +818,7 @@ void BufferCache::SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size, boost::container::small_vector copies; u64 total_size_bytes = 0; VAddr buffer_start = buffer.CpuAddr(); - memory_tracker.ForEachUploadRange(device_addr, size, [&](u64 device_addr_out, u64 range_size) { + memory_tracker->ForEachUploadRange(device_addr, size, [&](u64 device_addr_out, u64 range_size) { copies.push_back(vk::BufferCopy{ .srcOffset = total_size_bytes, .dstOffset = device_addr_out - buffer_start, diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index 651ba84dc..5acb6ebd3 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -9,7 +9,6 @@ #include "common/slot_vector.h" #include "common/types.h" #include "video_core/buffer_cache/buffer.h" -#include "video_core/buffer_cache/memory_tracker.h" #include "video_core/buffer_cache/range_set.h" #include "video_core/multi_level_page_table.h" @@ -21,13 +20,6 @@ namespace Core { class MemoryManager; } -namespace Shader { -namespace Gcn { -struct FetchShaderData; -} -struct Info; -} // namespace Shader - namespace Vulkan { class GraphicsPipeline; } @@ -39,6 +31,8 @@ using BufferId = Common::SlotId; static constexpr BufferId NULL_BUFFER_ID{0}; class TextureCache; +class MemoryTracker; +class PageManager; class BufferCache { public: @@ -69,10 +63,16 @@ public: bool has_stream_leap = false; }; + using IntervalSet = + boost::icl::interval_set; + using IntervalType = typename IntervalSet::interval_type; + public: explicit BufferCache(const Vulkan::Instance& instance, Vulkan::Scheduler& scheduler, - Vulkan::Rasterizer& rasterizer_, AmdGpu::Liverpool* liverpool, - TextureCache& texture_cache, PageManager& tracker); + AmdGpu::Liverpool* liverpool, TextureCache& texture_cache, + PageManager& tracker); ~BufferCache(); /// Returns a pointer to GDS device local buffer. @@ -110,7 +110,10 @@ public: } /// Invalidates any buffer in the logical page range. - void InvalidateMemory(VAddr device_addr, u64 size, bool unmap); + void InvalidateMemory(VAddr device_addr, u64 size); + + /// Waits on pending downloads in the logical page range. + void ReadMemory(VAddr device_addr, u64 size); /// Binds host vertex buffers for the current draw. void BindVertexBuffers(const Vulkan::GraphicsPipeline& pipeline); @@ -124,6 +127,9 @@ public: /// Writes a value to GPU buffer. (uses staging buffer to temporarily store the data) void WriteData(VAddr address, const void* value, u32 num_bytes, bool is_gds); + /// Performs buffer to buffer data copy on the GPU. + void CopyBuffer(VAddr dst, VAddr src, u32 num_bytes, bool dst_gds, bool src_gds); + /// Obtains a buffer for the specified region. [[nodiscard]] std::pair ObtainBuffer(VAddr gpu_addr, u32 size, bool is_written, bool is_texel_buffer = false, @@ -166,6 +172,10 @@ private: }); } + inline bool IsBufferInvalid(BufferId buffer_id) const { + return !buffer_id || slot_buffers[buffer_id].is_deleted; + } + void DownloadBufferMemory(Buffer& buffer, VAddr device_addr, u64 size); [[nodiscard]] OverlapResult ResolveOverlaps(VAddr device_addr, u32 wanted_size); @@ -193,11 +203,10 @@ private: const Vulkan::Instance& instance; Vulkan::Scheduler& scheduler; - Vulkan::Rasterizer& rasterizer; AmdGpu::Liverpool* liverpool; Core::MemoryManager* memory; TextureCache& texture_cache; - PageManager& tracker; + std::unique_ptr memory_tracker; StreamBuffer staging_buffer; StreamBuffer stream_buffer; StreamBuffer download_buffer; @@ -209,7 +218,6 @@ private: Common::SlotVector slot_buffers; RangeSet gpu_modified_ranges; SplitRangeMap buffer_ranges; - MemoryTracker memory_tracker; PageTable page_table; vk::UniqueDescriptorSetLayout fault_process_desc_layout; vk::UniquePipeline fault_process_pipeline; diff --git a/src/video_core/buffer_cache/memory_tracker.h b/src/video_core/buffer_cache/memory_tracker.h index 3dbffdabd..acc53b8f9 100644 --- a/src/video_core/buffer_cache/memory_tracker.h +++ b/src/video_core/buffer_cache/memory_tracker.h @@ -57,6 +57,14 @@ public: }); } + void UnmarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 query_size) noexcept { + IteratePages(dirty_cpu_addr, query_size, + [](RegionManager* manager, u64 offset, size_t size) { + manager->template ChangeRegionState( + manager->GetCpuAddr() + offset, size); + }); + } + /// Call 'func' for each CPU modified range and unmark those pages as CPU modified void ForEachUploadRange(VAddr query_cpu_range, u64 query_size, auto&& func) { IteratePages(query_cpu_range, query_size, diff --git a/src/video_core/buffer_cache/region_definitions.h b/src/video_core/buffer_cache/region_definitions.h index f035704d9..76e7ee263 100644 --- a/src/video_core/buffer_cache/region_definitions.h +++ b/src/video_core/buffer_cache/region_definitions.h @@ -3,7 +3,6 @@ #pragma once -#include #include "common/bit_array.h" #include "common/types.h" @@ -20,9 +19,8 @@ constexpr u64 NUM_PAGES_PER_REGION = TRACKER_HIGHER_PAGE_SIZE / TRACKER_BYTES_PE enum class Type { CPU, GPU, - Writeable, }; using RegionBits = Common::BitArray; -} // namespace VideoCore \ No newline at end of file +} // namespace VideoCore diff --git a/src/video_core/buffer_cache/region_manager.h b/src/video_core/buffer_cache/region_manager.h index e8ec21129..894809cd5 100644 --- a/src/video_core/buffer_cache/region_manager.h +++ b/src/video_core/buffer_cache/region_manager.h @@ -4,7 +4,7 @@ #pragma once #include -#include +#include "common/config.h" #include "common/div_ceil.h" #ifdef __linux__ @@ -20,7 +20,7 @@ namespace VideoCore { /** - * Allows tracking CPU and GPU modification of pages in a contigious 4MB virtual address region. + * Allows tracking CPU and GPU modification of pages in a contigious 16MB virtual address region. * Information is stored in bitsets for spacial locality and fast update of single pages. */ class RegionManager { @@ -30,6 +30,7 @@ public: cpu.Fill(); gpu.Clear(); writeable.Fill(); + readable.Fill(); } explicit RegionManager() = default; @@ -47,29 +48,19 @@ public: template RegionBits& GetRegionBits() noexcept { - static_assert(type != Type::Writeable); if constexpr (type == Type::CPU) { return cpu; } else if constexpr (type == Type::GPU) { return gpu; - } else if constexpr (type == Type::Writeable) { - return writeable; - } else { - static_assert(false, "Invalid type"); } } template const RegionBits& GetRegionBits() const noexcept { - static_assert(type != Type::Writeable); if constexpr (type == Type::CPU) { return cpu; } else if constexpr (type == Type::GPU) { return gpu; - } else if constexpr (type == Type::Writeable) { - return writeable; - } else { - static_assert(false, "Invalid type"); } } @@ -90,7 +81,6 @@ public: return; } std::scoped_lock lk{lock}; - static_assert(type != Type::Writeable); RegionBits& bits = GetRegionBits(); if constexpr (enable) { @@ -99,7 +89,9 @@ public: bits.UnsetRange(start_page, end_page); } if constexpr (type == Type::CPU) { - UpdateProtection(); + UpdateProtection(); + } else if (Config::readbacks()) { + UpdateProtection(); } } @@ -122,16 +114,10 @@ public: return; } std::scoped_lock lk{lock}; - static_assert(type != Type::Writeable); RegionBits& bits = GetRegionBits(); RegionBits mask(bits, start_page, end_page); - // TODO: this will not be needed once we handle readbacks - if constexpr (type == Type::GPU) { - mask &= ~writeable; - } - for (const auto& [start, end] : mask) { func(cpu_addr + start * TRACKER_BYTES_PER_PAGE, (end - start) * TRACKER_BYTES_PER_PAGE); } @@ -139,7 +125,9 @@ public: if constexpr (clear) { bits.UnsetRange(start_page, end_page); if constexpr (type == Type::CPU) { - UpdateProtection(); + UpdateProtection(); + } else if (Config::readbacks()) { + UpdateProtection(); } } } @@ -151,7 +139,7 @@ public: * @param size Size in bytes of the region to query for modifications */ template - [[nodiscard]] bool IsRegionModified(u64 offset, u64 size) const noexcept { + [[nodiscard]] bool IsRegionModified(u64 offset, u64 size) noexcept { RENDERER_TRACE; const size_t start_page = SanitizeAddress(offset) / TRACKER_BYTES_PER_PAGE; const size_t end_page = @@ -159,17 +147,10 @@ public: if (start_page >= NUM_PAGES_PER_REGION || end_page <= start_page) { return false; } - // std::scoped_lock lk{lock}; // Is this needed? - static_assert(type != Type::Writeable); + std::scoped_lock lk{lock}; const RegionBits& bits = GetRegionBits(); RegionBits test(bits, start_page, end_page); - - // TODO: this will not be needed once we handle readbacks - if constexpr (type == Type::GPU) { - test &= ~writeable; - } - return test.Any(); } @@ -181,19 +162,21 @@ private: * @param current_bits Current state of the word * @param new_bits New state of the word * - * @tparam add_to_tracker True when the tracker should start tracking the new pages + * @tparam track True when the tracker should start tracking the new pages */ - template + template void UpdateProtection() { RENDERER_TRACE; - RegionBits mask = cpu ^ writeable; - + RegionBits mask = is_read ? (~gpu ^ readable) : (cpu ^ writeable); if (mask.None()) { - return; // No changes to the CPU tracking state + return; } - - writeable = cpu; - tracker->UpdatePageWatchersForRegion(cpu_addr, mask); + if constexpr (is_read) { + readable = ~gpu; + } else { + writeable = cpu; + } + tracker->UpdatePageWatchersForRegion(cpu_addr, mask); } #ifdef PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP @@ -206,6 +189,7 @@ private: RegionBits cpu; RegionBits gpu; RegionBits writeable; + RegionBits readable; }; } // namespace VideoCore diff --git a/src/video_core/page_manager.cpp b/src/video_core/page_manager.cpp index 15dbf909c..40ac9b5b4 100644 --- a/src/video_core/page_manager.cpp +++ b/src/video_core/page_manager.cpp @@ -13,6 +13,7 @@ #ifndef _WIN64 #include +#include "common/adaptive_mutex.h" #ifdef ENABLE_USERFAULTFD #include #include @@ -23,6 +24,7 @@ #endif #else #include +#include "common/spin_lock.h" #endif #ifdef __linux__ @@ -38,22 +40,45 @@ constexpr size_t PAGE_BITS = 12; struct PageManager::Impl { struct PageState { - u8 num_watchers{}; + u8 num_write_watchers : 7; + // At the moment only buffer cache can request read watchers. + // And buffers cannot overlap, thus only 1 can exist per page. + u8 num_read_watchers : 1; - Core::MemoryPermission Perm() const noexcept { - return num_watchers == 0 ? Core::MemoryPermission::ReadWrite - : Core::MemoryPermission::Read; + Core::MemoryPermission WritePerm() const noexcept { + return num_write_watchers == 0 ? Core::MemoryPermission::Write + : Core::MemoryPermission::None; } - template + Core::MemoryPermission ReadPerm() const noexcept { + return num_read_watchers == 0 ? Core::MemoryPermission::Read + : Core::MemoryPermission::None; + } + + Core::MemoryPermission Perms() const noexcept { + return ReadPerm() | WritePerm(); + } + + template u8 AddDelta() { - if constexpr (delta == 1) { - return ++num_watchers; - } else if constexpr (delta == -1) { - ASSERT_MSG(num_watchers > 0, "Not enough watchers"); - return --num_watchers; + if constexpr (is_read) { + if constexpr (delta == 1) { + return ++num_read_watchers; + } else if (delta == -1) { + ASSERT_MSG(num_read_watchers > 0, "Not enough watchers"); + return --num_read_watchers; + } else { + return num_read_watchers; + } } else { - return num_watchers; + if constexpr (delta == 1) { + return ++num_write_watchers; + } else if (delta == -1) { + ASSERT_MSG(num_write_watchers > 0, "Not enough watchers"); + return --num_write_watchers; + } else { + return num_write_watchers; + } } } }; @@ -176,6 +201,7 @@ struct PageManager::Impl { RENDERER_TRACE; auto* memory = Core::Memory::Instance(); auto& impl = memory->GetAddressSpace(); + // ASSERT(perms != Core::MemoryPermission::Write); impl.Protect(address, size, perms); } @@ -183,12 +209,14 @@ struct PageManager::Impl { const auto addr = reinterpret_cast(fault_address); if (Common::IsWriteError(context)) { return rasterizer->InvalidateMemory(addr, 1); + } else { + return rasterizer->ReadMemory(addr, 1); } return false; } - #endif - template + + template void UpdatePageWatchers(VAddr addr, u64 size) { RENDERER_TRACE; @@ -200,7 +228,7 @@ struct PageManager::Impl { const auto lock_end = locks.begin() + Common::DivCeil(page_end, PAGES_PER_LOCK); Common::RangeLockGuard lk(lock_start, lock_end); - auto perms = cached_pages[page].Perm(); + auto perms = cached_pages[page].Perms(); u64 range_begin = 0; u64 range_bytes = 0; u64 potential_range_bytes = 0; @@ -226,9 +254,9 @@ struct PageManager::Impl { PageState& state = cached_pages[page]; // Apply the change to the page state - const u8 new_count = state.AddDelta(); + const u8 new_count = state.AddDelta(); - if (auto new_perms = state.Perm(); new_perms != perms) [[unlikely]] { + if (auto new_perms = state.Perms(); new_perms != perms) [[unlikely]] { // If the protection changed add pending (un)protect action release_pending(); perms = new_perms; @@ -253,25 +281,23 @@ struct PageManager::Impl { release_pending(); } - template + template void UpdatePageWatchersForRegion(VAddr base_addr, RegionBits& mask) { RENDERER_TRACE; auto start_range = mask.FirstRange(); auto end_range = mask.LastRange(); if (start_range.second == end_range.second) { - // Optimization: if all pages are contiguous, use the regular UpdatePageWatchers + // if all pages are contiguous, use the regular UpdatePageWatchers const VAddr start_addr = base_addr + (start_range.first << PAGE_BITS); const u64 size = (start_range.second - start_range.first) << PAGE_BITS; - - UpdatePageWatchers(start_addr, size); - return; + return UpdatePageWatchers(start_addr, size); } size_t base_page = (base_addr >> PAGE_BITS); ASSERT(base_page % PAGES_PER_LOCK == 0); std::scoped_lock lk(locks[base_page / PAGES_PER_LOCK]); - auto perms = cached_pages[base_page + start_range.first].Perm(); + auto perms = cached_pages[base_page + start_range.first].Perms(); u64 range_begin = 0; u64 range_bytes = 0; u64 potential_range_bytes = 0; @@ -292,9 +318,10 @@ struct PageManager::Impl { const bool update = mask.Get(page); // Apply the change to the page state - const u8 new_count = update ? state.AddDelta() : state.AddDelta<0>(); + const u8 new_count = + update ? state.AddDelta() : state.AddDelta<0, is_read>(); - if (auto new_perms = state.Perm(); new_perms != perms) [[unlikely]] { + if (auto new_perms = state.Perms(); new_perms != perms) [[unlikely]] { // If the protection changed add pending (un)protect action release_pending(); perms = new_perms; @@ -348,19 +375,23 @@ void PageManager::OnGpuUnmap(VAddr address, size_t size) { template void PageManager::UpdatePageWatchers(VAddr addr, u64 size) const { - impl->UpdatePageWatchers(addr, size); + impl->UpdatePageWatchers(addr, size); } -template +template void PageManager::UpdatePageWatchersForRegion(VAddr base_addr, RegionBits& mask) const { - impl->UpdatePageWatchersForRegion(base_addr, mask); + impl->UpdatePageWatchersForRegion(base_addr, mask); } template void PageManager::UpdatePageWatchers(VAddr addr, u64 size) const; template void PageManager::UpdatePageWatchers(VAddr addr, u64 size) const; -template void PageManager::UpdatePageWatchersForRegion(VAddr base_addr, - RegionBits& mask) const; -template void PageManager::UpdatePageWatchersForRegion(VAddr base_addr, - RegionBits& mask) const; +template void PageManager::UpdatePageWatchersForRegion(VAddr base_addr, + RegionBits& mask) const; +template void PageManager::UpdatePageWatchersForRegion(VAddr base_addr, + RegionBits& mask) const; +template void PageManager::UpdatePageWatchersForRegion(VAddr base_addr, + RegionBits& mask) const; +template void PageManager::UpdatePageWatchersForRegion(VAddr base_addr, + RegionBits& mask) const; } // namespace VideoCore diff --git a/src/video_core/page_manager.h b/src/video_core/page_manager.h index 561087ead..4ca41cb43 100644 --- a/src/video_core/page_manager.h +++ b/src/video_core/page_manager.h @@ -37,9 +37,8 @@ public: template void UpdatePageWatchers(VAddr addr, u64 size) const; - /// Updates watches in the pages touching the specified region - /// using a mask. - template + /// Updates watches in the pages touching the specified region using a mask. + template void UpdatePageWatchersForRegion(VAddr base_addr, RegionBits& mask) const; /// Returns page aligned address. diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 86adfcaa5..0aad0f047 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -36,7 +36,7 @@ static Shader::PushData MakeUserData(const AmdGpu::Liverpool::Regs& regs) { Rasterizer::Rasterizer(const Instance& instance_, Scheduler& scheduler_, AmdGpu::Liverpool* liverpool_) : instance{instance_}, scheduler{scheduler_}, page_manager{this}, - buffer_cache{instance, scheduler, *this, liverpool_, texture_cache, page_manager}, + buffer_cache{instance, scheduler, liverpool_, texture_cache, page_manager}, texture_cache{instance, scheduler, buffer_cache, page_manager}, liverpool{liverpool_}, memory{Core::Memory::Instance()}, pipeline_cache{instance, scheduler, liverpool} { if (!Config::nullGpu()) { @@ -945,6 +945,10 @@ void Rasterizer::InlineData(VAddr address, const void* value, u32 num_bytes, boo buffer_cache.InlineData(address, value, num_bytes, is_gds); } +void Rasterizer::CopyBuffer(VAddr dst, VAddr src, u32 num_bytes, bool dst_gds, bool src_gds) { + buffer_cache.CopyBuffer(dst, src, num_bytes, dst_gds, src_gds); +} + u32 Rasterizer::ReadDataFromGds(u32 gds_offset) { auto* gds_buf = buffer_cache.GetGdsBuffer(); u32 value; @@ -957,11 +961,20 @@ bool Rasterizer::InvalidateMemory(VAddr addr, u64 size) { // Not GPU mapped memory, can skip invalidation logic entirely. return false; } - buffer_cache.InvalidateMemory(addr, size, false); + buffer_cache.InvalidateMemory(addr, size); texture_cache.InvalidateMemory(addr, size); return true; } +bool Rasterizer::ReadMemory(VAddr addr, u64 size) { + if (!IsMapped(addr, size)) { + // Not GPU mapped memory, can skip invalidation logic entirely. + return false; + } + buffer_cache.ReadMemory(addr, size); + return true; +} + bool Rasterizer::IsMapped(VAddr addr, u64 size) { if (size == 0) { // There is no memory, so not mapped. @@ -982,7 +995,7 @@ void Rasterizer::MapMemory(VAddr addr, u64 size) { } void Rasterizer::UnmapMemory(VAddr addr, u64 size) { - buffer_cache.InvalidateMemory(addr, size, true); + buffer_cache.InvalidateMemory(addr, size); texture_cache.UnmapMemory(addr, size); page_manager.OnGpuUnmap(addr, size); { diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index fb9ca4bbe..c570ea368 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h @@ -56,8 +56,10 @@ public: bool from_guest = false); void InlineData(VAddr address, const void* value, u32 num_bytes, bool is_gds); + void CopyBuffer(VAddr dst, VAddr src, u32 num_bytes, bool dst_gds, bool src_gds); u32 ReadDataFromGds(u32 gsd_offset); bool InvalidateMemory(VAddr addr, u64 size); + bool ReadMemory(VAddr addr, u64 size); bool IsMapped(VAddr addr, u64 size); void MapMemory(VAddr addr, u64 size); void UnmapMemory(VAddr addr, u64 size);