From 7fedbd52e0629c037be631068f609e77a2b27615 Mon Sep 17 00:00:00 2001 From: TheTurtle Date: Mon, 7 Jul 2025 16:23:20 +0300 Subject: [PATCH] texture_cache: Async download of GPU modified linear images (#3204) * texture_cache: Async download of GPU modified linear images * liverpool: Back to less submits * texture_cache: Don't download depth images * config: Add option for linear image readback --- src/common/config.cpp | 11 +++- src/common/config.h | 1 + src/video_core/amdgpu/liverpool.cpp | 3 +- src/video_core/buffer_cache/buffer_cache.h | 2 +- .../renderer_vulkan/vk_rasterizer.cpp | 16 +++-- .../renderer_vulkan/vk_rasterizer.h | 2 +- .../renderer_vulkan/vk_scheduler.cpp | 13 ++-- src/video_core/renderer_vulkan/vk_scheduler.h | 3 + .../texture_cache/texture_cache.cpp | 61 ++++++++++++++++++- src/video_core/texture_cache/texture_cache.h | 10 ++- 10 files changed, 106 insertions(+), 16 deletions(-) diff --git a/src/common/config.cpp b/src/common/config.cpp index d3a5fa6a1..010fecf95 100644 --- a/src/common/config.cpp +++ b/src/common/config.cpp @@ -65,6 +65,7 @@ static u32 screenHeight = 720; static bool isNullGpu = false; static bool shouldCopyGPUBuffers = false; static bool readbacksEnabled = false; +static bool readbackLinearImagesEnabled = false; static bool directMemoryAccessEnabled = false; static bool shouldDumpShaders = false; static bool shouldPatchShaders = false; @@ -103,7 +104,7 @@ u32 m_language = 1; // english static std::string trophyKey = ""; // Expected number of items in the config file -static constexpr u64 total_entries = 51; +static constexpr u64 total_entries = 52; bool allowHDR() { return isHDRAllowed; @@ -262,6 +263,10 @@ bool readbacks() { return readbacksEnabled; } +bool readbackLinearImages() { + return readbackLinearImagesEnabled; +} + bool directMemoryAccess() { return directMemoryAccessEnabled; } @@ -631,6 +636,8 @@ void load(const std::filesystem::path& path) { isNullGpu = toml::find_or(gpu, "nullGpu", isNullGpu); shouldCopyGPUBuffers = toml::find_or(gpu, "copyGPUBuffers", shouldCopyGPUBuffers); readbacksEnabled = toml::find_or(gpu, "readbacks", readbacksEnabled); + readbackLinearImagesEnabled = + toml::find_or(gpu, "readbackLinearImages", readbackLinearImagesEnabled); directMemoryAccessEnabled = toml::find_or(gpu, "directMemoryAccess", directMemoryAccessEnabled); shouldDumpShaders = toml::find_or(gpu, "dumpShaders", shouldDumpShaders); @@ -802,6 +809,7 @@ void save(const std::filesystem::path& path) { data["GPU"]["nullGpu"] = isNullGpu; data["GPU"]["copyGPUBuffers"] = shouldCopyGPUBuffers; data["GPU"]["readbacks"] = readbacksEnabled; + data["GPU"]["readbackLinearImages"] = readbackLinearImagesEnabled; data["GPU"]["directMemoryAccess"] = directMemoryAccessEnabled; data["GPU"]["dumpShaders"] = shouldDumpShaders; data["GPU"]["patchShaders"] = shouldPatchShaders; @@ -902,6 +910,7 @@ void setDefaultValues() { isNullGpu = false; shouldCopyGPUBuffers = false; readbacksEnabled = false; + readbackLinearImagesEnabled = false; directMemoryAccessEnabled = false; shouldDumpShaders = false; shouldPatchShaders = false; diff --git a/src/common/config.h b/src/common/config.h index 931fa68e2..2ed08198a 100644 --- a/src/common/config.h +++ b/src/common/config.h @@ -47,6 +47,7 @@ bool copyGPUCmdBuffers(); void setCopyGPUCmdBuffers(bool enable); bool readbacks(); void setReadbacks(bool enable); +bool readbackLinearImages(); bool directMemoryAccess(); void setDirectMemoryAccess(bool enable); bool dumpShaders(); diff --git a/src/video_core/amdgpu/liverpool.cpp b/src/video_core/amdgpu/liverpool.cpp index 9b8c28b66..e264de74a 100644 --- a/src/video_core/amdgpu/liverpool.cpp +++ b/src/video_core/amdgpu/liverpool.cpp @@ -135,9 +135,8 @@ void Liverpool::Process(std::stop_token stoken) { if (submit_done) { VideoCore::EndCapture(); - if (rasterizer) { - rasterizer->ProcessFaults(); + rasterizer->EndCommandList(); rasterizer->Flush(); } submit_done = false; diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index 900a27aee..354d01431 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -112,7 +112,7 @@ public: /// Invalidates any buffer in the logical page range. void InvalidateMemory(VAddr device_addr, u64 size); - /// Waits on pending downloads in the logical page range. + /// Flushes any GPU modified buffer in the logical page range back to CPU memory. void ReadMemory(VAddr device_addr, u64 size, bool is_write = false); /// Binds host vertex buffers for the current draw. diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index e4e026485..cca193831 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -272,6 +272,8 @@ void Rasterizer::EliminateFastClear() { void Rasterizer::Draw(bool is_indexed, u32 index_offset) { RENDERER_TRACE; + scheduler.PopPendingOperations(); + if (!FilterDraw()) { return; } @@ -317,6 +319,8 @@ void Rasterizer::DrawIndirect(bool is_indexed, VAddr arg_address, u32 offset, u3 u32 max_count, VAddr count_address) { RENDERER_TRACE; + scheduler.PopPendingOperations(); + if (!FilterDraw()) { return; } @@ -380,6 +384,8 @@ void Rasterizer::DrawIndirect(bool is_indexed, VAddr arg_address, u32 offset, u3 void Rasterizer::DispatchDirect() { RENDERER_TRACE; + scheduler.PopPendingOperations(); + const auto& cs_program = liverpool->GetCsRegs(); const ComputePipeline* pipeline = pipeline_cache.GetComputePipeline(); if (!pipeline) { @@ -407,6 +413,8 @@ void Rasterizer::DispatchDirect() { void Rasterizer::DispatchIndirect(VAddr address, u32 offset, u32 size) { RENDERER_TRACE; + scheduler.PopPendingOperations(); + const auto& cs_program = liverpool->GetCsRegs(); const ComputePipeline* pipeline = pipeline_cache.GetComputePipeline(); if (!pipeline) { @@ -439,11 +447,12 @@ void Rasterizer::Finish() { scheduler.Finish(); } -void Rasterizer::ProcessFaults() { +void Rasterizer::EndCommandList() { if (fault_process_pending) { fault_process_pending = false; buffer_cache.ProcessFaultBuffer(); } + texture_cache.ProcessDownloadImages(); } bool Rasterizer::BindResources(const Pipeline* pipeline) { @@ -649,8 +658,7 @@ void Rasterizer::BindTextures(const Shader::Info& stage, Shader::Backend::Bindin if (instance.IsNullDescriptorSupported()) { image_infos.emplace_back(VK_NULL_HANDLE, VK_NULL_HANDLE, vk::ImageLayout::eGeneral); } else { - auto& null_image_view = - texture_cache.FindTexture(VideoCore::NULL_IMAGE_ID, desc.view_info); + auto& null_image_view = texture_cache.FindTexture(VideoCore::NULL_IMAGE_ID, desc); image_infos.emplace_back(VK_NULL_HANDLE, *null_image_view.image_view, vk::ImageLayout::eGeneral); } @@ -664,7 +672,7 @@ void Rasterizer::BindTextures(const Shader::Info& stage, Shader::Backend::Bindin bound_images.emplace_back(image_id); auto& image = texture_cache.GetImage(image_id); - auto& image_view = texture_cache.FindTexture(image_id, desc.view_info); + auto& image_view = texture_cache.FindTexture(image_id, desc); if (image.binding.force_general || image.binding.is_target) { image.Transit(vk::ImageLayout::eGeneral, diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index 4a978746c..1e1680258 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h @@ -68,7 +68,7 @@ public: void CpSync(); u64 Flush(); void Finish(); - void ProcessFaults(); + void EndCommandList(); PipelineCache& GetPipelineCache() { return pipeline_cache; diff --git a/src/video_core/renderer_vulkan/vk_scheduler.cpp b/src/video_core/renderer_vulkan/vk_scheduler.cpp index e75a69924..4c4e17fe4 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.cpp +++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp @@ -101,6 +101,14 @@ void Scheduler::Wait(u64 tick) { } } +void Scheduler::PopPendingOperations() { + master_semaphore.Refresh(); + while (!pending_ops.empty() && master_semaphore.IsFree(pending_ops.front().gpu_tick)) { + pending_ops.front().callback(); + pending_ops.pop(); + } +} + void Scheduler::AllocateWorkerCommandBuffers() { const vk::CommandBufferBeginInfo begin_info = { .flags = vk::CommandBufferUsageFlagBits::eOneTimeSubmit, @@ -175,10 +183,7 @@ void Scheduler::SubmitExecution(SubmitInfo& info) { AllocateWorkerCommandBuffers(); // Apply pending operations - while (!pending_ops.empty() && IsFree(pending_ops.front().gpu_tick)) { - pending_ops.front().callback(); - pending_ops.pop(); - } + PopPendingOperations(); } void DynamicState::Commit(const Instance& instance, const vk::CommandBuffer& cmdbuf) { diff --git a/src/video_core/renderer_vulkan/vk_scheduler.h b/src/video_core/renderer_vulkan/vk_scheduler.h index 8ddf00f6a..36fd9c055 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.h +++ b/src/video_core/renderer_vulkan/vk_scheduler.h @@ -317,6 +317,9 @@ public: /// Waits for the given tick to trigger on the GPU. void Wait(u64 tick); + /// Attempts to execute operations whose tick the GPU has caught up with. + void PopPendingOperations(); + /// Starts a new rendering scope with provided state. void BeginRendering(const RenderState& new_state); diff --git a/src/video_core/texture_cache/texture_cache.cpp b/src/video_core/texture_cache/texture_cache.cpp index aa6563a84..723b95892 100644 --- a/src/video_core/texture_cache/texture_cache.cpp +++ b/src/video_core/texture_cache/texture_cache.cpp @@ -5,7 +5,9 @@ #include #include "common/assert.h" +#include "common/config.h" #include "common/debug.h" +#include "core/memory.h" #include "video_core/buffer_cache/buffer_cache.h" #include "video_core/page_manager.h" #include "video_core/renderer_vulkan/vk_instance.h" @@ -58,6 +60,50 @@ ImageId TextureCache::GetNullImage(const vk::Format format) { return null_id; } +void TextureCache::ProcessDownloadImages() { + for (const ImageId image_id : download_images) { + DownloadImageMemory(image_id); + } + download_images.clear(); +} + +void TextureCache::DownloadImageMemory(ImageId image_id) { + Image& image = slot_images[image_id]; + if (False(image.flags & ImageFlagBits::GpuModified)) { + return; + } + auto& download_buffer = buffer_cache.GetUtilityBuffer(MemoryUsage::Download); + const u32 download_size = image.info.pitch * image.info.size.height * + image.info.resources.layers * (image.info.num_bits / 8); + ASSERT(download_size <= image.info.guest_size); + const auto [download, offset] = download_buffer.Map(download_size); + download_buffer.Commit(); + const vk::BufferImageCopy image_download = { + .bufferOffset = offset, + .bufferRowLength = image.info.pitch, + .bufferImageHeight = image.info.size.height, + .imageSubresource = + { + .aspectMask = image.info.IsDepthStencil() ? vk::ImageAspectFlagBits::eDepth + : vk::ImageAspectFlagBits::eColor, + .mipLevel = 0, + .baseArrayLayer = 0, + .layerCount = image.info.resources.layers, + }, + .imageOffset = {0, 0, 0}, + .imageExtent = {image.info.size.width, image.info.size.height, 1}, + }; + scheduler.EndRendering(); + const auto cmdbuf = scheduler.CommandBuffer(); + image.Transit(vk::ImageLayout::eTransferSrcOptimal, vk::AccessFlagBits2::eTransferRead, {}); + cmdbuf.copyImageToBuffer(image.image, vk::ImageLayout::eTransferSrcOptimal, + download_buffer.Handle(), image_download); + scheduler.DeferOperation([device_addr = image.info.guest_address, download, download_size] { + auto* memory = Core::Memory::Instance(); + memory->TryWriteBacking(std::bit_cast(device_addr), download, download_size); + }); +} + void TextureCache::MarkAsMaybeDirty(ImageId image_id, Image& image) { if (image.hash == 0) { // Initialize hash @@ -437,16 +483,27 @@ ImageView& TextureCache::RegisterImageView(ImageId image_id, const ImageViewInfo return slot_image_views[view_id]; } -ImageView& TextureCache::FindTexture(ImageId image_id, const ImageViewInfo& view_info) { +ImageView& TextureCache::FindTexture(ImageId image_id, const BaseDesc& desc) { Image& image = slot_images[image_id]; + if (desc.type == BindingType::Storage) { + image.flags |= ImageFlagBits::GpuModified; + if (Config::readbackLinearImages() && + image.info.tiling_mode == AmdGpu::TilingMode::Display_Linear) { + download_images.emplace(image_id); + } + } UpdateImage(image_id); - return RegisterImageView(image_id, view_info); + return RegisterImageView(image_id, desc.view_info); } ImageView& TextureCache::FindRenderTarget(BaseDesc& desc) { const ImageId image_id = FindImage(desc); Image& image = slot_images[image_id]; image.flags |= ImageFlagBits::GpuModified; + if (Config::readbackLinearImages() && + image.info.tiling_mode == AmdGpu::TilingMode::Display_Linear) { + download_images.emplace(image_id); + } image.usage.render_target = 1u; UpdateImage(image_id); diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index 87228b84f..ff8ffb61c 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -3,6 +3,7 @@ #pragma once +#include #include #include @@ -105,11 +106,14 @@ public: /// Evicts any images that overlap the unmapped range. void UnmapMemory(VAddr cpu_addr, size_t size); + /// Schedules a copy of pending images for download back to CPU memory. + void ProcessDownloadImages(); + /// Retrieves the image handle of the image with the provided attributes. [[nodiscard]] ImageId FindImage(BaseDesc& desc, FindFlags flags = {}); /// Retrieves an image view with the properties of the specified image id. - [[nodiscard]] ImageView& FindTexture(ImageId image_id, const ImageViewInfo& view_info); + [[nodiscard]] ImageView& FindTexture(ImageId image_id, const BaseDesc& desc); /// Retrieves the render target with specified properties [[nodiscard]] ImageView& FindRenderTarget(BaseDesc& desc); @@ -252,6 +256,9 @@ private: /// Gets or creates a null image for a particular format. ImageId GetNullImage(vk::Format format); + /// Copies image memory back to CPU. + void DownloadImageMemory(ImageId image_id); + /// Create an image from the given parameters [[nodiscard]] ImageId InsertImage(const ImageInfo& info, VAddr cpu_addr); @@ -293,6 +300,7 @@ private: Common::SlotVector slot_image_views; tsl::robin_map samplers; tsl::robin_map null_images; + std::unordered_set download_images; PageTable page_table; std::mutex mutex;