From dedf6de2ac13b6543339ee5cdedc44ee0efd963c Mon Sep 17 00:00:00 2001 From: TheTurtle Date: Wed, 11 Jun 2025 11:34:37 +0300 Subject: [PATCH] texture_cache: Implement color<->depth copies (#3079) * texture_cache: Implement color to depth copies and vise versa * ir_passes: Adjust shared memory barrier pass to cover more cases * texture_cache: Remove unused code * review comment --- .../ir/passes/shared_memory_barrier_pass.cpp | 35 ++++-- src/video_core/buffer_cache/buffer_cache.cpp | 10 +- src/video_core/buffer_cache/buffer_cache.h | 23 ++-- .../renderer_vulkan/vk_rasterizer.cpp | 4 +- src/video_core/texture_cache/image.cpp | 113 +++++++++++++++--- src/video_core/texture_cache/image.h | 3 +- .../texture_cache/texture_cache.cpp | 23 ++-- 7 files changed, 157 insertions(+), 54 deletions(-) diff --git a/src/shader_recompiler/ir/passes/shared_memory_barrier_pass.cpp b/src/shader_recompiler/ir/passes/shared_memory_barrier_pass.cpp index 10d6a285c..11713d099 100644 --- a/src/shader_recompiler/ir/passes/shared_memory_barrier_pass.cpp +++ b/src/shader_recompiler/ir/passes/shared_memory_barrier_pass.cpp @@ -1,6 +1,7 @@ // SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later +#include #include "shader_recompiler/ir/breadth_first_search.h" #include "shader_recompiler/ir/ir_emitter.h" #include "shader_recompiler/ir/program.h" @@ -51,11 +52,14 @@ static void EmitBarrierInBlock(IR::Block* block) { } } +using NodeSet = std::unordered_set; + // Inserts a barrier after divergent conditional blocks to avoid undefined // behavior when some threads write and others read from shared memory. -static void EmitBarrierInMergeBlock(const IR::AbstractSyntaxNode::Data& data) { +static void EmitBarrierInMergeBlock(const IR::AbstractSyntaxNode::Data& data, + NodeSet& divergence_end, u32& divergence_depth) { const IR::U1 cond = data.if_node.cond; - const auto insert_barrier = + const auto is_divergent_cond = IR::BreadthFirstSearch(cond, [](IR::Inst* inst) -> std::optional { if (inst->GetOpcode() == IR::Opcode::GetAttributeU32 && inst->Arg(0).Attribute() == IR::Attribute::LocalInvocationId) { @@ -63,11 +67,15 @@ static void EmitBarrierInMergeBlock(const IR::AbstractSyntaxNode::Data& data) { } return std::nullopt; }); - if (insert_barrier) { - IR::Block* const merge = data.if_node.merge; - auto insert_point = std::ranges::find_if_not(merge->Instructions(), IR::IsPhi); - IR::IREmitter ir{*merge, insert_point}; - ir.Barrier(); + if (is_divergent_cond) { + if (divergence_depth == 0) { + IR::Block* const merge = data.if_node.merge; + auto insert_point = std::ranges::find_if_not(merge->Instructions(), IR::IsPhi); + IR::IREmitter ir{*merge, insert_point}; + ir.Barrier(); + } + ++divergence_depth; + divergence_end.emplace(data.if_node.merge); } } @@ -89,19 +97,22 @@ void SharedMemoryBarrierPass(IR::Program& program, const RuntimeInfo& runtime_in return; } using Type = IR::AbstractSyntaxNode::Type; - u32 branch_depth{}; + u32 divergence_depth{}; + NodeSet divergence_end; for (const IR::AbstractSyntaxNode& node : program.syntax_list) { if (node.type == Type::EndIf) { - --branch_depth; + if (divergence_end.contains(node.data.end_if.merge)) { + --divergence_depth; + } continue; } // Check if branch depth is zero, we don't want to insert barrier in potentially divergent // code. - if (node.type == Type::If && branch_depth++ == 0) { - EmitBarrierInMergeBlock(node.data); + if (node.type == Type::If) { + EmitBarrierInMergeBlock(node.data, divergence_end, divergence_depth); continue; } - if (node.type == Type::Block && branch_depth == 0) { + if (node.type == Type::Block && divergence_depth == 0) { EmitBarrierInBlock(node.data.block); } } diff --git a/src/video_core/buffer_cache/buffer_cache.cpp b/src/video_core/buffer_cache/buffer_cache.cpp index e470f8e77..ffa744b31 100644 --- a/src/video_core/buffer_cache/buffer_cache.cpp +++ b/src/video_core/buffer_cache/buffer_cache.cpp @@ -23,6 +23,7 @@ static constexpr size_t DataShareBufferSize = 64_KB; static constexpr size_t StagingBufferSize = 512_MB; static constexpr size_t UboStreamBufferSize = 128_MB; static constexpr size_t DownloadBufferSize = 128_MB; +static constexpr size_t DeviceBufferSize = 16_MB; static constexpr size_t MaxPageFaults = 1024; BufferCache::BufferCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_, @@ -32,7 +33,8 @@ BufferCache::BufferCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& s memory{Core::Memory::Instance()}, texture_cache{texture_cache_}, tracker{tracker_}, staging_buffer{instance, scheduler, MemoryUsage::Upload, StagingBufferSize}, stream_buffer{instance, scheduler, MemoryUsage::Stream, UboStreamBufferSize}, - download_buffer(instance, scheduler, MemoryUsage::Download, DownloadBufferSize), + download_buffer{instance, scheduler, MemoryUsage::Download, DownloadBufferSize}, + device_buffer{instance, scheduler, MemoryUsage::DeviceLocal, DeviceBufferSize}, gds_buffer{instance, scheduler, MemoryUsage::Stream, 0, AllFlags, DataShareBufferSize}, bda_pagetable_buffer{instance, scheduler, MemoryUsage::DeviceLocal, 0, AllFlags, BDA_PAGETABLE_SIZE}, @@ -348,7 +350,7 @@ std::pair BufferCache::ObtainBuffer(VAddr device_addr, u32 size, b return {&buffer, buffer.Offset(device_addr)}; } -std::pair BufferCache::ObtainViewBuffer(VAddr gpu_addr, u32 size, bool prefer_gpu) { +std::pair BufferCache::ObtainBufferForImage(VAddr gpu_addr, u32 size) { // Check if any buffer contains the full requested range. const u64 page = gpu_addr >> CACHING_PAGEBITS; const BufferId buffer_id = page_table[page].buffer_id; @@ -361,10 +363,10 @@ std::pair BufferCache::ObtainViewBuffer(VAddr gpu_addr, u32 size, } // If no buffer contains the full requested range but some buffer within was GPU-modified, // fall back to ObtainBuffer to create a full buffer and avoid losing GPU modifications. - // This is only done if the request prefers to use GPU memory, otherwise we can skip it. - if (prefer_gpu && memory_tracker.IsRegionGpuModified(gpu_addr, size)) { + if (memory_tracker.IsRegionGpuModified(gpu_addr, size)) { return ObtainBuffer(gpu_addr, size, false, false); } + // In all other cases, just do a CPU copy to the staging buffer. const auto [data, offset] = staging_buffer.Map(size, 16); memory->CopySparseMemory(gpu_addr, data, size); diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index c2faf12c8..d7d753213 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -80,11 +80,6 @@ public: return &gds_buffer; } - /// Retrieves the host visible device local stream buffer. - [[nodiscard]] StreamBuffer& GetStreamBuffer() noexcept { - return stream_buffer; - } - /// Retrieves the device local DBA page table buffer. [[nodiscard]] Buffer* GetBdaPageTableBuffer() noexcept { return &bda_pagetable_buffer; @@ -100,6 +95,20 @@ public: return slot_buffers[id]; } + /// Retrieves a utility buffer optimized for specified memory usage. + StreamBuffer& GetUtilityBuffer(MemoryUsage usage) noexcept { + switch (usage) { + case MemoryUsage::Stream: + return stream_buffer; + case MemoryUsage::Download: + return download_buffer; + case MemoryUsage::Upload: + return staging_buffer; + case MemoryUsage::DeviceLocal: + return device_buffer; + } + } + /// Invalidates any buffer in the logical page range. void InvalidateMemory(VAddr device_addr, u64 size, bool unmap); @@ -121,8 +130,7 @@ public: BufferId buffer_id = {}); /// Attempts to obtain a buffer without modifying the cache contents. - [[nodiscard]] std::pair ObtainViewBuffer(VAddr gpu_addr, u32 size, - bool prefer_gpu); + [[nodiscard]] std::pair ObtainBufferForImage(VAddr gpu_addr, u32 size); /// Return true when a region is registered on the cache [[nodiscard]] bool IsRegionRegistered(VAddr addr, size_t size); @@ -193,6 +201,7 @@ private: StreamBuffer staging_buffer; StreamBuffer stream_buffer; StreamBuffer download_buffer; + StreamBuffer device_buffer; Buffer gds_buffer; Buffer bda_pagetable_buffer; Buffer fault_buffer; diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index dff4e5a5f..9dea5ceea 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -549,7 +549,7 @@ void Rasterizer::BindBuffers(const Shader::Info& stage, Shader::Backend::Binding const auto* gds_buf = buffer_cache.GetGdsBuffer(); buffer_infos.emplace_back(gds_buf->Handle(), 0, gds_buf->SizeBytes()); } else if (desc.buffer_type == Shader::BufferType::Flatbuf) { - auto& vk_buffer = buffer_cache.GetStreamBuffer(); + auto& vk_buffer = buffer_cache.GetUtilityBuffer(VideoCore::MemoryUsage::Stream); const u32 ubo_size = stage.flattened_ud_buf.size() * sizeof(u32); const u64 offset = vk_buffer.Copy(stage.flattened_ud_buf.data(), ubo_size, instance.UniformMinAlignment()); @@ -561,7 +561,7 @@ void Rasterizer::BindBuffers(const Shader::Info& stage, Shader::Backend::Binding const auto* fault_buffer = buffer_cache.GetFaultBuffer(); buffer_infos.emplace_back(fault_buffer->Handle(), 0, fault_buffer->SizeBytes()); } else if (desc.buffer_type == Shader::BufferType::SharedMemory) { - auto& lds_buffer = buffer_cache.GetStreamBuffer(); + auto& lds_buffer = buffer_cache.GetUtilityBuffer(VideoCore::MemoryUsage::Stream); const auto& cs_program = liverpool->GetCsRegs(); const auto lds_size = cs_program.SharedMemSize() * cs_program.NumWorkgroups(); const auto [data, offset] = diff --git a/src/video_core/texture_cache/image.cpp b/src/video_core/texture_cache/image.cpp index 6241100a0..ab9111e6b 100644 --- a/src/video_core/texture_cache/image.cpp +++ b/src/video_core/texture_cache/image.cpp @@ -312,43 +312,121 @@ void Image::Upload(vk::Buffer buffer, u64 offset) { vk::AccessFlagBits2::eShaderRead | vk::AccessFlagBits2::eTransferRead, {}); } -void Image::CopyImage(const Image& image) { +void Image::CopyImage(const Image& src_image) { scheduler->EndRendering(); Transit(vk::ImageLayout::eTransferDstOptimal, vk::AccessFlagBits2::eTransferWrite, {}); auto cmdbuf = scheduler->CommandBuffer(); + const auto& src_info = src_image.info; boost::container::small_vector image_copy{}; - const u32 num_mips = std::min(image.info.resources.levels, info.resources.levels); + const u32 num_mips = std::min(src_info.resources.levels, info.resources.levels); for (u32 m = 0; m < num_mips; ++m) { - const auto mip_w = std::max(image.info.size.width >> m, 1u); - const auto mip_h = std::max(image.info.size.height >> m, 1u); - const auto mip_d = std::max(image.info.size.depth >> m, 1u); + const auto mip_w = std::max(src_info.size.width >> m, 1u); + const auto mip_h = std::max(src_info.size.height >> m, 1u); + const auto mip_d = std::max(src_info.size.depth >> m, 1u); image_copy.emplace_back(vk::ImageCopy{ .srcSubresource{ - .aspectMask = image.aspect_mask, + .aspectMask = src_image.aspect_mask, .mipLevel = m, .baseArrayLayer = 0, - .layerCount = image.info.resources.layers, + .layerCount = src_info.resources.layers, }, .dstSubresource{ - .aspectMask = image.aspect_mask, + .aspectMask = src_image.aspect_mask, .mipLevel = m, .baseArrayLayer = 0, - .layerCount = image.info.resources.layers, + .layerCount = src_info.resources.layers, }, .extent = {mip_w, mip_h, mip_d}, }); } - cmdbuf.copyImage(image.image, image.last_state.layout, this->image, this->last_state.layout, + cmdbuf.copyImage(src_image.image, src_image.last_state.layout, image, last_state.layout, image_copy); Transit(vk::ImageLayout::eGeneral, vk::AccessFlagBits2::eShaderRead | vk::AccessFlagBits2::eTransferRead, {}); } -void Image::CopyMip(const Image& image, u32 mip, u32 slice) { +void Image::CopyImageWithBuffer(Image& src_image, vk::Buffer buffer, u64 offset) { + const auto& src_info = src_image.info; + + vk::BufferImageCopy buffer_image_copy = { + .bufferOffset = offset, + .bufferRowLength = 0, + .bufferImageHeight = 0, + .imageSubresource = + { + .aspectMask = src_info.IsDepthStencil() ? vk::ImageAspectFlagBits::eDepth + : vk::ImageAspectFlagBits::eColor, + .mipLevel = 0, + .baseArrayLayer = 0, + .layerCount = 1, + }, + .imageOffset = + { + .x = 0, + .y = 0, + .z = 0, + }, + .imageExtent = + { + .width = src_info.size.width, + .height = src_info.size.height, + .depth = src_info.size.depth, + }, + }; + + const vk::BufferMemoryBarrier2 pre_copy_barrier = { + .srcStageMask = vk::PipelineStageFlagBits2::eTransfer, + .srcAccessMask = vk::AccessFlagBits2::eTransferRead, + .dstStageMask = vk::PipelineStageFlagBits2::eTransfer, + .dstAccessMask = vk::AccessFlagBits2::eTransferWrite, + .buffer = buffer, + .offset = offset, + .size = VK_WHOLE_SIZE, + }; + + const vk::BufferMemoryBarrier2 post_copy_barrier = { + .srcStageMask = vk::PipelineStageFlagBits2::eTransfer, + .srcAccessMask = vk::AccessFlagBits2::eTransferWrite, + .dstStageMask = vk::PipelineStageFlagBits2::eTransfer, + .dstAccessMask = vk::AccessFlagBits2::eTransferRead, + .buffer = buffer, + .offset = offset, + .size = VK_WHOLE_SIZE, + }; + + scheduler->EndRendering(); + src_image.Transit(vk::ImageLayout::eTransferSrcOptimal, vk::AccessFlagBits2::eTransferRead, {}); + Transit(vk::ImageLayout::eTransferDstOptimal, vk::AccessFlagBits2::eTransferWrite, {}); + + auto cmdbuf = scheduler->CommandBuffer(); + + cmdbuf.pipelineBarrier2(vk::DependencyInfo{ + .dependencyFlags = vk::DependencyFlagBits::eByRegion, + .bufferMemoryBarrierCount = 1, + .pBufferMemoryBarriers = &pre_copy_barrier, + }); + + cmdbuf.copyImageToBuffer(src_image.image, vk::ImageLayout::eTransferSrcOptimal, buffer, + buffer_image_copy); + + cmdbuf.pipelineBarrier2(vk::DependencyInfo{ + .dependencyFlags = vk::DependencyFlagBits::eByRegion, + .bufferMemoryBarrierCount = 1, + .pBufferMemoryBarriers = &post_copy_barrier, + }); + + buffer_image_copy.imageSubresource.aspectMask = + info.IsDepthStencil() ? vk::ImageAspectFlagBits::eDepth : vk::ImageAspectFlagBits::eColor; + + cmdbuf.copyBufferToImage(buffer, image, vk::ImageLayout::eTransferDstOptimal, + buffer_image_copy); +} + +void Image::CopyMip(const Image& src_image, u32 mip, u32 slice) { scheduler->EndRendering(); Transit(vk::ImageLayout::eTransferDstOptimal, vk::AccessFlagBits2::eTransferWrite, {}); @@ -358,26 +436,27 @@ void Image::CopyMip(const Image& image, u32 mip, u32 slice) { const auto mip_h = std::max(info.size.height >> mip, 1u); const auto mip_d = std::max(info.size.depth >> mip, 1u); - ASSERT(mip_w == image.info.size.width); - ASSERT(mip_h == image.info.size.height); + const auto& src_info = src_image.info; + ASSERT(mip_w == src_info.size.width); + ASSERT(mip_h == src_info.size.height); - const u32 num_layers = std::min(image.info.resources.layers, info.resources.layers); + const u32 num_layers = std::min(src_info.resources.layers, info.resources.layers); const vk::ImageCopy image_copy{ .srcSubresource{ - .aspectMask = image.aspect_mask, + .aspectMask = src_image.aspect_mask, .mipLevel = 0, .baseArrayLayer = 0, .layerCount = num_layers, }, .dstSubresource{ - .aspectMask = image.aspect_mask, + .aspectMask = src_image.aspect_mask, .mipLevel = mip, .baseArrayLayer = slice, .layerCount = num_layers, }, .extent = {mip_w, mip_h, mip_d}, }; - cmdbuf.copyImage(image.image, image.last_state.layout, this->image, this->last_state.layout, + cmdbuf.copyImage(src_image.image, src_image.last_state.layout, image, last_state.layout, image_copy); Transit(vk::ImageLayout::eGeneral, diff --git a/src/video_core/texture_cache/image.h b/src/video_core/texture_cache/image.h index 404e25e88..31b67e021 100644 --- a/src/video_core/texture_cache/image.h +++ b/src/video_core/texture_cache/image.h @@ -104,7 +104,8 @@ struct Image { std::optional range, vk::CommandBuffer cmdbuf = {}); void Upload(vk::Buffer buffer, u64 offset); - void CopyImage(const Image& image); + void CopyImage(const Image& src_image); + void CopyImageWithBuffer(Image& src_image, vk::Buffer buffer, u64 offset); void CopyMip(const Image& src_image, u32 mip, u32 slice); bool IsTracked() { diff --git a/src/video_core/texture_cache/texture_cache.cpp b/src/video_core/texture_cache/texture_cache.cpp index cc244eb6b..a47e858ab 100644 --- a/src/video_core/texture_cache/texture_cache.cpp +++ b/src/video_core/texture_cache/texture_cache.cpp @@ -8,7 +8,6 @@ #include "common/debug.h" #include "video_core/buffer_cache/buffer_cache.h" #include "video_core/page_manager.h" -#include "video_core/renderer_vulkan/liverpool_to_vk.h" #include "video_core/renderer_vulkan/vk_instance.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/texture_cache/host_compatibility.h" @@ -126,7 +125,7 @@ void TextureCache::UnmapMemory(VAddr cpu_addr, size_t size) { ImageId TextureCache::ResolveDepthOverlap(const ImageInfo& requested_info, BindingType binding, ImageId cache_image_id) { - const auto& cache_image = slot_images[cache_image_id]; + auto& cache_image = slot_images[cache_image_id]; if (!cache_image.info.IsDepthStencil() && !requested_info.IsDepthStencil()) { return {}; @@ -169,18 +168,21 @@ ImageId TextureCache::ResolveDepthOverlap(const ImageInfo& requested_info, Bindi } if (recreate) { - auto new_info{requested_info}; - new_info.resources = std::max(requested_info.resources, cache_image.info.resources); - new_info.UpdateSize(); + auto new_info = requested_info; + new_info.resources = std::min(requested_info.resources, cache_image.info.resources); const auto new_image_id = slot_images.insert(instance, scheduler, new_info); RegisterImage(new_image_id); // Inherit image usage - auto& new_image = GetImage(new_image_id); + auto& new_image = slot_images[new_image_id]; new_image.usage = cache_image.usage; + new_image.flags &= ~ImageFlagBits::Dirty; - // TODO: perform a depth copy here + // Perform depth<->color copy using the intermediate copy buffer. + const auto& copy_buffer = buffer_cache.GetUtilityBuffer(MemoryUsage::DeviceLocal); + new_image.CopyImageWithBuffer(cache_image, copy_buffer.Handle(), 0); + // Free the cache image. FreeImage(cache_image_id); return new_image_id; } @@ -584,12 +586,11 @@ void TextureCache::RefreshImage(Image& image, Vulkan::Scheduler* custom_schedule const VAddr image_addr = image.info.guest_address; const size_t image_size = image.info.guest_size; - const auto [vk_buffer, buf_offset] = - buffer_cache.ObtainViewBuffer(image_addr, image_size, is_gpu_dirty); + const auto [vk_buffer, buf_offset] = buffer_cache.ObtainBufferForImage(image_addr, image_size); const auto cmdbuf = sched_ptr->CommandBuffer(); - // The obtained buffer may be written by a shader so we need to emit a barrier to prevent RAW - // hazard + + // The obtained buffer may be GPU modified so we need to emit a barrier to prevent RAW hazard if (auto barrier = vk_buffer->GetBarrier(vk::AccessFlagBits2::eTransferRead, vk::PipelineStageFlagBits2::eTransfer)) { cmdbuf.pipelineBarrier2(vk::DependencyInfo{