From 54907409c76d3383a17cbfd3dc207b109ffb1a25 Mon Sep 17 00:00:00 2001 From: Lander Gallastegi Date: Sun, 6 Jul 2025 02:34:51 +0200 Subject: [PATCH] Initial implementation (untested) --- src/video_core/buffer_cache/buffer_cache.cpp | 57 ++++++++++- src/video_core/buffer_cache/buffer_cache.h | 8 +- src/video_core/buffer_cache/memory_tracker.h | 97 ++++++++++++------- src/video_core/buffer_cache/region_manager.h | 18 ++++ .../renderer_vulkan/vk_rasterizer.cpp | 8 +- .../renderer_vulkan/vk_rasterizer.h | 6 +- 6 files changed, 145 insertions(+), 49 deletions(-) diff --git a/src/video_core/buffer_cache/buffer_cache.cpp b/src/video_core/buffer_cache/buffer_cache.cpp index 1aadf34a8..dfc3f2050 100644 --- a/src/video_core/buffer_cache/buffer_cache.cpp +++ b/src/video_core/buffer_cache/buffer_cache.cpp @@ -29,9 +29,9 @@ static constexpr size_t DeviceBufferSize = 128_MB; static constexpr size_t MaxPageFaults = 1024; BufferCache::BufferCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_, - AmdGpu::Liverpool* liverpool_, TextureCache& texture_cache_, - PageManager& tracker) - : instance{instance_}, scheduler{scheduler_}, liverpool{liverpool_}, + Vulkan::Rasterizer& rasterizer_, AmdGpu::Liverpool* liverpool_, + TextureCache& texture_cache_, PageManager& tracker) + : instance{instance_}, scheduler{scheduler_}, rasterizer{rasterizer_}, liverpool{liverpool_}, memory{Core::Memory::Instance()}, texture_cache{texture_cache_}, staging_buffer{instance, scheduler, MemoryUsage::Upload, StagingBufferSize}, stream_buffer{instance, scheduler, MemoryUsage::Stream, UboStreamBufferSize}, @@ -817,7 +817,7 @@ void BufferCache::SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size, bool is_texel_buffer) { boost::container::small_vector copies; VAddr buffer_start = buffer.CpuAddr(); - memory_tracker->ForEachUploadRange( + memory_tracker->ForEachUploadRange( device_addr, size, is_written, [&](u64 device_addr_out, u64 range_size) { const u64 offset = staging_buffer.Copy(device_addr_out, range_size); copies.push_back(vk::BufferCopy{ @@ -996,7 +996,54 @@ void BufferCache::SynchronizeBuffersInRange(VAddr device_addr, u64 size) { } void BufferCache::SynchronizeBuffersForDma() { - + RENDERER_TRACE; + boost::container::small_vector buffers; + boost::container::small_vector barriers; + boost::container::small_vector copies; + const auto& mapped_ranges = rasterizer.GetMappedRanges(); + memory_tracker->Lock(); + scheduler.EndRendering(); + const auto cmdbuf = scheduler.CommandBuffer(); + mapped_ranges.ForEach([&](VAddr device_addr, u64 size) { + ForEachBufferInRange(device_addr, size, [&](BufferId buffer_id, Buffer& buffer) { + if (memory_tracker->IsRegionCpuModified(device_addr, size)) { + barriers.push_back(vk::BufferMemoryBarrier2{ + .srcStageMask = vk::PipelineStageFlagBits2::eAllCommands, + .srcAccessMask = + vk::AccessFlagBits2::eMemoryRead | vk::AccessFlagBits2::eMemoryWrite | + vk::AccessFlagBits2::eTransferRead | vk::AccessFlagBits2::eTransferWrite, + .dstStageMask = vk::PipelineStageFlagBits2::eTransfer, + .dstAccessMask = vk::AccessFlagBits2::eTransferWrite, + .buffer = buffer.Handle(), + .offset = 0, + .size = buffer.SizeBytes(), + }); + buffers.push_back(&buffer); + } + }); + }); + cmdbuf.pipelineBarrier2(vk::DependencyInfo{ + .dependencyFlags = vk::DependencyFlagBits::eByRegion, + .bufferMemoryBarrierCount = static_cast(barriers.size()), + .pBufferMemoryBarriers = barriers.data(), + }); + for (auto* buffer : buffers) { + memory_tracker->ForEachUploadRange( + buffer->CpuAddr(), buffer->SizeBytes(), false, + [&](u64 device_addr_out, u64 range_size) { + const u64 offset = staging_buffer.Copy(device_addr_out, range_size); + copies.push_back(vk::BufferCopy{ + .srcOffset = offset, + .dstOffset = device_addr_out - buffer->CpuAddr(), + .size = range_size, + }); + }); + cmdbuf.copyBuffer(staging_buffer.Handle(), buffer->Handle(), copies); + copies.clear(); + } + memory_tracker->UnmarkAllRegionsAsCpuModified(); + MemoryBarrier(); + memory_tracker->Unlock(); } void BufferCache::MemoryBarrier() { diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index 3ce5d2436..fd3470ae7 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -21,7 +21,8 @@ class MemoryManager; namespace Vulkan { class GraphicsPipeline; -} +class Rasterizer; +} // namespace Vulkan namespace VideoCore { @@ -70,8 +71,8 @@ public: public: explicit BufferCache(const Vulkan::Instance& instance, Vulkan::Scheduler& scheduler, - AmdGpu::Liverpool* liverpool, TextureCache& texture_cache, - PageManager& tracker); + Vulkan::Rasterizer& rasterizer, AmdGpu::Liverpool* liverpool, + TextureCache& texture_cache, PageManager& tracker); ~BufferCache(); /// Returns a pointer to GDS device local buffer. @@ -203,6 +204,7 @@ private: const Vulkan::Instance& instance; Vulkan::Scheduler& scheduler; + Vulkan::Rasterizer& rasterizer; AmdGpu::Liverpool* liverpool; Core::MemoryManager* memory; TextureCache& texture_cache; diff --git a/src/video_core/buffer_cache/memory_tracker.h b/src/video_core/buffer_cache/memory_tracker.h index 5ac129967..6a93ee31b 100644 --- a/src/video_core/buffer_cache/memory_tracker.h +++ b/src/video_core/buffer_cache/memory_tracker.h @@ -27,7 +27,7 @@ public: /// Returns true if a region has been modified from the CPU template bool IsRegionCpuModified(VAddr query_cpu_addr, u64 query_size) noexcept { - return IteratePages( + return IterateRegions( query_cpu_addr, query_size, [](RegionManager* manager, u64 offset, size_t size) { std::scoped_lock lk{manager->lock}; return manager->template IsRegionModified(offset, size); @@ -37,7 +37,7 @@ public: /// Returns true if a region has been modified from the GPU template bool IsRegionGpuModified(VAddr query_cpu_addr, u64 query_size) noexcept { - return IteratePages( + return IterateRegions( query_cpu_addr, query_size, [](RegionManager* manager, u64 offset, size_t size) { std::scoped_lock lk{manager->lock}; return manager->template IsRegionModified(offset, size); @@ -47,29 +47,38 @@ public: /// Mark region as CPU modified, notifying the device_tracker about this change template void MarkRegionAsCpuModified(VAddr dirty_cpu_addr, u64 query_size) { - IteratePages(dirty_cpu_addr, query_size, - [](RegionManager* manager, u64 offset, size_t size) { - std::scoped_lock lk{manager->lock}; - manager->template ChangeRegionState( - manager->GetCpuAddr() + offset, size); - }); + IterateRegions(dirty_cpu_addr, query_size, + [](RegionManager* manager, u64 offset, size_t size) { + std::scoped_lock lk{manager->lock}; + manager->template ChangeRegionState( + manager->GetCpuAddr() + offset, size); + }); + } + + /// Unmark all regions as CPU modified, notifying the device_tracker about this change + template + void UnmarkAllRegionsAsCpuModified() noexcept { + ForEachRegion([](RegionManager* manager) { + std::scoped_lock lk{manager->lock}; + manager->template ChangeAllRegionState(); + }); } /// Unmark region as modified from the host GPU template void UnmarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 query_size) noexcept { - IteratePages(dirty_cpu_addr, query_size, - [](RegionManager* manager, u64 offset, size_t size) { - std::scoped_lock lk{manager->lock}; - manager->template ChangeRegionState( - manager->GetCpuAddr() + offset, size); - }); + IterateRegions(dirty_cpu_addr, query_size, + [](RegionManager* manager, u64 offset, size_t size) { + std::scoped_lock lk{manager->lock}; + manager->template ChangeRegionState( + manager->GetCpuAddr() + offset, size); + }); } /// Removes all protection from a page and ensures GPU data has been flushed if requested template void InvalidateRegion(VAddr cpu_addr, u64 size, bool try_flush, auto&& on_flush) noexcept { - IteratePages( + IterateRegions( cpu_addr, size, [try_flush, &on_flush](RegionManager* manager, u64 offset, size_t size) { const bool should_flush = [&] { @@ -92,29 +101,30 @@ public: } /// Call 'func' for each CPU modified range and unmark those pages as CPU modified - template + template void ForEachUploadRange(VAddr query_cpu_range, u64 query_size, bool is_written, auto&& func) { - IteratePages(query_cpu_range, query_size, - [&func, is_written](RegionManager* manager, u64 offset, size_t size) { - std::scoped_lock lk{manager->lock}; - manager->template ForEachModifiedRange( - manager->GetCpuAddr() + offset, size, func); - if (is_written) { - manager->template ChangeRegionState( - manager->GetCpuAddr() + offset, size); - } - }); + IterateRegions( + query_cpu_range, query_size, + [&func, is_written](RegionManager* manager, u64 offset, size_t size) { + std::scoped_lock lk{manager->lock}; + manager->template ForEachModifiedRange( + manager->GetCpuAddr() + offset, size, func); + if (is_written && clear) { + manager->template ChangeRegionState( + manager->GetCpuAddr() + offset, size); + } + }); } /// Call 'func' for each GPU modified range and unmark those pages as GPU modified template void ForEachDownloadRange(VAddr query_cpu_range, u64 query_size, auto&& func) { - IteratePages(query_cpu_range, query_size, - [&func](RegionManager* manager, u64 offset, size_t size) { - std::scoped_lock lk{manager->lock}; - manager->template ForEachModifiedRange( - manager->GetCpuAddr() + offset, size, func); - }); + IterateRegions(query_cpu_range, query_size, + [&func](RegionManager* manager, u64 offset, size_t size) { + std::scoped_lock lk{manager->lock}; + manager->template ForEachModifiedRange( + manager->GetCpuAddr() + offset, size, func); + }); } /// Lck the memory tracker. @@ -127,7 +137,6 @@ public: global_lock.unlock(); } - private: /** * @brief IteratePages Iterates L2 word manager page table. @@ -137,7 +146,7 @@ private: * @return */ template - bool IteratePages(VAddr cpu_address, size_t size, Func&& func) { + bool IterateRegions(VAddr cpu_address, size_t size, Func&& func) { RENDERER_TRACE; if constexpr (locking) { std::shared_lock lock{global_lock}; @@ -177,6 +186,26 @@ private: return false; } + /** + * @brief Iterate throw all regions in the memory tracker. + * @param func Callback for each region manager. + * @return + */ + template + void ForEachRegion(Func&& func) { + RENDERER_TRACE; + if constexpr (locking) { + std::shared_lock lock{global_lock}; + } + for (auto& pool : manager_pool) { + for (auto& manager : pool) { + if (manager.GetCpuAddr() != 0) { + func(&manager); + } + } + } + } + void CreateRegion(std::size_t page_index) { const VAddr base_cpu_addr = page_index << TRACKER_HIGHER_PAGE_BITS; if (free_managers.empty()) { diff --git a/src/video_core/buffer_cache/region_manager.h b/src/video_core/buffer_cache/region_manager.h index 608b16fb3..b21f4e406 100644 --- a/src/video_core/buffer_cache/region_manager.h +++ b/src/video_core/buffer_cache/region_manager.h @@ -136,6 +136,24 @@ public: } } + /** + * Chagnes state of all pages in the region + */ + template + void ChangeAllRegionState() noexcept { + RENDERER_TRACE; + if constexpr (enable) { + GetRegionBits().Fill(); + } else { + GetRegionBits().Clear(); + } + if constexpr (type == Type::CPU) { + UpdateProtection(); + } else if (Config::readbacks()) { + UpdateProtection(); + } + } + /** * Returns true when a region has been modified * diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index da15b7c5e..d95363af7 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -36,7 +36,7 @@ static Shader::PushData MakeUserData(const AmdGpu::Liverpool::Regs& regs) { Rasterizer::Rasterizer(const Instance& instance_, Scheduler& scheduler_, AmdGpu::Liverpool* liverpool_) : instance{instance_}, scheduler{scheduler_}, page_manager{this}, - buffer_cache{instance, scheduler, liverpool_, texture_cache, page_manager}, + buffer_cache{instance, scheduler, *this, liverpool_, texture_cache, page_manager}, texture_cache{instance, scheduler, buffer_cache, page_manager}, liverpool{liverpool_}, memory{Core::Memory::Instance()}, pipeline_cache{instance, scheduler, liverpool} { if (!Config::nullGpu()) { @@ -475,12 +475,8 @@ bool Rasterizer::BindResources(const Pipeline* pipeline) { // We only use fault buffer for DMA right now. { Common::RecursiveSharedLock lock{mapped_ranges_mutex}; - mapped_ranges.ForEach( - [&](const VAddr addr, u64 size) { - buffer_cache.SynchronizeBuffersInRange(addr, size); - }); + buffer_cache.SynchronizeBuffersForDma(); } - buffer_cache.MemoryBarrier(); } fault_process_pending |= uses_dma; diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index 35e8284ae..a00df1093 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h @@ -7,9 +7,9 @@ #include "common/shared_first_mutex.h" #include "video_core/buffer_cache/buffer_cache.h" #include "video_core/page_manager.h" +#include "video_core/range_set.h" #include "video_core/renderer_vulkan/vk_pipeline_cache.h" #include "video_core/texture_cache/texture_cache.h" -#include "video_core/range_set.h" namespace AmdGpu { struct Liverpool; @@ -43,6 +43,10 @@ public: return texture_cache; } + [[nodiscard]] const VideoCore::RangeSet& GetMappedRanges() const noexcept { + return mapped_ranges; + } + void Draw(bool is_indexed, u32 index_offset = 0); void DrawIndirect(bool is_indexed, VAddr arg_address, u32 offset, u32 size, u32 max_count, VAddr count_address);