diff --git a/CMakeLists.txt b/CMakeLists.txt index 73fff2c73..53a2281ec 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -674,6 +674,8 @@ set(COMMON src/common/logging/backend.cpp src/common/polyfill_thread.h src/common/rdtsc.cpp src/common/rdtsc.h + src/common/recursive_lock.cpp + src/common/recursive_lock.h src/common/sha1.h src/common/signal_context.h src/common/signal_context.cpp @@ -864,6 +866,7 @@ set(SHADER_RECOMPILER src/shader_recompiler/exception.h src/shader_recompiler/ir/passes/shared_memory_barrier_pass.cpp src/shader_recompiler/ir/passes/shared_memory_to_storage_pass.cpp src/shader_recompiler/ir/passes/ssa_rewrite_pass.cpp + src/shader_recompiler/ir/abstract_syntax_list.cpp src/shader_recompiler/ir/abstract_syntax_list.h src/shader_recompiler/ir/attribute.cpp src/shader_recompiler/ir/attribute.h diff --git a/externals/sirit b/externals/sirit index 09a1416ab..6b450704f 160000 --- a/externals/sirit +++ b/externals/sirit @@ -1 +1 @@ -Subproject commit 09a1416ab1b59ddfebd2618412f118f2004f3b2c +Subproject commit 6b450704f6fedb9413d0c89a9eb59d028eb1e6c0 diff --git a/src/common/recursive_lock.cpp b/src/common/recursive_lock.cpp new file mode 100644 index 000000000..2471a2ee0 --- /dev/null +++ b/src/common/recursive_lock.cpp @@ -0,0 +1,37 @@ +// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include +#include "common/assert.h" +#include "common/recursive_lock.h" + +namespace Common::Detail { + +struct RecursiveLockState { + RecursiveLockType type; + int count; +}; + +thread_local std::unordered_map g_recursive_locks; + +bool IncrementRecursiveLock(void* mutex, RecursiveLockType type) { + auto& state = g_recursive_locks[mutex]; + if (state.count == 0) { + ASSERT(state.type == RecursiveLockType::None); + state.type = type; + } + ASSERT(state.type == type); + return state.count++ == 0; +} + +bool DecrementRecursiveLock(void* mutex, RecursiveLockType type) { + auto& state = g_recursive_locks[mutex]; + ASSERT(state.type == type && state.count > 0); + if (--state.count == 0) { + g_recursive_locks.erase(mutex); + return true; + } + return false; +} + +} // namespace Common::Detail diff --git a/src/common/recursive_lock.h b/src/common/recursive_lock.h new file mode 100644 index 000000000..5a5fc6658 --- /dev/null +++ b/src/common/recursive_lock.h @@ -0,0 +1,67 @@ +// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#include +#include +#include + +namespace Common { + +namespace Detail { + +enum class RecursiveLockType { None, Shared, Exclusive }; + +bool IncrementRecursiveLock(void* mutex, RecursiveLockType type); +bool DecrementRecursiveLock(void* mutex, RecursiveLockType type); + +} // namespace Detail + +template +class RecursiveScopedLock { +public: + explicit RecursiveScopedLock(MutexType& mutex) : m_mutex(mutex), m_locked(false) { + if (Detail::IncrementRecursiveLock(&m_mutex, Detail::RecursiveLockType::Exclusive)) { + m_locked = true; + m_lock.emplace(m_mutex); + } + } + + ~RecursiveScopedLock() { + Detail::DecrementRecursiveLock(&m_mutex, Detail::RecursiveLockType::Exclusive); + if (m_locked) { + m_lock.reset(); + } + } + +private: + MutexType& m_mutex; + std::optional> m_lock; + bool m_locked = false; +}; + +template +class RecursiveSharedLock { +public: + explicit RecursiveSharedLock(MutexType& mutex) : m_mutex(mutex), m_locked(false) { + if (Detail::IncrementRecursiveLock(&m_mutex, Detail::RecursiveLockType::Shared)) { + m_locked = true; + m_lock.emplace(m_mutex); + } + } + + ~RecursiveSharedLock() { + Detail::DecrementRecursiveLock(&m_mutex, Detail::RecursiveLockType::Shared); + if (m_locked) { + m_lock.reset(); + } + } + +private: + MutexType& m_mutex; + std::optional> m_lock; + bool m_locked = false; +}; + +} // namespace Common \ No newline at end of file diff --git a/src/common/slot_vector.h b/src/common/slot_vector.h index d4ac51361..2f693fb28 100644 --- a/src/common/slot_vector.h +++ b/src/common/slot_vector.h @@ -14,6 +14,9 @@ namespace Common { struct SlotId { static constexpr u32 INVALID_INDEX = std::numeric_limits::max(); + SlotId() noexcept = default; + constexpr SlotId(u32 index) noexcept : index(index) {} + constexpr auto operator<=>(const SlotId&) const noexcept = default; constexpr explicit operator bool() const noexcept { @@ -28,6 +31,63 @@ class SlotVector { constexpr static std::size_t InitialCapacity = 2048; public: + template + class Iterator { + public: + using iterator_category = std::forward_iterator_tag; + using value_type = ValueType; + using difference_type = std::ptrdiff_t; + using pointer = Pointer; + using reference = Reference; + + Iterator(SlotVector& vector_, SlotId index_) : vector(vector_), slot(index_) { + AdvanceToValid(); + } + + reference operator*() const { + return vector[slot]; + } + + pointer operator->() const { + return &vector[slot]; + } + + Iterator& operator++() { + ++slot.index; + AdvanceToValid(); + return *this; + } + + Iterator operator++(int) { + Iterator temp = *this; + ++(*this); + return temp; + } + + bool operator==(const Iterator& other) const { + return slot == other.slot; + } + + bool operator!=(const Iterator& other) const { + return !(*this == other); + } + + private: + void AdvanceToValid() { + while (slot < vector.values_capacity && !vector.ReadStorageBit(slot.index)) { + ++slot.index; + } + } + + SlotVector& vector; + SlotId slot; + }; + + using iterator = Iterator; + using const_iterator = Iterator; + using reverse_iterator = std::reverse_iterator; + using const_reverse_iterator = std::reverse_iterator; + SlotVector() { Reserve(InitialCapacity); } @@ -60,7 +120,7 @@ public: } template - [[nodiscard]] SlotId insert(Args&&... args) noexcept { + SlotId insert(Args&&... args) noexcept { const u32 index = FreeValueIndex(); new (&values[index].object) T(std::forward(args)...); SetStorageBit(index); @@ -78,6 +138,54 @@ public: return values_capacity - free_list.size(); } + iterator begin() noexcept { + return iterator(*this, 0); + } + + const_iterator begin() const noexcept { + return const_iterator(*this, 0); + } + + const_iterator cbegin() const noexcept { + return begin(); + } + + iterator end() noexcept { + return iterator(*this, values_capacity); + } + + const_iterator end() const noexcept { + return const_iterator(*this, values_capacity); + } + + const_iterator cend() const noexcept { + return end(); + } + + reverse_iterator rbegin() noexcept { + return reverse_iterator(end()); + } + + const_reverse_iterator rbegin() const noexcept { + return const_reverse_iterator(end()); + } + + const_reverse_iterator crbegin() const noexcept { + return rbegin(); + } + + reverse_iterator rend() noexcept { + return reverse_iterator(begin()); + } + + const_reverse_iterator rend() const noexcept { + return const_reverse_iterator(begin()); + } + + const_reverse_iterator crend() const noexcept { + return rend(); + } + private: struct NonTrivialDummy { NonTrivialDummy() noexcept {} diff --git a/src/shader_recompiler/backend/spirv/emit_spirv.cpp b/src/shader_recompiler/backend/spirv/emit_spirv.cpp index 9ebb842cc..f2e6279f4 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv.cpp @@ -154,6 +154,7 @@ void Traverse(EmitContext& ctx, const IR::Program& program) { for (IR::Inst& inst : node.data.block->Instructions()) { EmitInst(ctx, &inst); } + ctx.first_to_last_label_map[label.value] = ctx.last_label; break; } case IR::AbstractSyntaxNode::Type::If: { @@ -298,6 +299,10 @@ void SetupCapabilities(const Info& info, const Profile& profile, EmitContext& ct if (stage == LogicalStage::TessellationControl || stage == LogicalStage::TessellationEval) { ctx.AddCapability(spv::Capability::Tessellation); } + if (info.dma_types != IR::Type::Void) { + ctx.AddCapability(spv::Capability::PhysicalStorageBufferAddresses); + ctx.AddExtension("SPV_KHR_physical_storage_buffer"); + } } void DefineEntryPoint(const Info& info, EmitContext& ctx, Id main) { @@ -387,7 +392,7 @@ void SetupFloatMode(EmitContext& ctx, const Profile& profile, const RuntimeInfo& void PatchPhiNodes(const IR::Program& program, EmitContext& ctx) { auto inst{program.blocks.front()->begin()}; size_t block_index{0}; - ctx.PatchDeferredPhi([&](size_t phi_arg) { + ctx.PatchDeferredPhi([&](u32 phi_arg, Id first_parent) { if (phi_arg == 0) { ++inst; if (inst == program.blocks[block_index]->end() || @@ -398,7 +403,9 @@ void PatchPhiNodes(const IR::Program& program, EmitContext& ctx) { } while (inst->GetOpcode() != IR::Opcode::Phi); } } - return ctx.Def(inst->Arg(phi_arg)); + const Id arg = ctx.Def(inst->Arg(phi_arg)); + const Id parent = ctx.first_to_last_label_map[first_parent.value]; + return std::make_pair(arg, parent); }); } } // Anonymous namespace diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp index c3799fb4b..d7c73ca8f 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp @@ -60,7 +60,7 @@ Id BufferAtomicU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id address = ctx.OpIAdd(ctx.U32[1], address, buffer.offset); } const Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(2u)); - const auto [id, pointer_type] = buffer[EmitContext::BufferAlias::U32]; + const auto [id, pointer_type] = buffer[EmitContext::PointerType::U32]; const Id ptr = ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, index); const auto [scope, semantics]{AtomicArgs(ctx)}; return BufferAtomicU32BoundsCheck(ctx, index, buffer.size_dwords, [&] { @@ -257,7 +257,7 @@ Id EmitImageAtomicExchange32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id co Id EmitDataAppend(EmitContext& ctx, u32 gds_addr, u32 binding) { const auto& buffer = ctx.buffers[binding]; - const auto [id, pointer_type] = buffer[EmitContext::BufferAlias::U32]; + const auto [id, pointer_type] = buffer[EmitContext::PointerType::U32]; const Id ptr = ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, ctx.ConstU32(gds_addr)); const auto [scope, semantics]{AtomicArgs(ctx)}; return ctx.OpAtomicIIncrement(ctx.U32[1], ptr, scope, semantics); @@ -265,7 +265,7 @@ Id EmitDataAppend(EmitContext& ctx, u32 gds_addr, u32 binding) { Id EmitDataConsume(EmitContext& ctx, u32 gds_addr, u32 binding) { const auto& buffer = ctx.buffers[binding]; - const auto [id, pointer_type] = buffer[EmitContext::BufferAlias::U32]; + const auto [id, pointer_type] = buffer[EmitContext::PointerType::U32]; const Id ptr = ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, ctx.ConstU32(gds_addr)); const auto [scope, semantics]{AtomicArgs(ctx)}; return ctx.OpAtomicIDecrement(ctx.U32[1], ptr, scope, semantics); diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp index eff562955..658d4759f 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp @@ -161,26 +161,25 @@ void EmitGetGotoVariable(EmitContext&) { UNREACHABLE_MSG("Unreachable instruction"); } -using BufferAlias = EmitContext::BufferAlias; +using PointerType = EmitContext::PointerType; -Id EmitReadConst(EmitContext& ctx, IR::Inst* inst) { +Id EmitReadConst(EmitContext& ctx, IR::Inst* inst, Id addr, Id offset) { const u32 flatbuf_off_dw = inst->Flags(); - const auto& srt_flatbuf = ctx.buffers.back(); - ASSERT(srt_flatbuf.binding >= 0 && flatbuf_off_dw > 0 && - srt_flatbuf.buffer_type == BufferType::ReadConstUbo); - LOG_DEBUG(Render_Recompiler, "ReadConst from flatbuf dword {}", flatbuf_off_dw); - const auto [id, pointer_type] = srt_flatbuf[BufferAlias::U32]; - const Id ptr{ - ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, ctx.ConstU32(flatbuf_off_dw))}; - return ctx.OpLoad(ctx.U32[1], ptr); + // We can only provide a fallback for immediate offsets. + if (flatbuf_off_dw == 0) { + return ctx.OpFunctionCall(ctx.U32[1], ctx.read_const_dynamic, addr, offset); + } else { + return ctx.OpFunctionCall(ctx.U32[1], ctx.read_const, addr, offset, + ctx.ConstU32(flatbuf_off_dw)); + } } -template +template Id ReadConstBuffer(EmitContext& ctx, u32 handle, Id index) { const auto& buffer = ctx.buffers[handle]; index = ctx.OpIAdd(ctx.U32[1], index, buffer.offset_dwords); - const auto [id, pointer_type] = buffer[alias]; - const auto value_type = alias == BufferAlias::U32 ? ctx.U32[1] : ctx.F32[1]; + const auto [id, pointer_type] = buffer[type]; + const auto value_type = type == PointerType::U32 ? ctx.U32[1] : ctx.F32[1]; const Id ptr{ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, index)}; const Id result{ctx.OpLoad(value_type, ptr)}; @@ -192,7 +191,7 @@ Id ReadConstBuffer(EmitContext& ctx, u32 handle, Id index) { } Id EmitReadConstBuffer(EmitContext& ctx, u32 handle, Id index) { - return ReadConstBuffer(ctx, handle, index); + return ReadConstBuffer(ctx, handle, index); } Id EmitReadStepRate(EmitContext& ctx, int rate_idx) { @@ -251,7 +250,7 @@ Id EmitGetAttribute(EmitContext& ctx, IR::Attribute attr, u32 comp, Id index) { ctx.OpUDiv(ctx.U32[1], ctx.OpLoad(ctx.U32[1], ctx.instance_id), step_rate), ctx.ConstU32(param.num_components)), ctx.ConstU32(comp)); - return ReadConstBuffer(ctx, param.buffer_handle, offset); + return ReadConstBuffer(ctx, param.buffer_handle, offset); } Id result; @@ -437,7 +436,7 @@ static Id EmitLoadBufferBoundsCheck(EmitContext& ctx, Id index, Id buffer_size, return result; } -template +template static Id EmitLoadBufferB32xN(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { const auto flags = inst->Flags(); const auto& spv_buffer = ctx.buffers[handle]; @@ -445,7 +444,7 @@ static Id EmitLoadBufferB32xN(EmitContext& ctx, IR::Inst* inst, u32 handle, Id a address = ctx.OpIAdd(ctx.U32[1], address, spv_buffer.offset); } const Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(2u)); - const auto& data_types = alias == BufferAlias::U32 ? ctx.U32 : ctx.F32; + const auto& data_types = alias == PointerType::U32 ? ctx.U32 : ctx.F32; const auto [id, pointer_type] = spv_buffer[alias]; boost::container::static_vector ids; @@ -456,7 +455,7 @@ static Id EmitLoadBufferB32xN(EmitContext& ctx, IR::Inst* inst, u32 handle, Id a if (!flags.typed) { // Untyped loads have bounds checking per-component. ids.push_back(EmitLoadBufferBoundsCheck<1>(ctx, index_i, spv_buffer.size_dwords, - result_i, alias == BufferAlias::F32)); + result_i, alias == PointerType::F32)); } else { ids.push_back(result_i); } @@ -466,7 +465,7 @@ static Id EmitLoadBufferB32xN(EmitContext& ctx, IR::Inst* inst, u32 handle, Id a if (flags.typed) { // Typed loads have single bounds check for the whole load. return EmitLoadBufferBoundsCheck(ctx, index, spv_buffer.size_dwords, result, - alias == BufferAlias::F32); + alias == PointerType::F32); } return result; } @@ -476,7 +475,7 @@ Id EmitLoadBufferU8(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { if (Sirit::ValidId(spv_buffer.offset)) { address = ctx.OpIAdd(ctx.U32[1], address, spv_buffer.offset); } - const auto [id, pointer_type] = spv_buffer[BufferAlias::U8]; + const auto [id, pointer_type] = spv_buffer[PointerType::U8]; const Id ptr{ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, address)}; const Id result{ctx.OpUConvert(ctx.U32[1], ctx.OpLoad(ctx.U8, ptr))}; return EmitLoadBufferBoundsCheck<1>(ctx, address, spv_buffer.size, result, false); @@ -487,7 +486,7 @@ Id EmitLoadBufferU16(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { if (Sirit::ValidId(spv_buffer.offset)) { address = ctx.OpIAdd(ctx.U32[1], address, spv_buffer.offset); } - const auto [id, pointer_type] = spv_buffer[BufferAlias::U16]; + const auto [id, pointer_type] = spv_buffer[PointerType::U16]; const Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(1u)); const Id ptr{ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, index)}; const Id result{ctx.OpUConvert(ctx.U32[1], ctx.OpLoad(ctx.U16, ptr))}; @@ -495,35 +494,35 @@ Id EmitLoadBufferU16(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { } Id EmitLoadBufferU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { - return EmitLoadBufferB32xN<1, BufferAlias::U32>(ctx, inst, handle, address); + return EmitLoadBufferB32xN<1, PointerType::U32>(ctx, inst, handle, address); } Id EmitLoadBufferU32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { - return EmitLoadBufferB32xN<2, BufferAlias::U32>(ctx, inst, handle, address); + return EmitLoadBufferB32xN<2, PointerType::U32>(ctx, inst, handle, address); } Id EmitLoadBufferU32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { - return EmitLoadBufferB32xN<3, BufferAlias::U32>(ctx, inst, handle, address); + return EmitLoadBufferB32xN<3, PointerType::U32>(ctx, inst, handle, address); } Id EmitLoadBufferU32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { - return EmitLoadBufferB32xN<4, BufferAlias::U32>(ctx, inst, handle, address); + return EmitLoadBufferB32xN<4, PointerType::U32>(ctx, inst, handle, address); } Id EmitLoadBufferF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { - return EmitLoadBufferB32xN<1, BufferAlias::F32>(ctx, inst, handle, address); + return EmitLoadBufferB32xN<1, PointerType::F32>(ctx, inst, handle, address); } Id EmitLoadBufferF32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { - return EmitLoadBufferB32xN<2, BufferAlias::F32>(ctx, inst, handle, address); + return EmitLoadBufferB32xN<2, PointerType::F32>(ctx, inst, handle, address); } Id EmitLoadBufferF32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { - return EmitLoadBufferB32xN<3, BufferAlias::F32>(ctx, inst, handle, address); + return EmitLoadBufferB32xN<3, PointerType::F32>(ctx, inst, handle, address); } Id EmitLoadBufferF32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { - return EmitLoadBufferB32xN<4, BufferAlias::F32>(ctx, inst, handle, address); + return EmitLoadBufferB32xN<4, PointerType::F32>(ctx, inst, handle, address); } Id EmitLoadBufferFormatF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { @@ -553,7 +552,7 @@ void EmitStoreBufferBoundsCheck(EmitContext& ctx, Id index, Id buffer_size, auto emit_func(); } -template +template static void EmitStoreBufferB32xN(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) { const auto flags = inst->Flags(); @@ -562,7 +561,7 @@ static void EmitStoreBufferB32xN(EmitContext& ctx, IR::Inst* inst, u32 handle, I address = ctx.OpIAdd(ctx.U32[1], address, spv_buffer.offset); } const Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(2u)); - const auto& data_types = alias == BufferAlias::U32 ? ctx.U32 : ctx.F32; + const auto& data_types = alias == PointerType::U32 ? ctx.U32 : ctx.F32; const auto [id, pointer_type] = spv_buffer[alias]; auto store = [&] { @@ -593,7 +592,7 @@ void EmitStoreBufferU8(EmitContext& ctx, IR::Inst*, u32 handle, Id address, Id v if (Sirit::ValidId(spv_buffer.offset)) { address = ctx.OpIAdd(ctx.U32[1], address, spv_buffer.offset); } - const auto [id, pointer_type] = spv_buffer[BufferAlias::U8]; + const auto [id, pointer_type] = spv_buffer[PointerType::U8]; const Id ptr{ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, address)}; const Id result{ctx.OpUConvert(ctx.U8, value)}; EmitStoreBufferBoundsCheck<1>(ctx, address, spv_buffer.size, [&] { ctx.OpStore(ptr, result); }); @@ -604,7 +603,7 @@ void EmitStoreBufferU16(EmitContext& ctx, IR::Inst*, u32 handle, Id address, Id if (Sirit::ValidId(spv_buffer.offset)) { address = ctx.OpIAdd(ctx.U32[1], address, spv_buffer.offset); } - const auto [id, pointer_type] = spv_buffer[BufferAlias::U16]; + const auto [id, pointer_type] = spv_buffer[PointerType::U16]; const Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(1u)); const Id ptr{ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, index)}; const Id result{ctx.OpUConvert(ctx.U16, value)}; @@ -613,35 +612,35 @@ void EmitStoreBufferU16(EmitContext& ctx, IR::Inst*, u32 handle, Id address, Id } void EmitStoreBufferU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) { - EmitStoreBufferB32xN<1, BufferAlias::U32>(ctx, inst, handle, address, value); + EmitStoreBufferB32xN<1, PointerType::U32>(ctx, inst, handle, address, value); } void EmitStoreBufferU32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) { - EmitStoreBufferB32xN<2, BufferAlias::U32>(ctx, inst, handle, address, value); + EmitStoreBufferB32xN<2, PointerType::U32>(ctx, inst, handle, address, value); } void EmitStoreBufferU32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) { - EmitStoreBufferB32xN<3, BufferAlias::U32>(ctx, inst, handle, address, value); + EmitStoreBufferB32xN<3, PointerType::U32>(ctx, inst, handle, address, value); } void EmitStoreBufferU32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) { - EmitStoreBufferB32xN<4, BufferAlias::U32>(ctx, inst, handle, address, value); + EmitStoreBufferB32xN<4, PointerType::U32>(ctx, inst, handle, address, value); } void EmitStoreBufferF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) { - EmitStoreBufferB32xN<1, BufferAlias::F32>(ctx, inst, handle, address, value); + EmitStoreBufferB32xN<1, PointerType::F32>(ctx, inst, handle, address, value); } void EmitStoreBufferF32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) { - EmitStoreBufferB32xN<2, BufferAlias::F32>(ctx, inst, handle, address, value); + EmitStoreBufferB32xN<2, PointerType::F32>(ctx, inst, handle, address, value); } void EmitStoreBufferF32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) { - EmitStoreBufferB32xN<3, BufferAlias::F32>(ctx, inst, handle, address, value); + EmitStoreBufferB32xN<3, PointerType::F32>(ctx, inst, handle, address, value); } void EmitStoreBufferF32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) { - EmitStoreBufferB32xN<4, BufferAlias::F32>(ctx, inst, handle, address, value); + EmitStoreBufferB32xN<4, PointerType::F32>(ctx, inst, handle, address, value); } void EmitStoreBufferFormatF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) { diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h index 269f372d5..09f9732bf 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h +++ b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h @@ -61,7 +61,7 @@ void EmitSetVectorRegister(EmitContext& ctx); void EmitSetGotoVariable(EmitContext& ctx); void EmitGetGotoVariable(EmitContext& ctx); void EmitSetScc(EmitContext& ctx); -Id EmitReadConst(EmitContext& ctx, IR::Inst* inst); +Id EmitReadConst(EmitContext& ctx, IR::Inst* inst, Id addr, Id offset); Id EmitReadConstBuffer(EmitContext& ctx, u32 handle, Id index); Id EmitLoadBufferU8(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address); Id EmitLoadBufferU16(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address); diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp index 2640030df..68bfcc0d0 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp @@ -7,6 +7,7 @@ #include "shader_recompiler/frontend/fetch_shader.h" #include "shader_recompiler/runtime_info.h" #include "video_core/amdgpu/types.h" +#include "video_core/buffer_cache/buffer_cache.h" #include #include @@ -70,6 +71,12 @@ EmitContext::EmitContext(const Profile& profile_, const RuntimeInfo& runtime_inf Bindings& binding_) : Sirit::Module(profile_.supported_spirv), info{info_}, runtime_info{runtime_info_}, profile{profile_}, stage{info.stage}, l_stage{info.l_stage}, binding{binding_} { + if (info.dma_types != IR::Type::Void) { + SetMemoryModel(spv::AddressingModel::PhysicalStorageBuffer64, spv::MemoryModel::GLSL450); + } else { + SetMemoryModel(spv::AddressingModel::Logical, spv::MemoryModel::GLSL450); + } + AddCapability(spv::Capability::Shader); DefineArithmeticTypes(); DefineInterfaces(); @@ -137,9 +144,13 @@ void EmitContext::DefineArithmeticTypes() { true_value = ConstantTrue(U1[1]); false_value = ConstantFalse(U1[1]); + u8_one_value = Constant(U8, 1U); + u8_zero_value = Constant(U8, 0U); u32_one_value = ConstU32(1U); u32_zero_value = ConstU32(0U); f32_zero_value = ConstF32(0.0f); + u64_one_value = Constant(U64, 1ULL); + u64_zero_value = Constant(U64, 0ULL); pi_x2 = ConstF32(2.0f * float{std::numbers::pi}); @@ -157,6 +168,35 @@ void EmitContext::DefineArithmeticTypes() { if (info.uses_fp64) { frexp_result_f64 = Name(TypeStruct(F64[1], S32[1]), "frexp_result_f64"); } + + if (True(info.dma_types & IR::Type::F64)) { + physical_pointer_types[PointerType::F64] = + TypePointer(spv::StorageClass::PhysicalStorageBuffer, F64[1]); + } + if (True(info.dma_types & IR::Type::U64)) { + physical_pointer_types[PointerType::U64] = + TypePointer(spv::StorageClass::PhysicalStorageBuffer, U64); + } + if (True(info.dma_types & IR::Type::F32)) { + physical_pointer_types[PointerType::F32] = + TypePointer(spv::StorageClass::PhysicalStorageBuffer, F32[1]); + } + if (True(info.dma_types & IR::Type::U32)) { + physical_pointer_types[PointerType::U32] = + TypePointer(spv::StorageClass::PhysicalStorageBuffer, U32[1]); + } + if (True(info.dma_types & IR::Type::F16)) { + physical_pointer_types[PointerType::F16] = + TypePointer(spv::StorageClass::PhysicalStorageBuffer, F16[1]); + } + if (True(info.dma_types & IR::Type::U16)) { + physical_pointer_types[PointerType::U16] = + TypePointer(spv::StorageClass::PhysicalStorageBuffer, U16); + } + if (True(info.dma_types & IR::Type::U8)) { + physical_pointer_types[PointerType::U8] = + TypePointer(spv::StorageClass::PhysicalStorageBuffer, U8); + } } void EmitContext::DefineInterfaces() { @@ -195,9 +235,10 @@ EmitContext::SpirvAttribute EmitContext::GetAttributeInfo(AmdGpu::NumberFormat f } Id EmitContext::GetBufferSize(const u32 sharp_idx) { - const auto& srt_flatbuf = buffers.back(); - ASSERT(srt_flatbuf.buffer_type == BufferType::ReadConstUbo); - const auto [id, pointer_type] = srt_flatbuf[BufferAlias::U32]; + // Can this be done with memory access? Like we do now with ReadConst + const auto& srt_flatbuf = buffers[flatbuf_index]; + ASSERT(srt_flatbuf.buffer_type == BufferType::Flatbuf); + const auto [id, pointer_type] = srt_flatbuf[PointerType::U32]; const auto rsrc1{ OpLoad(U32[1], OpAccessChain(pointer_type, id, u32_zero_value, ConstU32(sharp_idx + 1)))}; @@ -690,8 +731,14 @@ EmitContext::BufferSpv EmitContext::DefineBuffer(bool is_storage, bool is_writte case Shader::BufferType::GdsBuffer: Name(id, "gds_buffer"); break; - case Shader::BufferType::ReadConstUbo: - Name(id, "srt_flatbuf_ubo"); + case Shader::BufferType::Flatbuf: + Name(id, "srt_flatbuf"); + break; + case Shader::BufferType::BdaPagetable: + Name(id, "bda_pagetable"); + break; + case Shader::BufferType::FaultBuffer: + Name(id, "fault_buffer"); break; case Shader::BufferType::SharedMemory: Name(id, "ssbo_shmem"); @@ -705,35 +752,53 @@ EmitContext::BufferSpv EmitContext::DefineBuffer(bool is_storage, bool is_writte }; void EmitContext::DefineBuffers() { - if (!profile.supports_robust_buffer_access && !info.has_readconst) { - // In case ReadConstUbo has not already been bound by IR and is needed + if (!profile.supports_robust_buffer_access && + info.readconst_types == Info::ReadConstType::None) { + // In case Flatbuf has not already been bound by IR and is needed // to query buffer sizes, bind it now. info.buffers.push_back({ .used_types = IR::Type::U32, - .inline_cbuf = AmdGpu::Buffer::Null(), - .buffer_type = BufferType::ReadConstUbo, + // We can't guarantee that flatbuf will not grow past UBO + // limit if there are a lot of ReadConsts. (We could specialize) + .inline_cbuf = AmdGpu::Buffer::Placeholder(std::numeric_limits::max()), + .buffer_type = BufferType::Flatbuf, }); + // In the future we may want to read buffer sizes from GPU memory if available. + // info.readconst_types |= Info::ReadConstType::Immediate; } for (const auto& desc : info.buffers) { const auto buf_sharp = desc.GetSharp(info); const bool is_storage = desc.IsStorage(buf_sharp, profile); + // Set indexes for special buffers. + if (desc.buffer_type == BufferType::Flatbuf) { + flatbuf_index = buffers.size(); + } else if (desc.buffer_type == BufferType::BdaPagetable) { + bda_pagetable_index = buffers.size(); + } else if (desc.buffer_type == BufferType::FaultBuffer) { + fault_buffer_index = buffers.size(); + } + // Define aliases depending on the shader usage. auto& spv_buffer = buffers.emplace_back(binding.buffer++, desc.buffer_type); + if (True(desc.used_types & IR::Type::U64)) { + spv_buffer[PointerType::U64] = + DefineBuffer(is_storage, desc.is_written, 3, desc.buffer_type, U64); + } if (True(desc.used_types & IR::Type::U32)) { - spv_buffer[BufferAlias::U32] = + spv_buffer[PointerType::U32] = DefineBuffer(is_storage, desc.is_written, 2, desc.buffer_type, U32[1]); } if (True(desc.used_types & IR::Type::F32)) { - spv_buffer[BufferAlias::F32] = + spv_buffer[PointerType::F32] = DefineBuffer(is_storage, desc.is_written, 2, desc.buffer_type, F32[1]); } if (True(desc.used_types & IR::Type::U16)) { - spv_buffer[BufferAlias::U16] = + spv_buffer[PointerType::U16] = DefineBuffer(is_storage, desc.is_written, 1, desc.buffer_type, U16); } if (True(desc.used_types & IR::Type::U8)) { - spv_buffer[BufferAlias::U8] = + spv_buffer[PointerType::U8] = DefineBuffer(is_storage, desc.is_written, 0, desc.buffer_type, U8); } ++binding.unified; @@ -1003,6 +1068,101 @@ Id EmitContext::DefineUfloatM5ToFloat32(u32 mantissa_bits, const std::string_vie return func; } +Id EmitContext::DefineGetBdaPointer() { + const auto caching_pagebits{ + Constant(U64, static_cast(VideoCore::BufferCache::CACHING_PAGEBITS))}; + const auto caching_pagemask{Constant(U64, VideoCore::BufferCache::CACHING_PAGESIZE - 1)}; + + const auto func_type{TypeFunction(U64, U64)}; + const auto func{OpFunction(U64, spv::FunctionControlMask::MaskNone, func_type)}; + const auto address{OpFunctionParameter(U64)}; + Name(func, "get_bda_pointer"); + AddLabel(); + + const auto fault_label{OpLabel()}; + const auto available_label{OpLabel()}; + const auto merge_label{OpLabel()}; + + // Get page BDA + const auto page{OpShiftRightLogical(U64, address, caching_pagebits)}; + const auto page32{OpUConvert(U32[1], page)}; + const auto& bda_buffer{buffers[bda_pagetable_index]}; + const auto [bda_buffer_id, bda_pointer_type] = bda_buffer[PointerType::U64]; + const auto bda_ptr{OpAccessChain(bda_pointer_type, bda_buffer_id, u32_zero_value, page32)}; + const auto bda{OpLoad(U64, bda_ptr)}; + + // Check if page is GPU cached + const auto is_fault{OpIEqual(U1[1], bda, u64_zero_value)}; + OpSelectionMerge(merge_label, spv::SelectionControlMask::MaskNone); + OpBranchConditional(is_fault, fault_label, available_label); + + // First time acces, mark as fault + AddLabel(fault_label); + const auto& fault_buffer{buffers[fault_buffer_index]}; + const auto [fault_buffer_id, fault_pointer_type] = fault_buffer[PointerType::U8]; + const auto page_div8{OpShiftRightLogical(U32[1], page32, ConstU32(3U))}; + const auto page_mod8{OpBitwiseAnd(U32[1], page32, ConstU32(7U))}; + const auto page_mask{OpShiftLeftLogical(U8, u8_one_value, page_mod8)}; + const auto fault_ptr{ + OpAccessChain(fault_pointer_type, fault_buffer_id, u32_zero_value, page_div8)}; + const auto fault_value{OpLoad(U8, fault_ptr)}; + const auto fault_value_masked{OpBitwiseOr(U8, fault_value, page_mask)}; + OpStore(fault_ptr, fault_value_masked); + + // Return null pointer + const auto fallback_result{u64_zero_value}; + OpBranch(merge_label); + + // Value is available, compute address + AddLabel(available_label); + const auto offset_in_bda{OpBitwiseAnd(U64, address, caching_pagemask)}; + const auto addr{OpIAdd(U64, bda, offset_in_bda)}; + OpBranch(merge_label); + + // Merge + AddLabel(merge_label); + const auto result{OpPhi(U64, addr, available_label, fallback_result, fault_label)}; + OpReturnValue(result); + OpFunctionEnd(); + return func; +} + +Id EmitContext::DefineReadConst(bool dynamic) { + const auto func_type{!dynamic ? TypeFunction(U32[1], U32[2], U32[1], U32[1]) + : TypeFunction(U32[1], U32[2], U32[1])}; + const auto func{OpFunction(U32[1], spv::FunctionControlMask::MaskNone, func_type)}; + const auto base{OpFunctionParameter(U32[2])}; + const auto offset{OpFunctionParameter(U32[1])}; + const auto flatbuf_offset{!dynamic ? OpFunctionParameter(U32[1]) : Id{}}; + Name(func, dynamic ? "read_const_dynamic" : "read_const"); + AddLabel(); + + const auto base_lo{OpUConvert(U64, OpCompositeExtract(U32[1], base, 0))}; + const auto base_hi{OpUConvert(U64, OpCompositeExtract(U32[1], base, 1))}; + const auto base_shift{OpShiftLeftLogical(U64, base_hi, ConstU32(32U))}; + const auto base_addr{OpBitwiseOr(U64, base_lo, base_shift)}; + const auto offset_bytes{OpShiftLeftLogical(U32[1], offset, ConstU32(2U))}; + const auto addr{OpIAdd(U64, base_addr, OpUConvert(U64, offset_bytes))}; + + const auto result = EmitMemoryRead(U32[1], addr, [&]() { + if (dynamic) { + return u32_zero_value; + } else { + const auto& flatbuf_buffer{buffers[flatbuf_index]}; + ASSERT(flatbuf_buffer.binding >= 0 && + flatbuf_buffer.buffer_type == BufferType::Flatbuf); + const auto [flatbuf_buffer_id, flatbuf_pointer_type] = flatbuf_buffer[PointerType::U32]; + const auto ptr{OpAccessChain(flatbuf_pointer_type, flatbuf_buffer_id, u32_zero_value, + flatbuf_offset)}; + return OpLoad(U32[1], ptr); + } + }); + + OpReturnValue(result); + OpFunctionEnd(); + return func; +} + void EmitContext::DefineFunctions() { if (info.uses_pack_10_11_11) { f32_to_uf11 = DefineFloat32ToUfloatM5(6, "f32_to_uf11"); @@ -1012,6 +1172,18 @@ void EmitContext::DefineFunctions() { uf11_to_f32 = DefineUfloatM5ToFloat32(6, "uf11_to_f32"); uf10_to_f32 = DefineUfloatM5ToFloat32(5, "uf10_to_f32"); } + if (info.dma_types != IR::Type::Void) { + get_bda_pointer = DefineGetBdaPointer(); + } + + if (True(info.readconst_types & Info::ReadConstType::Immediate)) { + LOG_DEBUG(Render_Recompiler, "Shader {:#x} uses immediate ReadConst", info.pgm_hash); + read_const = DefineReadConst(false); + } + if (True(info.readconst_types & Info::ReadConstType::Dynamic)) { + LOG_DEBUG(Render_Recompiler, "Shader {:#x} uses dynamic ReadConst", info.pgm_hash); + read_const_dynamic = DefineReadConst(true); + } } } // namespace Shader::Backend::SPIRV diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.h b/src/shader_recompiler/backend/spirv/spirv_emit_context.h index 38d55e0e4..a2e0d2f47 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.h +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.h @@ -4,6 +4,7 @@ #pragma once #include +#include #include #include "shader_recompiler/backend/bindings.h" @@ -41,6 +42,17 @@ public: Bindings& binding); ~EmitContext(); + enum class PointerType : u32 { + U8, + U16, + F16, + U32, + F32, + U64, + F64, + NumAlias, + }; + Id Def(const IR::Value& value); void DefineBufferProperties(); @@ -133,12 +145,72 @@ public: return ConstantComposite(type, constituents); } + inline Id AddLabel() { + last_label = Module::AddLabel(); + return last_label; + } + + inline Id AddLabel(Id label) { + last_label = Module::AddLabel(label); + return last_label; + } + + PointerType PointerTypeFromType(Id type) { + if (type.value == U8.value) + return PointerType::U8; + if (type.value == U16.value) + return PointerType::U16; + if (type.value == F16[1].value) + return PointerType::F16; + if (type.value == U32[1].value) + return PointerType::U32; + if (type.value == F32[1].value) + return PointerType::F32; + if (type.value == U64.value) + return PointerType::U64; + if (type.value == F64[1].value) + return PointerType::F64; + UNREACHABLE_MSG("Unknown type for pointer"); + } + + Id EmitMemoryRead(Id type, Id address, auto&& fallback) { + const Id available_label = OpLabel(); + const Id fallback_label = OpLabel(); + const Id merge_label = OpLabel(); + + const Id addr = OpFunctionCall(U64, get_bda_pointer, address); + const Id is_available = OpINotEqual(U1[1], addr, u64_zero_value); + OpSelectionMerge(merge_label, spv::SelectionControlMask::MaskNone); + OpBranchConditional(is_available, available_label, fallback_label); + + // Available + AddLabel(available_label); + const auto pointer_type = PointerTypeFromType(type); + const Id pointer_type_id = physical_pointer_types[pointer_type]; + const Id addr_ptr = OpConvertUToPtr(pointer_type_id, addr); + const Id result = OpLoad(type, addr_ptr, spv::MemoryAccessMask::Aligned, 4u); + OpBranch(merge_label); + + // Fallback + AddLabel(fallback_label); + const Id fallback_result = fallback(); + OpBranch(merge_label); + + // Merge + AddLabel(merge_label); + const Id final_result = + OpPhi(type, fallback_result, fallback_label, result, available_label); + return final_result; + } + Info& info; const RuntimeInfo& runtime_info; const Profile& profile; Stage stage; LogicalStage l_stage{}; + Id last_label{}; + Id void_id{}; Id U8{}; Id S8{}; @@ -161,9 +233,13 @@ public: Id true_value{}; Id false_value{}; + Id u8_one_value{}; + Id u8_zero_value{}; Id u32_one_value{}; Id u32_zero_value{}; Id f32_zero_value{}; + Id u64_one_value{}; + Id u64_zero_value{}; Id shared_u8{}; Id shared_u16{}; @@ -231,14 +307,6 @@ public: bool is_storage = false; }; - enum class BufferAlias : u32 { - U8, - U16, - U32, - F32, - NumAlias, - }; - struct BufferSpv { Id id; Id pointer_type; @@ -252,22 +320,40 @@ public: Id size; Id size_shorts; Id size_dwords; - std::array aliases; + std::array aliases; - const BufferSpv& operator[](BufferAlias alias) const { + const BufferSpv& operator[](PointerType alias) const { return aliases[u32(alias)]; } - BufferSpv& operator[](BufferAlias alias) { + BufferSpv& operator[](PointerType alias) { return aliases[u32(alias)]; } }; + struct PhysicalPointerTypes { + std::array types; + + const Id& operator[](PointerType type) const { + return types[u32(type)]; + } + + Id& operator[](PointerType type) { + return types[u32(type)]; + } + }; + Bindings& binding; boost::container::small_vector buf_type_ids; boost::container::small_vector buffers; boost::container::small_vector images; boost::container::small_vector samplers; + PhysicalPointerTypes physical_pointer_types; + std::unordered_map first_to_last_label_map; + + size_t flatbuf_index{}; + size_t bda_pagetable_index{}; + size_t fault_buffer_index{}; Id sampler_type{}; Id sampler_pointer_type{}; @@ -292,6 +378,11 @@ public: Id uf10_to_f32{}; Id f32_to_uf10{}; + Id get_bda_pointer{}; + + Id read_const{}; + Id read_const_dynamic{}; + private: void DefineArithmeticTypes(); void DefineInterfaces(); @@ -312,6 +403,10 @@ private: Id DefineFloat32ToUfloatM5(u32 mantissa_bits, std::string_view name); Id DefineUfloatM5ToFloat32(u32 mantissa_bits, std::string_view name); + Id DefineGetBdaPointer(); + + Id DefineReadConst(bool dynamic); + Id GetBufferSize(u32 sharp_idx); }; diff --git a/src/shader_recompiler/frontend/translate/scalar_memory.cpp b/src/shader_recompiler/frontend/translate/scalar_memory.cpp index 376cc304e..3c6fd3968 100644 --- a/src/shader_recompiler/frontend/translate/scalar_memory.cpp +++ b/src/shader_recompiler/frontend/translate/scalar_memory.cpp @@ -39,21 +39,22 @@ void Translator::EmitScalarMemory(const GcnInst& inst) { void Translator::S_LOAD_DWORD(int num_dwords, const GcnInst& inst) { const auto& smrd = inst.control.smrd; - const u32 dword_offset = [&] -> u32 { + const IR::ScalarReg sbase{inst.src[0].code * 2}; + const IR::U32 dword_offset = [&] -> IR::U32 { if (smrd.imm) { - return smrd.offset; + return ir.Imm32(smrd.offset); } if (smrd.offset == SQ_SRC_LITERAL) { - return inst.src[1].code; + return ir.Imm32(inst.src[1].code); } - UNREACHABLE(); + return ir.ShiftRightLogical(ir.GetScalarReg(IR::ScalarReg(smrd.offset)), ir.Imm32(2)); }(); - const IR::ScalarReg sbase{inst.src[0].code * 2}; const IR::Value base = ir.CompositeConstruct(ir.GetScalarReg(sbase), ir.GetScalarReg(sbase + 1)); IR::ScalarReg dst_reg{inst.dst[0].code}; for (u32 i = 0; i < num_dwords; i++) { - ir.SetScalarReg(dst_reg + i, ir.ReadConst(base, ir.Imm32(dword_offset + i))); + IR::U32 index = ir.IAdd(dword_offset, ir.Imm32(i)); + ir.SetScalarReg(dst_reg + i, ir.ReadConst(base, index)); } } diff --git a/src/shader_recompiler/info.h b/src/shader_recompiler/info.h index ba28d7e43..d349d7827 100644 --- a/src/shader_recompiler/info.h +++ b/src/shader_recompiler/info.h @@ -41,7 +41,9 @@ constexpr u32 NUM_TEXTURE_TYPES = 7; enum class BufferType : u32 { Guest, - ReadConstUbo, + Flatbuf, + BdaPagetable, + FaultBuffer, GdsBuffer, SharedMemory, }; @@ -215,11 +217,18 @@ struct Info { bool stores_tess_level_outer{}; bool stores_tess_level_inner{}; bool translation_failed{}; - bool has_readconst{}; u8 mrt_mask{0u}; bool has_fetch_shader{false}; u32 fetch_shader_sgpr_base{0u}; + enum class ReadConstType { + None = 0, + Immediate = 1 << 0, + Dynamic = 1 << 1, + }; + ReadConstType readconst_types{}; + IR::Type dma_types{IR::Type::Void}; + explicit Info(Stage stage_, LogicalStage l_stage_, ShaderParams params) : stage{stage_}, l_stage{l_stage_}, pgm_hash{params.hash}, pgm_base{params.Base()}, user_data{params.user_data} {} @@ -277,6 +286,7 @@ struct Info { sizeof(tess_constants)); } }; +DECLARE_ENUM_FLAG_OPERATORS(Info::ReadConstType); constexpr AmdGpu::Buffer BufferResource::GetSharp(const Info& info) const noexcept { return inline_cbuf ? inline_cbuf : info.ReadUdSharp(sharp_idx); diff --git a/src/shader_recompiler/ir/abstract_syntax_list.cpp b/src/shader_recompiler/ir/abstract_syntax_list.cpp new file mode 100644 index 000000000..0d967ac11 --- /dev/null +++ b/src/shader_recompiler/ir/abstract_syntax_list.cpp @@ -0,0 +1,44 @@ +// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include "abstract_syntax_list.h" + +namespace Shader::IR { + +std::string DumpASLNode(const AbstractSyntaxNode& node, + const std::map& block_to_index, + const std::map& inst_to_index) { + switch (node.type) { + case AbstractSyntaxNode::Type::Block: + return fmt::format("Block: ${}", block_to_index.at(node.data.block)); + case AbstractSyntaxNode::Type::If: + return fmt::format("If: cond = %{}, body = ${}, merge = ${}", + inst_to_index.at(node.data.if_node.cond.Inst()), + block_to_index.at(node.data.if_node.body), + block_to_index.at(node.data.if_node.merge)); + case AbstractSyntaxNode::Type::EndIf: + return fmt::format("EndIf: merge = ${}", block_to_index.at(node.data.end_if.merge)); + case AbstractSyntaxNode::Type::Loop: + return fmt::format("Loop: body = ${}, continue = ${}, merge = ${}", + block_to_index.at(node.data.loop.body), + block_to_index.at(node.data.loop.continue_block), + block_to_index.at(node.data.loop.merge)); + case AbstractSyntaxNode::Type::Repeat: + return fmt::format("Repeat: cond = %{}, header = ${}, merge = ${}", + inst_to_index.at(node.data.repeat.cond.Inst()), + block_to_index.at(node.data.repeat.loop_header), + block_to_index.at(node.data.repeat.merge)); + case AbstractSyntaxNode::Type::Break: + return fmt::format("Break: cond = %{}, merge = ${}, skip = ${}", + inst_to_index.at(node.data.break_node.cond.Inst()), + block_to_index.at(node.data.break_node.merge), + block_to_index.at(node.data.break_node.skip)); + case AbstractSyntaxNode::Type::Return: + return "Return"; + case AbstractSyntaxNode::Type::Unreachable: + return "Unreachable"; + }; + UNREACHABLE(); +} + +} // namespace Shader::IR \ No newline at end of file diff --git a/src/shader_recompiler/ir/abstract_syntax_list.h b/src/shader_recompiler/ir/abstract_syntax_list.h index 313a23abc..a620baccb 100644 --- a/src/shader_recompiler/ir/abstract_syntax_list.h +++ b/src/shader_recompiler/ir/abstract_syntax_list.h @@ -3,6 +3,7 @@ #pragma once +#include #include #include "shader_recompiler/ir/value.h" @@ -53,4 +54,8 @@ struct AbstractSyntaxNode { }; using AbstractSyntaxList = std::vector; +std::string DumpASLNode(const AbstractSyntaxNode& node, + const std::map& block_to_index, + const std::map& inst_to_index); + } // namespace Shader::IR diff --git a/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp b/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp index f53a0f4d4..d4759b32e 100644 --- a/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp +++ b/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp @@ -2,6 +2,7 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include "shader_recompiler/ir/program.h" +#include "video_core/buffer_cache/buffer_cache.h" namespace Shader::Optimization { @@ -79,14 +80,21 @@ void Visit(Info& info, const IR::Inst& inst) { info.uses_lane_id = true; break; case IR::Opcode::ReadConst: - if (!info.has_readconst) { + if (info.readconst_types == Info::ReadConstType::None) { info.buffers.push_back({ .used_types = IR::Type::U32, - .inline_cbuf = AmdGpu::Buffer::Null(), - .buffer_type = BufferType::ReadConstUbo, + // We can't guarantee that flatbuf will not grow past UBO + // limit if there are a lot of ReadConsts. (We could specialize) + .inline_cbuf = AmdGpu::Buffer::Placeholder(std::numeric_limits::max()), + .buffer_type = BufferType::Flatbuf, }); - info.has_readconst = true; } + if (inst.Flags() != 0) { + info.readconst_types |= Info::ReadConstType::Immediate; + } else { + info.readconst_types |= Info::ReadConstType::Dynamic; + } + info.dma_types |= IR::Type::U32; break; case IR::Opcode::PackUfloat10_11_11: info.uses_pack_10_11_11 = true; @@ -105,6 +113,21 @@ void CollectShaderInfoPass(IR::Program& program) { Visit(program.info, inst); } } + + if (program.info.dma_types != IR::Type::Void) { + program.info.buffers.push_back({ + .used_types = IR::Type::U64, + .inline_cbuf = AmdGpu::Buffer::Placeholder(VideoCore::BufferCache::BDA_PAGETABLE_SIZE), + .buffer_type = BufferType::BdaPagetable, + .is_written = true, + }); + program.info.buffers.push_back({ + .used_types = IR::Type::U8, + .inline_cbuf = AmdGpu::Buffer::Placeholder(VideoCore::BufferCache::FAULT_BUFFER_SIZE), + .buffer_type = BufferType::FaultBuffer, + .is_written = true, + }); + } } } // namespace Shader::Optimization diff --git a/src/shader_recompiler/ir/program.cpp b/src/shader_recompiler/ir/program.cpp index 7728a3ccb..f2f6e34fa 100644 --- a/src/shader_recompiler/ir/program.cpp +++ b/src/shader_recompiler/ir/program.cpp @@ -6,13 +6,30 @@ #include +#include "common/config.h" +#include "common/io_file.h" +#include "common/path_util.h" #include "shader_recompiler/ir/basic_block.h" #include "shader_recompiler/ir/program.h" #include "shader_recompiler/ir/value.h" namespace Shader::IR { -std::string DumpProgram(const Program& program) { +void DumpProgram(const Program& program, const Info& info, const std::string& type) { + using namespace Common::FS; + + if (!Config::dumpShaders()) { + return; + } + + const auto dump_dir = GetUserPath(PathType::ShaderDir) / "dumps"; + if (!std::filesystem::exists(dump_dir)) { + std::filesystem::create_directories(dump_dir); + } + const auto ir_filename = + fmt::format("{}_{:#018x}.{}irprogram.txt", info.stage, info.pgm_hash, type); + const auto ir_file = IOFile{dump_dir / ir_filename, FileAccessMode::Write, FileType::TextFile}; + size_t index{0}; std::map inst_to_index; std::map block_to_index; @@ -21,11 +38,20 @@ std::string DumpProgram(const Program& program) { block_to_index.emplace(block, index); ++index; } - std::string ret; + for (const auto& block : program.blocks) { - ret += IR::DumpBlock(*block, block_to_index, inst_to_index, index) + '\n'; + std::string s = IR::DumpBlock(*block, block_to_index, inst_to_index, index) + '\n'; + ir_file.WriteString(s); + } + + const auto asl_filename = fmt::format("{}_{:#018x}.{}asl.txt", info.stage, info.pgm_hash, type); + const auto asl_file = + IOFile{dump_dir / asl_filename, FileAccessMode::Write, FileType::TextFile}; + + for (const auto& node : program.syntax_list) { + std::string s = IR::DumpASLNode(node, block_to_index, inst_to_index) + '\n'; + asl_file.WriteString(s); } - return ret; } } // namespace Shader::IR diff --git a/src/shader_recompiler/ir/program.h b/src/shader_recompiler/ir/program.h index 84a1a2d40..3ffd4dc96 100644 --- a/src/shader_recompiler/ir/program.h +++ b/src/shader_recompiler/ir/program.h @@ -21,6 +21,6 @@ struct Program { Info& info; }; -[[nodiscard]] std::string DumpProgram(const Program& program); +void DumpProgram(const Program& program, const Info& info, const std::string& type = ""); } // namespace Shader::IR diff --git a/src/shader_recompiler/recompiler.cpp b/src/shader_recompiler/recompiler.cpp index 3e0bd98d2..9f92857d6 100644 --- a/src/shader_recompiler/recompiler.cpp +++ b/src/shader_recompiler/recompiler.cpp @@ -85,6 +85,8 @@ IR::Program TranslateProgram(std::span code, Pools& pools, Info& info Shader::Optimization::ConstantPropagationPass(program.post_order_blocks); Shader::Optimization::CollectShaderInfoPass(program); + Shader::IR::DumpProgram(program, info); + return program; } diff --git a/src/video_core/amdgpu/liverpool.cpp b/src/video_core/amdgpu/liverpool.cpp index d1cd98634..706e94c2b 100644 --- a/src/video_core/amdgpu/liverpool.cpp +++ b/src/video_core/amdgpu/liverpool.cpp @@ -133,6 +133,7 @@ void Liverpool::Process(std::stop_token stoken) { VideoCore::EndCapture(); if (rasterizer) { + rasterizer->ProcessFaults(); rasterizer->Flush(); } submit_done = false; diff --git a/src/video_core/amdgpu/resource.h b/src/video_core/amdgpu/resource.h index 9060074fb..89ac04f9a 100644 --- a/src/video_core/amdgpu/resource.h +++ b/src/video_core/amdgpu/resource.h @@ -37,6 +37,13 @@ struct Buffer { return buffer; } + static constexpr Buffer Placeholder(u32 size) { + Buffer buffer{}; + buffer.base_address = 1; + buffer.num_records = size; + return buffer; + } + bool Valid() const { return type == 0u; } diff --git a/src/video_core/buffer_cache/buffer.cpp b/src/video_core/buffer_cache/buffer.cpp index 15ef746cd..15bf0d81e 100644 --- a/src/video_core/buffer_cache/buffer.cpp +++ b/src/video_core/buffer_cache/buffer.cpp @@ -70,8 +70,11 @@ UniqueBuffer::~UniqueBuffer() { void UniqueBuffer::Create(const vk::BufferCreateInfo& buffer_ci, MemoryUsage usage, VmaAllocationInfo* out_alloc_info) { + const bool with_bda = bool(buffer_ci.usage & vk::BufferUsageFlagBits::eShaderDeviceAddress); + const VmaAllocationCreateFlags bda_flag = + with_bda ? VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT : 0; const VmaAllocationCreateInfo alloc_ci = { - .flags = VMA_ALLOCATION_CREATE_WITHIN_BUDGET_BIT | MemoryUsageVmaFlags(usage), + .flags = VMA_ALLOCATION_CREATE_WITHIN_BUDGET_BIT | bda_flag | MemoryUsageVmaFlags(usage), .usage = MemoryUsageVma(usage), .requiredFlags = 0, .preferredFlags = MemoryUsagePreferredVmaFlags(usage), @@ -86,6 +89,15 @@ void UniqueBuffer::Create(const vk::BufferCreateInfo& buffer_ci, MemoryUsage usa ASSERT_MSG(result == VK_SUCCESS, "Failed allocating buffer with error {}", vk::to_string(vk::Result{result})); buffer = vk::Buffer{unsafe_buffer}; + + if (with_bda) { + vk::BufferDeviceAddressInfo bda_info{ + .buffer = buffer, + }; + auto bda_result = device.getBufferAddress(bda_info); + ASSERT_MSG(bda_result != 0, "Failed to get buffer device address"); + bda_addr = bda_result; + } } Buffer::Buffer(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_, MemoryUsage usage_, diff --git a/src/video_core/buffer_cache/buffer.h b/src/video_core/buffer_cache/buffer.h index 188b4b2ca..530968787 100644 --- a/src/video_core/buffer_cache/buffer.h +++ b/src/video_core/buffer_cache/buffer.h @@ -68,6 +68,7 @@ struct UniqueBuffer { VmaAllocator allocator; VmaAllocation allocation; vk::Buffer buffer{}; + vk::DeviceAddress bda_addr = 0; }; class Buffer { @@ -115,6 +116,11 @@ public: return buffer; } + vk::DeviceAddress BufferDeviceAddress() const noexcept { + ASSERT_MSG(buffer.bda_addr != 0, "Can't get BDA from a non BDA buffer"); + return buffer.bda_addr; + } + std::optional GetBarrier( vk::Flags dst_acess_mask, vk::PipelineStageFlagBits2 dst_stage, u32 offset = 0) { diff --git a/src/video_core/buffer_cache/buffer_cache.cpp b/src/video_core/buffer_cache/buffer_cache.cpp index c993ef3e5..45863d8e8 100644 --- a/src/video_core/buffer_cache/buffer_cache.cpp +++ b/src/video_core/buffer_cache/buffer_cache.cpp @@ -3,13 +3,17 @@ #include #include "common/alignment.h" +#include "common/debug.h" #include "common/scope_exit.h" #include "common/types.h" #include "video_core/amdgpu/liverpool.h" #include "video_core/buffer_cache/buffer_cache.h" +#include "video_core/host_shaders/fault_buffer_process_comp.h" #include "video_core/renderer_vulkan/vk_graphics_pipeline.h" #include "video_core/renderer_vulkan/vk_instance.h" +#include "video_core/renderer_vulkan/vk_rasterizer.h" #include "video_core/renderer_vulkan/vk_scheduler.h" +#include "video_core/renderer_vulkan/vk_shader_util.h" #include "video_core/texture_cache/texture_cache.h" namespace VideoCore { @@ -17,17 +21,26 @@ namespace VideoCore { static constexpr size_t DataShareBufferSize = 64_KB; static constexpr size_t StagingBufferSize = 512_MB; static constexpr size_t UboStreamBufferSize = 128_MB; +static constexpr size_t DownloadBufferSize = 128_MB; +static constexpr size_t MaxPageFaults = 1024; BufferCache::BufferCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_, - AmdGpu::Liverpool* liverpool_, TextureCache& texture_cache_, - PageManager& tracker_) - : instance{instance_}, scheduler{scheduler_}, liverpool{liverpool_}, + Vulkan::Rasterizer& rasterizer_, AmdGpu::Liverpool* liverpool_, + TextureCache& texture_cache_, PageManager& tracker_) + : instance{instance_}, scheduler{scheduler_}, rasterizer{rasterizer_}, liverpool{liverpool_}, texture_cache{texture_cache_}, tracker{tracker_}, staging_buffer{instance, scheduler, MemoryUsage::Upload, StagingBufferSize}, stream_buffer{instance, scheduler, MemoryUsage::Stream, UboStreamBufferSize}, + download_buffer(instance, scheduler, MemoryUsage::Download, DownloadBufferSize), gds_buffer{instance, scheduler, MemoryUsage::Stream, 0, AllFlags, DataShareBufferSize}, - memory_tracker{&tracker} { + bda_pagetable_buffer{instance, scheduler, MemoryUsage::DeviceLocal, + 0, AllFlags, BDA_PAGETABLE_SIZE}, + fault_buffer(instance, scheduler, MemoryUsage::DeviceLocal, 0, AllFlags, FAULT_BUFFER_SIZE), + memory_tracker{tracker} { Vulkan::SetObjectName(instance.GetDevice(), gds_buffer.Handle(), "GDS Buffer"); + Vulkan::SetObjectName(instance.GetDevice(), bda_pagetable_buffer.Handle(), + "BDA Page Table Buffer"); + Vulkan::SetObjectName(instance.GetDevice(), fault_buffer.Handle(), "Fault Buffer"); // Ensure the first slot is used for the null buffer const auto null_id = @@ -35,15 +48,93 @@ BufferCache::BufferCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& s ASSERT(null_id.index == 0); const vk::Buffer& null_buffer = slot_buffers[null_id].buffer; Vulkan::SetObjectName(instance.GetDevice(), null_buffer, "Null Buffer"); + + // Prepare the fault buffer parsing pipeline + boost::container::static_vector bindings{ + { + .binding = 0, + .descriptorType = vk::DescriptorType::eStorageBuffer, + .descriptorCount = 1, + .stageFlags = vk::ShaderStageFlagBits::eCompute, + }, + { + .binding = 1, + .descriptorType = vk::DescriptorType::eStorageBuffer, + .descriptorCount = 1, + .stageFlags = vk::ShaderStageFlagBits::eCompute, + }, + }; + + const vk::DescriptorSetLayoutCreateInfo desc_layout_ci = { + .flags = vk::DescriptorSetLayoutCreateFlagBits::ePushDescriptorKHR, + .bindingCount = static_cast(bindings.size()), + .pBindings = bindings.data(), + }; + auto [desc_layout_result, desc_layout] = + instance.GetDevice().createDescriptorSetLayoutUnique(desc_layout_ci); + ASSERT_MSG(desc_layout_result == vk::Result::eSuccess, + "Failed to create descriptor set layout: {}", vk::to_string(desc_layout_result)); + fault_process_desc_layout = std::move(desc_layout); + + const auto& module = Vulkan::Compile(HostShaders::FAULT_BUFFER_PROCESS_COMP, + vk::ShaderStageFlagBits::eCompute, instance.GetDevice()); + Vulkan::SetObjectName(instance.GetDevice(), module, "Fault Buffer Parser"); + + const vk::SpecializationMapEntry specialization_map_entry = { + .constantID = 0, + .offset = 0, + .size = sizeof(u32), + }; + + const vk::SpecializationInfo specialization_info = { + .mapEntryCount = 1, + .pMapEntries = &specialization_map_entry, + .dataSize = sizeof(u32), + .pData = &CACHING_PAGEBITS, + }; + + const vk::PipelineShaderStageCreateInfo shader_ci = { + .stage = vk::ShaderStageFlagBits::eCompute, + .module = module, + .pName = "main", + .pSpecializationInfo = &specialization_info, + }; + + const vk::PipelineLayoutCreateInfo layout_info = { + .setLayoutCount = 1U, + .pSetLayouts = &(*fault_process_desc_layout), + }; + auto [layout_result, layout] = instance.GetDevice().createPipelineLayoutUnique(layout_info); + ASSERT_MSG(layout_result == vk::Result::eSuccess, "Failed to create pipeline layout: {}", + vk::to_string(layout_result)); + fault_process_pipeline_layout = std::move(layout); + + const vk::ComputePipelineCreateInfo pipeline_info = { + .stage = shader_ci, + .layout = *fault_process_pipeline_layout, + }; + auto [pipeline_result, pipeline] = + instance.GetDevice().createComputePipelineUnique({}, pipeline_info); + ASSERT_MSG(pipeline_result == vk::Result::eSuccess, "Failed to create compute pipeline: {}", + vk::to_string(pipeline_result)); + fault_process_pipeline = std::move(pipeline); + Vulkan::SetObjectName(instance.GetDevice(), *fault_process_pipeline, + "Fault Buffer Parser Pipeline"); + + instance.GetDevice().destroyShaderModule(module); } BufferCache::~BufferCache() = default; -void BufferCache::InvalidateMemory(VAddr device_addr, u64 size) { +void BufferCache::InvalidateMemory(VAddr device_addr, u64 size, bool unmap) { const bool is_tracked = IsRegionRegistered(device_addr, size); if (is_tracked) { // Mark the page as CPU modified to stop tracking writes. memory_tracker.MarkRegionAsCpuModified(device_addr, size); + + if (unmap) { + return; + } } } @@ -69,20 +160,20 @@ void BufferCache::DownloadBufferMemory(Buffer& buffer, VAddr device_addr, u64 si if (total_size_bytes == 0) { return; } - const auto [staging, offset] = staging_buffer.Map(total_size_bytes); + const auto [download, offset] = download_buffer.Map(total_size_bytes); for (auto& copy : copies) { // Modify copies to have the staging offset in mind copy.dstOffset += offset; } - staging_buffer.Commit(); + download_buffer.Commit(); scheduler.EndRendering(); const auto cmdbuf = scheduler.CommandBuffer(); - cmdbuf.copyBuffer(buffer.buffer, staging_buffer.Handle(), copies); + cmdbuf.copyBuffer(buffer.buffer, download_buffer.Handle(), copies); scheduler.Finish(); for (const auto& copy : copies) { const VAddr copy_device_addr = buffer.CpuAddr() + copy.srcOffset; const u64 dst_offset = copy.dstOffset - offset; - std::memcpy(std::bit_cast(copy_device_addr), staging + dst_offset, copy.size); + std::memcpy(std::bit_cast(copy_device_addr), download + dst_offset, copy.size); } } @@ -206,58 +297,37 @@ void BufferCache::InlineData(VAddr address, const void* value, u32 num_bytes, bo memcpy(std::bit_cast(address), value, num_bytes); return; } - scheduler.EndRendering(); - const Buffer* buffer = [&] { + Buffer* buffer = [&] { if (is_gds) { return &gds_buffer; } const BufferId buffer_id = FindBuffer(address, num_bytes); return &slot_buffers[buffer_id]; }(); - const auto cmdbuf = scheduler.CommandBuffer(); - const vk::BufferMemoryBarrier2 pre_barrier = { - .srcStageMask = vk::PipelineStageFlagBits2::eAllCommands, - .srcAccessMask = vk::AccessFlagBits2::eMemoryRead, - .dstStageMask = vk::PipelineStageFlagBits2::eTransfer, - .dstAccessMask = vk::AccessFlagBits2::eTransferWrite, - .buffer = buffer->Handle(), - .offset = buffer->Offset(address), - .size = num_bytes, - }; - const vk::BufferMemoryBarrier2 post_barrier = { - .srcStageMask = vk::PipelineStageFlagBits2::eTransfer, - .srcAccessMask = vk::AccessFlagBits2::eTransferWrite, - .dstStageMask = vk::PipelineStageFlagBits2::eAllCommands, - .dstAccessMask = vk::AccessFlagBits2::eMemoryRead, - .buffer = buffer->Handle(), - .offset = buffer->Offset(address), - .size = num_bytes, - }; - cmdbuf.pipelineBarrier2(vk::DependencyInfo{ - .dependencyFlags = vk::DependencyFlagBits::eByRegion, - .bufferMemoryBarrierCount = 1, - .pBufferMemoryBarriers = &pre_barrier, - }); - // vkCmdUpdateBuffer can only copy up to 65536 bytes at a time. - static constexpr u32 UpdateBufferMaxSize = 65536; - const auto dst_offset = buffer->Offset(address); - for (u32 offset = 0; offset < num_bytes; offset += UpdateBufferMaxSize) { - const auto* update_src = static_cast(value) + offset; - const auto update_dst = dst_offset + offset; - const auto update_size = std::min(num_bytes - offset, UpdateBufferMaxSize); - cmdbuf.updateBuffer(buffer->Handle(), update_dst, update_size, update_src); + InlineDataBuffer(*buffer, address, value, num_bytes); +} + +void BufferCache::WriteData(VAddr address, const void* value, u32 num_bytes, bool is_gds) { + ASSERT_MSG(address % 4 == 0, "GDS offset must be dword aligned"); + if (!is_gds && !IsRegionRegistered(address, num_bytes)) { + memcpy(std::bit_cast(address), value, num_bytes); + return; } - cmdbuf.pipelineBarrier2(vk::DependencyInfo{ - .dependencyFlags = vk::DependencyFlagBits::eByRegion, - .bufferMemoryBarrierCount = 1, - .pBufferMemoryBarriers = &post_barrier, - }); + Buffer* buffer = [&] { + if (is_gds) { + return &gds_buffer; + } + const BufferId buffer_id = FindBuffer(address, num_bytes); + return &slot_buffers[buffer_id]; + }(); + WriteDataBuffer(*buffer, address, value, num_bytes); } std::pair BufferCache::ObtainBuffer(VAddr device_addr, u32 size, bool is_written, bool is_texel_buffer, BufferId buffer_id) { // For small uniform buffers that have not been modified by gpu // use device local stream buffer to reduce renderpass breaks. + // Maybe we want to modify the threshold now that the page size is 16KB? static constexpr u64 StreamThreshold = CACHING_PAGESIZE; const bool is_gpu_dirty = memory_tracker.IsRegionGpuModified(device_addr, size); if (!is_written && size <= StreamThreshold && !is_gpu_dirty) { @@ -280,7 +350,7 @@ std::pair BufferCache::ObtainBuffer(VAddr device_addr, u32 size, b std::pair BufferCache::ObtainViewBuffer(VAddr gpu_addr, u32 size, bool prefer_gpu) { // Check if any buffer contains the full requested range. const u64 page = gpu_addr >> CACHING_PAGEBITS; - const BufferId buffer_id = page_table[page]; + const BufferId buffer_id = page_table[page].buffer_id; if (buffer_id) { Buffer& buffer = slot_buffers[buffer_id]; if (buffer.IsInBounds(gpu_addr, size)) { @@ -300,24 +370,8 @@ std::pair BufferCache::ObtainViewBuffer(VAddr gpu_addr, u32 size, } bool BufferCache::IsRegionRegistered(VAddr addr, size_t size) { - const VAddr end_addr = addr + size; - const u64 page_end = Common::DivCeil(end_addr, CACHING_PAGESIZE); - for (u64 page = addr >> CACHING_PAGEBITS; page < page_end;) { - const BufferId buffer_id = page_table[page]; - if (!buffer_id) { - ++page; - continue; - } - std::shared_lock lk{mutex}; - Buffer& buffer = slot_buffers[buffer_id]; - const VAddr buf_start_addr = buffer.CpuAddr(); - const VAddr buf_end_addr = buf_start_addr + buffer.SizeBytes(); - if (buf_start_addr < end_addr && addr < buf_end_addr) { - return true; - } - page = Common::DivCeil(buf_end_addr, CACHING_PAGESIZE); - } - return false; + // Check if we are missing some edge case here + return buffer_ranges.Intersects(addr, size); } bool BufferCache::IsRegionCpuModified(VAddr addr, size_t size) { @@ -333,7 +387,7 @@ BufferId BufferCache::FindBuffer(VAddr device_addr, u32 size) { return NULL_BUFFER_ID; } const u64 page = device_addr >> CACHING_PAGEBITS; - const BufferId buffer_id = page_table[page]; + const BufferId buffer_id = page_table[page].buffer_id; if (!buffer_id) { return CreateBuffer(device_addr, size); } @@ -379,7 +433,7 @@ BufferCache::OverlapResult BufferCache::ResolveOverlaps(VAddr device_addr, u32 w } for (; device_addr >> CACHING_PAGEBITS < Common::DivCeil(end, CACHING_PAGESIZE); device_addr += CACHING_PAGESIZE) { - const BufferId overlap_id = page_table[device_addr >> CACHING_PAGEBITS]; + const BufferId overlap_id = page_table[device_addr >> CACHING_PAGEBITS].buffer_id; if (!overlap_id) { continue; } @@ -480,11 +534,21 @@ BufferId BufferCache::CreateBuffer(VAddr device_addr, u32 wanted_size) { const OverlapResult overlap = ResolveOverlaps(device_addr, wanted_size); const u32 size = static_cast(overlap.end - overlap.begin); const BufferId new_buffer_id = [&] { - std::scoped_lock lk{mutex}; + std::scoped_lock lk{slot_buffers_mutex}; return slot_buffers.insert(instance, scheduler, MemoryUsage::DeviceLocal, overlap.begin, - AllFlags, size); + AllFlags | vk::BufferUsageFlagBits::eShaderDeviceAddress, size); }(); auto& new_buffer = slot_buffers[new_buffer_id]; + boost::container::small_vector bda_addrs; + const u64 start_page = overlap.begin >> CACHING_PAGEBITS; + const u64 size_pages = size >> CACHING_PAGEBITS; + bda_addrs.reserve(size_pages); + for (u64 i = 0; i < size_pages; ++i) { + vk::DeviceAddress addr = new_buffer.BufferDeviceAddress() + (i << CACHING_PAGEBITS); + bda_addrs.push_back(addr); + } + WriteDataBuffer(bda_pagetable_buffer, start_page * sizeof(vk::DeviceAddress), bda_addrs.data(), + bda_addrs.size() * sizeof(vk::DeviceAddress)); const size_t size_bytes = new_buffer.SizeBytes(); const auto cmdbuf = scheduler.CommandBuffer(); scheduler.EndRendering(); @@ -496,6 +560,129 @@ BufferId BufferCache::CreateBuffer(VAddr device_addr, u32 wanted_size) { return new_buffer_id; } +void BufferCache::ProcessFaultBuffer() { + // Run fault processing shader + const auto [mapped, offset] = download_buffer.Map(MaxPageFaults * sizeof(u64)); + vk::BufferMemoryBarrier2 fault_buffer_barrier{ + .srcStageMask = vk::PipelineStageFlagBits2::eAllCommands, + .srcAccessMask = vk::AccessFlagBits2::eShaderWrite, + .dstStageMask = vk::PipelineStageFlagBits2::eComputeShader, + .dstAccessMask = vk::AccessFlagBits2::eShaderRead, + .buffer = fault_buffer.Handle(), + .offset = 0, + .size = FAULT_BUFFER_SIZE, + }; + vk::BufferMemoryBarrier2 download_barrier{ + .srcStageMask = vk::PipelineStageFlagBits2::eTransfer, + .srcAccessMask = vk::AccessFlagBits2::eTransferWrite, + .dstStageMask = vk::PipelineStageFlagBits2::eComputeShader, + .dstAccessMask = vk::AccessFlagBits2::eShaderRead | vk::AccessFlagBits2::eShaderWrite, + .buffer = download_buffer.Handle(), + .offset = offset, + .size = MaxPageFaults * sizeof(u64), + }; + std::array barriers{fault_buffer_barrier, download_barrier}; + vk::DescriptorBufferInfo fault_buffer_info{ + .buffer = fault_buffer.Handle(), + .offset = 0, + .range = FAULT_BUFFER_SIZE, + }; + vk::DescriptorBufferInfo download_info{ + .buffer = download_buffer.Handle(), + .offset = offset, + .range = MaxPageFaults * sizeof(u64), + }; + boost::container::small_vector writes{ + { + .dstSet = VK_NULL_HANDLE, + .dstBinding = 0, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = vk::DescriptorType::eStorageBuffer, + .pBufferInfo = &fault_buffer_info, + }, + { + .dstSet = VK_NULL_HANDLE, + .dstBinding = 1, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = vk::DescriptorType::eStorageBuffer, + .pBufferInfo = &download_info, + }, + }; + download_buffer.Commit(); + scheduler.EndRendering(); + const auto cmdbuf = scheduler.CommandBuffer(); + cmdbuf.fillBuffer(download_buffer.Handle(), offset, MaxPageFaults * sizeof(u64), 0); + cmdbuf.pipelineBarrier2(vk::DependencyInfo{ + .dependencyFlags = vk::DependencyFlagBits::eByRegion, + .bufferMemoryBarrierCount = 2, + .pBufferMemoryBarriers = barriers.data(), + }); + cmdbuf.bindPipeline(vk::PipelineBindPoint::eCompute, *fault_process_pipeline); + cmdbuf.pushDescriptorSetKHR(vk::PipelineBindPoint::eCompute, *fault_process_pipeline_layout, 0, + writes); + constexpr u32 num_threads = CACHING_NUMPAGES / 32; // 1 bit per page, 32 pages per workgroup + constexpr u32 num_workgroups = Common::DivCeil(num_threads, 64u); + cmdbuf.dispatch(num_workgroups, 1, 1); + + // Reset fault buffer + const vk::BufferMemoryBarrier2 reset_pre_barrier = { + .srcStageMask = vk::PipelineStageFlagBits2::eComputeShader, + .srcAccessMask = vk::AccessFlagBits2::eShaderRead, + .dstStageMask = vk::PipelineStageFlagBits2::eTransfer, + .dstAccessMask = vk::AccessFlagBits2::eTransferWrite, + .buffer = fault_buffer.Handle(), + .offset = 0, + .size = FAULT_BUFFER_SIZE, + }; + const vk::BufferMemoryBarrier2 reset_post_barrier = { + .srcStageMask = vk::PipelineStageFlagBits2::eTransfer, + .srcAccessMask = vk::AccessFlagBits2::eTransferWrite, + .dstStageMask = vk::PipelineStageFlagBits2::eAllCommands, + .dstAccessMask = vk::AccessFlagBits2::eMemoryRead | vk::AccessFlagBits2::eMemoryWrite, + .buffer = fault_buffer.Handle(), + .offset = 0, + .size = FAULT_BUFFER_SIZE, + }; + cmdbuf.pipelineBarrier2(vk::DependencyInfo{ + .dependencyFlags = vk::DependencyFlagBits::eByRegion, + .bufferMemoryBarrierCount = 1, + .pBufferMemoryBarriers = &reset_pre_barrier, + }); + cmdbuf.fillBuffer(fault_buffer.buffer, 0, FAULT_BUFFER_SIZE, 0); + cmdbuf.pipelineBarrier2(vk::DependencyInfo{ + .dependencyFlags = vk::DependencyFlagBits::eByRegion, + .bufferMemoryBarrierCount = 1, + .pBufferMemoryBarriers = &reset_post_barrier, + }); + + // Defer creating buffers + scheduler.DeferOperation([this, mapped]() { + // Create the fault buffers batched + boost::icl::interval_set fault_ranges; + const u64* fault_ptr = std::bit_cast(mapped); + const u32 fault_count = static_cast(*(fault_ptr++)); + for (u32 i = 0; i < fault_count; ++i) { + const VAddr fault = *(fault_ptr++); + const VAddr fault_end = fault + CACHING_PAGESIZE; // This can be adjusted + fault_ranges += + boost::icl::interval_set::interval_type::right_open(fault, fault_end); + LOG_INFO(Render_Vulkan, "Accessed non-GPU mapped memory at {:#x}", fault); + } + for (const auto& range : fault_ranges) { + const VAddr start = range.lower(); + const VAddr end = range.upper(); + const u64 page_start = start >> CACHING_PAGEBITS; + const u64 page_end = Common::DivCeil(end, CACHING_PAGESIZE); + // Buffer size is in 32 bits + ASSERT_MSG((range.upper() - range.lower()) <= std::numeric_limits::max(), + "Buffer size is too large"); + CreateBuffer(start, static_cast(end - start)); + } + }); +} + void BufferCache::Register(BufferId buffer_id) { ChangeRegister(buffer_id); } @@ -514,11 +701,16 @@ void BufferCache::ChangeRegister(BufferId buffer_id) { const u64 page_end = Common::DivCeil(device_addr_end, CACHING_PAGESIZE); for (u64 page = page_begin; page != page_end; ++page) { if constexpr (insert) { - page_table[page] = buffer_id; + page_table[page].buffer_id = buffer_id; } else { - page_table[page] = BufferId{}; + page_table[page].buffer_id = BufferId{}; } } + if constexpr (insert) { + buffer_ranges.Add(buffer.CpuAddr(), buffer.SizeBytes(), buffer_id); + } else { + buffer_ranges.Subtract(buffer.CpuAddr(), buffer.SizeBytes()); + } } void BufferCache::SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size, @@ -697,6 +889,138 @@ bool BufferCache::SynchronizeBufferFromImage(Buffer& buffer, VAddr device_addr, return true; } +void BufferCache::SynchronizeBuffersInRange(VAddr device_addr, u64 size) { + if (device_addr == 0) { + return; + } + VAddr device_addr_end = device_addr + size; + ForEachBufferInRange(device_addr, size, [&](BufferId buffer_id, Buffer& buffer) { + RENDERER_TRACE; + VAddr start = std::max(buffer.CpuAddr(), device_addr); + VAddr end = std::min(buffer.CpuAddr() + buffer.SizeBytes(), device_addr_end); + u32 size = static_cast(end - start); + SynchronizeBuffer(buffer, start, size, false); + }); +} + +void BufferCache::MemoryBarrier() { + // Vulkan doesn't know which buffer we access in a shader if we use + // BufferDeviceAddress. We need a full memory barrier. + // For now, we only read memory using BDA. If we want to write to it, + // we might need to change this. + scheduler.EndRendering(); + const auto cmdbuf = scheduler.CommandBuffer(); + vk::MemoryBarrier2 barrier = { + .srcStageMask = vk::PipelineStageFlagBits2::eTransfer, + .srcAccessMask = vk::AccessFlagBits2::eMemoryWrite, + .dstStageMask = vk::PipelineStageFlagBits2::eAllCommands, + .dstAccessMask = vk::AccessFlagBits2::eMemoryRead, + }; + cmdbuf.pipelineBarrier2(vk::DependencyInfo{ + .memoryBarrierCount = 1, + .pMemoryBarriers = &barrier, + }); +} + +void BufferCache::InlineDataBuffer(Buffer& buffer, VAddr address, const void* value, + u32 num_bytes) { + scheduler.EndRendering(); + const auto cmdbuf = scheduler.CommandBuffer(); + const vk::BufferMemoryBarrier2 pre_barrier = { + .srcStageMask = vk::PipelineStageFlagBits2::eAllCommands, + .srcAccessMask = vk::AccessFlagBits2::eMemoryRead, + .dstStageMask = vk::PipelineStageFlagBits2::eTransfer, + .dstAccessMask = vk::AccessFlagBits2::eTransferWrite, + .buffer = buffer.Handle(), + .offset = buffer.Offset(address), + .size = num_bytes, + }; + const vk::BufferMemoryBarrier2 post_barrier = { + .srcStageMask = vk::PipelineStageFlagBits2::eTransfer, + .srcAccessMask = vk::AccessFlagBits2::eTransferWrite, + .dstStageMask = vk::PipelineStageFlagBits2::eAllCommands, + .dstAccessMask = vk::AccessFlagBits2::eMemoryRead, + .buffer = buffer.Handle(), + .offset = buffer.Offset(address), + .size = num_bytes, + }; + cmdbuf.pipelineBarrier2(vk::DependencyInfo{ + .dependencyFlags = vk::DependencyFlagBits::eByRegion, + .bufferMemoryBarrierCount = 1, + .pBufferMemoryBarriers = &pre_barrier, + }); + // vkCmdUpdateBuffer can only copy up to 65536 bytes at a time. + static constexpr u32 UpdateBufferMaxSize = 65536; + const auto dst_offset = buffer.Offset(address); + for (u32 offset = 0; offset < num_bytes; offset += UpdateBufferMaxSize) { + const auto* update_src = static_cast(value) + offset; + const auto update_dst = dst_offset + offset; + const auto update_size = std::min(num_bytes - offset, UpdateBufferMaxSize); + cmdbuf.updateBuffer(buffer.Handle(), update_dst, update_size, update_src); + } + cmdbuf.pipelineBarrier2(vk::DependencyInfo{ + .dependencyFlags = vk::DependencyFlagBits::eByRegion, + .bufferMemoryBarrierCount = 1, + .pBufferMemoryBarriers = &post_barrier, + }); +} + +void BufferCache::WriteDataBuffer(Buffer& buffer, VAddr address, const void* value, u32 num_bytes) { + vk::BufferCopy copy = { + .srcOffset = 0, + .dstOffset = buffer.Offset(address), + .size = num_bytes, + }; + vk::Buffer src_buffer = staging_buffer.Handle(); + if (num_bytes < StagingBufferSize) { + const auto [staging, offset] = staging_buffer.Map(num_bytes); + std::memcpy(staging, value, num_bytes); + copy.srcOffset = offset; + staging_buffer.Commit(); + } else { + // For large one time transfers use a temporary host buffer. + // RenderDoc can lag quite a bit if the stream buffer is too large. + Buffer temp_buffer{ + instance, scheduler, MemoryUsage::Upload, 0, vk::BufferUsageFlagBits::eTransferSrc, + num_bytes}; + src_buffer = temp_buffer.Handle(); + u8* const staging = temp_buffer.mapped_data.data(); + std::memcpy(staging, value, num_bytes); + scheduler.DeferOperation([buffer = std::move(temp_buffer)]() mutable {}); + } + scheduler.EndRendering(); + const auto cmdbuf = scheduler.CommandBuffer(); + const vk::BufferMemoryBarrier2 pre_barrier = { + .srcStageMask = vk::PipelineStageFlagBits2::eAllCommands, + .srcAccessMask = vk::AccessFlagBits2::eMemoryRead, + .dstStageMask = vk::PipelineStageFlagBits2::eTransfer, + .dstAccessMask = vk::AccessFlagBits2::eTransferWrite, + .buffer = buffer.Handle(), + .offset = buffer.Offset(address), + .size = num_bytes, + }; + const vk::BufferMemoryBarrier2 post_barrier = { + .srcStageMask = vk::PipelineStageFlagBits2::eTransfer, + .srcAccessMask = vk::AccessFlagBits2::eTransferWrite, + .dstStageMask = vk::PipelineStageFlagBits2::eAllCommands, + .dstAccessMask = vk::AccessFlagBits2::eMemoryRead | vk::AccessFlagBits2::eMemoryWrite, + .buffer = buffer.Handle(), + .offset = buffer.Offset(address), + .size = num_bytes, + }; + cmdbuf.pipelineBarrier2(vk::DependencyInfo{ + .dependencyFlags = vk::DependencyFlagBits::eByRegion, + .bufferMemoryBarrierCount = 1, + .pBufferMemoryBarriers = &pre_barrier, + }); + cmdbuf.copyBuffer(src_buffer, buffer.Handle(), copy); + cmdbuf.pipelineBarrier2(vk::DependencyInfo{ + .dependencyFlags = vk::DependencyFlagBits::eByRegion, + .bufferMemoryBarrierCount = 1, + .pBufferMemoryBarriers = &post_barrier, + }); +} + void BufferCache::DeleteBuffer(BufferId buffer_id) { Buffer& buffer = slot_buffers[buffer_id]; Unregister(buffer_id); diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index 71a6bed2a..2d6551a7f 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -38,14 +38,22 @@ class TextureCache; class BufferCache { public: - static constexpr u32 CACHING_PAGEBITS = 12; + static constexpr u32 CACHING_PAGEBITS = 14; static constexpr u64 CACHING_PAGESIZE = u64{1} << CACHING_PAGEBITS; - static constexpr u64 DEVICE_PAGESIZE = 4_KB; + static constexpr u64 DEVICE_PAGESIZE = 16_KB; + static constexpr u64 CACHING_NUMPAGES = u64{1} << (40 - CACHING_PAGEBITS); + + static constexpr u64 BDA_PAGETABLE_SIZE = CACHING_NUMPAGES * sizeof(vk::DeviceAddress); + static constexpr u64 FAULT_BUFFER_SIZE = CACHING_NUMPAGES / 8; // Bit per page + + struct PageData { + BufferId buffer_id{}; + }; struct Traits { - using Entry = BufferId; + using Entry = PageData; static constexpr size_t AddressSpaceBits = 40; - static constexpr size_t FirstLevelBits = 14; + static constexpr size_t FirstLevelBits = 16; static constexpr size_t PageBits = CACHING_PAGEBITS; }; using PageTable = MultiLevelPageTable; @@ -59,8 +67,8 @@ public: public: explicit BufferCache(const Vulkan::Instance& instance, Vulkan::Scheduler& scheduler, - AmdGpu::Liverpool* liverpool, TextureCache& texture_cache, - PageManager& tracker); + Vulkan::Rasterizer& rasterizer_, AmdGpu::Liverpool* liverpool, + TextureCache& texture_cache, PageManager& tracker); ~BufferCache(); /// Returns a pointer to GDS device local buffer. @@ -73,13 +81,23 @@ public: return stream_buffer; } + /// Retrieves the device local DBA page table buffer. + [[nodiscard]] Buffer* GetBdaPageTableBuffer() noexcept { + return &bda_pagetable_buffer; + } + + /// Retrieves the fault buffer. + [[nodiscard]] Buffer* GetFaultBuffer() noexcept { + return &fault_buffer; + } + /// Retrieves the buffer with the specified id. [[nodiscard]] Buffer& GetBuffer(BufferId id) { return slot_buffers[id]; } /// Invalidates any buffer in the logical page range. - void InvalidateMemory(VAddr device_addr, u64 size); + void InvalidateMemory(VAddr device_addr, u64 size, bool unmap); /// Binds host vertex buffers for the current draw. void BindVertexBuffers(const Vulkan::GraphicsPipeline& pipeline); @@ -87,9 +105,12 @@ public: /// Bind host index buffer for the current draw. void BindIndexBuffer(u32 index_offset); - /// Writes a value to GPU buffer. + /// Writes a value to GPU buffer. (uses command buffer to temporarily store the data) void InlineData(VAddr address, const void* value, u32 num_bytes, bool is_gds); + /// Writes a value to GPU buffer. (uses staging buffer to temporarily store the data) + void WriteData(VAddr address, const void* value, u32 num_bytes, bool is_gds); + /// Obtains a buffer for the specified region. [[nodiscard]] std::pair ObtainBuffer(VAddr gpu_addr, u32 size, bool is_written, bool is_texel_buffer = false, @@ -108,24 +129,29 @@ public: /// Return true when a CPU region is modified from the GPU [[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size); - [[nodiscard]] BufferId FindBuffer(VAddr device_addr, u32 size); + /// Return buffer id for the specified region + BufferId FindBuffer(VAddr device_addr, u32 size); + + /// Processes the fault buffer. + void ProcessFaultBuffer(); + + /// Synchronizes all buffers in the specified range. + void SynchronizeBuffersInRange(VAddr device_addr, u64 size); + + /// Synchronizes all buffers neede for DMA. + void SynchronizeDmaBuffers(); + + /// Record memory barrier. Used for buffers when accessed via BDA. + void MemoryBarrier(); private: template void ForEachBufferInRange(VAddr device_addr, u64 size, Func&& func) { - const u64 page_end = Common::DivCeil(device_addr + size, CACHING_PAGESIZE); - for (u64 page = device_addr >> CACHING_PAGEBITS; page < page_end;) { - const BufferId buffer_id = page_table[page]; - if (!buffer_id) { - ++page; - continue; - } - Buffer& buffer = slot_buffers[buffer_id]; - func(buffer_id, buffer); - - const VAddr end_addr = buffer.CpuAddr() + buffer.SizeBytes(); - page = Common::DivCeil(end_addr, CACHING_PAGESIZE); - } + buffer_ranges.ForEachInRange(device_addr, size, + [&](u64 page_start, u64 page_end, BufferId id) { + Buffer& buffer = slot_buffers[id]; + func(id, buffer); + }); } void DownloadBufferMemory(Buffer& buffer, VAddr device_addr, u64 size); @@ -134,7 +160,7 @@ private: void JoinOverlap(BufferId new_buffer_id, BufferId overlap_id, bool accumulate_stream_score); - [[nodiscard]] BufferId CreateBuffer(VAddr device_addr, u32 wanted_size); + BufferId CreateBuffer(VAddr device_addr, u32 wanted_size); void Register(BufferId buffer_id); @@ -147,21 +173,33 @@ private: bool SynchronizeBufferFromImage(Buffer& buffer, VAddr device_addr, u32 size); + void InlineDataBuffer(Buffer& buffer, VAddr address, const void* value, u32 num_bytes); + + void WriteDataBuffer(Buffer& buffer, VAddr address, const void* value, u32 num_bytes); + void DeleteBuffer(BufferId buffer_id); const Vulkan::Instance& instance; Vulkan::Scheduler& scheduler; + Vulkan::Rasterizer& rasterizer; AmdGpu::Liverpool* liverpool; TextureCache& texture_cache; PageManager& tracker; StreamBuffer staging_buffer; StreamBuffer stream_buffer; + StreamBuffer download_buffer; Buffer gds_buffer; - std::shared_mutex mutex; + Buffer bda_pagetable_buffer; + Buffer fault_buffer; + std::shared_mutex slot_buffers_mutex; Common::SlotVector slot_buffers; RangeSet gpu_modified_ranges; + SplitRangeMap buffer_ranges; MemoryTracker memory_tracker; PageTable page_table; + vk::UniqueDescriptorSetLayout fault_process_desc_layout; + vk::UniquePipeline fault_process_pipeline; + vk::UniquePipelineLayout fault_process_pipeline_layout; }; } // namespace VideoCore diff --git a/src/video_core/buffer_cache/memory_tracker_base.h b/src/video_core/buffer_cache/memory_tracker_base.h index d9166b11c..c60aa9c80 100644 --- a/src/video_core/buffer_cache/memory_tracker_base.h +++ b/src/video_core/buffer_cache/memory_tracker_base.h @@ -7,6 +7,7 @@ #include #include #include +#include "common/debug.h" #include "common/types.h" #include "video_core/buffer_cache/word_manager.h" @@ -19,11 +20,11 @@ public: static constexpr size_t MANAGER_POOL_SIZE = 32; public: - explicit MemoryTracker(PageManager* tracker_) : tracker{tracker_} {} + explicit MemoryTracker(PageManager& tracker_) : tracker{&tracker_} {} ~MemoryTracker() = default; /// Returns true if a region has been modified from the CPU - [[nodiscard]] bool IsRegionCpuModified(VAddr query_cpu_addr, u64 query_size) noexcept { + bool IsRegionCpuModified(VAddr query_cpu_addr, u64 query_size) noexcept { return IteratePages( query_cpu_addr, query_size, [](RegionManager* manager, u64 offset, size_t size) { return manager->template IsRegionModified(offset, size); @@ -31,7 +32,7 @@ public: } /// Returns true if a region has been modified from the GPU - [[nodiscard]] bool IsRegionGpuModified(VAddr query_cpu_addr, u64 query_size) noexcept { + bool IsRegionGpuModified(VAddr query_cpu_addr, u64 query_size) noexcept { return IteratePages( query_cpu_addr, query_size, [](RegionManager* manager, u64 offset, size_t size) { return manager->template IsRegionModified(offset, size); @@ -57,8 +58,7 @@ public: } /// Call 'func' for each CPU modified range and unmark those pages as CPU modified - template - void ForEachUploadRange(VAddr query_cpu_range, u64 query_size, Func&& func) { + void ForEachUploadRange(VAddr query_cpu_range, u64 query_size, auto&& func) { IteratePages(query_cpu_range, query_size, [&func](RegionManager* manager, u64 offset, size_t size) { manager->template ForEachModifiedRange( @@ -67,17 +67,12 @@ public: } /// Call 'func' for each GPU modified range and unmark those pages as GPU modified - template - void ForEachDownloadRange(VAddr query_cpu_range, u64 query_size, Func&& func) { + template + void ForEachDownloadRange(VAddr query_cpu_range, u64 query_size, auto&& func) { IteratePages(query_cpu_range, query_size, [&func](RegionManager* manager, u64 offset, size_t size) { - if constexpr (clear) { - manager->template ForEachModifiedRange( - manager->GetCpuAddr() + offset, size, func); - } else { - manager->template ForEachModifiedRange( - manager->GetCpuAddr() + offset, size, func); - } + manager->template ForEachModifiedRange( + manager->GetCpuAddr() + offset, size, func); }); } @@ -91,6 +86,7 @@ private: */ template bool IteratePages(VAddr cpu_address, size_t size, Func&& func) { + RENDERER_TRACE; using FuncReturn = typename std::invoke_result::type; static constexpr bool BOOL_BREAK = std::is_same_v; std::size_t remaining_size{size}; diff --git a/src/video_core/buffer_cache/range_set.h b/src/video_core/buffer_cache/range_set.h index 2abf6e524..5c8e78c7c 100644 --- a/src/video_core/buffer_cache/range_set.h +++ b/src/video_core/buffer_cache/range_set.h @@ -3,7 +3,10 @@ #pragma once +#include #include +#include +#include #include #include #include @@ -38,6 +41,22 @@ struct RangeSet { m_ranges_set.subtract(interval); } + void Clear() { + m_ranges_set.clear(); + } + + bool Contains(VAddr base_address, size_t size) const { + const VAddr end_address = base_address + size; + IntervalType interval{base_address, end_address}; + return boost::icl::contains(m_ranges_set, interval); + } + + bool Intersects(VAddr base_address, size_t size) const { + const VAddr end_address = base_address + size; + IntervalType interval{base_address, end_address}; + return boost::icl::intersects(m_ranges_set, interval); + } + template void ForEach(Func&& func) const { if (m_ranges_set.empty()) { @@ -77,14 +96,29 @@ struct RangeSet { } } + template + void ForEachNotInRange(VAddr base_addr, size_t size, Func&& func) const { + const VAddr end_addr = base_addr + size; + ForEachInRange(base_addr, size, [&](VAddr range_addr, VAddr range_end) { + if (size_t gap_size = range_addr - base_addr; gap_size != 0) { + func(base_addr, gap_size); + } + base_addr = range_end; + }); + if (base_addr != end_addr) { + func(base_addr, end_addr - base_addr); + } + } + IntervalSet m_ranges_set; }; +template class RangeMap { public: using IntervalMap = - boost::icl::interval_map; using IntervalType = typename IntervalMap::interval_type; @@ -99,7 +133,7 @@ public: RangeMap(RangeMap&& other); RangeMap& operator=(RangeMap&& other); - void Add(VAddr base_address, size_t size, u64 value) { + void Add(VAddr base_address, size_t size, const T& value) { const VAddr end_address = base_address + size; IntervalType interval{base_address, end_address}; m_ranges_map.add({interval, value}); @@ -111,6 +145,35 @@ public: m_ranges_map -= interval; } + void Clear() { + m_ranges_map.clear(); + } + + bool Contains(VAddr base_address, size_t size) const { + const VAddr end_address = base_address + size; + IntervalType interval{base_address, end_address}; + return boost::icl::contains(m_ranges_map, interval); + } + + bool Intersects(VAddr base_address, size_t size) const { + const VAddr end_address = base_address + size; + IntervalType interval{base_address, end_address}; + return boost::icl::intersects(m_ranges_map, interval); + } + + template + void ForEach(Func&& func) const { + if (m_ranges_map.empty()) { + return; + } + + for (const auto& [interval, value] : m_ranges_map) { + const VAddr inter_addr_end = interval.upper(); + const VAddr inter_addr = interval.lower(); + func(inter_addr, inter_addr_end, value); + } + } + template void ForEachInRange(VAddr base_addr, size_t size, Func&& func) const { if (m_ranges_map.empty()) { @@ -140,7 +203,111 @@ public: template void ForEachNotInRange(VAddr base_addr, size_t size, Func&& func) const { const VAddr end_addr = base_addr + size; - ForEachInRange(base_addr, size, [&](VAddr range_addr, VAddr range_end, u64) { + ForEachInRange(base_addr, size, [&](VAddr range_addr, VAddr range_end, const T&) { + if (size_t gap_size = range_addr - base_addr; gap_size != 0) { + func(base_addr, gap_size); + } + base_addr = range_end; + }); + if (base_addr != end_addr) { + func(base_addr, end_addr - base_addr); + } + } + +private: + IntervalMap m_ranges_map; +}; + +template +class SplitRangeMap { +public: + using IntervalMap = boost::icl::split_interval_map< + VAddr, T, boost::icl::total_absorber, std::less, boost::icl::inplace_identity, + boost::icl::inter_section, ICL_INTERVAL_INSTANCE(ICL_INTERVAL_DEFAULT, VAddr, std::less), + RangeSetsAllocator>; + using IntervalType = typename IntervalMap::interval_type; + +public: + SplitRangeMap() = default; + ~SplitRangeMap() = default; + + SplitRangeMap(SplitRangeMap const&) = delete; + SplitRangeMap& operator=(SplitRangeMap const&) = delete; + + SplitRangeMap(SplitRangeMap&& other); + SplitRangeMap& operator=(SplitRangeMap&& other); + + void Add(VAddr base_address, size_t size, const T& value) { + const VAddr end_address = base_address + size; + IntervalType interval{base_address, end_address}; + m_ranges_map.add({interval, value}); + } + + void Subtract(VAddr base_address, size_t size) { + const VAddr end_address = base_address + size; + IntervalType interval{base_address, end_address}; + m_ranges_map -= interval; + } + + void Clear() { + m_ranges_map.clear(); + } + + bool Contains(VAddr base_address, size_t size) const { + const VAddr end_address = base_address + size; + IntervalType interval{base_address, end_address}; + return boost::icl::contains(m_ranges_map, interval); + } + + bool Intersects(VAddr base_address, size_t size) const { + const VAddr end_address = base_address + size; + IntervalType interval{base_address, end_address}; + return boost::icl::intersects(m_ranges_map, interval); + } + + template + void ForEach(Func&& func) const { + if (m_ranges_map.empty()) { + return; + } + + for (const auto& [interval, value] : m_ranges_map) { + const VAddr inter_addr_end = interval.upper(); + const VAddr inter_addr = interval.lower(); + func(inter_addr, inter_addr_end, value); + } + } + + template + void ForEachInRange(VAddr base_addr, size_t size, Func&& func) const { + if (m_ranges_map.empty()) { + return; + } + const VAddr start_address = base_addr; + const VAddr end_address = start_address + size; + const IntervalType search_interval{start_address, end_address}; + auto it = m_ranges_map.lower_bound(search_interval); + if (it == m_ranges_map.end()) { + return; + } + auto end_it = m_ranges_map.upper_bound(search_interval); + for (; it != end_it; it++) { + VAddr inter_addr_end = it->first.upper(); + VAddr inter_addr = it->first.lower(); + if (inter_addr_end > end_address) { + inter_addr_end = end_address; + } + if (inter_addr < start_address) { + inter_addr = start_address; + } + func(inter_addr, inter_addr_end, it->second); + } + } + + template + void ForEachNotInRange(VAddr base_addr, size_t size, Func&& func) const { + const VAddr end_addr = base_addr + size; + ForEachInRange(base_addr, size, [&](VAddr range_addr, VAddr range_end, const T&) { if (size_t gap_size = range_addr - base_addr; gap_size != 0) { func(base_addr, gap_size); } diff --git a/src/video_core/buffer_cache/word_manager.h b/src/video_core/buffer_cache/word_manager.h index 5ad724f96..51a912c62 100644 --- a/src/video_core/buffer_cache/word_manager.h +++ b/src/video_core/buffer_cache/word_manager.h @@ -10,8 +10,10 @@ #ifdef __linux__ #include "common/adaptive_mutex.h" -#endif +#else #include "common/spin_lock.h" +#endif +#include "common/debug.h" #include "common/types.h" #include "video_core/page_manager.h" @@ -56,7 +58,7 @@ public: return cpu_addr; } - static u64 ExtractBits(u64 word, size_t page_start, size_t page_end) { + static constexpr u64 ExtractBits(u64 word, size_t page_start, size_t page_end) { constexpr size_t number_bits = sizeof(u64) * 8; const size_t limit_page_end = number_bits - std::min(page_end, number_bits); u64 bits = (word >> page_start) << page_start; @@ -64,7 +66,7 @@ public: return bits; } - static std::pair GetWordPage(VAddr address) { + static constexpr std::pair GetWordPage(VAddr address) { const size_t converted_address = static_cast(address); const size_t word_number = converted_address / BYTES_PER_WORD; const size_t amount_pages = converted_address % BYTES_PER_WORD; @@ -73,6 +75,7 @@ public: template void IterateWords(size_t offset, size_t size, Func&& func) const { + RENDERER_TRACE; using FuncReturn = std::invoke_result_t; static constexpr bool BOOL_BREAK = std::is_same_v; const size_t start = static_cast(std::max(static_cast(offset), 0LL)); @@ -104,13 +107,13 @@ public: } } - template - void IteratePages(u64 mask, Func&& func) const { + void IteratePages(u64 mask, auto&& func) const { + RENDERER_TRACE; size_t offset = 0; while (mask != 0) { const size_t empty_bits = std::countr_zero(mask); offset += empty_bits; - mask = mask >> empty_bits; + mask >>= empty_bits; const size_t continuous_bits = std::countr_one(mask); func(offset, continuous_bits); @@ -155,8 +158,9 @@ public: * @param size Size in bytes of the CPU range to loop over * @param func Function to call for each turned off region */ - template - void ForEachModifiedRange(VAddr query_cpu_range, s64 size, Func&& func) { + template + void ForEachModifiedRange(VAddr query_cpu_range, s64 size, auto&& func) { + RENDERER_TRACE; std::scoped_lock lk{lock}; static_assert(type != Type::Untracked); @@ -170,6 +174,7 @@ public: (pending_pointer - pending_offset) * BYTES_PER_PAGE); }; IterateWords(offset, size, [&](size_t index, u64 mask) { + RENDERER_TRACE; if constexpr (type == Type::GPU) { mask &= ~untracked[index]; } @@ -177,14 +182,13 @@ public: if constexpr (clear) { if constexpr (type == Type::CPU) { UpdateProtection(index, untracked[index], mask); - } - state_words[index] &= ~mask; - if constexpr (type == Type::CPU) { untracked[index] &= ~mask; } + state_words[index] &= ~mask; } const size_t base_offset = index * PAGES_PER_WORD; IteratePages(word, [&](size_t pages_offset, size_t pages_size) { + RENDERER_TRACE; const auto reset = [&]() { pending_offset = base_offset + pages_offset; pending_pointer = base_offset + pages_offset + pages_size; @@ -245,11 +249,13 @@ private: */ template void UpdateProtection(u64 word_index, u64 current_bits, u64 new_bits) const { + RENDERER_TRACE; + constexpr s32 delta = add_to_tracker ? 1 : -1; u64 changed_bits = (add_to_tracker ? current_bits : ~current_bits) & new_bits; VAddr addr = cpu_addr + word_index * BYTES_PER_WORD; IteratePages(changed_bits, [&](size_t offset, size_t size) { - tracker->UpdatePagesCachedCount(addr + offset * BYTES_PER_PAGE, size * BYTES_PER_PAGE, - add_to_tracker ? 1 : -1); + tracker->UpdatePageWatchers(addr + offset * BYTES_PER_PAGE, + size * BYTES_PER_PAGE); }); } diff --git a/src/video_core/host_shaders/CMakeLists.txt b/src/video_core/host_shaders/CMakeLists.txt index 3001bf773..d52afe738 100644 --- a/src/video_core/host_shaders/CMakeLists.txt +++ b/src/video_core/host_shaders/CMakeLists.txt @@ -11,6 +11,7 @@ set(SHADER_FILES detilers/micro_32bpp.comp detilers/micro_64bpp.comp detilers/micro_8bpp.comp + fault_buffer_process.comp fs_tri.vert fsr.comp post_process.frag diff --git a/src/video_core/host_shaders/fault_buffer_process.comp b/src/video_core/host_shaders/fault_buffer_process.comp new file mode 100644 index 000000000..a712cf441 --- /dev/null +++ b/src/video_core/host_shaders/fault_buffer_process.comp @@ -0,0 +1,42 @@ +// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#version 450 +#extension GL_ARB_gpu_shader_int64 : enable + +layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in; + +layout(std430, binding = 0) buffer input_buf { + uint fault_buffer[]; +}; + +layout(std430, binding = 1) buffer output_buf { + uint64_t download_buffer[]; +}; + +// Overlap for 32 bit atomics +layout(std430, binding = 1) buffer output_buf32 { + uint download_buffer32[]; +}; + +layout(constant_id = 0) const uint CACHING_PAGEBITS = 0; + +void main() { + uint id = gl_GlobalInvocationID.x; + uint word = fault_buffer[id]; + if (word == 0u) { + return; + } + // 1 page per bit + uint base_bit = id * 32u; + while (word != 0u) { + uint bit = findLSB(word); + word &= word - 1; + uint page = base_bit + bit; + uint store_index = atomicAdd(download_buffer32[0], 1u) + 1u; + // It is very unlikely, but should we check for overflow? + if (store_index < 1024u) { // only support 1024 page faults + download_buffer[store_index] = uint64_t(page) << CACHING_PAGEBITS; + } + } +} diff --git a/src/video_core/page_manager.cpp b/src/video_core/page_manager.cpp index 47ed9e543..36145d0c5 100644 --- a/src/video_core/page_manager.cpp +++ b/src/video_core/page_manager.cpp @@ -1,11 +1,9 @@ // SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later -#include -#include -#include "common/alignment.h" +#include #include "common/assert.h" -#include "common/error.h" +#include "common/debug.h" #include "common/signal_context.h" #include "core/memory.h" #include "core/signals.h" @@ -15,23 +13,60 @@ #ifndef _WIN64 #include #ifdef ENABLE_USERFAULTFD +#include #include #include #include #include +#include "common/error.h" #endif #else #include #endif +#ifdef __linux__ +#include "common/adaptive_mutex.h" +#else +#include "common/spin_lock.h" +#endif + namespace VideoCore { -constexpr size_t PAGESIZE = 4_KB; -constexpr size_t PAGEBITS = 12; +constexpr size_t PAGE_SIZE = 4_KB; +constexpr size_t PAGE_BITS = 12; -#ifdef ENABLE_USERFAULTFD struct PageManager::Impl { - Impl(Vulkan::Rasterizer* rasterizer_) : rasterizer{rasterizer_} { + struct PageState { + u8 num_watchers{}; + + Core::MemoryPermission Perm() const noexcept { + return num_watchers == 0 ? Core::MemoryPermission::ReadWrite + : Core::MemoryPermission::Read; + } + + template + u8 AddDelta() { + if constexpr (delta == 1) { + return ++num_watchers; + } else { + ASSERT_MSG(num_watchers > 0, "Not enough watchers"); + return --num_watchers; + } + } + }; + + struct UpdateProtectRange { + VAddr addr; + u64 size; + Core::MemoryPermission perms; + }; + + static constexpr size_t ADDRESS_BITS = 40; + static constexpr size_t NUM_ADDRESS_PAGES = 1ULL << (40 - PAGE_BITS); + inline static Vulkan::Rasterizer* rasterizer; +#ifdef ENABLE_USERFAULTFD + Impl(Vulkan::Rasterizer* rasterizer_) { + rasterizer = rasterizer_; uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY); ASSERT_MSG(uffd != -1, "{}", Common::GetLastErrorMsg()); @@ -63,7 +98,8 @@ struct PageManager::Impl { ASSERT_MSG(ret != -1, "Uffdio unregister failed"); } - void Protect(VAddr address, size_t size, bool allow_write) { + void Protect(VAddr address, size_t size, Core::MemoryPermission perms) { + bool allow_write = True(perms & Core::MemoryPermission::Write); uffdio_writeprotect wp; wp.range.start = address; wp.range.len = size; @@ -118,12 +154,9 @@ struct PageManager::Impl { } } - Vulkan::Rasterizer* rasterizer; std::jthread ufd_thread; int uffd; -}; #else -struct PageManager::Impl { Impl(Vulkan::Rasterizer* rasterizer_) { rasterizer = rasterizer_; @@ -141,12 +174,11 @@ struct PageManager::Impl { // No-op } - void Protect(VAddr address, size_t size, bool allow_write) { + void Protect(VAddr address, size_t size, Core::MemoryPermission perms) { + RENDERER_TRACE; auto* memory = Core::Memory::Instance(); auto& impl = memory->GetAddressSpace(); - impl.Protect(address, size, - allow_write ? Core::MemoryPermission::ReadWrite - : Core::MemoryPermission::Read); + impl.Protect(address, size, perms); } static bool GuestFaultSignalHandler(void* context, void* fault_address) { @@ -157,23 +189,76 @@ struct PageManager::Impl { return false; } - inline static Vulkan::Rasterizer* rasterizer; -}; #endif + template + void UpdatePageWatchers(VAddr addr, u64 size) { + RENDERER_TRACE; + boost::container::small_vector update_ranges; + { + std::scoped_lock lk(lock); + + size_t page = addr >> PAGE_BITS; + auto perms = cached_pages[page].Perm(); + u64 range_begin = 0; + u64 range_bytes = 0; + + const auto release_pending = [&] { + if (range_bytes > 0) { + RENDERER_TRACE; + // Add pending (un)protect action + update_ranges.push_back({range_begin << PAGE_BITS, range_bytes, perms}); + range_bytes = 0; + } + }; + + // Iterate requested pages + const u64 page_end = Common::DivCeil(addr + size, PAGE_SIZE); + for (; page != page_end; ++page) { + PageState& state = cached_pages[page]; + + // Apply the change to the page state + const u8 new_count = state.AddDelta(); + + // If the protection changed add pending (un)protect action + if (auto new_perms = state.Perm(); new_perms != perms) [[unlikely]] { + release_pending(); + perms = new_perms; + } + + // If the page must be (un)protected, add it to the pending range + if ((new_count == 0 && delta < 0) || (new_count == 1 && delta > 0)) { + if (range_bytes == 0) { + range_begin = page; + } + range_bytes += PAGE_SIZE; + } else { + release_pending(); + } + } + + // Add pending (un)protect action + release_pending(); + } + + // Flush deferred protects + for (const auto& range : update_ranges) { + Protect(range.addr, range.size, range.perms); + } + } + + std::array cached_pages{}; +#ifdef __linux__ + Common::AdaptiveMutex lock; +#else + Common::SpinLock lock; +#endif +}; PageManager::PageManager(Vulkan::Rasterizer* rasterizer_) - : impl{std::make_unique(rasterizer_)}, rasterizer{rasterizer_} {} + : impl{std::make_unique(rasterizer_)} {} PageManager::~PageManager() = default; -VAddr PageManager::GetPageAddr(VAddr addr) { - return Common::AlignDown(addr, PAGESIZE); -} - -VAddr PageManager::GetNextPageAddr(VAddr addr) { - return Common::AlignUp(addr + 1, PAGESIZE); -} - void PageManager::OnGpuMap(VAddr address, size_t size) { impl->OnMap(address, size); } @@ -182,41 +267,12 @@ void PageManager::OnGpuUnmap(VAddr address, size_t size) { impl->OnUnmap(address, size); } -void PageManager::UpdatePagesCachedCount(VAddr addr, u64 size, s32 delta) { - static constexpr u64 PageShift = 12; - - std::scoped_lock lk{lock}; - const u64 num_pages = ((addr + size - 1) >> PageShift) - (addr >> PageShift) + 1; - const u64 page_start = addr >> PageShift; - const u64 page_end = page_start + num_pages; - - const auto pages_interval = - decltype(cached_pages)::interval_type::right_open(page_start, page_end); - if (delta > 0) { - cached_pages.add({pages_interval, delta}); - } - - const auto& range = cached_pages.equal_range(pages_interval); - for (const auto& [range, count] : boost::make_iterator_range(range)) { - const auto interval = range & pages_interval; - const VAddr interval_start_addr = boost::icl::first(interval) << PageShift; - const VAddr interval_end_addr = boost::icl::last_next(interval) << PageShift; - const u32 interval_size = interval_end_addr - interval_start_addr; - ASSERT_MSG(rasterizer->IsMapped(interval_start_addr, interval_size), - "Attempted to track non-GPU memory at address {:#x}, size {:#x}.", - interval_start_addr, interval_size); - if (delta > 0 && count == delta) { - impl->Protect(interval_start_addr, interval_size, false); - } else if (delta < 0 && count == -delta) { - impl->Protect(interval_start_addr, interval_size, true); - } else { - ASSERT(count >= 0); - } - } - - if (delta < 0) { - cached_pages.add({pages_interval, delta}); - } +template +void PageManager::UpdatePageWatchers(VAddr addr, u64 size) const { + impl->UpdatePageWatchers(addr, size); } +template void PageManager::UpdatePageWatchers<1>(VAddr addr, u64 size) const; +template void PageManager::UpdatePageWatchers<-1>(VAddr addr, u64 size) const; + } // namespace VideoCore diff --git a/src/video_core/page_manager.h b/src/video_core/page_manager.h index f6bae9641..98dd099af 100644 --- a/src/video_core/page_manager.h +++ b/src/video_core/page_manager.h @@ -4,11 +4,7 @@ #pragma once #include -#include -#ifdef __linux__ -#include "common/adaptive_mutex.h" -#endif -#include "common/spin_lock.h" +#include "common/alignment.h" #include "common/types.h" namespace Vulkan { @@ -18,6 +14,9 @@ class Rasterizer; namespace VideoCore { class PageManager { + static constexpr size_t PAGE_BITS = 12; + static constexpr size_t PAGE_SIZE = 1ULL << PAGE_BITS; + public: explicit PageManager(Vulkan::Rasterizer* rasterizer); ~PageManager(); @@ -28,22 +27,23 @@ public: /// Unregister a range of gpu memory that was unmapped. void OnGpuUnmap(VAddr address, size_t size); - /// Increase/decrease the number of surface in pages touching the specified region - void UpdatePagesCachedCount(VAddr addr, u64 size, s32 delta); + /// Updates watches in the pages touching the specified region. + template + void UpdatePageWatchers(VAddr addr, u64 size) const; - static VAddr GetPageAddr(VAddr addr); - static VAddr GetNextPageAddr(VAddr addr); + /// Returns page aligned address. + static constexpr VAddr GetPageAddr(VAddr addr) { + return Common::AlignDown(addr, PAGE_SIZE); + } + + /// Returns address of the next page. + static constexpr VAddr GetNextPageAddr(VAddr addr) { + return Common::AlignUp(addr + 1, PAGE_SIZE); + } private: struct Impl; std::unique_ptr impl; - Vulkan::Rasterizer* rasterizer; - boost::icl::interval_map cached_pages; -#ifdef PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP - Common::AdaptiveMutex lock; -#else - Common::SpinLock lock; -#endif }; } // namespace VideoCore diff --git a/src/video_core/renderdoc.cpp b/src/video_core/renderdoc.cpp index b082fd1ca..4cf2ddd53 100644 --- a/src/video_core/renderdoc.cpp +++ b/src/video_core/renderdoc.cpp @@ -121,6 +121,7 @@ void SetOutputDir(const std::filesystem::path& path, const std::string& prefix) if (!rdoc_api) { return; } + LOG_WARNING(Common, "RenderDoc capture path: {}", (path / prefix).string()); rdoc_api->SetCaptureFilePathTemplate(fmt::UTF((path / prefix).u8string()).data.data()); } diff --git a/src/video_core/renderer_vulkan/vk_instance.cpp b/src/video_core/renderer_vulkan/vk_instance.cpp index e31b95844..9584329f0 100644 --- a/src/video_core/renderer_vulkan/vk_instance.cpp +++ b/src/video_core/renderer_vulkan/vk_instance.cpp @@ -147,6 +147,7 @@ Instance::Instance(Frontend::WindowSDL& window, s32 physical_device_index, available_extensions = GetSupportedExtensions(physical_device); format_properties = GetFormatProperties(physical_device); properties = physical_device.getProperties(); + memory_properties = physical_device.getMemoryProperties(); CollectDeviceParameters(); ASSERT_MSG(properties.apiVersion >= TargetVulkanApiVersion, "Vulkan {}.{} is required, but only {}.{} is supported by device!", @@ -375,6 +376,7 @@ bool Instance::CreateDevice() { .separateDepthStencilLayouts = vk12_features.separateDepthStencilLayouts, .hostQueryReset = vk12_features.hostQueryReset, .timelineSemaphore = vk12_features.timelineSemaphore, + .bufferDeviceAddress = vk12_features.bufferDeviceAddress, }, vk::PhysicalDeviceVulkan13Features{ .robustImageAccess = vk13_features.robustImageAccess, @@ -505,6 +507,7 @@ void Instance::CreateAllocator() { }; const VmaAllocatorCreateInfo allocator_info = { + .flags = VMA_ALLOCATOR_CREATE_BUFFER_DEVICE_ADDRESS_BIT, .physicalDevice = physical_device, .device = *device, .pVulkanFunctions = &functions, diff --git a/src/video_core/renderer_vulkan/vk_instance.h b/src/video_core/renderer_vulkan/vk_instance.h index 573473869..30848e8b7 100644 --- a/src/video_core/renderer_vulkan/vk_instance.h +++ b/src/video_core/renderer_vulkan/vk_instance.h @@ -286,6 +286,11 @@ public: return vk12_props; } + /// Returns the memory properties of the physical device. + const vk::PhysicalDeviceMemoryProperties& GetMemoryProperties() const noexcept { + return memory_properties; + } + /// Returns true if shaders can declare the ClipDistance attribute bool IsShaderClipDistanceSupported() const { return features.shaderClipDistance; @@ -335,6 +340,7 @@ private: vk::PhysicalDevice physical_device; vk::UniqueDevice device; vk::PhysicalDeviceProperties properties; + vk::PhysicalDeviceMemoryProperties memory_properties; vk::PhysicalDeviceVulkan11Properties vk11_props; vk::PhysicalDeviceVulkan12Properties vk12_props; vk::PhysicalDevicePushDescriptorPropertiesKHR push_descriptor_props; diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index e7b42a34b..4bdb08bf2 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -36,7 +36,7 @@ static Shader::PushData MakeUserData(const AmdGpu::Liverpool::Regs& regs) { Rasterizer::Rasterizer(const Instance& instance_, Scheduler& scheduler_, AmdGpu::Liverpool* liverpool_) : instance{instance_}, scheduler{scheduler_}, page_manager{this}, - buffer_cache{instance, scheduler, liverpool_, texture_cache, page_manager}, + buffer_cache{instance, scheduler, *this, liverpool_, texture_cache, page_manager}, texture_cache{instance, scheduler, buffer_cache, page_manager}, liverpool{liverpool_}, memory{Core::Memory::Instance()}, pipeline_cache{instance, scheduler, liverpool} { if (!Config::nullGpu()) { @@ -439,6 +439,13 @@ void Rasterizer::Finish() { scheduler.Finish(); } +void Rasterizer::ProcessFaults() { + if (fault_process_pending) { + fault_process_pending = false; + buffer_cache.ProcessFaultBuffer(); + } +} + bool Rasterizer::BindResources(const Pipeline* pipeline) { if (IsComputeMetaClear(pipeline)) { return false; @@ -449,6 +456,8 @@ bool Rasterizer::BindResources(const Pipeline* pipeline) { buffer_infos.clear(); image_infos.clear(); + bool uses_dma = false; + // Bind resource buffers and textures. Shader::Backend::Bindings binding{}; Shader::PushData push_data = MakeUserData(liverpool->regs); @@ -459,9 +468,28 @@ bool Rasterizer::BindResources(const Pipeline* pipeline) { stage->PushUd(binding, push_data); BindBuffers(*stage, binding, push_data); BindTextures(*stage, binding); + + uses_dma |= stage->dma_types != Shader::IR::Type::Void; } pipeline->BindResources(set_writes, buffer_barriers, push_data); + + if (uses_dma && !fault_process_pending) { + // We only use fault buffer for DMA right now. + { + // TODO: GPU might have written to memory (for example with EVENT_WRITE_EOP) + // we need to account for that and synchronize. + Common::RecursiveSharedLock lock{mapped_ranges_mutex}; + for (auto& range : mapped_ranges) { + buffer_cache.SynchronizeBuffersInRange(range.lower(), + range.upper() - range.lower()); + } + } + buffer_cache.MemoryBarrier(); + } + + fault_process_pending |= uses_dma; + return true; } @@ -520,12 +548,18 @@ void Rasterizer::BindBuffers(const Shader::Info& stage, Shader::Backend::Binding if (desc.buffer_type == Shader::BufferType::GdsBuffer) { const auto* gds_buf = buffer_cache.GetGdsBuffer(); buffer_infos.emplace_back(gds_buf->Handle(), 0, gds_buf->SizeBytes()); - } else if (desc.buffer_type == Shader::BufferType::ReadConstUbo) { + } else if (desc.buffer_type == Shader::BufferType::Flatbuf) { auto& vk_buffer = buffer_cache.GetStreamBuffer(); const u32 ubo_size = stage.flattened_ud_buf.size() * sizeof(u32); const u64 offset = vk_buffer.Copy(stage.flattened_ud_buf.data(), ubo_size, instance.UniformMinAlignment()); buffer_infos.emplace_back(vk_buffer.Handle(), offset, ubo_size); + } else if (desc.buffer_type == Shader::BufferType::BdaPagetable) { + const auto* bda_buffer = buffer_cache.GetBdaPageTableBuffer(); + buffer_infos.emplace_back(bda_buffer->Handle(), 0, bda_buffer->SizeBytes()); + } else if (desc.buffer_type == Shader::BufferType::FaultBuffer) { + const auto* fault_buffer = buffer_cache.GetFaultBuffer(); + buffer_infos.emplace_back(fault_buffer->Handle(), 0, fault_buffer->SizeBytes()); } else if (desc.buffer_type == Shader::BufferType::SharedMemory) { auto& lds_buffer = buffer_cache.GetStreamBuffer(); const auto& cs_program = liverpool->GetCsRegs(); @@ -925,7 +959,7 @@ bool Rasterizer::InvalidateMemory(VAddr addr, u64 size) { // Not GPU mapped memory, can skip invalidation logic entirely. return false; } - buffer_cache.InvalidateMemory(addr, size); + buffer_cache.InvalidateMemory(addr, size, false); texture_cache.InvalidateMemory(addr, size); return true; } @@ -937,24 +971,24 @@ bool Rasterizer::IsMapped(VAddr addr, u64 size) { } const auto range = decltype(mapped_ranges)::interval_type::right_open(addr, addr + size); - std::shared_lock lock{mapped_ranges_mutex}; + Common::RecursiveSharedLock lock{mapped_ranges_mutex}; return boost::icl::contains(mapped_ranges, range); } void Rasterizer::MapMemory(VAddr addr, u64 size) { { - std::unique_lock lock{mapped_ranges_mutex}; + std::scoped_lock lock{mapped_ranges_mutex}; mapped_ranges += decltype(mapped_ranges)::interval_type::right_open(addr, addr + size); } page_manager.OnGpuMap(addr, size); } void Rasterizer::UnmapMemory(VAddr addr, u64 size) { - buffer_cache.InvalidateMemory(addr, size); + buffer_cache.InvalidateMemory(addr, size, true); texture_cache.UnmapMemory(addr, size); page_manager.OnGpuUnmap(addr, size); { - std::unique_lock lock{mapped_ranges_mutex}; + std::scoped_lock lock{mapped_ranges_mutex}; mapped_ranges -= decltype(mapped_ranges)::interval_type::right_open(addr, addr + size); } } diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index 4e0ed0996..fb9ca4bbe 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h @@ -4,7 +4,7 @@ #pragma once #include - +#include "common/recursive_lock.h" #include "video_core/buffer_cache/buffer_cache.h" #include "video_core/page_manager.h" #include "video_core/renderer_vulkan/vk_pipeline_cache.h" @@ -65,11 +65,21 @@ public: void CpSync(); u64 Flush(); void Finish(); + void ProcessFaults(); PipelineCache& GetPipelineCache() { return pipeline_cache; } + template + void ForEachMappedRangeInRange(VAddr addr, u64 size, Func&& func) { + const auto range = decltype(mapped_ranges)::interval_type::right_open(addr, addr + size); + Common::RecursiveSharedLock lock{mapped_ranges_mutex}; + for (const auto& mapped_range : (mapped_ranges & range)) { + func(mapped_range); + } + } + private: RenderState PrepareRenderState(u32 mrt_mask); void BeginRendering(const GraphicsPipeline& pipeline, RenderState& state); @@ -100,6 +110,8 @@ private: bool IsComputeMetaClear(const Pipeline* pipeline); private: + friend class VideoCore::BufferCache; + const Instance& instance; Scheduler& scheduler; VideoCore::PageManager page_manager; @@ -126,6 +138,7 @@ private: boost::container::static_vector buffer_bindings; using ImageBindingInfo = std::pair; boost::container::static_vector image_bindings; + bool fault_process_pending{false}; }; } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_scheduler.cpp b/src/video_core/renderer_vulkan/vk_scheduler.cpp index 8d4188a22..e75a69924 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.cpp +++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp @@ -70,6 +70,11 @@ void Scheduler::Flush(SubmitInfo& info) { SubmitExecution(info); } +void Scheduler::Flush() { + SubmitInfo info{}; + Flush(info); +} + void Scheduler::Finish() { // When finishing, we need to wait for the submission to have executed on the device. const u64 presubmit_tick = CurrentTick(); @@ -85,6 +90,15 @@ void Scheduler::Wait(u64 tick) { Flush(info); } master_semaphore.Wait(tick); + + // CAUTION: This can introduce unexpected variation in the wait time. + // We don't currently sync the GPU, and some games are very sensitive to this. + // If this becomes a problem, it can be commented out. + // Idealy we would implement proper gpu sync. + while (!pending_ops.empty() && pending_ops.front().gpu_tick <= tick) { + pending_ops.front().callback(); + pending_ops.pop(); + } } void Scheduler::AllocateWorkerCommandBuffers() { diff --git a/src/video_core/renderer_vulkan/vk_scheduler.h b/src/video_core/renderer_vulkan/vk_scheduler.h index 7709e1d41..c30fc6e0e 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.h +++ b/src/video_core/renderer_vulkan/vk_scheduler.h @@ -307,6 +307,10 @@ public: /// and increments the scheduler timeline semaphore. void Flush(SubmitInfo& info); + /// Sends the current execution context to the GPU + /// and increments the scheduler timeline semaphore. + void Flush(); + /// Sends the current execution context to the GPU and waits for it to complete. void Finish(); diff --git a/src/video_core/texture_cache/texture_cache.cpp b/src/video_core/texture_cache/texture_cache.cpp index 82f4d6413..409085511 100644 --- a/src/video_core/texture_cache/texture_cache.cpp +++ b/src/video_core/texture_cache/texture_cache.cpp @@ -672,7 +672,7 @@ void TextureCache::TrackImage(ImageId image_id) { // Re-track the whole image image.track_addr = image_begin; image.track_addr_end = image_end; - tracker.UpdatePagesCachedCount(image_begin, image.info.guest_size, 1); + tracker.UpdatePageWatchers<1>(image_begin, image.info.guest_size); } else { if (image_begin < image.track_addr) { TrackImageHead(image_id); @@ -695,7 +695,7 @@ void TextureCache::TrackImageHead(ImageId image_id) { ASSERT(image.track_addr != 0 && image_begin < image.track_addr); const auto size = image.track_addr - image_begin; image.track_addr = image_begin; - tracker.UpdatePagesCachedCount(image_begin, size, 1); + tracker.UpdatePageWatchers<1>(image_begin, size); } void TextureCache::TrackImageTail(ImageId image_id) { @@ -711,7 +711,7 @@ void TextureCache::TrackImageTail(ImageId image_id) { const auto addr = image.track_addr_end; const auto size = image_end - image.track_addr_end; image.track_addr_end = image_end; - tracker.UpdatePagesCachedCount(addr, size, 1); + tracker.UpdatePageWatchers<1>(addr, size); } void TextureCache::UntrackImage(ImageId image_id) { @@ -724,7 +724,7 @@ void TextureCache::UntrackImage(ImageId image_id) { image.track_addr = 0; image.track_addr_end = 0; if (size != 0) { - tracker.UpdatePagesCachedCount(addr, size, -1); + tracker.UpdatePageWatchers<-1>(addr, size); } } @@ -743,7 +743,7 @@ void TextureCache::UntrackImageHead(ImageId image_id) { // Cehck its hash later. MarkAsMaybeDirty(image_id, image); } - tracker.UpdatePagesCachedCount(image_begin, size, -1); + tracker.UpdatePageWatchers<-1>(image_begin, size); } void TextureCache::UntrackImageTail(ImageId image_id) { @@ -762,7 +762,7 @@ void TextureCache::UntrackImageTail(ImageId image_id) { // Cehck its hash later. MarkAsMaybeDirty(image_id, image); } - tracker.UpdatePagesCachedCount(addr, size, -1); + tracker.UpdatePageWatchers<-1>(addr, size); } void TextureCache::DeleteImage(ImageId image_id) {