shader_recompiler: Remove special case buffers and add support for aliasing (#2428)

* shader_recompiler: Move shared mem lowering into emitter * IR can be quite verbose during first stages of translation, before ssa and constant prop passes have run that drastically simplify it. This lowering can also be done during emission so why not do it then to save some compilation time * runtime_info: Pack PsColorBuffer into 8 bytes * Drops the size of the total structure by half from 396 to 204 bytes. Also should make comparison of the array a bit faster, since its a hot path done every draw * emit_spirv_context: Add infrastructure for buffer aliases * Splits out the buffer creation function so it can be reused when defining multiple type aliases * shader_recompiler: Merge srt_flatbuf into buffers list * Its no longer a special case, yay * shader_recompiler: Complete buffer aliasing support * Add a bunch more types into buffers, such as F32 for float reads/writes and 8/16 bit integer types for formatted buffers * shader_recompiler: Remove existing shared memory emulation * The current impl relies on backend side implementaton and hooking into every shared memory access. It also doesnt handle atomics. Will be replaced by an IR pass that solves these issues * shader_recompiler: Reintroduce shared memory on ssbo emulation * Now it is performed with an IR pass, and combined with the previous commit cleanup, is fully transparent from the backend, other than requiring workgroup_index be provided as an attribute (computing this on every shared memory access is gonna be too verbose * clang format * buffer_cache: Reduce buffer sizes * vk_rasterizer: Cleanup resource binding code * Reduce noise in the functions, also remove some arguments which are class members * Fix gcc
2025-06-26 04:16:18 +00:00 · 2025-02-15 14:06:56 +02:00 · 2025-02-15 14:06:56 +02:00 · 82cacec8eb
commit 82cacec8eb
parent 290e127a4f
36 changed files with 675 additions and 625 deletions
--- a/src/shader_recompiler/ir/attribute.h
+++ b/src/shader_recompiler/ir/attribute.h
@ -69,16 +69,17 @@ enum class Attribute : u64 {
    SampleIndex = 72,
    GlobalInvocationId = 73,
    WorkgroupId = 74,
-    LocalInvocationId = 75,
-    LocalInvocationIndex = 76,
-    FragCoord = 77,
-    InstanceId0 = 78,  // step rate 0
-    InstanceId1 = 79,  // step rate 1
-    InvocationId = 80, // TCS id in output patch and instanced geometry shader id
-    PatchVertices = 81,
-    TessellationEvaluationPointU = 82,
-    TessellationEvaluationPointV = 83,
-    PackedHullInvocationInfo = 84, // contains patch id within the VGT and invocation ID
+    WorkgroupIndex = 75,
+    LocalInvocationId = 76,
+    LocalInvocationIndex = 77,
+    FragCoord = 78,
+    InstanceId0 = 79,  // step rate 0
+    InstanceId1 = 80,  // step rate 1
+    InvocationId = 81, // TCS id in output patch and instanced geometry shader id
+    PatchVertices = 82,
+    TessellationEvaluationPointU = 83,
+    TessellationEvaluationPointV = 84,
+    PackedHullInvocationInfo = 85, // contains patch id within the VGT and invocation ID
    Max,
 };

--- a/src/shader_recompiler/ir/passes/ir_passes.h
+++ b/src/shader_recompiler/ir/passes/ir_passes.h
@ -20,12 +20,14 @@ void FlattenExtendedUserdataPass(IR::Program& program);
 void ResourceTrackingPass(IR::Program& program);
 void CollectShaderInfoPass(IR::Program& program);
 void LowerBufferFormatToRaw(IR::Program& program);
-void LowerSharedMemToRegisters(IR::Program& program, const RuntimeInfo& runtime_info);
 void RingAccessElimination(const IR::Program& program, const RuntimeInfo& runtime_info,
                           Stage stage);
 void TessellationPreprocess(IR::Program& program, RuntimeInfo& runtime_info);
 void HullShaderTransform(IR::Program& program, RuntimeInfo& runtime_info);
 void DomainShaderTransform(IR::Program& program, RuntimeInfo& runtime_info);
-void SharedMemoryBarrierPass(IR::Program& program, const Profile& profile);
+void SharedMemoryBarrierPass(IR::Program& program, const RuntimeInfo& runtime_info,
+                             const Profile& profile);
+void SharedMemoryToStoragePass(IR::Program& program, const RuntimeInfo& runtime_info,
+                               const Profile& profile);

 } // namespace Shader::Optimization
--- a/src/shader_recompiler/ir/passes/lower_shared_mem_to_registers.cpp
+++ b/src/shader_recompiler/ir/passes/lower_shared_mem_to_registers.cpp
@ -1,81 +0,0 @@
-// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
-// SPDX-License-Identifier: GPL-2.0-or-later
-
-#include <unordered_map>
-
-#include "shader_recompiler/ir/ir_emitter.h"
-#include "shader_recompiler/ir/program.h"
-
-namespace Shader::Optimization {
-
-static bool IsSharedMemoryInst(const IR::Inst& inst) {
-    const auto opcode = inst.GetOpcode();
-    return opcode == IR::Opcode::LoadSharedU32 || opcode == IR::Opcode::LoadSharedU64 ||
-           opcode == IR::Opcode::WriteSharedU32 || opcode == IR::Opcode::WriteSharedU64;
-}
-
-static u32 GetSharedMemImmOffset(const IR::Inst& inst) {
-    const auto* address = inst.Arg(0).InstRecursive();
-    ASSERT(address->GetOpcode() == IR::Opcode::IAdd32);
-    const auto ir_offset = address->Arg(1);
-    ASSERT_MSG(ir_offset.IsImmediate());
-    const auto offset = ir_offset.U32();
-    // Typical usage is the compiler spilling registers into shared memory, with 256 bytes between
-    // each register to account for 4 bytes per register times 64 threads per group. Ensure that
-    // this assumption holds, as if it does not this approach may need to be revised.
-    ASSERT_MSG(offset % 256 == 0, "Unexpected shared memory offset alignment: {}", offset);
-    return offset;
-}
-
-static void ConvertSharedMemToVgpr(IR::IREmitter& ir, IR::Inst& inst, const IR::VectorReg vgpr) {
-    switch (inst.GetOpcode()) {
-    case IR::Opcode::LoadSharedU32:
-        inst.ReplaceUsesWithAndRemove(ir.GetVectorReg(vgpr));
-        break;
-    case IR::Opcode::LoadSharedU64:
-        inst.ReplaceUsesWithAndRemove(
-            ir.CompositeConstruct(ir.GetVectorReg(vgpr), ir.GetVectorReg(vgpr + 1)));
-        break;
-    case IR::Opcode::WriteSharedU32:
-        ir.SetVectorReg(vgpr, IR::U32{inst.Arg(1)});
-        inst.Invalidate();
-        break;
-    case IR::Opcode::WriteSharedU64: {
-        const auto value = inst.Arg(1);
-        ir.SetVectorReg(vgpr, IR::U32{ir.CompositeExtract(value, 0)});
-        ir.SetVectorReg(vgpr, IR::U32{ir.CompositeExtract(value, 1)});
-        inst.Invalidate();
-        break;
-    }
-    default:
-        UNREACHABLE_MSG("Unknown shared memory opcode: {}", inst.GetOpcode());
-    }
-}
-
-void LowerSharedMemToRegisters(IR::Program& program, const RuntimeInfo& runtime_info) {
-    u32 next_vgpr_num = runtime_info.num_allocated_vgprs;
-    std::unordered_map<u32, IR::VectorReg> vgpr_map;
-    const auto get_vgpr = [&next_vgpr_num, &vgpr_map](const u32 offset) {
-        const auto [it, is_new] = vgpr_map.try_emplace(offset);
-        if (is_new) {
-            ASSERT_MSG(next_vgpr_num < 256, "Out of VGPRs");
-            const auto new_vgpr = static_cast<IR::VectorReg>(next_vgpr_num++);
-            it->second = new_vgpr;
-        }
-        return it->second;
-    };
-
-    for (IR::Block* const block : program.blocks) {
-        for (IR::Inst& inst : block->Instructions()) {
-            if (!IsSharedMemoryInst(inst)) {
-                continue;
-            }
-            const auto offset = GetSharedMemImmOffset(inst);
-            const auto vgpr = get_vgpr(offset);
-            IR::IREmitter ir{*block, IR::Block::InstructionList::s_iterator_to(inst)};
-            ConvertSharedMemToVgpr(ir, inst, vgpr);
-        }
-    }
-}
-
-} // namespace Shader::Optimization
--- a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp
+++ b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp
@ -78,7 +78,20 @@ bool IsDataRingInstruction(const IR::Inst& inst) {
 }

 IR::Type BufferDataType(const IR::Inst& inst, AmdGpu::NumberFormat num_format) {
-    return IR::Type::U32;
+    switch (inst.GetOpcode()) {
+    case IR::Opcode::LoadBufferU8:
+    case IR::Opcode::StoreBufferU8:
+        return IR::Type::U8;
+    case IR::Opcode::LoadBufferU16:
+    case IR::Opcode::StoreBufferU16:
+        return IR::Type::U16;
+    case IR::Opcode::LoadBufferFormatF32:
+    case IR::Opcode::StoreBufferFormatF32:
+        // Formatted buffer loads can use a variety of types.
+        return IR::Type::U32 | IR::Type::F32 | IR::Type::U16 | IR::Type::U8;
+    default:
+        return IR::Type::U32;
+    }
 }

 bool IsImageAtomicInstruction(const IR::Inst& inst) {
@ -121,11 +134,9 @@ public:

    u32 Add(const BufferResource& desc) {
        const u32 index{Add(buffer_resources, desc, [&desc](const auto& existing) {
-            // Only one GDS binding can exist.
-            if (desc.is_gds_buffer && existing.is_gds_buffer) {
-                return true;
-            }
-            return desc.sharp_idx == existing.sharp_idx && desc.inline_cbuf == existing.inline_cbuf;
+            return desc.sharp_idx == existing.sharp_idx &&
+                   desc.inline_cbuf == existing.inline_cbuf &&
+                   desc.buffer_type == existing.buffer_type;
        })};
        auto& buffer = buffer_resources[index];
        buffer.used_types |= desc.used_types;
@ -272,6 +283,7 @@ s32 TryHandleInlineCbuf(IR::Inst& inst, Info& info, Descriptors& descriptors,
        .sharp_idx = std::numeric_limits<u32>::max(),
        .used_types = BufferDataType(inst, cbuf.GetNumberFmt()),
        .inline_cbuf = cbuf,
+        .buffer_type = BufferType::Guest,
    });
 }

@ -286,6 +298,7 @@ void PatchBufferSharp(IR::Block& block, IR::Inst& inst, Info& info, Descriptors&
        binding = descriptors.Add(BufferResource{
            .sharp_idx = sharp,
            .used_types = BufferDataType(inst, buffer.GetNumberFmt()),
+            .buffer_type = BufferType::Guest,
            .is_written = IsBufferStore(inst),
            .is_formatted = inst.GetOpcode() == IR::Opcode::LoadBufferFormatF32 ||
                            inst.GetOpcode() == IR::Opcode::StoreBufferFormatF32,
@ -402,13 +415,10 @@ void PatchImageSharp(IR::Block& block, IR::Inst& inst, Info& info, Descriptors&
 }

 void PatchDataRingAccess(IR::Block& block, IR::Inst& inst, Info& info, Descriptors& descriptors) {
-    // Insert gds binding in the shader if it doesn't exist already.
-    // The buffer is used for append/consume counters.
-    constexpr static AmdGpu::Buffer GdsSharp{.base_address = 1};
    const u32 binding = descriptors.Add(BufferResource{
        .used_types = IR::Type::U32,
-        .inline_cbuf = GdsSharp,
-        .is_gds_buffer = true,
+        .inline_cbuf = AmdGpu::Buffer::Null(),
+        .buffer_type = BufferType::GdsBuffer,
        .is_written = true,
    });

@ -420,12 +430,12 @@ void PatchDataRingAccess(IR::Block& block, IR::Inst& inst, Info& info, Descripto
    };

    // Attempt to deduce the GDS address of counter at compile time.
-    const u32 gds_addr = [&] {
-        const IR::Value& gds_offset = inst.Arg(0);
-        if (gds_offset.IsImmediate()) {
-            // Nothing to do, offset is known.
-            return gds_offset.U32() & 0xFFFF;
-        }
+    u32 gds_addr = 0;
+    const IR::Value& gds_offset = inst.Arg(0);
+    if (gds_offset.IsImmediate()) {
+        // Nothing to do, offset is known.
+        gds_addr = gds_offset.U32() & 0xFFFF;
+    } else {
        const auto result = IR::BreadthFirstSearch(&inst, pred);
        ASSERT_MSG(result, "Unable to track M0 source");

@ -436,8 +446,8 @@ void PatchDataRingAccess(IR::Block& block, IR::Inst& inst, Info& info, Descripto
        if (prod->GetOpcode() == IR::Opcode::IAdd32) {
            m0_val += prod->Arg(1).U32();
        }
-        return m0_val & 0xFFFF;
-    }();
+        gds_addr = m0_val & 0xFFFF;
+    }

    // Patch instruction.
    IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)};
--- a/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp
+++ b/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp
@ -74,7 +74,14 @@ void Visit(Info& info, const IR::Inst& inst) {
        info.uses_lane_id = true;
        break;
    case IR::Opcode::ReadConst:
-        info.has_readconst = true;
+        if (!info.has_readconst) {
+            info.buffers.push_back({
+                .used_types = IR::Type::U32,
+                .inline_cbuf = AmdGpu::Buffer::Null(),
+                .buffer_type = BufferType::ReadConstUbo,
+            });
+            info.has_readconst = true;
+        }
        break;
    case IR::Opcode::PackUfloat10_11_11:
        info.uses_pack_10_11_11 = true;
@ -88,10 +95,9 @@ void Visit(Info& info, const IR::Inst& inst) {
 }

 void CollectShaderInfoPass(IR::Program& program) {
-    Info& info{program.info};
    for (IR::Block* const block : program.post_order_blocks) {
        for (IR::Inst& inst : block->Instructions()) {
-            Visit(info, inst);
+            Visit(program.info, inst);
        }
    }
 }
--- a/src/shader_recompiler/ir/passes/shared_memory_barrier_pass.cpp
+++ b/src/shader_recompiler/ir/passes/shared_memory_barrier_pass.cpp
@ -8,37 +8,46 @@

 namespace Shader::Optimization {

+static bool IsLoadShared(const IR::Inst& inst) {
+    return inst.GetOpcode() == IR::Opcode::LoadSharedU32 ||
+           inst.GetOpcode() == IR::Opcode::LoadSharedU64;
+}
+
+static bool IsWriteShared(const IR::Inst& inst) {
+    return inst.GetOpcode() == IR::Opcode::WriteSharedU32 ||
+           inst.GetOpcode() == IR::Opcode::WriteSharedU64;
+}
+
+// Inserts barriers when a shared memory write and read occur in the same basic block.
 static void EmitBarrierInBlock(IR::Block* block) {
-    // This is inteded to insert a barrier when shared memory write and read
-    // occur in the same basic block. Also checks if branch depth is zero as
-    // we don't want to insert barrier in potentially divergent code.
-    bool emit_barrier_on_write = false;
-    bool emit_barrier_on_read = false;
-    const auto emit_barrier = [block](bool& emit_cond, IR::Inst& inst) {
-        if (emit_cond) {
-            IR::IREmitter ir{*block, IR::Block::InstructionList::s_iterator_to(inst)};
-            ir.Barrier();
-            emit_cond = false;
-        }
+    enum class BarrierAction : u32 {
+        None,
+        BarrierOnWrite,
+        BarrierOnRead,
    };
+    BarrierAction action{};
    for (IR::Inst& inst : block->Instructions()) {
-        if (inst.GetOpcode() == IR::Opcode::LoadSharedU32 ||
-            inst.GetOpcode() == IR::Opcode::LoadSharedU64) {
-            emit_barrier(emit_barrier_on_read, inst);
-            emit_barrier_on_write = true;
+        if (IsLoadShared(inst)) {
+            if (action == BarrierAction::BarrierOnRead) {
+                IR::IREmitter ir{*block, IR::Block::InstructionList::s_iterator_to(inst)};
+                ir.Barrier();
+            }
+            action = BarrierAction::BarrierOnWrite;
+            continue;
        }
-        if (inst.GetOpcode() == IR::Opcode::WriteSharedU32 ||
-            inst.GetOpcode() == IR::Opcode::WriteSharedU64) {
-            emit_barrier(emit_barrier_on_write, inst);
-            emit_barrier_on_read = true;
+        if (IsWriteShared(inst)) {
+            if (action == BarrierAction::BarrierOnWrite) {
+                IR::IREmitter ir{*block, IR::Block::InstructionList::s_iterator_to(inst)};
+                ir.Barrier();
+            }
+            action = BarrierAction::BarrierOnRead;
        }
    }
 }

+// Inserts a barrier after divergent conditional blocks to avoid undefined
+// behavior when some threads write and others read from shared memory.
 static void EmitBarrierInMergeBlock(const IR::AbstractSyntaxNode::Data& data) {
-    // Insert a barrier after divergent conditional blocks.
-    // This avoids potential softlocks and crashes when some threads
-    // initialize shared memory and others read from it.
    const IR::U1 cond = data.if_node.cond;
    const auto insert_barrier =
        IR::BreadthFirstSearch(cond, [](IR::Inst* inst) -> std::optional<bool> {
@ -56,8 +65,21 @@ static void EmitBarrierInMergeBlock(const IR::AbstractSyntaxNode::Data& data) {
    }
 }

-void SharedMemoryBarrierPass(IR::Program& program, const Profile& profile) {
-    if (!program.info.uses_shared || !profile.needs_lds_barriers) {
+static constexpr u32 GcnSubgroupSize = 64;
+
+void SharedMemoryBarrierPass(IR::Program& program, const RuntimeInfo& runtime_info,
+                             const Profile& profile) {
+    if (program.info.stage != Stage::Compute) {
+        return;
+    }
+    const auto& cs_info = runtime_info.cs_info;
+    const u32 shared_memory_size = cs_info.shared_memory_size;
+    const u32 threadgroup_size =
+        cs_info.workgroup_size[0] * cs_info.workgroup_size[1] * cs_info.workgroup_size[2];
+    // The compiler can only omit barriers when the local workgroup size is the same as the HW
+    // subgroup.
+    if (shared_memory_size == 0 || threadgroup_size != GcnSubgroupSize ||
+        !profile.needs_lds_barriers) {
        return;
    }
    using Type = IR::AbstractSyntaxNode::Type;
@ -67,6 +89,8 @@ void SharedMemoryBarrierPass(IR::Program& program, const Profile& profile) {
            --branch_depth;
            continue;
        }
+        // Check if branch depth is zero, we don't want to insert barrier in potentially divergent
+        // code.
        if (node.type == Type::If && branch_depth++ == 0) {
            EmitBarrierInMergeBlock(node.data);
            continue;
--- a/src/shader_recompiler/ir/passes/shared_memory_to_storage_pass.cpp
+++ b/src/shader_recompiler/ir/passes/shared_memory_to_storage_pass.cpp
@ -0,0 +1,117 @@
+// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include "shader_recompiler/ir/ir_emitter.h"
+#include "shader_recompiler/ir/program.h"
+#include "shader_recompiler/profile.h"
+
+namespace Shader::Optimization {
+
+static bool IsSharedAccess(const IR::Inst& inst) {
+    const auto opcode = inst.GetOpcode();
+    switch (opcode) {
+    case IR::Opcode::LoadSharedU32:
+    case IR::Opcode::LoadSharedU64:
+    case IR::Opcode::WriteSharedU32:
+    case IR::Opcode::WriteSharedU64:
+    case IR::Opcode::SharedAtomicAnd32:
+    case IR::Opcode::SharedAtomicIAdd32:
+    case IR::Opcode::SharedAtomicOr32:
+    case IR::Opcode::SharedAtomicSMax32:
+    case IR::Opcode::SharedAtomicUMax32:
+    case IR::Opcode::SharedAtomicSMin32:
+    case IR::Opcode::SharedAtomicUMin32:
+    case IR::Opcode::SharedAtomicXor32:
+        return true;
+    default:
+        return false;
+    }
+}
+
+void SharedMemoryToStoragePass(IR::Program& program, const RuntimeInfo& runtime_info,
+                               const Profile& profile) {
+    if (program.info.stage != Stage::Compute) {
+        return;
+    }
+    // Only perform the transform if the host shared memory is insufficient.
+    const u32 shared_memory_size = runtime_info.cs_info.shared_memory_size;
+    if (shared_memory_size <= profile.max_shared_memory_size) {
+        return;
+    }
+    // Add buffer binding for shared memory storage buffer.
+    const u32 binding = static_cast<u32>(program.info.buffers.size());
+    program.info.buffers.push_back({
+        .used_types = IR::Type::U32,
+        .inline_cbuf = AmdGpu::Buffer::Null(),
+        .buffer_type = BufferType::SharedMemory,
+        .is_written = true,
+    });
+    for (IR::Block* const block : program.blocks) {
+        for (IR::Inst& inst : block->Instructions()) {
+            if (!IsSharedAccess(inst)) {
+                continue;
+            }
+            IR::IREmitter ir{*block, IR::Block::InstructionList::s_iterator_to(inst)};
+            const IR::U32 handle = ir.Imm32(binding);
+            // Replace shared atomics first
+            switch (inst.GetOpcode()) {
+            case IR::Opcode::SharedAtomicAnd32:
+                inst.ReplaceUsesWithAndRemove(
+                    ir.BufferAtomicAnd(handle, inst.Arg(0), inst.Arg(1), {}));
+                continue;
+            case IR::Opcode::SharedAtomicIAdd32:
+                inst.ReplaceUsesWithAndRemove(
+                    ir.BufferAtomicIAdd(handle, inst.Arg(0), inst.Arg(1), {}));
+                continue;
+            case IR::Opcode::SharedAtomicOr32:
+                inst.ReplaceUsesWithAndRemove(
+                    ir.BufferAtomicOr(handle, inst.Arg(0), inst.Arg(1), {}));
+                continue;
+            case IR::Opcode::SharedAtomicSMax32:
+            case IR::Opcode::SharedAtomicUMax32: {
+                const bool is_signed = inst.GetOpcode() == IR::Opcode::SharedAtomicSMax32;
+                inst.ReplaceUsesWithAndRemove(
+                    ir.BufferAtomicIMax(handle, inst.Arg(0), inst.Arg(1), is_signed, {}));
+                continue;
+            }
+            case IR::Opcode::SharedAtomicSMin32:
+            case IR::Opcode::SharedAtomicUMin32: {
+                const bool is_signed = inst.GetOpcode() == IR::Opcode::SharedAtomicSMin32;
+                inst.ReplaceUsesWithAndRemove(
+                    ir.BufferAtomicIMin(handle, inst.Arg(0), inst.Arg(1), is_signed, {}));
+                continue;
+            }
+            case IR::Opcode::SharedAtomicXor32:
+                inst.ReplaceUsesWithAndRemove(
+                    ir.BufferAtomicXor(handle, inst.Arg(0), inst.Arg(1), {}));
+                continue;
+            default:
+                break;
+            }
+            // Replace shared operations.
+            const IR::U32 offset = ir.IMul(ir.GetAttributeU32(IR::Attribute::WorkgroupIndex),
+                                           ir.Imm32(shared_memory_size));
+            const IR::U32 address = ir.IAdd(IR::U32{inst.Arg(0)}, offset);
+            switch (inst.GetOpcode()) {
+            case IR::Opcode::LoadSharedU32:
+                inst.ReplaceUsesWithAndRemove(ir.LoadBufferU32(1, handle, address, {}));
+                break;
+            case IR::Opcode::LoadSharedU64:
+                inst.ReplaceUsesWithAndRemove(ir.LoadBufferU32(2, handle, address, {}));
+                break;
+            case IR::Opcode::WriteSharedU32:
+                ir.StoreBufferU32(1, handle, address, inst.Arg(1), {});
+                inst.Invalidate();
+                break;
+            case IR::Opcode::WriteSharedU64:
+                ir.StoreBufferU32(2, handle, address, inst.Arg(1), {});
+                inst.Invalidate();
+                break;
+            default:
+                break;
+            }
+        }
+    }
+}
+
+} // namespace Shader::Optimization