shader_recompiler: Implement AMD buffer bounds checking behavior. (#2448)

* shader_recompiler: Implement AMD buffer bounds checking behavior. * shader_recompiler: Use SRT flatbuf for bounds check size. * shader_recompiler: Fix buffer atomic bounds check. * buffer_cache: Prevent false image-to-buffer sync. Lowering vertex fetch to formatted buffer surfaced an issue where a CPU modified range may be overwritten with stale GPU modified image data. * Address review comments.
2025-06-26 12:26:18 +00:00 · 2025-02-17 06:13:39 -08:00 · 2025-02-17 06:13:39 -08:00 · fd3d3c4158
commit fd3d3c4158
parent b06790dfe5
19 changed files with 376 additions and 158 deletions
--- a/src/shader_recompiler/frontend/fetch_shader.cpp
+++ b/src/shader_recompiler/frontend/fetch_shader.cpp
@ -9,6 +9,12 @@

 namespace Shader::Gcn {

+const u32* GetFetchShaderCode(const Info& info, u32 sgpr_base) {
+    const u32* code;
+    std::memcpy(&code, &info.user_data[sgpr_base], sizeof(code));
+    return code;
+}
+
 /**
 * s_load_dwordx4 s[8:11], s[2:3], 0x00
 * s_load_dwordx4 s[12:15], s[2:3], 0x04
@ -38,9 +44,8 @@ std::optional<FetchShaderData> ParseFetchShader(const Shader::Info& info) {
    if (!info.has_fetch_shader) {
        return std::nullopt;
    }
-    const u32* code;
-    std::memcpy(&code, &info.user_data[info.fetch_shader_sgpr_base], sizeof(code));

+    const auto* code = GetFetchShaderCode(info, info.fetch_shader_sgpr_base);
    FetchShaderData data{.code = code};
    GcnCodeSlice code_slice(code, code + std::numeric_limits<u32>::max());
    GcnDecodeContext decoder;
--- a/src/shader_recompiler/frontend/fetch_shader.h
+++ b/src/shader_recompiler/frontend/fetch_shader.h
@ -64,6 +64,8 @@ struct FetchShaderData {
    }
 };

+const u32* GetFetchShaderCode(const Info& info, u32 sgpr_base);
+
 std::optional<FetchShaderData> ParseFetchShader(const Shader::Info& info);

 } // namespace Shader::Gcn
--- a/src/shader_recompiler/frontend/translate/translate.cpp
+++ b/src/shader_recompiler/frontend/translate/translate.cpp
@ -4,6 +4,7 @@
 #include "common/config.h"
 #include "common/io_file.h"
 #include "common/path_util.h"
+#include "shader_recompiler/frontend/decode.h"
 #include "shader_recompiler/frontend/fetch_shader.h"
 #include "shader_recompiler/frontend/translate/translate.h"
 #include "shader_recompiler/info.h"
@ -470,8 +471,29 @@ void Translator::SetDst64(const InstOperand& operand, const IR::U64F64& value_ra

 void Translator::EmitFetch(const GcnInst& inst) {
    // Read the pointer to the fetch shader assembly.
+    const auto code_sgpr_base = inst.src[0].code;
+    if (!profile.supports_robust_buffer_access) {
+        // The fetch shader must be inlined to access as regular buffers, so that
+        // bounds checks can be emitted to emulate robust buffer access.
+        const auto* code = GetFetchShaderCode(info, code_sgpr_base);
+        GcnCodeSlice slice(code, code + std::numeric_limits<u32>::max());
+        GcnDecodeContext decoder;
+
+        // Decode and save instructions
+        u32 sub_pc = 0;
+        while (!slice.atEnd()) {
+            const auto sub_inst = decoder.decodeInstruction(slice);
+            if (sub_inst.opcode == Opcode::S_SETPC_B64) {
+                // Assume we're swapping back to the main shader.
+                break;
+            }
+            TranslateInstruction(sub_inst, sub_pc++);
+        }
+        return;
+    }
+
    info.has_fetch_shader = true;
-    info.fetch_shader_sgpr_base = inst.src[0].code;
+    info.fetch_shader_sgpr_base = code_sgpr_base;

    const auto fetch_data = ParseFetchShader(info);
    ASSERT(fetch_data.has_value());
@ -520,6 +542,40 @@ void Translator::LogMissingOpcode(const GcnInst& inst) {
    info.translation_failed = true;
 }

+void Translator::TranslateInstruction(const GcnInst& inst, const u32 pc) {
+    // Emit instructions for each category.
+    switch (inst.category) {
+    case InstCategory::DataShare:
+        EmitDataShare(inst);
+        break;
+    case InstCategory::VectorInterpolation:
+        EmitVectorInterpolation(inst);
+        break;
+    case InstCategory::ScalarMemory:
+        EmitScalarMemory(inst);
+        break;
+    case InstCategory::VectorMemory:
+        EmitVectorMemory(inst);
+        break;
+    case InstCategory::Export:
+        EmitExport(inst);
+        break;
+    case InstCategory::FlowControl:
+        EmitFlowControl(pc, inst);
+        break;
+    case InstCategory::ScalarALU:
+        EmitScalarAlu(inst);
+        break;
+    case InstCategory::VectorALU:
+        EmitVectorAlu(inst);
+        break;
+    case InstCategory::DebugProfile:
+        break;
+    default:
+        UNREACHABLE();
+    }
+}
+
 void Translate(IR::Block* block, u32 pc, std::span<const GcnInst> inst_list, Info& info,
               const RuntimeInfo& runtime_info, const Profile& profile) {
    if (inst_list.empty()) {
@ -537,37 +593,7 @@ void Translate(IR::Block* block, u32 pc, std::span<const GcnInst> inst_list, Inf
            continue;
        }

-        // Emit instructions for each category.
-        switch (inst.category) {
-        case InstCategory::DataShare:
-            translator.EmitDataShare(inst);
-            break;
-        case InstCategory::VectorInterpolation:
-            translator.EmitVectorInterpolation(inst);
-            break;
-        case InstCategory::ScalarMemory:
-            translator.EmitScalarMemory(inst);
-            break;
-        case InstCategory::VectorMemory:
-            translator.EmitVectorMemory(inst);
-            break;
-        case InstCategory::Export:
-            translator.EmitExport(inst);
-            break;
-        case InstCategory::FlowControl:
-            translator.EmitFlowControl(pc, inst);
-            break;
-        case InstCategory::ScalarALU:
-            translator.EmitScalarAlu(inst);
-            break;
-        case InstCategory::VectorALU:
-            translator.EmitVectorAlu(inst);
-            break;
-        case InstCategory::DebugProfile:
-            break;
-        default:
-            UNREACHABLE();
-        }
+        translator.TranslateInstruction(inst, pc);
    }
 }

--- a/src/shader_recompiler/frontend/translate/translate.h
+++ b/src/shader_recompiler/frontend/translate/translate.h
@ -58,6 +58,8 @@ public:
    explicit Translator(IR::Block* block_, Info& info, const RuntimeInfo& runtime_info,
                        const Profile& profile);

+    void TranslateInstruction(const GcnInst& inst, u32 pc);
+
    // Instruction categories
    void EmitPrologue();
    void EmitFetch(const GcnInst& inst);
--- a/src/shader_recompiler/frontend/translate/vector_memory.cpp
+++ b/src/shader_recompiler/frontend/translate/vector_memory.cpp
@ -195,6 +195,7 @@ void Translator::BUFFER_LOAD(u32 num_dwords, bool is_typed, const GcnInst& inst)
    buffer_info.inst_offset.Assign(mubuf.offset);
    buffer_info.globally_coherent.Assign(mubuf.glc);
    buffer_info.system_coherent.Assign(mubuf.slc);
+    buffer_info.typed.Assign(is_typed);
    if (is_typed) {
        const auto& mtbuf = inst.control.mtbuf;
        const auto dmft = static_cast<AmdGpu::DataFormat>(mtbuf.dfmt);
@ -241,6 +242,7 @@ void Translator::BUFFER_LOAD_FORMAT(u32 num_dwords, const GcnInst& inst) {
    buffer_info.inst_offset.Assign(mubuf.offset);
    buffer_info.globally_coherent.Assign(mubuf.glc);
    buffer_info.system_coherent.Assign(mubuf.slc);
+    buffer_info.typed.Assign(true);

    const IR::Value handle =
        ir.CompositeConstruct(ir.GetScalarReg(sharp), ir.GetScalarReg(sharp + 1),
@ -283,6 +285,7 @@ void Translator::BUFFER_STORE(u32 num_dwords, bool is_typed, const GcnInst& inst
    buffer_info.inst_offset.Assign(mubuf.offset);
    buffer_info.globally_coherent.Assign(mubuf.glc);
    buffer_info.system_coherent.Assign(mubuf.slc);
+    buffer_info.typed.Assign(is_typed);
    if (is_typed) {
        const auto& mtbuf = inst.control.mtbuf;
        const auto dmft = static_cast<AmdGpu::DataFormat>(mtbuf.dfmt);
@ -339,6 +342,7 @@ void Translator::BUFFER_STORE_FORMAT(u32 num_dwords, const GcnInst& inst) {
    buffer_info.inst_offset.Assign(mubuf.offset);
    buffer_info.globally_coherent.Assign(mubuf.glc);
    buffer_info.system_coherent.Assign(mubuf.slc);
+    buffer_info.typed.Assign(true);

    const IR::VectorReg src_reg{inst.src[1].code};