shader_recompiler: Implement AMD buffer bounds checking behavior. (#2448)

* shader_recompiler: Implement AMD buffer bounds checking behavior.

* shader_recompiler: Use SRT flatbuf for bounds check size.

* shader_recompiler: Fix buffer atomic bounds check.

* buffer_cache: Prevent false image-to-buffer sync.

Lowering vertex fetch to formatted buffer surfaced an issue where a CPU modified range may be overwritten with stale GPU modified image data.

* Address review comments.
This commit is contained in:
squidbus 2025-02-17 06:13:39 -08:00 committed by GitHub
parent b06790dfe5
commit fd3d3c4158
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
19 changed files with 376 additions and 158 deletions

View file

@ -9,6 +9,12 @@
namespace Shader::Gcn {
const u32* GetFetchShaderCode(const Info& info, u32 sgpr_base) {
const u32* code;
std::memcpy(&code, &info.user_data[sgpr_base], sizeof(code));
return code;
}
/**
* s_load_dwordx4 s[8:11], s[2:3], 0x00
* s_load_dwordx4 s[12:15], s[2:3], 0x04
@ -38,9 +44,8 @@ std::optional<FetchShaderData> ParseFetchShader(const Shader::Info& info) {
if (!info.has_fetch_shader) {
return std::nullopt;
}
const u32* code;
std::memcpy(&code, &info.user_data[info.fetch_shader_sgpr_base], sizeof(code));
const auto* code = GetFetchShaderCode(info, info.fetch_shader_sgpr_base);
FetchShaderData data{.code = code};
GcnCodeSlice code_slice(code, code + std::numeric_limits<u32>::max());
GcnDecodeContext decoder;

View file

@ -64,6 +64,8 @@ struct FetchShaderData {
}
};
const u32* GetFetchShaderCode(const Info& info, u32 sgpr_base);
std::optional<FetchShaderData> ParseFetchShader(const Shader::Info& info);
} // namespace Shader::Gcn

View file

@ -4,6 +4,7 @@
#include "common/config.h"
#include "common/io_file.h"
#include "common/path_util.h"
#include "shader_recompiler/frontend/decode.h"
#include "shader_recompiler/frontend/fetch_shader.h"
#include "shader_recompiler/frontend/translate/translate.h"
#include "shader_recompiler/info.h"
@ -470,8 +471,29 @@ void Translator::SetDst64(const InstOperand& operand, const IR::U64F64& value_ra
void Translator::EmitFetch(const GcnInst& inst) {
// Read the pointer to the fetch shader assembly.
const auto code_sgpr_base = inst.src[0].code;
if (!profile.supports_robust_buffer_access) {
// The fetch shader must be inlined to access as regular buffers, so that
// bounds checks can be emitted to emulate robust buffer access.
const auto* code = GetFetchShaderCode(info, code_sgpr_base);
GcnCodeSlice slice(code, code + std::numeric_limits<u32>::max());
GcnDecodeContext decoder;
// Decode and save instructions
u32 sub_pc = 0;
while (!slice.atEnd()) {
const auto sub_inst = decoder.decodeInstruction(slice);
if (sub_inst.opcode == Opcode::S_SETPC_B64) {
// Assume we're swapping back to the main shader.
break;
}
TranslateInstruction(sub_inst, sub_pc++);
}
return;
}
info.has_fetch_shader = true;
info.fetch_shader_sgpr_base = inst.src[0].code;
info.fetch_shader_sgpr_base = code_sgpr_base;
const auto fetch_data = ParseFetchShader(info);
ASSERT(fetch_data.has_value());
@ -520,6 +542,40 @@ void Translator::LogMissingOpcode(const GcnInst& inst) {
info.translation_failed = true;
}
void Translator::TranslateInstruction(const GcnInst& inst, const u32 pc) {
// Emit instructions for each category.
switch (inst.category) {
case InstCategory::DataShare:
EmitDataShare(inst);
break;
case InstCategory::VectorInterpolation:
EmitVectorInterpolation(inst);
break;
case InstCategory::ScalarMemory:
EmitScalarMemory(inst);
break;
case InstCategory::VectorMemory:
EmitVectorMemory(inst);
break;
case InstCategory::Export:
EmitExport(inst);
break;
case InstCategory::FlowControl:
EmitFlowControl(pc, inst);
break;
case InstCategory::ScalarALU:
EmitScalarAlu(inst);
break;
case InstCategory::VectorALU:
EmitVectorAlu(inst);
break;
case InstCategory::DebugProfile:
break;
default:
UNREACHABLE();
}
}
void Translate(IR::Block* block, u32 pc, std::span<const GcnInst> inst_list, Info& info,
const RuntimeInfo& runtime_info, const Profile& profile) {
if (inst_list.empty()) {
@ -537,37 +593,7 @@ void Translate(IR::Block* block, u32 pc, std::span<const GcnInst> inst_list, Inf
continue;
}
// Emit instructions for each category.
switch (inst.category) {
case InstCategory::DataShare:
translator.EmitDataShare(inst);
break;
case InstCategory::VectorInterpolation:
translator.EmitVectorInterpolation(inst);
break;
case InstCategory::ScalarMemory:
translator.EmitScalarMemory(inst);
break;
case InstCategory::VectorMemory:
translator.EmitVectorMemory(inst);
break;
case InstCategory::Export:
translator.EmitExport(inst);
break;
case InstCategory::FlowControl:
translator.EmitFlowControl(pc, inst);
break;
case InstCategory::ScalarALU:
translator.EmitScalarAlu(inst);
break;
case InstCategory::VectorALU:
translator.EmitVectorAlu(inst);
break;
case InstCategory::DebugProfile:
break;
default:
UNREACHABLE();
}
translator.TranslateInstruction(inst, pc);
}
}

View file

@ -58,6 +58,8 @@ public:
explicit Translator(IR::Block* block_, Info& info, const RuntimeInfo& runtime_info,
const Profile& profile);
void TranslateInstruction(const GcnInst& inst, u32 pc);
// Instruction categories
void EmitPrologue();
void EmitFetch(const GcnInst& inst);

View file

@ -195,6 +195,7 @@ void Translator::BUFFER_LOAD(u32 num_dwords, bool is_typed, const GcnInst& inst)
buffer_info.inst_offset.Assign(mubuf.offset);
buffer_info.globally_coherent.Assign(mubuf.glc);
buffer_info.system_coherent.Assign(mubuf.slc);
buffer_info.typed.Assign(is_typed);
if (is_typed) {
const auto& mtbuf = inst.control.mtbuf;
const auto dmft = static_cast<AmdGpu::DataFormat>(mtbuf.dfmt);
@ -241,6 +242,7 @@ void Translator::BUFFER_LOAD_FORMAT(u32 num_dwords, const GcnInst& inst) {
buffer_info.inst_offset.Assign(mubuf.offset);
buffer_info.globally_coherent.Assign(mubuf.glc);
buffer_info.system_coherent.Assign(mubuf.slc);
buffer_info.typed.Assign(true);
const IR::Value handle =
ir.CompositeConstruct(ir.GetScalarReg(sharp), ir.GetScalarReg(sharp + 1),
@ -283,6 +285,7 @@ void Translator::BUFFER_STORE(u32 num_dwords, bool is_typed, const GcnInst& inst
buffer_info.inst_offset.Assign(mubuf.offset);
buffer_info.globally_coherent.Assign(mubuf.glc);
buffer_info.system_coherent.Assign(mubuf.slc);
buffer_info.typed.Assign(is_typed);
if (is_typed) {
const auto& mtbuf = inst.control.mtbuf;
const auto dmft = static_cast<AmdGpu::DataFormat>(mtbuf.dfmt);
@ -339,6 +342,7 @@ void Translator::BUFFER_STORE_FORMAT(u32 num_dwords, const GcnInst& inst) {
buffer_info.inst_offset.Assign(mubuf.offset);
buffer_info.globally_coherent.Assign(mubuf.glc);
buffer_info.system_coherent.Assign(mubuf.slc);
buffer_info.typed.Assign(true);
const IR::VectorReg src_reg{inst.src[1].code};