From 27cbd6647f76531807f578d35bffe3bc21d70145 Mon Sep 17 00:00:00 2001 From: TheTurtle Date: Thu, 10 Jul 2025 13:38:50 +0300 Subject: [PATCH] shader_recompiler: Reorganize data share operations and implement GDS bit (#3222) * shader_recompiler: Reorganize data share operations and implement GDS bit * Review comments --- .../backend/spirv/emit_spirv_atomic.cpp | 60 +++- .../backend/spirv/emit_spirv_instructions.h | 10 + .../backend/spirv/spirv_emit_context.cpp | 1 + .../frontend/translate/data_share.cpp | 317 +++++++----------- .../frontend/translate/translate.h | 14 +- .../frontend/translate/vector_alu.cpp | 6 +- src/shader_recompiler/ir/ir_emitter.cpp | 115 +++++-- src/shader_recompiler/ir/ir_emitter.h | 28 +- src/shader_recompiler/ir/microinstruction.cpp | 12 +- src/shader_recompiler/ir/opcodes.inc | 10 + .../ir/passes/resource_tracking_pass.cpp | 179 ++++++++-- .../ir/passes/shader_info_collection_pass.cpp | 10 + .../ir/passes/shared_memory_simplify_pass.cpp | 10 + .../passes/shared_memory_to_storage_pass.cpp | 42 ++- src/video_core/buffer_cache/buffer_cache.cpp | 2 + 15 files changed, 525 insertions(+), 291 deletions(-) diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp index e37acb2e4..80c8b836b 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp @@ -54,17 +54,23 @@ Id SharedAtomicU64(EmitContext& ctx, Id offset, Id value, }); } +Id SharedAtomicU64IncDec(EmitContext& ctx, Id offset, + Id (Sirit::Module::*atomic_func)(Id, Id, Id, Id)) { + const Id shift_id{ctx.ConstU32(3U)}; + const Id index{ctx.OpShiftRightLogical(ctx.U32[1], offset, shift_id)}; + const u32 num_elements{Common::DivCeil(ctx.runtime_info.cs_info.shared_memory_size, 8u)}; + const Id pointer{ctx.EmitSharedMemoryAccess(ctx.shared_u64, ctx.shared_memory_u64, index)}; + const auto [scope, semantics]{AtomicArgs(ctx)}; + return AccessBoundsCheck<64>(ctx, index, ctx.ConstU32(num_elements), [&] { + return (ctx.*atomic_func)(ctx.U64, pointer, scope, semantics); + }); +} + template Id BufferAtomicU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value, Id (Sirit::Module::*atomic_func)(Id, Id, Id, Id, Id)) { const auto& buffer = ctx.buffers[handle]; - const auto type = [&] { - if constexpr (is_float) { - return ctx.F32[1]; - } else { - return ctx.U32[1]; - } - }(); + const Id type = is_float ? ctx.F32[1] : ctx.U32[1]; if (const Id offset = buffer.Offset(PointerSize::B32); Sirit::ValidId(offset)) { address = ctx.OpIAdd(ctx.U32[1], address, offset); } @@ -148,42 +154,82 @@ Id EmitSharedAtomicUMax32(EmitContext& ctx, Id offset, Id value) { return SharedAtomicU32(ctx, offset, value, &Sirit::Module::OpAtomicUMax); } +Id EmitSharedAtomicUMax64(EmitContext& ctx, Id offset, Id value) { + return SharedAtomicU64(ctx, offset, value, &Sirit::Module::OpAtomicUMax); +} + Id EmitSharedAtomicSMax32(EmitContext& ctx, Id offset, Id value) { return SharedAtomicU32(ctx, offset, value, &Sirit::Module::OpAtomicSMax); } +Id EmitSharedAtomicSMax64(EmitContext& ctx, Id offset, Id value) { + return SharedAtomicU64(ctx, offset, value, &Sirit::Module::OpAtomicSMax); +} + Id EmitSharedAtomicUMin32(EmitContext& ctx, Id offset, Id value) { return SharedAtomicU32(ctx, offset, value, &Sirit::Module::OpAtomicUMin); } +Id EmitSharedAtomicUMin64(EmitContext& ctx, Id offset, Id value) { + return SharedAtomicU64(ctx, offset, value, &Sirit::Module::OpAtomicUMin); +} + Id EmitSharedAtomicSMin32(EmitContext& ctx, Id offset, Id value) { return SharedAtomicU32(ctx, offset, value, &Sirit::Module::OpAtomicSMin); } +Id EmitSharedAtomicSMin64(EmitContext& ctx, Id offset, Id value) { + return SharedAtomicU64(ctx, offset, value, &Sirit::Module::OpAtomicSMin); +} + Id EmitSharedAtomicAnd32(EmitContext& ctx, Id offset, Id value) { return SharedAtomicU32(ctx, offset, value, &Sirit::Module::OpAtomicAnd); } +Id EmitSharedAtomicAnd64(EmitContext& ctx, Id offset, Id value) { + return SharedAtomicU64(ctx, offset, value, &Sirit::Module::OpAtomicAnd); +} + Id EmitSharedAtomicOr32(EmitContext& ctx, Id offset, Id value) { return SharedAtomicU32(ctx, offset, value, &Sirit::Module::OpAtomicOr); } +Id EmitSharedAtomicOr64(EmitContext& ctx, Id offset, Id value) { + return SharedAtomicU64(ctx, offset, value, &Sirit::Module::OpAtomicOr); +} + Id EmitSharedAtomicXor32(EmitContext& ctx, Id offset, Id value) { return SharedAtomicU32(ctx, offset, value, &Sirit::Module::OpAtomicXor); } +Id EmitSharedAtomicXor64(EmitContext& ctx, Id offset, Id value) { + return SharedAtomicU64(ctx, offset, value, &Sirit::Module::OpAtomicXor); +} + Id EmitSharedAtomicISub32(EmitContext& ctx, Id offset, Id value) { return SharedAtomicU32(ctx, offset, value, &Sirit::Module::OpAtomicISub); } +Id EmitSharedAtomicISub64(EmitContext& ctx, Id offset, Id value) { + return SharedAtomicU64(ctx, offset, value, &Sirit::Module::OpAtomicISub); +} + Id EmitSharedAtomicInc32(EmitContext& ctx, Id offset) { return SharedAtomicU32IncDec(ctx, offset, &Sirit::Module::OpAtomicIIncrement); } +Id EmitSharedAtomicInc64(EmitContext& ctx, Id offset) { + return SharedAtomicU64IncDec(ctx, offset, &Sirit::Module::OpAtomicIIncrement); +} + Id EmitSharedAtomicDec32(EmitContext& ctx, Id offset) { return SharedAtomicU32IncDec(ctx, offset, &Sirit::Module::OpAtomicIDecrement); } +Id EmitSharedAtomicDec64(EmitContext& ctx, Id offset) { + return SharedAtomicU64IncDec(ctx, offset, &Sirit::Module::OpAtomicIDecrement); +} + Id EmitBufferAtomicIAdd32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) { return BufferAtomicU32(ctx, inst, handle, address, value, &Sirit::Module::OpAtomicIAdd); } diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h index 6e146c5f6..8a0c586e9 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h +++ b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h @@ -139,15 +139,25 @@ void EmitWriteSharedU64(EmitContext& ctx, Id offset, Id value); Id EmitSharedAtomicIAdd32(EmitContext& ctx, Id offset, Id value); Id EmitSharedAtomicIAdd64(EmitContext& ctx, Id offset, Id value); Id EmitSharedAtomicUMax32(EmitContext& ctx, Id offset, Id value); +Id EmitSharedAtomicUMax64(EmitContext& ctx, Id offset, Id value); Id EmitSharedAtomicSMax32(EmitContext& ctx, Id offset, Id value); +Id EmitSharedAtomicSMax64(EmitContext& ctx, Id offset, Id value); Id EmitSharedAtomicUMin32(EmitContext& ctx, Id offset, Id value); +Id EmitSharedAtomicUMin64(EmitContext& ctx, Id offset, Id value); Id EmitSharedAtomicSMin32(EmitContext& ctx, Id offset, Id value); +Id EmitSharedAtomicSMin64(EmitContext& ctx, Id offset, Id value); Id EmitSharedAtomicAnd32(EmitContext& ctx, Id offset, Id value); +Id EmitSharedAtomicAnd64(EmitContext& ctx, Id offset, Id value); Id EmitSharedAtomicOr32(EmitContext& ctx, Id offset, Id value); +Id EmitSharedAtomicOr64(EmitContext& ctx, Id offset, Id value); Id EmitSharedAtomicXor32(EmitContext& ctx, Id offset, Id value); +Id EmitSharedAtomicXor64(EmitContext& ctx, Id offset, Id value); Id EmitSharedAtomicInc32(EmitContext& ctx, Id offset); +Id EmitSharedAtomicInc64(EmitContext& ctx, Id offset); Id EmitSharedAtomicDec32(EmitContext& ctx, Id offset); +Id EmitSharedAtomicDec64(EmitContext& ctx, Id offset); Id EmitSharedAtomicISub32(EmitContext& ctx, Id offset, Id value); +Id EmitSharedAtomicISub64(EmitContext& ctx, Id offset, Id value); Id EmitCompositeConstructU32x2(EmitContext& ctx, IR::Inst* inst, Id e1, Id e2); Id EmitCompositeConstructU32x3(EmitContext& ctx, IR::Inst* inst, Id e1, Id e2, Id e3); diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp index fe489f1b6..6a731d05c 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp @@ -76,6 +76,7 @@ EmitContext::EmitContext(const Profile& profile_, const RuntimeInfo& runtime_inf } else { SetMemoryModel(spv::AddressingModel::Logical, spv::MemoryModel::GLSL450); } + String(fmt::format("{:#x}", info.pgm_hash)); AddCapability(spv::Capability::Shader); DefineArithmeticTypes(); diff --git a/src/shader_recompiler/frontend/translate/data_share.cpp b/src/shader_recompiler/frontend/translate/data_share.cpp index 8ead93f78..634486fc4 100644 --- a/src/shader_recompiler/frontend/translate/data_share.cpp +++ b/src/shader_recompiler/frontend/translate/data_share.cpp @@ -3,7 +3,6 @@ #include "shader_recompiler/frontend/translate/translate.h" #include "shader_recompiler/ir/reg.h" -#include "shader_recompiler/profile.h" #include "shader_recompiler/runtime_info.h" namespace Shader::Gcn { @@ -12,29 +11,29 @@ void Translator::EmitDataShare(const GcnInst& inst) { switch (inst.opcode) { // DS case Opcode::DS_ADD_U32: - return DS_ADD_U32(inst, false); + return DS_OP(inst, AtomicOp::Add, false); case Opcode::DS_ADD_U64: - return DS_ADD_U64(inst, false); + return DS_OP(inst, AtomicOp::Add, false); case Opcode::DS_SUB_U32: - return DS_SUB_U32(inst, false); + return DS_OP(inst, AtomicOp::Sub, false); case Opcode::DS_INC_U32: - return DS_INC_U32(inst, false); + return DS_OP(inst, AtomicOp::Inc, false); case Opcode::DS_DEC_U32: - return DS_DEC_U32(inst, false); + return DS_OP(inst, AtomicOp::Dec, false); case Opcode::DS_MIN_I32: - return DS_MIN_U32(inst, true, false); + return DS_OP(inst, AtomicOp::Smin, false); case Opcode::DS_MAX_I32: - return DS_MAX_U32(inst, true, false); + return DS_OP(inst, AtomicOp::Smax, false); case Opcode::DS_MIN_U32: - return DS_MIN_U32(inst, false, false); + return DS_OP(inst, AtomicOp::Umin, false); case Opcode::DS_MAX_U32: - return DS_MAX_U32(inst, false, false); + return DS_OP(inst, AtomicOp::Umax, false); case Opcode::DS_AND_B32: - return DS_AND_B32(inst, false); + return DS_OP(inst, AtomicOp::And, false); case Opcode::DS_OR_B32: - return DS_OR_B32(inst, false); + return DS_OP(inst, AtomicOp::Or, false); case Opcode::DS_XOR_B32: - return DS_XOR_B32(inst, false); + return DS_OP(inst, AtomicOp::Xor, false); case Opcode::DS_WRITE_B32: return DS_WRITE(32, false, false, false, inst); case Opcode::DS_WRITE2_B32: @@ -42,19 +41,19 @@ void Translator::EmitDataShare(const GcnInst& inst) { case Opcode::DS_WRITE2ST64_B32: return DS_WRITE(32, false, true, true, inst); case Opcode::DS_ADD_RTN_U32: - return DS_ADD_U32(inst, true); + return DS_OP(inst, AtomicOp::Add, true); case Opcode::DS_SUB_RTN_U32: - return DS_SUB_U32(inst, true); + return DS_OP(inst, AtomicOp::Sub, true); case Opcode::DS_MIN_RTN_U32: - return DS_MIN_U32(inst, false, true); + return DS_OP(inst, AtomicOp::Umin, true); case Opcode::DS_MAX_RTN_U32: - return DS_MAX_U32(inst, false, true); + return DS_OP(inst, AtomicOp::Umax, true); case Opcode::DS_AND_RTN_B32: - return DS_AND_B32(inst, true); + return DS_OP(inst, AtomicOp::And, true); case Opcode::DS_OR_RTN_B32: - return DS_OR_B32(inst, true); + return DS_OP(inst, AtomicOp::Or, true); case Opcode::DS_XOR_RTN_B32: - return DS_XOR_B32(inst, true); + return DS_OP(inst, AtomicOp::Xor, true); case Opcode::DS_SWIZZLE_B32: return DS_SWIZZLE_B32(inst); case Opcode::DS_READ_B32: @@ -117,92 +116,63 @@ void Translator::V_WRITELANE_B32(const GcnInst& inst) { // DS -void Translator::DS_ADD_U32(const GcnInst& inst, bool rtn) { +template +void Translator::DS_OP(const GcnInst& inst, AtomicOp op, bool rtn) { + const bool is_gds = inst.control.ds.gds; const IR::U32 addr{GetSrc(inst.src[0])}; - const IR::U32 data{GetSrc(inst.src[1])}; + const T data = [&] { + if (op == AtomicOp::Inc || op == AtomicOp::Dec) { + return T{}; + } + if constexpr (std::is_same_v) { + return GetSrc(inst.src[1]); + } else { + return GetSrc64(inst.src[1]); + } + }(); const IR::U32 offset = ir.Imm32((u32(inst.control.ds.offset1) << 8u) + u32(inst.control.ds.offset0)); const IR::U32 addr_offset = ir.IAdd(addr, offset); - const IR::Value original_val = ir.SharedAtomicIAdd(addr_offset, data); + const T original_val = [&] -> T { + switch (op) { + case AtomicOp::Add: + return ir.SharedAtomicIAdd(addr_offset, data, is_gds); + case AtomicOp::Umin: + return ir.SharedAtomicIMin(addr_offset, data, false, is_gds); + case AtomicOp::Smin: + return ir.SharedAtomicIMin(addr_offset, data, true, is_gds); + case AtomicOp::Umax: + return ir.SharedAtomicIMax(addr_offset, data, false, is_gds); + case AtomicOp::Smax: + return ir.SharedAtomicIMax(addr_offset, data, true, is_gds); + case AtomicOp::And: + return ir.SharedAtomicAnd(addr_offset, data, is_gds); + case AtomicOp::Or: + return ir.SharedAtomicOr(addr_offset, data, is_gds); + case AtomicOp::Xor: + return ir.SharedAtomicXor(addr_offset, data, is_gds); + case AtomicOp::Sub: + return ir.SharedAtomicISub(addr_offset, data, is_gds); + case AtomicOp::Inc: + return ir.SharedAtomicInc(addr_offset, is_gds); + case AtomicOp::Dec: + return ir.SharedAtomicDec(addr_offset, is_gds); + default: + UNREACHABLE(); + } + }(); if (rtn) { - SetDst(inst.dst[0], IR::U32{original_val}); - } -} - -void Translator::DS_ADD_U64(const GcnInst& inst, bool rtn) { - const IR::U32 addr{GetSrc(inst.src[0])}; - const IR::U64 data{GetSrc64(inst.src[1])}; - const IR::U32 offset = - ir.Imm32((u32(inst.control.ds.offset1) << 8u) + u32(inst.control.ds.offset0)); - const IR::U32 addr_offset = ir.IAdd(addr, offset); - const IR::Value original_val = ir.SharedAtomicIAdd(addr_offset, data); - if (rtn) { - SetDst64(inst.dst[0], IR::U64{original_val}); - } -} - -void Translator::DS_MIN_U32(const GcnInst& inst, bool is_signed, bool rtn) { - const IR::U32 addr{GetSrc(inst.src[0])}; - const IR::U32 data{GetSrc(inst.src[1])}; - const IR::U32 offset = - ir.Imm32((u32(inst.control.ds.offset1) << 8u) + u32(inst.control.ds.offset0)); - const IR::U32 addr_offset = ir.IAdd(addr, offset); - const IR::Value original_val = ir.SharedAtomicIMin(addr_offset, data, is_signed); - if (rtn) { - SetDst(inst.dst[0], IR::U32{original_val}); - } -} - -void Translator::DS_MAX_U32(const GcnInst& inst, bool is_signed, bool rtn) { - const IR::U32 addr{GetSrc(inst.src[0])}; - const IR::U32 data{GetSrc(inst.src[1])}; - const IR::U32 offset = - ir.Imm32((u32(inst.control.ds.offset1) << 8u) + u32(inst.control.ds.offset0)); - const IR::U32 addr_offset = ir.IAdd(addr, offset); - const IR::Value original_val = ir.SharedAtomicIMax(addr_offset, data, is_signed); - if (rtn) { - SetDst(inst.dst[0], IR::U32{original_val}); - } -} - -void Translator::DS_AND_B32(const GcnInst& inst, bool rtn) { - const IR::U32 addr{GetSrc(inst.src[0])}; - const IR::U32 data{GetSrc(inst.src[1])}; - const IR::U32 offset = - ir.Imm32((u32(inst.control.ds.offset1) << 8u) + u32(inst.control.ds.offset0)); - const IR::U32 addr_offset = ir.IAdd(addr, offset); - const IR::Value original_val = ir.SharedAtomicAnd(addr_offset, data); - if (rtn) { - SetDst(inst.dst[0], IR::U32{original_val}); - } -} - -void Translator::DS_OR_B32(const GcnInst& inst, bool rtn) { - const IR::U32 addr{GetSrc(inst.src[0])}; - const IR::U32 data{GetSrc(inst.src[1])}; - const IR::U32 offset = - ir.Imm32((u32(inst.control.ds.offset1) << 8u) + u32(inst.control.ds.offset0)); - const IR::U32 addr_offset = ir.IAdd(addr, offset); - const IR::Value original_val = ir.SharedAtomicOr(addr_offset, data); - if (rtn) { - SetDst(inst.dst[0], IR::U32{original_val}); - } -} - -void Translator::DS_XOR_B32(const GcnInst& inst, bool rtn) { - const IR::U32 addr{GetSrc(inst.src[0])}; - const IR::U32 data{GetSrc(inst.src[1])}; - const IR::U32 offset = - ir.Imm32((u32(inst.control.ds.offset1) << 8u) + u32(inst.control.ds.offset0)); - const IR::U32 addr_offset = ir.IAdd(addr, offset); - const IR::Value original_val = ir.SharedAtomicXor(addr_offset, data); - if (rtn) { - SetDst(inst.dst[0], IR::U32{original_val}); + if constexpr (std::is_same_v) { + SetDst(inst.dst[0], original_val); + } else { + SetDst64(inst.dst[0], original_val); + } } } void Translator::DS_WRITE(int bit_size, bool is_signed, bool is_pair, bool stride64, const GcnInst& inst) { + const bool is_gds = inst.control.ds.gds; const IR::U32 addr{ir.GetVectorReg(IR::VectorReg(inst.src[0].code))}; const IR::VectorReg data0{inst.src[1].code}; const IR::VectorReg data1{inst.src[2].code}; @@ -220,33 +190,85 @@ void Translator::DS_WRITE(int bit_size, bool is_signed, bool is_pair, bool strid ir.WriteShared(64, ir.PackUint2x32(ir.CompositeConstruct(ir.GetVectorReg(data0), ir.GetVectorReg(data0 + 1))), - addr0); + addr0, is_gds); } else if (bit_size == 32) { - ir.WriteShared(32, ir.GetVectorReg(data0), addr0); + ir.WriteShared(32, ir.GetVectorReg(data0), addr0, is_gds); } else if (bit_size == 16) { - ir.WriteShared(16, ir.UConvert(16, ir.GetVectorReg(data0)), addr0); + ir.WriteShared(16, ir.UConvert(16, ir.GetVectorReg(data0)), addr0, is_gds); } const IR::U32 addr1 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset1 * adj))); if (bit_size == 64) { ir.WriteShared(64, ir.PackUint2x32(ir.CompositeConstruct(ir.GetVectorReg(data1), ir.GetVectorReg(data1 + 1))), - addr1); + addr1, is_gds); } else if (bit_size == 32) { - ir.WriteShared(32, ir.GetVectorReg(data1), addr1); + ir.WriteShared(32, ir.GetVectorReg(data1), addr1, is_gds); } else if (bit_size == 16) { - ir.WriteShared(16, ir.UConvert(16, ir.GetVectorReg(data1)), addr1); + ir.WriteShared(16, ir.UConvert(16, ir.GetVectorReg(data1)), addr1, is_gds); } } else { const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(offset)); if (bit_size == 64) { const IR::Value data = ir.CompositeConstruct(ir.GetVectorReg(data0), ir.GetVectorReg(data0 + 1)); - ir.WriteShared(bit_size, ir.PackUint2x32(data), addr0); + ir.WriteShared(bit_size, ir.PackUint2x32(data), addr0, is_gds); } else if (bit_size == 32) { - ir.WriteShared(bit_size, ir.GetVectorReg(data0), addr0); + ir.WriteShared(bit_size, ir.GetVectorReg(data0), addr0, is_gds); } else if (bit_size == 16) { - ir.WriteShared(bit_size, ir.UConvert(16, ir.GetVectorReg(data0)), addr0); + ir.WriteShared(bit_size, ir.UConvert(16, ir.GetVectorReg(data0)), addr0, is_gds); + } + } +} + +void Translator::DS_READ(int bit_size, bool is_signed, bool is_pair, bool stride64, + const GcnInst& inst) { + const bool is_gds = inst.control.ds.gds; + const IR::U32 addr{ir.GetVectorReg(IR::VectorReg(inst.src[0].code))}; + IR::VectorReg dst_reg{inst.dst[0].code}; + const u32 offset = (inst.control.ds.offset1 << 8u) + inst.control.ds.offset0; + if (info.stage == Stage::Fragment) { + ASSERT_MSG(!is_pair && bit_size == 32 && offset % 256 == 0, + "Unexpected shared memory offset alignment: {}", offset); + ir.SetVectorReg(dst_reg, ir.GetVectorReg(GetScratchVgpr(offset))); + return; + } + if (is_pair) { + // Pair loads are either 32 or 64-bit + const u32 adj = (bit_size == 32 ? 4 : 8) * (stride64 ? 64 : 1); + const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset0 * adj))); + const IR::Value data0 = ir.LoadShared(bit_size, is_signed, addr0, is_gds); + if (bit_size == 64) { + const auto vector = ir.UnpackUint2x32(IR::U64{data0}); + ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(vector, 0)}); + ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(vector, 1)}); + } else if (bit_size == 32) { + ir.SetVectorReg(dst_reg++, IR::U32{data0}); + } else if (bit_size == 16) { + ir.SetVectorReg(dst_reg++, IR::U32{ir.UConvert(32, IR::U16{data0})}); + } + const IR::U32 addr1 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset1 * adj))); + const IR::Value data1 = ir.LoadShared(bit_size, is_signed, addr1, is_gds); + if (bit_size == 64) { + const auto vector = ir.UnpackUint2x32(IR::U64{data1}); + ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(vector, 0)}); + ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(vector, 1)}); + } else if (bit_size == 32) { + ir.SetVectorReg(dst_reg++, IR::U32{data1}); + } else if (bit_size == 16) { + ir.SetVectorReg(dst_reg++, IR::U32{ir.UConvert(32, IR::U16{data1})}); + } + } else { + const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(offset)); + const IR::Value data = ir.LoadShared(bit_size, is_signed, addr0, is_gds); + if (bit_size == 64) { + const auto vector = ir.UnpackUint2x32(IR::U64{data}); + ir.SetVectorReg(dst_reg, IR::U32{ir.CompositeExtract(vector, 0)}); + ir.SetVectorReg(dst_reg + 1, IR::U32{ir.CompositeExtract(vector, 1)}); + } else if (bit_size == 32) { + ir.SetVectorReg(dst_reg, IR::U32{data}); + } else if (bit_size == 16) { + ir.SetVectorReg(dst_reg++, IR::U32{ir.UConvert(32, IR::U16{data})}); } } } @@ -263,91 +285,6 @@ void Translator::DS_SWIZZLE_B32(const GcnInst& inst) { SetDst(inst.dst[0], ir.QuadShuffle(src, index)); } -void Translator::DS_INC_U32(const GcnInst& inst, bool rtn) { - const IR::U32 addr{GetSrc(inst.src[0])}; - const IR::U32 offset = - ir.Imm32((u32(inst.control.ds.offset1) << 8u) + u32(inst.control.ds.offset0)); - const IR::U32 addr_offset = ir.IAdd(addr, offset); - const IR::Value original_val = ir.SharedAtomicInc(addr_offset); - if (rtn) { - SetDst(inst.dst[0], IR::U32{original_val}); - } -} - -void Translator::DS_DEC_U32(const GcnInst& inst, bool rtn) { - const IR::U32 addr{GetSrc(inst.src[0])}; - const IR::U32 offset = - ir.Imm32((u32(inst.control.ds.offset1) << 8u) + u32(inst.control.ds.offset0)); - const IR::U32 addr_offset = ir.IAdd(addr, offset); - const IR::Value original_val = ir.SharedAtomicDec(addr_offset); - if (rtn) { - SetDst(inst.dst[0], IR::U32{original_val}); - } -} - -void Translator::DS_SUB_U32(const GcnInst& inst, bool rtn) { - const IR::U32 addr{GetSrc(inst.src[0])}; - const IR::U32 data{GetSrc(inst.src[1])}; - const IR::U32 offset = - ir.Imm32((u32(inst.control.ds.offset1) << 8u) + u32(inst.control.ds.offset0)); - const IR::U32 addr_offset = ir.IAdd(addr, offset); - const IR::Value original_val = ir.SharedAtomicISub(addr_offset, data); - if (rtn) { - SetDst(inst.dst[0], IR::U32{original_val}); - } -} - -void Translator::DS_READ(int bit_size, bool is_signed, bool is_pair, bool stride64, - const GcnInst& inst) { - const IR::U32 addr{ir.GetVectorReg(IR::VectorReg(inst.src[0].code))}; - IR::VectorReg dst_reg{inst.dst[0].code}; - const u32 offset = (inst.control.ds.offset1 << 8u) + inst.control.ds.offset0; - if (info.stage == Stage::Fragment) { - ASSERT_MSG(!is_pair && bit_size == 32 && offset % 256 == 0, - "Unexpected shared memory offset alignment: {}", offset); - ir.SetVectorReg(dst_reg, ir.GetVectorReg(GetScratchVgpr(offset))); - return; - } - if (is_pair) { - // Pair loads are either 32 or 64-bit - const u32 adj = (bit_size == 32 ? 4 : 8) * (stride64 ? 64 : 1); - const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset0 * adj))); - const IR::Value data0 = ir.LoadShared(bit_size, is_signed, addr0); - if (bit_size == 64) { - const auto vector = ir.UnpackUint2x32(IR::U64{data0}); - ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(vector, 0)}); - ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(vector, 1)}); - } else if (bit_size == 32) { - ir.SetVectorReg(dst_reg++, IR::U32{data0}); - } else if (bit_size == 16) { - ir.SetVectorReg(dst_reg++, IR::U32{ir.UConvert(32, IR::U16{data0})}); - } - const IR::U32 addr1 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset1 * adj))); - const IR::Value data1 = ir.LoadShared(bit_size, is_signed, addr1); - if (bit_size == 64) { - const auto vector = ir.UnpackUint2x32(IR::U64{data1}); - ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(vector, 0)}); - ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(vector, 1)}); - } else if (bit_size == 32) { - ir.SetVectorReg(dst_reg++, IR::U32{data1}); - } else if (bit_size == 16) { - ir.SetVectorReg(dst_reg++, IR::U32{ir.UConvert(32, IR::U16{data1})}); - } - } else { - const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(offset)); - const IR::Value data = ir.LoadShared(bit_size, is_signed, addr0); - if (bit_size == 64) { - const auto vector = ir.UnpackUint2x32(IR::U64{data}); - ir.SetVectorReg(dst_reg, IR::U32{ir.CompositeExtract(vector, 0)}); - ir.SetVectorReg(dst_reg + 1, IR::U32{ir.CompositeExtract(vector, 1)}); - } else if (bit_size == 32) { - ir.SetVectorReg(dst_reg, IR::U32{data}); - } else if (bit_size == 16) { - ir.SetVectorReg(dst_reg++, IR::U32{ir.UConvert(32, IR::U16{data})}); - } - } -} - void Translator::DS_APPEND(const GcnInst& inst) { const u32 inst_offset = (u32(inst.control.ds.offset1) << 8u) + inst.control.ds.offset0; const IR::U32 gds_offset = ir.IAdd(ir.GetM0(), ir.Imm32(inst_offset)); diff --git a/src/shader_recompiler/frontend/translate/translate.h b/src/shader_recompiler/frontend/translate/translate.h index b5bfec344..4b5ff827b 100644 --- a/src/shader_recompiler/frontend/translate/translate.h +++ b/src/shader_recompiler/frontend/translate/translate.h @@ -270,21 +270,13 @@ public: // Data share // DS - void DS_ADD_U32(const GcnInst& inst, bool rtn); - void DS_ADD_U64(const GcnInst& inst, bool rtn); - void DS_MIN_U32(const GcnInst& inst, bool is_signed, bool rtn); - void DS_MAX_U32(const GcnInst& inst, bool is_signed, bool rtn); + template + void DS_OP(const GcnInst& inst, AtomicOp op, bool rtn); void DS_WRITE(int bit_size, bool is_signed, bool is_pair, bool stride64, const GcnInst& inst); - void DS_SWIZZLE_B32(const GcnInst& inst); - void DS_AND_B32(const GcnInst& inst, bool rtn); - void DS_OR_B32(const GcnInst& inst, bool rtn); - void DS_XOR_B32(const GcnInst& inst, bool rtn); void DS_READ(int bit_size, bool is_signed, bool is_pair, bool stride64, const GcnInst& inst); + void DS_SWIZZLE_B32(const GcnInst& inst); void DS_APPEND(const GcnInst& inst); void DS_CONSUME(const GcnInst& inst); - void DS_SUB_U32(const GcnInst& inst, bool rtn); - void DS_INC_U32(const GcnInst& inst, bool rtn); - void DS_DEC_U32(const GcnInst& inst, bool rtn); // Buffer Memory // MUBUF / MTBUF diff --git a/src/shader_recompiler/frontend/translate/vector_alu.cpp b/src/shader_recompiler/frontend/translate/vector_alu.cpp index 74c7ec601..017c77fb0 100644 --- a/src/shader_recompiler/frontend/translate/vector_alu.cpp +++ b/src/shader_recompiler/frontend/translate/vector_alu.cpp @@ -565,7 +565,8 @@ void Translator::V_MBCNT_U32_B32(bool is_low, const GcnInst& inst) { } // v_mbcnt_hi_u32_b32 vX, exec_hi, 0/vZ if ((inst.src[0].field == OperandField::ExecHi || - inst.src[0].field == OperandField::VccHi) && + inst.src[0].field == OperandField::VccHi || + inst.src[0].field == OperandField::ScalarGPR) && (inst.src[1].field == OperandField::ConstZero || inst.src[1].field == OperandField::VectorGPR)) { return SetDst(inst.dst[0], GetSrc(inst.src[1])); @@ -579,7 +580,8 @@ void Translator::V_MBCNT_U32_B32(bool is_low, const GcnInst& inst) { } // v_mbcnt_lo_u32_b32 vY, exec_lo, vX // used combined with above for append buffer indexing. - if (inst.src[0].field == OperandField::ExecLo || inst.src[0].field == OperandField::VccLo) { + if (inst.src[0].field == OperandField::ExecLo || inst.src[0].field == OperandField::VccLo || + inst.src[0].field == OperandField::ScalarGPR) { return SetDst(inst.dst[0], GetSrc(inst.src[1])); } UNREACHABLE(); diff --git a/src/shader_recompiler/ir/ir_emitter.cpp b/src/shader_recompiler/ir/ir_emitter.cpp index 2334777ed..b88e1a17d 100644 --- a/src/shader_recompiler/ir/ir_emitter.cpp +++ b/src/shader_recompiler/ir/ir_emitter.cpp @@ -291,78 +291,137 @@ void IREmitter::SetPatch(Patch patch, const F32& value) { Inst(Opcode::SetPatch, patch, value); } -Value IREmitter::LoadShared(int bit_size, bool is_signed, const U32& offset) { +Value IREmitter::LoadShared(int bit_size, bool is_signed, const U32& offset, bool is_gds) { switch (bit_size) { case 16: - return Inst(Opcode::LoadSharedU16, offset); + return Inst(Opcode::LoadSharedU16, Flags{is_gds}, offset); case 32: - return Inst(Opcode::LoadSharedU32, offset); + return Inst(Opcode::LoadSharedU32, Flags{is_gds}, offset); case 64: - return Inst(Opcode::LoadSharedU64, offset); + return Inst(Opcode::LoadSharedU64, Flags{is_gds}, offset); default: UNREACHABLE_MSG("Invalid bit size {}", bit_size); } } -void IREmitter::WriteShared(int bit_size, const Value& value, const U32& offset) { +void IREmitter::WriteShared(int bit_size, const Value& value, const U32& offset, bool is_gds) { switch (bit_size) { case 16: - Inst(Opcode::WriteSharedU16, offset, value); + Inst(Opcode::WriteSharedU16, Flags{is_gds}, offset, value); break; case 32: - Inst(Opcode::WriteSharedU32, offset, value); + Inst(Opcode::WriteSharedU32, Flags{is_gds}, offset, value); break; case 64: - Inst(Opcode::WriteSharedU64, offset, value); + Inst(Opcode::WriteSharedU64, Flags{is_gds}, offset, value); break; default: UNREACHABLE_MSG("Invalid bit size {}", bit_size); } } -U32U64 IREmitter::SharedAtomicIAdd(const U32& address, const U32U64& data) { +U32U64 IREmitter::SharedAtomicIAdd(const U32& address, const U32U64& data, bool is_gds) { switch (data.Type()) { case Type::U32: - return Inst(Opcode::SharedAtomicIAdd32, address, data); + return Inst(Opcode::SharedAtomicIAdd32, Flags{is_gds}, address, data); case Type::U64: - return Inst(Opcode::SharedAtomicIAdd64, address, data); + return Inst(Opcode::SharedAtomicIAdd64, Flags{is_gds}, address, data); default: ThrowInvalidType(data.Type()); } } -U32 IREmitter::SharedAtomicIMin(const U32& address, const U32& data, bool is_signed) { - return is_signed ? Inst(Opcode::SharedAtomicSMin32, address, data) - : Inst(Opcode::SharedAtomicUMin32, address, data); +U32U64 IREmitter::SharedAtomicIMin(const U32& address, const U32U64& data, bool is_signed, + bool is_gds) { + switch (data.Type()) { + case Type::U32: + return Inst(is_signed ? Opcode::SharedAtomicSMin32 : Opcode::SharedAtomicUMin32, + Flags{is_gds}, address, data); + case Type::U64: + return Inst(is_signed ? Opcode::SharedAtomicSMin64 : Opcode::SharedAtomicUMin64, + Flags{is_gds}, address, data); + default: + ThrowInvalidType(data.Type()); + } } -U32 IREmitter::SharedAtomicIMax(const U32& address, const U32& data, bool is_signed) { - return is_signed ? Inst(Opcode::SharedAtomicSMax32, address, data) - : Inst(Opcode::SharedAtomicUMax32, address, data); +U32U64 IREmitter::SharedAtomicIMax(const U32& address, const U32U64& data, bool is_signed, + bool is_gds) { + switch (data.Type()) { + case Type::U32: + return Inst(is_signed ? Opcode::SharedAtomicSMax32 : Opcode::SharedAtomicUMax32, + Flags{is_gds}, address, data); + case Type::U64: + return Inst(is_signed ? Opcode::SharedAtomicSMax64 : Opcode::SharedAtomicUMax64, + Flags{is_gds}, address, data); + default: + ThrowInvalidType(data.Type()); + } } -U32 IREmitter::SharedAtomicAnd(const U32& address, const U32& data) { - return Inst(Opcode::SharedAtomicAnd32, address, data); +U32U64 IREmitter::SharedAtomicAnd(const U32& address, const U32U64& data, bool is_gds) { + switch (data.Type()) { + case Type::U32: + return Inst(Opcode::SharedAtomicAnd32, Flags{is_gds}, address, data); + case Type::U64: + return Inst(Opcode::SharedAtomicAnd64, Flags{is_gds}, address, data); + default: + ThrowInvalidType(data.Type()); + } } -U32 IREmitter::SharedAtomicOr(const U32& address, const U32& data) { +U32U64 IREmitter::SharedAtomicOr(const U32& address, const U32U64& data, bool is_gds) { + switch (data.Type()) { + case Type::U32: + return Inst(Opcode::SharedAtomicAnd32, Flags{is_gds}, address, data); + case Type::U64: + return Inst(Opcode::SharedAtomicAnd64, Flags{is_gds}, address, data); + default: + ThrowInvalidType(data.Type()); + } return Inst(Opcode::SharedAtomicOr32, address, data); } -U32 IREmitter::SharedAtomicXor(const U32& address, const U32& data) { - return Inst(Opcode::SharedAtomicXor32, address, data); +U32U64 IREmitter::SharedAtomicXor(const U32& address, const U32U64& data, bool is_gds) { + switch (data.Type()) { + case Type::U32: + return Inst(Opcode::SharedAtomicXor32, Flags{is_gds}, address, data); + case Type::U64: + return Inst(Opcode::SharedAtomicXor64, Flags{is_gds}, address, data); + default: + ThrowInvalidType(data.Type()); + } } -U32 IREmitter::SharedAtomicInc(const U32& address) { - return Inst(Opcode::SharedAtomicInc32, address); +U32U64 IREmitter::SharedAtomicISub(const U32& address, const U32U64& data, bool is_gds) { + switch (data.Type()) { + case Type::U32: + return Inst(Opcode::SharedAtomicISub32, Flags{is_gds}, address, data); + case Type::U64: + return Inst(Opcode::SharedAtomicISub64, Flags{is_gds}, address, data); + default: + ThrowInvalidType(data.Type()); + } } -U32 IREmitter::SharedAtomicDec(const U32& address) { - return Inst(Opcode::SharedAtomicDec32, address); +template <> +U32 IREmitter::SharedAtomicInc(const U32& address, bool is_gds) { + return Inst(Opcode::SharedAtomicInc32, Flags{is_gds}, address); } -U32 IREmitter::SharedAtomicISub(const U32& address, const U32& data) { - return Inst(Opcode::SharedAtomicISub32, address, data); +template <> +U64 IREmitter::SharedAtomicInc(const U32& address, bool is_gds) { + return Inst(Opcode::SharedAtomicInc64, Flags{is_gds}, address); +} + +template <> +U32 IREmitter::SharedAtomicDec(const U32& address, bool is_gds) { + return Inst(Opcode::SharedAtomicDec32, Flags{is_gds}, address); +} + +template <> +U64 IREmitter::SharedAtomicDec(const U32& address, bool is_gds) { + return Inst(Opcode::SharedAtomicDec64, Flags{is_gds}, address); } U32 IREmitter::ReadConst(const Value& base, const U32& offset) { diff --git a/src/shader_recompiler/ir/ir_emitter.h b/src/shader_recompiler/ir/ir_emitter.h index 1c5a8f545..d9e5aab7a 100644 --- a/src/shader_recompiler/ir/ir_emitter.h +++ b/src/shader_recompiler/ir/ir_emitter.h @@ -96,18 +96,24 @@ public: [[nodiscard]] F32 GetPatch(Patch patch); void SetPatch(Patch patch, const F32& value); - [[nodiscard]] Value LoadShared(int bit_size, bool is_signed, const U32& offset); - void WriteShared(int bit_size, const Value& value, const U32& offset); + [[nodiscard]] Value LoadShared(int bit_size, bool is_signed, const U32& offset, + bool is_gds = false); + void WriteShared(int bit_size, const Value& value, const U32& offset, bool is_gds = false); - [[nodiscard]] U32U64 SharedAtomicIAdd(const U32& address, const U32U64& data); - [[nodiscard]] U32 SharedAtomicISub(const U32& address, const U32& data); - [[nodiscard]] U32 SharedAtomicIMin(const U32& address, const U32& data, bool is_signed); - [[nodiscard]] U32 SharedAtomicIMax(const U32& address, const U32& data, bool is_signed); - [[nodiscard]] U32 SharedAtomicInc(const U32& address); - [[nodiscard]] U32 SharedAtomicDec(const U32& address); - [[nodiscard]] U32 SharedAtomicAnd(const U32& address, const U32& data); - [[nodiscard]] U32 SharedAtomicOr(const U32& address, const U32& data); - [[nodiscard]] U32 SharedAtomicXor(const U32& address, const U32& data); + [[nodiscard]] U32U64 SharedAtomicIAdd(const U32& address, const U32U64& data, bool is_gds); + [[nodiscard]] U32U64 SharedAtomicISub(const U32& address, const U32U64& data, bool is_gds); + [[nodiscard]] U32U64 SharedAtomicIMin(const U32& address, const U32U64& data, bool is_signed, + bool is_gds); + [[nodiscard]] U32U64 SharedAtomicIMax(const U32& address, const U32U64& data, bool is_signed, + bool is_gds); + [[nodiscard]] U32U64 SharedAtomicAnd(const U32& address, const U32U64& data, bool is_gds); + [[nodiscard]] U32U64 SharedAtomicOr(const U32& address, const U32U64& data, bool is_gds); + [[nodiscard]] U32U64 SharedAtomicXor(const U32& address, const U32U64& data, bool is_gds); + + template + [[nodiscard]] T SharedAtomicInc(const U32& address, bool is_gds); + template + [[nodiscard]] T SharedAtomicDec(const U32& address, bool is_gds); [[nodiscard]] U32 ReadConst(const Value& base, const U32& offset); [[nodiscard]] U32 ReadConstBuffer(const Value& handle, const U32& index); diff --git a/src/shader_recompiler/ir/microinstruction.cpp b/src/shader_recompiler/ir/microinstruction.cpp index 84bdb5739..eaab05cb7 100644 --- a/src/shader_recompiler/ir/microinstruction.cpp +++ b/src/shader_recompiler/ir/microinstruction.cpp @@ -92,7 +92,6 @@ bool Inst::MayHaveSideEffects() const noexcept { case Opcode::WriteSharedU32: case Opcode::WriteSharedU64: case Opcode::SharedAtomicIAdd32: - case Opcode::SharedAtomicIAdd64: case Opcode::SharedAtomicISub32: case Opcode::SharedAtomicSMin32: case Opcode::SharedAtomicUMin32: @@ -103,6 +102,17 @@ bool Inst::MayHaveSideEffects() const noexcept { case Opcode::SharedAtomicAnd32: case Opcode::SharedAtomicOr32: case Opcode::SharedAtomicXor32: + case Opcode::SharedAtomicIAdd64: + case Opcode::SharedAtomicISub64: + case Opcode::SharedAtomicSMin64: + case Opcode::SharedAtomicUMin64: + case Opcode::SharedAtomicSMax64: + case Opcode::SharedAtomicUMax64: + case Opcode::SharedAtomicInc64: + case Opcode::SharedAtomicDec64: + case Opcode::SharedAtomicAnd64: + case Opcode::SharedAtomicOr64: + case Opcode::SharedAtomicXor64: case Opcode::ImageWrite: case Opcode::ImageAtomicIAdd32: case Opcode::ImageAtomicSMin32: diff --git a/src/shader_recompiler/ir/opcodes.inc b/src/shader_recompiler/ir/opcodes.inc index 553e63f3e..08dcec458 100644 --- a/src/shader_recompiler/ir/opcodes.inc +++ b/src/shader_recompiler/ir/opcodes.inc @@ -41,15 +41,25 @@ OPCODE(WriteSharedU64, Void, U32, OPCODE(SharedAtomicIAdd32, U32, U32, U32, ) OPCODE(SharedAtomicIAdd64, U64, U32, U64, ) OPCODE(SharedAtomicISub32, U32, U32, U32, ) +OPCODE(SharedAtomicISub64, U64, U32, U64, ) OPCODE(SharedAtomicSMin32, U32, U32, U32, ) +OPCODE(SharedAtomicSMin64, U64, U32, U64, ) OPCODE(SharedAtomicUMin32, U32, U32, U32, ) +OPCODE(SharedAtomicUMin64, U64, U32, U64, ) OPCODE(SharedAtomicSMax32, U32, U32, U32, ) +OPCODE(SharedAtomicSMax64, U64, U32, U64, ) OPCODE(SharedAtomicUMax32, U32, U32, U32, ) +OPCODE(SharedAtomicUMax64, U64, U32, U64, ) OPCODE(SharedAtomicInc32, U32, U32, ) +OPCODE(SharedAtomicInc64, U64, U32, ) OPCODE(SharedAtomicDec32, U32, U32, ) +OPCODE(SharedAtomicDec64, U64, U32, ) OPCODE(SharedAtomicAnd32, U32, U32, U32, ) +OPCODE(SharedAtomicAnd64, U64, U32, U64, ) OPCODE(SharedAtomicOr32, U32, U32, U32, ) +OPCODE(SharedAtomicOr64, U64, U32, U64, ) OPCODE(SharedAtomicXor32, U32, U32, U32, ) +OPCODE(SharedAtomicXor64, U64, U32, U64, ) // Context getters/setters OPCODE(GetUserData, U32, ScalarReg, ) diff --git a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp index f3972769c..e5a4beb8b 100644 --- a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp +++ b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp @@ -84,8 +84,42 @@ bool IsBufferInstruction(const IR::Inst& inst) { } bool IsDataRingInstruction(const IR::Inst& inst) { - return inst.GetOpcode() == IR::Opcode::DataAppend || - inst.GetOpcode() == IR::Opcode::DataConsume; + switch (inst.GetOpcode()) { + case IR::Opcode::DataAppend: + case IR::Opcode::DataConsume: + return true; + case IR::Opcode::LoadSharedU16: + case IR::Opcode::LoadSharedU32: + case IR::Opcode::LoadSharedU64: + case IR::Opcode::WriteSharedU16: + case IR::Opcode::WriteSharedU32: + case IR::Opcode::WriteSharedU64: + case IR::Opcode::SharedAtomicIAdd32: + case IR::Opcode::SharedAtomicIAdd64: + case IR::Opcode::SharedAtomicUMin32: + case IR::Opcode::SharedAtomicUMin64: + case IR::Opcode::SharedAtomicSMin32: + case IR::Opcode::SharedAtomicSMin64: + case IR::Opcode::SharedAtomicUMax32: + case IR::Opcode::SharedAtomicUMax64: + case IR::Opcode::SharedAtomicSMax32: + case IR::Opcode::SharedAtomicSMax64: + case IR::Opcode::SharedAtomicAnd32: + case IR::Opcode::SharedAtomicAnd64: + case IR::Opcode::SharedAtomicOr32: + case IR::Opcode::SharedAtomicOr64: + case IR::Opcode::SharedAtomicXor32: + case IR::Opcode::SharedAtomicXor64: + case IR::Opcode::SharedAtomicISub32: + case IR::Opcode::SharedAtomicISub64: + case IR::Opcode::SharedAtomicInc32: + case IR::Opcode::SharedAtomicInc64: + case IR::Opcode::SharedAtomicDec32: + case IR::Opcode::SharedAtomicDec64: + return inst.Flags(); // is_gds + default: + return false; + } } IR::Type BufferDataType(const IR::Inst& inst, AmdGpu::NumberFormat num_format) { @@ -507,7 +541,8 @@ void PatchImageSharp(IR::Block& block, IR::Inst& inst, Info& info, Descriptors& } } -void PatchDataRingAccess(IR::Block& block, IR::Inst& inst, Info& info, Descriptors& descriptors) { +void PatchGlobalDataShareAccess(IR::Block& block, IR::Inst& inst, Info& info, + Descriptors& descriptors) { const u32 binding = descriptors.Add(BufferResource{ .used_types = IR::Type::U32, .inline_cbuf = AmdGpu::Buffer::Null(), @@ -515,37 +550,111 @@ void PatchDataRingAccess(IR::Block& block, IR::Inst& inst, Info& info, Descripto .is_written = true, }); - const auto pred = [](const IR::Inst* inst) -> std::optional { - if (inst->GetOpcode() == IR::Opcode::GetUserData) { - return inst; - } - return std::nullopt; - }; - - // Attempt to deduce the GDS address of counter at compile time. - u32 gds_addr = 0; - const IR::Value& gds_offset = inst.Arg(0); - if (gds_offset.IsImmediate()) { - // Nothing to do, offset is known. - gds_addr = gds_offset.U32() & 0xFFFF; - } else { - const auto result = IR::BreadthFirstSearch(&inst, pred); - ASSERT_MSG(result, "Unable to track M0 source"); - - // M0 must be set by some user data register. - const IR::Inst* prod = gds_offset.InstRecursive(); - const u32 ud_reg = u32(result.value()->Arg(0).ScalarReg()); - u32 m0_val = info.user_data[ud_reg] >> 16; - if (prod->GetOpcode() == IR::Opcode::IAdd32) { - m0_val += prod->Arg(1).U32(); - } - gds_addr = m0_val & 0xFFFF; - } - - // Patch instruction. IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)}; - inst.SetArg(0, ir.Imm32(gds_addr >> 2)); - inst.SetArg(1, ir.Imm32(binding)); + + // For data append/consume operations attempt to deduce the GDS address. + if (inst.GetOpcode() == IR::Opcode::DataAppend || inst.GetOpcode() == IR::Opcode::DataConsume) { + const auto pred = [](const IR::Inst* inst) -> std::optional { + if (inst->GetOpcode() == IR::Opcode::GetUserData) { + return inst; + } + return std::nullopt; + }; + + u32 gds_addr = 0; + const IR::Value& gds_offset = inst.Arg(0); + if (gds_offset.IsImmediate()) { + // Nothing to do, offset is known. + gds_addr = gds_offset.U32() & 0xFFFF; + } else { + const auto result = IR::BreadthFirstSearch(&inst, pred); + ASSERT_MSG(result, "Unable to track M0 source"); + + // M0 must be set by some user data register. + const IR::Inst* prod = gds_offset.InstRecursive(); + const u32 ud_reg = u32(result.value()->Arg(0).ScalarReg()); + u32 m0_val = info.user_data[ud_reg] >> 16; + if (prod->GetOpcode() == IR::Opcode::IAdd32) { + m0_val += prod->Arg(1).U32(); + } + gds_addr = m0_val & 0xFFFF; + } + + // Patch instruction. + inst.SetArg(0, ir.Imm32(gds_addr >> 2)); + inst.SetArg(1, ir.Imm32(binding)); + } else { + // Convert shared memory opcode to storage buffer atomic to GDS buffer. + const IR::U32 offset = IR::U32{inst.Arg(0)}; + const IR::U32 address_words = ir.ShiftRightLogical(offset, ir.Imm32(1)); + const IR::U32 address_dwords = ir.ShiftRightLogical(offset, ir.Imm32(2)); + const IR::U32 address_qwords = ir.ShiftRightLogical(offset, ir.Imm32(3)); + const IR::U32 handle = ir.Imm32(binding); + switch (inst.GetOpcode()) { + case IR::Opcode::SharedAtomicIAdd32: + inst.ReplaceUsesWith(ir.BufferAtomicIAdd(handle, address_dwords, inst.Arg(1), {})); + break; + case IR::Opcode::SharedAtomicIAdd64: + inst.ReplaceUsesWith( + ir.BufferAtomicIAdd(handle, address_qwords, IR::U64{inst.Arg(1)}, {})); + break; + case IR::Opcode::SharedAtomicISub32: + inst.ReplaceUsesWith(ir.BufferAtomicISub(handle, address_dwords, inst.Arg(1), {})); + break; + case IR::Opcode::SharedAtomicSMin32: + case IR::Opcode::SharedAtomicUMin32: { + const bool is_signed = inst.GetOpcode() == IR::Opcode::SharedAtomicSMin32; + inst.ReplaceUsesWith( + ir.BufferAtomicIMin(handle, address_dwords, inst.Arg(1), is_signed, {})); + break; + } + case IR::Opcode::SharedAtomicSMax32: + case IR::Opcode::SharedAtomicUMax32: { + const bool is_signed = inst.GetOpcode() == IR::Opcode::SharedAtomicSMax32; + inst.ReplaceUsesWith( + ir.BufferAtomicIMax(handle, address_dwords, inst.Arg(1), is_signed, {})); + break; + } + case IR::Opcode::SharedAtomicInc32: + inst.ReplaceUsesWith(ir.BufferAtomicInc(handle, address_dwords, {})); + break; + case IR::Opcode::SharedAtomicDec32: + inst.ReplaceUsesWith(ir.BufferAtomicDec(handle, address_dwords, {})); + break; + case IR::Opcode::SharedAtomicAnd32: + inst.ReplaceUsesWith(ir.BufferAtomicAnd(handle, address_dwords, inst.Arg(1), {})); + break; + case IR::Opcode::SharedAtomicOr32: + inst.ReplaceUsesWith(ir.BufferAtomicOr(handle, address_dwords, inst.Arg(1), {})); + break; + case IR::Opcode::SharedAtomicXor32: + inst.ReplaceUsesWith(ir.BufferAtomicXor(handle, address_dwords, inst.Arg(1), {})); + break; + case IR::Opcode::LoadSharedU16: + inst.ReplaceUsesWith(ir.LoadBufferU16(handle, address_words, {})); + break; + case IR::Opcode::LoadSharedU32: + inst.ReplaceUsesWith(ir.LoadBufferU32(1, handle, address_dwords, {})); + break; + case IR::Opcode::LoadSharedU64: + inst.ReplaceUsesWith(ir.LoadBufferU64(handle, address_qwords, {})); + break; + case IR::Opcode::WriteSharedU16: + ir.StoreBufferU16(handle, address_words, IR::U16{inst.Arg(1)}, {}); + inst.Invalidate(); + break; + case IR::Opcode::WriteSharedU32: + ir.StoreBufferU32(1, handle, address_dwords, inst.Arg(1), {}); + inst.Invalidate(); + break; + case IR::Opcode::WriteSharedU64: + ir.StoreBufferU64(handle, address_qwords, IR::U64{inst.Arg(1)}, {}); + inst.Invalidate(); + break; + default: + UNREACHABLE(); + } + } } IR::U32 CalculateBufferAddress(IR::IREmitter& ir, const IR::Inst& inst, const Info& info, @@ -916,8 +1025,6 @@ void ResourceTrackingPass(IR::Program& program) { PatchBufferSharp(*block, inst, info, descriptors); } else if (IsImageInstruction(inst)) { PatchImageSharp(*block, inst, info, descriptors); - } else if (IsDataRingInstruction(inst)) { - PatchDataRingAccess(*block, inst, info, descriptors); } } } @@ -929,6 +1036,8 @@ void ResourceTrackingPass(IR::Program& program) { PatchBufferArgs(*block, inst, info); } else if (IsImageInstruction(inst)) { PatchImageArgs(*block, inst, info); + } else if (IsDataRingInstruction(inst)) { + PatchGlobalDataShareAccess(*block, inst, info, descriptors); } } } diff --git a/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp b/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp index a87dceb0a..079827866 100644 --- a/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp +++ b/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp @@ -55,6 +55,16 @@ void Visit(Info& info, const IR::Inst& inst) { info.shared_types |= IR::Type::U32; break; case IR::Opcode::SharedAtomicIAdd64: + case IR::Opcode::SharedAtomicISub64: + case IR::Opcode::SharedAtomicSMin64: + case IR::Opcode::SharedAtomicUMin64: + case IR::Opcode::SharedAtomicSMax64: + case IR::Opcode::SharedAtomicUMax64: + case IR::Opcode::SharedAtomicInc64: + case IR::Opcode::SharedAtomicDec64: + case IR::Opcode::SharedAtomicAnd64: + case IR::Opcode::SharedAtomicOr64: + case IR::Opcode::SharedAtomicXor64: info.uses_shared_int64_atomics = true; [[fallthrough]]; case IR::Opcode::LoadSharedU64: diff --git a/src/shader_recompiler/ir/passes/shared_memory_simplify_pass.cpp b/src/shader_recompiler/ir/passes/shared_memory_simplify_pass.cpp index 0f80a3b28..555fd505b 100644 --- a/src/shader_recompiler/ir/passes/shared_memory_simplify_pass.cpp +++ b/src/shader_recompiler/ir/passes/shared_memory_simplify_pass.cpp @@ -15,6 +15,16 @@ static bool Requires16BitSharedAtomic(const IR::Inst& inst) { static bool Requires64BitSharedAtomic(const IR::Inst& inst) { switch (inst.GetOpcode()) { case IR::Opcode::SharedAtomicIAdd64: + case IR::Opcode::SharedAtomicISub64: + case IR::Opcode::SharedAtomicSMin64: + case IR::Opcode::SharedAtomicUMin64: + case IR::Opcode::SharedAtomicSMax64: + case IR::Opcode::SharedAtomicUMax64: + case IR::Opcode::SharedAtomicInc64: + case IR::Opcode::SharedAtomicDec64: + case IR::Opcode::SharedAtomicAnd64: + case IR::Opcode::SharedAtomicOr64: + case IR::Opcode::SharedAtomicXor64: return true; default: return false; diff --git a/src/shader_recompiler/ir/passes/shared_memory_to_storage_pass.cpp b/src/shader_recompiler/ir/passes/shared_memory_to_storage_pass.cpp index a6900e180..b84011acc 100644 --- a/src/shader_recompiler/ir/passes/shared_memory_to_storage_pass.cpp +++ b/src/shader_recompiler/ir/passes/shared_memory_to_storage_pass.cpp @@ -17,7 +17,6 @@ static bool IsSharedAccess(const IR::Inst& inst) { case IR::Opcode::WriteSharedU32: case IR::Opcode::WriteSharedU64: case IR::Opcode::SharedAtomicIAdd32: - case IR::Opcode::SharedAtomicIAdd64: case IR::Opcode::SharedAtomicISub32: case IR::Opcode::SharedAtomicSMin32: case IR::Opcode::SharedAtomicUMin32: @@ -28,6 +27,17 @@ static bool IsSharedAccess(const IR::Inst& inst) { case IR::Opcode::SharedAtomicAnd32: case IR::Opcode::SharedAtomicOr32: case IR::Opcode::SharedAtomicXor32: + case IR::Opcode::SharedAtomicIAdd64: + case IR::Opcode::SharedAtomicISub64: + case IR::Opcode::SharedAtomicSMin64: + case IR::Opcode::SharedAtomicUMin64: + case IR::Opcode::SharedAtomicSMax64: + case IR::Opcode::SharedAtomicUMax64: + case IR::Opcode::SharedAtomicInc64: + case IR::Opcode::SharedAtomicDec64: + case IR::Opcode::SharedAtomicAnd64: + case IR::Opcode::SharedAtomicOr64: + case IR::Opcode::SharedAtomicXor64: return true; default: return false; @@ -64,6 +74,16 @@ IR::Type CalculateSharedMemoryTypes(IR::Program& program) { case IR::Opcode::LoadSharedU64: case IR::Opcode::WriteSharedU64: case IR::Opcode::SharedAtomicIAdd64: + case IR::Opcode::SharedAtomicISub64: + case IR::Opcode::SharedAtomicSMin64: + case IR::Opcode::SharedAtomicUMin64: + case IR::Opcode::SharedAtomicSMax64: + case IR::Opcode::SharedAtomicUMax64: + case IR::Opcode::SharedAtomicInc64: + case IR::Opcode::SharedAtomicDec64: + case IR::Opcode::SharedAtomicAnd64: + case IR::Opcode::SharedAtomicOr64: + case IR::Opcode::SharedAtomicXor64: used_types |= IR::Type::U64; break; default: @@ -119,19 +139,26 @@ void SharedMemoryToStoragePass(IR::Program& program, const RuntimeInfo& runtime_ ir.BufferAtomicIAdd(handle, address, inst.Arg(1), {})); continue; case IR::Opcode::SharedAtomicISub32: + case IR::Opcode::SharedAtomicISub64: inst.ReplaceUsesWithAndRemove( ir.BufferAtomicISub(handle, address, inst.Arg(1), {})); continue; case IR::Opcode::SharedAtomicSMin32: - case IR::Opcode::SharedAtomicUMin32: { - const bool is_signed = inst.GetOpcode() == IR::Opcode::SharedAtomicSMin32; + case IR::Opcode::SharedAtomicUMin32: + case IR::Opcode::SharedAtomicSMin64: + case IR::Opcode::SharedAtomicUMin64: { + const bool is_signed = inst.GetOpcode() == IR::Opcode::SharedAtomicSMin32 || + inst.GetOpcode() == IR::Opcode::SharedAtomicSMin64; inst.ReplaceUsesWithAndRemove( ir.BufferAtomicIMin(handle, address, inst.Arg(1), is_signed, {})); continue; } case IR::Opcode::SharedAtomicSMax32: - case IR::Opcode::SharedAtomicUMax32: { - const bool is_signed = inst.GetOpcode() == IR::Opcode::SharedAtomicSMax32; + case IR::Opcode::SharedAtomicUMax32: + case IR::Opcode::SharedAtomicSMax64: + case IR::Opcode::SharedAtomicUMax64: { + const bool is_signed = inst.GetOpcode() == IR::Opcode::SharedAtomicSMax32 || + inst.GetOpcode() == IR::Opcode::SharedAtomicSMax64; inst.ReplaceUsesWithAndRemove( ir.BufferAtomicIMax(handle, address, inst.Arg(1), is_signed, {})); continue; @@ -143,12 +170,15 @@ void SharedMemoryToStoragePass(IR::Program& program, const RuntimeInfo& runtime_ inst.ReplaceUsesWithAndRemove(ir.BufferAtomicDec(handle, address, {})); continue; case IR::Opcode::SharedAtomicAnd32: + case IR::Opcode::SharedAtomicAnd64: inst.ReplaceUsesWithAndRemove(ir.BufferAtomicAnd(handle, address, inst.Arg(1), {})); continue; case IR::Opcode::SharedAtomicOr32: + case IR::Opcode::SharedAtomicOr64: inst.ReplaceUsesWithAndRemove(ir.BufferAtomicOr(handle, address, inst.Arg(1), {})); continue; case IR::Opcode::SharedAtomicXor32: + case IR::Opcode::SharedAtomicXor64: inst.ReplaceUsesWithAndRemove(ir.BufferAtomicXor(handle, address, inst.Arg(1), {})); continue; case IR::Opcode::LoadSharedU16: @@ -173,7 +203,7 @@ void SharedMemoryToStoragePass(IR::Program& program, const RuntimeInfo& runtime_ inst.Invalidate(); break; default: - break; + UNREACHABLE(); } } } diff --git a/src/video_core/buffer_cache/buffer_cache.cpp b/src/video_core/buffer_cache/buffer_cache.cpp index c1110e54d..28444ac60 100644 --- a/src/video_core/buffer_cache/buffer_cache.cpp +++ b/src/video_core/buffer_cache/buffer_cache.cpp @@ -48,6 +48,8 @@ BufferCache::BufferCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& s memory_tracker = std::make_unique(tracker); + std::memset(gds_buffer.mapped_data.data(), 0, DataShareBufferSize); + // Ensure the first slot is used for the null buffer const auto null_id = slot_buffers.insert(instance, scheduler, MemoryUsage::DeviceLocal, 0, AllFlags, 16);