shader_recompiler: Implement data share append and consume operations (#814)

* shader_recompiler: Add more format swap modes

* texture_cache: Handle stencil texture reads

* emulator: Support loading font library

* readme: Add thanks section

* shader_recompiler: Constant buffers as integers

* shader_recompiler: Typed buffers as integers

* shader_recompiler: Separate thread bit scalars

* We can assume guest shader never mixes them with normal sgprs. This helps avoid errors where ssa could view an sgpr write dominating a thread bit read, due to how control flow is structurized, even though its not possible in actual control flow

* shader_recompiler: Implement data append/consume operations

* clang format

* buffer_cache: Simplify invalidation scheme

* video_core: Remove some invalidation remnants

* adjust
This commit is contained in:
TheTurtle 2024-09-07 00:14:51 +03:00 committed by GitHub
parent 649527a235
commit 13743b27fc
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
34 changed files with 512 additions and 272 deletions

View file

@ -152,4 +152,20 @@ Id EmitImageAtomicExchange32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id co
return ImageAtomicU32(ctx, inst, handle, coords, value, &Sirit::Module::OpAtomicExchange);
}
Id EmitDataAppend(EmitContext& ctx, u32 gds_addr, u32 binding) {
auto& buffer = ctx.buffers[binding];
const Id ptr = ctx.OpAccessChain(buffer.pointer_type, buffer.id, ctx.u32_zero_value,
ctx.ConstU32(gds_addr));
const auto [scope, semantics]{AtomicArgs(ctx)};
return ctx.OpAtomicIIncrement(ctx.U32[1], ptr, scope, semantics);
}
Id EmitDataConsume(EmitContext& ctx, u32 gds_addr, u32 binding) {
auto& buffer = ctx.buffers[binding];
const Id ptr = ctx.OpAccessChain(buffer.pointer_type, buffer.id, ctx.u32_zero_value,
ctx.ConstU32(gds_addr));
const auto [scope, semantics]{AtomicArgs(ctx)};
return ctx.OpAtomicIDecrement(ctx.U32[1], ptr, scope, semantics);
}
} // namespace Shader::Backend::SPIRV

View file

@ -133,10 +133,6 @@ Id EmitReadConstBuffer(EmitContext& ctx, u32 handle, Id index) {
return ctx.OpLoad(buffer.data_types->Get(1), ptr);
}
Id EmitReadConstBufferU32(EmitContext& ctx, u32 handle, Id index) {
return ctx.OpBitcast(ctx.U32[1], EmitReadConstBuffer(ctx, handle, index));
}
Id EmitReadStepRate(EmitContext& ctx, int rate_idx) {
return ctx.OpLoad(
ctx.U32[1], ctx.OpAccessChain(ctx.TypePointer(spv::StorageClass::PushConstant, ctx.U32[1]),
@ -222,12 +218,8 @@ void EmitSetAttribute(EmitContext& ctx, IR::Attribute attr, Id value, u32 elemen
ctx.OpStore(pointer, ctx.OpBitcast(ctx.F32[1], value));
}
Id EmitLoadBufferU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
return EmitLoadBufferF32(ctx, inst, handle, address);
}
template <u32 N>
static Id EmitLoadBufferF32xN(EmitContext& ctx, u32 handle, Id address) {
static Id EmitLoadBufferU32xN(EmitContext& ctx, u32 handle, Id address) {
auto& buffer = ctx.buffers[handle];
address = ctx.OpIAdd(ctx.U32[1], address, buffer.offset);
const Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(2u));
@ -246,20 +238,20 @@ static Id EmitLoadBufferF32xN(EmitContext& ctx, u32 handle, Id address) {
}
}
Id EmitLoadBufferF32(EmitContext& ctx, IR::Inst*, u32 handle, Id address) {
return EmitLoadBufferF32xN<1>(ctx, handle, address);
Id EmitLoadBufferU32(EmitContext& ctx, IR::Inst*, u32 handle, Id address) {
return EmitLoadBufferU32xN<1>(ctx, handle, address);
}
Id EmitLoadBufferF32x2(EmitContext& ctx, IR::Inst*, u32 handle, Id address) {
return EmitLoadBufferF32xN<2>(ctx, handle, address);
Id EmitLoadBufferU32x2(EmitContext& ctx, IR::Inst*, u32 handle, Id address) {
return EmitLoadBufferU32xN<2>(ctx, handle, address);
}
Id EmitLoadBufferF32x3(EmitContext& ctx, IR::Inst*, u32 handle, Id address) {
return EmitLoadBufferF32xN<3>(ctx, handle, address);
Id EmitLoadBufferU32x3(EmitContext& ctx, IR::Inst*, u32 handle, Id address) {
return EmitLoadBufferU32xN<3>(ctx, handle, address);
}
Id EmitLoadBufferF32x4(EmitContext& ctx, IR::Inst*, u32 handle, Id address) {
return EmitLoadBufferF32xN<4>(ctx, handle, address);
Id EmitLoadBufferU32x4(EmitContext& ctx, IR::Inst*, u32 handle, Id address) {
return EmitLoadBufferU32xN<4>(ctx, handle, address);
}
Id EmitLoadBufferFormatF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
@ -275,7 +267,7 @@ Id EmitLoadBufferFormatF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id addr
}
template <u32 N>
static void EmitStoreBufferF32xN(EmitContext& ctx, u32 handle, Id address, Id value) {
static void EmitStoreBufferU32xN(EmitContext& ctx, u32 handle, Id address, Id value) {
auto& buffer = ctx.buffers[handle];
address = ctx.OpIAdd(ctx.U32[1], address, buffer.offset);
const Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(2u));
@ -287,29 +279,25 @@ static void EmitStoreBufferF32xN(EmitContext& ctx, u32 handle, Id address, Id va
const Id index_i = ctx.OpIAdd(ctx.U32[1], index, ctx.ConstU32(i));
const Id ptr =
ctx.OpAccessChain(buffer.pointer_type, buffer.id, ctx.u32_zero_value, index_i);
ctx.OpStore(ptr, ctx.OpCompositeExtract(ctx.F32[1], value, i));
ctx.OpStore(ptr, ctx.OpCompositeExtract(buffer.data_types->Get(1), value, i));
}
}
}
void EmitStoreBufferF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) {
EmitStoreBufferF32xN<1>(ctx, handle, address, value);
}
void EmitStoreBufferF32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) {
EmitStoreBufferF32xN<2>(ctx, handle, address, value);
}
void EmitStoreBufferF32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) {
EmitStoreBufferF32xN<3>(ctx, handle, address, value);
}
void EmitStoreBufferF32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) {
EmitStoreBufferF32xN<4>(ctx, handle, address, value);
}
void EmitStoreBufferU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) {
EmitStoreBufferF32xN<1>(ctx, handle, address, value);
EmitStoreBufferU32xN<1>(ctx, handle, address, value);
}
void EmitStoreBufferU32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) {
EmitStoreBufferU32xN<2>(ctx, handle, address, value);
}
void EmitStoreBufferU32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) {
EmitStoreBufferU32xN<3>(ctx, handle, address, value);
}
void EmitStoreBufferU32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) {
EmitStoreBufferU32xN<4>(ctx, handle, address, value);
}
void EmitStoreBufferFormatF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) {

View file

@ -64,25 +64,16 @@ void EmitGetGotoVariable(EmitContext& ctx);
void EmitSetScc(EmitContext& ctx);
Id EmitReadConst(EmitContext& ctx);
Id EmitReadConstBuffer(EmitContext& ctx, u32 handle, Id index);
Id EmitReadConstBufferU32(EmitContext& ctx, u32 handle, Id index);
Id EmitLoadBufferF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address);
Id EmitLoadBufferF32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address);
Id EmitLoadBufferF32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address);
Id EmitLoadBufferF32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address);
Id EmitLoadBufferFormatF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address);
Id EmitLoadBufferFormatF32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address);
Id EmitLoadBufferFormatF32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address);
Id EmitLoadBufferFormatF32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address);
Id EmitLoadBufferU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address);
void EmitStoreBufferF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
void EmitStoreBufferF32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
void EmitStoreBufferF32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
void EmitStoreBufferF32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
void EmitStoreBufferFormatF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
void EmitStoreBufferFormatF32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
void EmitStoreBufferFormatF32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
void EmitStoreBufferFormatF32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
Id EmitLoadBufferU32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address);
Id EmitLoadBufferU32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address);
Id EmitLoadBufferU32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address);
Id EmitLoadBufferFormatF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address);
void EmitStoreBufferU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
void EmitStoreBufferU32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
void EmitStoreBufferU32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
void EmitStoreBufferU32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
void EmitStoreBufferFormatF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
Id EmitBufferAtomicIAdd32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
Id EmitBufferAtomicSMin32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
Id EmitBufferAtomicUMin32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
@ -406,12 +397,13 @@ Id EmitImageAtomicAnd32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords,
Id EmitImageAtomicOr32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id value);
Id EmitImageAtomicXor32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id value);
Id EmitImageAtomicExchange32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id value);
Id EmitLaneId(EmitContext& ctx);
Id EmitWarpId(EmitContext& ctx);
Id EmitQuadShuffle(EmitContext& ctx, Id value, Id index);
Id EmitReadFirstLane(EmitContext& ctx, Id value);
Id EmitReadLane(EmitContext& ctx, Id value, u32 lane);
Id EmitWriteLane(EmitContext& ctx, Id value, Id write_value, u32 lane);
Id EmitDataAppend(EmitContext& ctx, u32 gds_addr, u32 binding);
Id EmitDataConsume(EmitContext& ctx, u32 gds_addr, u32 binding);
} // namespace Shader::Backend::SPIRV

View file

@ -43,6 +43,10 @@ void Translator::EmitDataShare(const GcnInst& inst) {
return DS_MIN_U32(inst, false, true);
case Opcode::DS_MAX_RTN_U32:
return DS_MAX_U32(inst, false, true);
case Opcode::DS_APPEND:
return DS_APPEND(inst);
case Opcode::DS_CONSUME:
return DS_CONSUME(inst);
default:
LogMissingOpcode(inst);
}
@ -192,4 +196,18 @@ void Translator::V_WRITELANE_B32(const GcnInst& inst) {
ir.SetVectorReg(dst, ir.WriteLane(old_value, value, lane));
}
void Translator::DS_APPEND(const GcnInst& inst) {
const u32 inst_offset = inst.control.ds.offset0;
const IR::U32 gds_offset = ir.IAdd(ir.GetM0(), ir.Imm32(inst_offset));
const IR::U32 prev = ir.DataAppend(gds_offset);
SetDst(inst.dst[0], prev);
}
void Translator::DS_CONSUME(const GcnInst& inst) {
const u32 inst_offset = inst.control.ds.offset0;
const IR::U32 gds_offset = ir.IAdd(ir.GetM0(), ir.Imm32(inst_offset));
const IR::U32 prev = ir.DataConsume(gds_offset);
SetDst(inst.dst[0], prev);
}
} // namespace Shader::Gcn

View file

@ -31,6 +31,12 @@ void Translator::EmitExport(const GcnInst& inst) {
case MrtSwizzle::Alt:
static constexpr std::array<u32, 4> AltSwizzle = {2, 1, 0, 3};
return AltSwizzle[comp];
case MrtSwizzle::Reverse:
static constexpr std::array<u32, 4> RevSwizzle = {3, 2, 1, 0};
return RevSwizzle[comp];
case MrtSwizzle::ReverseAlt:
static constexpr std::array<u32, 4> AltRevSwizzle = {3, 0, 1, 2};
return AltRevSwizzle[comp];
default:
UNREACHABLE();
}

View file

@ -73,9 +73,13 @@ void Translator::EmitScalarAlu(const GcnInst& inst) {
case Opcode::S_SUB_I32:
return S_SUB_U32(inst);
case Opcode::S_MIN_U32:
return S_MIN_U32(inst);
return S_MIN_U32(false, inst);
case Opcode::S_MIN_I32:
return S_MIN_U32(true, inst);
case Opcode::S_MAX_U32:
return S_MAX_U32(inst);
return S_MAX_U32(false, inst);
case Opcode::S_MAX_I32:
return S_MAX_U32(true, inst);
case Opcode::S_WQM_B64:
break;
default:
@ -533,18 +537,18 @@ void Translator::S_ADDC_U32(const GcnInst& inst) {
SetDst(inst.dst[0], ir.IAdd(ir.IAdd(src0, src1), carry));
}
void Translator::S_MAX_U32(const GcnInst& inst) {
void Translator::S_MAX_U32(bool is_signed, const GcnInst& inst) {
const IR::U32 src0{GetSrc(inst.src[0])};
const IR::U32 src1{GetSrc(inst.src[1])};
const IR::U32 result = ir.UMax(src0, src1);
const IR::U32 result = ir.IMax(src0, src1, is_signed);
SetDst(inst.dst[0], result);
ir.SetScc(ir.IEqual(result, src0));
}
void Translator::S_MIN_U32(const GcnInst& inst) {
void Translator::S_MIN_U32(bool is_signed, const GcnInst& inst) {
const IR::U32 src0{GetSrc(inst.src[0])};
const IR::U32 src1{GetSrc(inst.src[1])};
const IR::U32 result = ir.UMin(src0, src1);
const IR::U32 result = ir.IMin(src0, src1, is_signed);
SetDst(inst.dst[0], result);
ir.SetScc(ir.IEqual(result, src0));
}

View file

@ -101,8 +101,8 @@ public:
void S_ADDC_U32(const GcnInst& inst);
void S_MULK_I32(const GcnInst& inst);
void S_ADDK_I32(const GcnInst& inst);
void S_MAX_U32(const GcnInst& inst);
void S_MIN_U32(const GcnInst& inst);
void S_MAX_U32(bool is_signed, const GcnInst& inst);
void S_MIN_U32(bool is_signed, const GcnInst& inst);
void S_CMPK(ConditionOp cond, bool is_signed, const GcnInst& inst);
// Scalar Memory
@ -173,7 +173,7 @@ public:
void V_BCNT_U32_B32(const GcnInst& inst);
void V_COS_F32(const GcnInst& inst);
void V_MAX3_F32(const GcnInst& inst);
void V_MAX3_U32(const GcnInst& inst);
void V_MAX3_U32(bool is_signed, const GcnInst& inst);
void V_CVT_I32_F32(const GcnInst& inst);
void V_MIN_I32(const GcnInst& inst);
void V_MUL_LO_U32(const GcnInst& inst);
@ -217,6 +217,8 @@ public:
void V_READFIRSTLANE_B32(const GcnInst& inst);
void V_READLANE_B32(const GcnInst& inst);
void V_WRITELANE_B32(const GcnInst& inst);
void DS_APPEND(const GcnInst& inst);
void DS_CONSUME(const GcnInst& inst);
void S_BARRIER();
// MIMG

View file

@ -227,7 +227,9 @@ void Translator::EmitVectorAlu(const GcnInst& inst) {
case Opcode::V_MAX3_F32:
return V_MAX3_F32(inst);
case Opcode::V_MAX3_U32:
return V_MAX3_U32(inst);
return V_MAX3_U32(false, inst);
case Opcode::V_MAX3_I32:
return V_MAX_U32(true, inst);
case Opcode::V_TRUNC_F32:
return V_TRUNC_F32(inst);
case Opcode::V_CEIL_F32:
@ -831,11 +833,11 @@ void Translator::V_MAX3_F32(const GcnInst& inst) {
SetDst(inst.dst[0], ir.FPMax(src0, ir.FPMax(src1, src2)));
}
void Translator::V_MAX3_U32(const GcnInst& inst) {
void Translator::V_MAX3_U32(bool is_signed, const GcnInst& inst) {
const IR::U32 src0{GetSrc(inst.src[0])};
const IR::U32 src1{GetSrc(inst.src[1])};
const IR::U32 src2{GetSrc(inst.src[2])};
SetDst(inst.dst[0], ir.UMax(src0, ir.UMax(src1, src2)));
SetDst(inst.dst[0], ir.IMax(src0, ir.IMax(src1, src2, is_signed), is_signed));
}
void Translator::V_CVT_I32_F32(const GcnInst& inst) {
@ -967,14 +969,29 @@ void Translator::V_FFBL_B32(const GcnInst& inst) {
}
void Translator::V_MBCNT_U32_B32(bool is_low, const GcnInst& inst) {
const IR::U32 src0{GetSrc(inst.src[0])};
const IR::U32 src1{GetSrc(inst.src[1])};
if (!is_low) {
ASSERT(src0.IsImmediate() && src0.U32() == ~0U && src1.IsImmediate() && src1.U32() == 0U);
return;
// v_mbcnt_hi_u32_b32 v2, -1, 0
if (inst.src[0].field == OperandField::SignedConstIntNeg && inst.src[0].code == 193 &&
inst.src[1].field == OperandField::ConstZero) {
return;
}
// v_mbcnt_hi_u32_b32 vX, exec_hi, 0
if (inst.src[0].field == OperandField::ExecHi &&
inst.src[1].field == OperandField::ConstZero) {
return;
}
} else {
// v_mbcnt_lo_u32_b32 v2, -1, vX
// used combined with above to fetch lane id in non-compute stages
if (inst.src[0].field == OperandField::SignedConstIntNeg && inst.src[0].code == 193) {
SetDst(inst.dst[0], ir.LaneId());
}
// v_mbcnt_lo_u32_b32 v20, exec_lo, vX
// used combined in above for append buffer indexing.
if (inst.src[0].field == OperandField::ExecLo) {
SetDst(inst.dst[0], ir.Imm32(0));
}
}
ASSERT(src0.IsImmediate() && src0.U32() == ~0U);
SetDst(inst.dst[0], ir.LaneId());
}
void Translator::V_BFM_B32(const GcnInst& inst) {

View file

@ -147,10 +147,6 @@ void Translator::IMAGE_GET_RESINFO(const GcnInst& inst) {
void Translator::IMAGE_SAMPLE(const GcnInst& inst) {
const auto& mimg = inst.control.mimg;
if (mimg.da) {
LOG_WARNING(Render_Vulkan, "Image instruction declares an array");
}
IR::VectorReg addr_reg{inst.src[0].code};
IR::VectorReg dest_reg{inst.dst[0].code};
const IR::ScalarReg tsharp_reg{inst.src[2].code * 4};
@ -388,11 +384,11 @@ void Translator::BUFFER_LOAD(u32 num_dwords, bool is_typed, const GcnInst& inst)
const IR::Value value = ir.LoadBuffer(num_dwords, handle, address, info);
const IR::VectorReg dst_reg{inst.src[1].code};
if (num_dwords == 1) {
ir.SetVectorReg(dst_reg, IR::F32{value});
ir.SetVectorReg(dst_reg, IR::U32{value});
return;
}
for (u32 i = 0; i < num_dwords; i++) {
ir.SetVectorReg(dst_reg + i, IR::F32{ir.CompositeExtract(value, i)});
ir.SetVectorReg(dst_reg + i, IR::U32{ir.CompositeExtract(value, i)});
}
}
@ -456,21 +452,18 @@ void Translator::BUFFER_STORE(u32 num_dwords, bool is_typed, const GcnInst& inst
const IR::VectorReg src_reg{inst.src[1].code};
switch (num_dwords) {
case 1:
value = ir.GetVectorReg<IR::F32>(src_reg);
value = ir.GetVectorReg(src_reg);
break;
case 2:
value = ir.CompositeConstruct(ir.GetVectorReg<IR::F32>(src_reg),
ir.GetVectorReg<IR::F32>(src_reg + 1));
value = ir.CompositeConstruct(ir.GetVectorReg(src_reg), ir.GetVectorReg(src_reg + 1));
break;
case 3:
value = ir.CompositeConstruct(ir.GetVectorReg<IR::F32>(src_reg),
ir.GetVectorReg<IR::F32>(src_reg + 1),
ir.GetVectorReg<IR::F32>(src_reg + 2));
value = ir.CompositeConstruct(ir.GetVectorReg(src_reg), ir.GetVectorReg(src_reg + 1),
ir.GetVectorReg(src_reg + 2));
break;
case 4:
value = ir.CompositeConstruct(
ir.GetVectorReg<IR::F32>(src_reg), ir.GetVectorReg<IR::F32>(src_reg + 1),
ir.GetVectorReg<IR::F32>(src_reg + 2), ir.GetVectorReg<IR::F32>(src_reg + 3));
value = ir.CompositeConstruct(ir.GetVectorReg(src_reg), ir.GetVectorReg(src_reg + 1),
ir.GetVectorReg(src_reg + 2), ir.GetVectorReg(src_reg + 3));
break;
}
const IR::Value handle =
@ -518,6 +511,15 @@ void Translator::BUFFER_ATOMIC(AtomicOp op, const GcnInst& inst) {
const IR::VectorReg vaddr{inst.src[0].code};
const IR::VectorReg vdata{inst.src[1].code};
const IR::ScalarReg srsrc{inst.src[2].code * 4};
const IR::Value address = [&] -> IR::Value {
if (mubuf.idxen && mubuf.offen) {
return ir.CompositeConstruct(ir.GetVectorReg(vaddr), ir.GetVectorReg(vaddr + 1));
}
if (mubuf.idxen || mubuf.offen) {
return ir.GetVectorReg(vaddr);
}
return {};
}();
const IR::U32 soffset{GetSrc(inst.src[3])};
ASSERT_MSG(soffset.IsImmediate() && soffset.U32() == 0, "Non immediate offset not supported");
@ -527,7 +529,6 @@ void Translator::BUFFER_ATOMIC(AtomicOp op, const GcnInst& inst) {
info.offset_enable.Assign(mubuf.offen);
IR::Value vdata_val = ir.GetVectorReg<Shader::IR::U32>(vdata);
const IR::U32 address = ir.GetVectorReg(vaddr);
const IR::Value handle =
ir.CompositeConstruct(ir.GetScalarReg(srsrc), ir.GetScalarReg(srsrc + 1),
ir.GetScalarReg(srsrc + 2), ir.GetScalarReg(srsrc + 3));

View file

@ -37,12 +37,13 @@ struct BufferResource {
u32 dword_offset;
IR::Type used_types;
AmdGpu::Buffer inline_cbuf;
bool is_gds_buffer{};
bool is_instance_data{};
bool is_written{};
bool IsStorage(AmdGpu::Buffer buffer) const noexcept {
static constexpr size_t MaxUboSize = 65536;
return buffer.GetSize() > MaxUboSize || is_written;
return buffer.GetSize() > MaxUboSize || is_written || is_gds_buffer;
}
constexpr AmdGpu::Buffer GetSharp(const Info& info) const noexcept;

View file

@ -147,6 +147,7 @@ public:
/// Intrusively store the value of a register in the block.
std::array<Value, NumScalarRegs> ssa_sreg_values;
std::array<Value, NumScalarRegs> ssa_sbit_values;
std::array<Value, NumVectorRegs> ssa_vreg_values;
bool has_multiple_predecessors{false};

View file

@ -313,21 +313,21 @@ U32 IREmitter::ReadConst(const Value& base, const U32& offset) {
return Inst<U32>(Opcode::ReadConst, base, offset);
}
F32 IREmitter::ReadConstBuffer(const Value& handle, const U32& index) {
return Inst<F32>(Opcode::ReadConstBuffer, handle, index);
U32 IREmitter::ReadConstBuffer(const Value& handle, const U32& index) {
return Inst<U32>(Opcode::ReadConstBuffer, handle, index);
}
Value IREmitter::LoadBuffer(int num_dwords, const Value& handle, const Value& address,
BufferInstInfo info) {
switch (num_dwords) {
case 1:
return Inst(Opcode::LoadBufferF32, Flags{info}, handle, address);
return Inst(Opcode::LoadBufferU32, Flags{info}, handle, address);
case 2:
return Inst(Opcode::LoadBufferF32x2, Flags{info}, handle, address);
return Inst(Opcode::LoadBufferU32x2, Flags{info}, handle, address);
case 3:
return Inst(Opcode::LoadBufferF32x3, Flags{info}, handle, address);
return Inst(Opcode::LoadBufferU32x3, Flags{info}, handle, address);
case 4:
return Inst(Opcode::LoadBufferF32x4, Flags{info}, handle, address);
return Inst(Opcode::LoadBufferU32x4, Flags{info}, handle, address);
default:
UNREACHABLE_MSG("Invalid number of dwords {}", num_dwords);
}
@ -341,17 +341,16 @@ void IREmitter::StoreBuffer(int num_dwords, const Value& handle, const Value& ad
const Value& data, BufferInstInfo info) {
switch (num_dwords) {
case 1:
Inst(data.Type() == Type::F32 ? Opcode::StoreBufferF32 : Opcode::StoreBufferU32,
Flags{info}, handle, address, data);
Inst(Opcode::StoreBufferU32, Flags{info}, handle, address, data);
break;
case 2:
Inst(Opcode::StoreBufferF32x2, Flags{info}, handle, address, data);
Inst(Opcode::StoreBufferU32x2, Flags{info}, handle, address, data);
break;
case 3:
Inst(Opcode::StoreBufferF32x3, Flags{info}, handle, address, data);
Inst(Opcode::StoreBufferU32x3, Flags{info}, handle, address, data);
break;
case 4:
Inst(Opcode::StoreBufferF32x4, Flags{info}, handle, address, data);
Inst(Opcode::StoreBufferU32x4, Flags{info}, handle, address, data);
break;
default:
UNREACHABLE_MSG("Invalid number of dwords {}", num_dwords);
@ -410,6 +409,14 @@ void IREmitter::StoreBufferFormat(const Value& handle, const Value& address, con
Inst(Opcode::StoreBufferFormatF32, Flags{info}, handle, address, data);
}
U32 IREmitter::DataAppend(const U32& counter) {
return Inst<U32>(Opcode::DataAppend, counter, Imm32(0));
}
U32 IREmitter::DataConsume(const U32& counter) {
return Inst<U32>(Opcode::DataConsume, counter, Imm32(0));
}
U32 IREmitter::LaneId() {
return Inst<U32>(Opcode::LaneId);
}

View file

@ -90,7 +90,7 @@ public:
[[nodiscard]] U32 SharedAtomicIMax(const U32& address, const U32& data, bool is_signed);
[[nodiscard]] U32 ReadConst(const Value& base, const U32& offset);
[[nodiscard]] F32 ReadConstBuffer(const Value& handle, const U32& index);
[[nodiscard]] U32 ReadConstBuffer(const Value& handle, const U32& index);
[[nodiscard]] Value LoadBuffer(int num_dwords, const Value& handle, const Value& address,
BufferInstInfo info);
@ -120,6 +120,8 @@ public:
[[nodiscard]] Value BufferAtomicSwap(const Value& handle, const Value& address,
const Value& value, BufferInstInfo info);
[[nodiscard]] U32 DataAppend(const U32& counter);
[[nodiscard]] U32 DataConsume(const U32& counter);
[[nodiscard]] U32 LaneId();
[[nodiscard]] U32 WarpId();
[[nodiscard]] U32 QuadShuffle(const U32& value, const U32& index);

View file

@ -51,12 +51,11 @@ bool Inst::MayHaveSideEffects() const noexcept {
case Opcode::Discard:
case Opcode::DiscardCond:
case Opcode::SetAttribute:
case Opcode::StoreBufferF32:
case Opcode::StoreBufferF32x2:
case Opcode::StoreBufferF32x3:
case Opcode::StoreBufferF32x4:
case Opcode::StoreBufferFormatF32:
case Opcode::StoreBufferU32:
case Opcode::StoreBufferU32x2:
case Opcode::StoreBufferU32x3:
case Opcode::StoreBufferU32x4:
case Opcode::StoreBufferFormatF32:
case Opcode::BufferAtomicIAdd32:
case Opcode::BufferAtomicSMin32:
case Opcode::BufferAtomicUMin32:
@ -68,6 +67,8 @@ bool Inst::MayHaveSideEffects() const noexcept {
case Opcode::BufferAtomicOr32:
case Opcode::BufferAtomicXor32:
case Opcode::BufferAtomicSwap32:
case Opcode::DataAppend:
case Opcode::DataConsume:
case Opcode::WriteSharedU128:
case Opcode::WriteSharedU64:
case Opcode::WriteSharedU32:

View file

@ -17,8 +17,7 @@ OPCODE(DiscardCond, Void, U1,
// Constant memory operations
OPCODE(ReadConst, U32, U32x2, U32, )
OPCODE(ReadConstBuffer, F32, Opaque, U32, )
OPCODE(ReadConstBufferU32, U32, Opaque, U32, )
OPCODE(ReadConstBuffer, U32, Opaque, U32, )
// Barriers
OPCODE(Barrier, Void, )
@ -77,21 +76,19 @@ OPCODE(UndefU32, U32,
OPCODE(UndefU64, U64, )
// Buffer operations
OPCODE(LoadBufferF32, F32, Opaque, Opaque, )
OPCODE(LoadBufferF32x2, F32x2, Opaque, Opaque, )
OPCODE(LoadBufferF32x3, F32x3, Opaque, Opaque, )
OPCODE(LoadBufferF32x4, F32x4, Opaque, Opaque, )
OPCODE(LoadBufferFormatF32, F32x4, Opaque, Opaque, )
OPCODE(LoadBufferU32, U32, Opaque, Opaque, )
OPCODE(StoreBufferF32, Void, Opaque, Opaque, F32, )
OPCODE(StoreBufferF32x2, Void, Opaque, Opaque, F32x2, )
OPCODE(StoreBufferF32x3, Void, Opaque, Opaque, F32x3, )
OPCODE(StoreBufferF32x4, Void, Opaque, Opaque, F32x4, )
OPCODE(StoreBufferFormatF32, Void, Opaque, Opaque, F32x4, )
OPCODE(LoadBufferU32x2, U32x2, Opaque, Opaque, )
OPCODE(LoadBufferU32x3, U32x3, Opaque, Opaque, )
OPCODE(LoadBufferU32x4, U32x4, Opaque, Opaque, )
OPCODE(LoadBufferFormatF32, F32x4, Opaque, Opaque, )
OPCODE(StoreBufferU32, Void, Opaque, Opaque, U32, )
OPCODE(StoreBufferU32x2, Void, Opaque, Opaque, U32x2, )
OPCODE(StoreBufferU32x3, Void, Opaque, Opaque, U32x3, )
OPCODE(StoreBufferU32x4, Void, Opaque, Opaque, U32x4, )
OPCODE(StoreBufferFormatF32, Void, Opaque, Opaque, U32x4, )
// Buffer atomic operations
OPCODE(BufferAtomicIAdd32, U32, Opaque, Opaque, U32 )
OPCODE(BufferAtomicIAdd32, U32, Opaque, Opaque, U32 )
OPCODE(BufferAtomicSMin32, U32, Opaque, Opaque, U32 )
OPCODE(BufferAtomicUMin32, U32, Opaque, Opaque, U32 )
OPCODE(BufferAtomicSMax32, U32, Opaque, Opaque, U32 )
@ -101,7 +98,7 @@ OPCODE(BufferAtomicDec32, U32, Opaq
OPCODE(BufferAtomicAnd32, U32, Opaque, Opaque, U32, )
OPCODE(BufferAtomicOr32, U32, Opaque, Opaque, U32, )
OPCODE(BufferAtomicXor32, U32, Opaque, Opaque, U32, )
OPCODE(BufferAtomicSwap32, U32, Opaque, Opaque, U32, )
OPCODE(BufferAtomicSwap32, U32, Opaque, Opaque, U32, )
// Vector utility
OPCODE(CompositeConstructU32x2, U32x2, U32, U32, )
@ -345,3 +342,5 @@ OPCODE(QuadShuffle, U32, U32,
OPCODE(ReadFirstLane, U32, U32, )
OPCODE(ReadLane, U32, U32, U32 )
OPCODE(WriteLane, U32, U32, U32, U32 )
OPCODE(DataAppend, U32, U32, U32 )
OPCODE(DataConsume, U32, U32, U32 )

View file

@ -3,7 +3,6 @@
#include <algorithm>
#include <boost/container/small_vector.hpp>
#include "common/alignment.h"
#include "shader_recompiler/info.h"
#include "shader_recompiler/ir/basic_block.h"
#include "shader_recompiler/ir/breadth_first_search.h"
@ -42,11 +41,10 @@ bool IsBufferAtomic(const IR::Inst& inst) {
bool IsBufferStore(const IR::Inst& inst) {
switch (inst.GetOpcode()) {
case IR::Opcode::StoreBufferF32:
case IR::Opcode::StoreBufferF32x2:
case IR::Opcode::StoreBufferF32x3:
case IR::Opcode::StoreBufferF32x4:
case IR::Opcode::StoreBufferU32:
case IR::Opcode::StoreBufferU32x2:
case IR::Opcode::StoreBufferU32x3:
case IR::Opcode::StoreBufferU32x4:
return true;
default:
return IsBufferAtomic(inst);
@ -55,25 +53,28 @@ bool IsBufferStore(const IR::Inst& inst) {
bool IsBufferInstruction(const IR::Inst& inst) {
switch (inst.GetOpcode()) {
case IR::Opcode::LoadBufferF32:
case IR::Opcode::LoadBufferF32x2:
case IR::Opcode::LoadBufferF32x3:
case IR::Opcode::LoadBufferF32x4:
case IR::Opcode::LoadBufferU32:
case IR::Opcode::LoadBufferU32x2:
case IR::Opcode::LoadBufferU32x3:
case IR::Opcode::LoadBufferU32x4:
case IR::Opcode::ReadConstBuffer:
case IR::Opcode::ReadConstBufferU32:
return true;
default:
return IsBufferStore(inst);
}
}
bool IsDataRingInstruction(const IR::Inst& inst) {
return inst.GetOpcode() == IR::Opcode::DataAppend ||
inst.GetOpcode() == IR::Opcode::DataConsume;
}
bool IsTextureBufferInstruction(const IR::Inst& inst) {
return inst.GetOpcode() == IR::Opcode::LoadBufferFormatF32 ||
inst.GetOpcode() == IR::Opcode::StoreBufferFormatF32;
}
static bool UseFP16(AmdGpu::DataFormat data_format, AmdGpu::NumberFormat num_format) {
bool UseFP16(AmdGpu::DataFormat data_format, AmdGpu::NumberFormat num_format) {
switch (num_format) {
case AmdGpu::NumberFormat::Float:
switch (data_format) {
@ -98,19 +99,15 @@ static bool UseFP16(AmdGpu::DataFormat data_format, AmdGpu::NumberFormat num_for
IR::Type BufferDataType(const IR::Inst& inst, AmdGpu::NumberFormat num_format) {
switch (inst.GetOpcode()) {
case IR::Opcode::LoadBufferF32:
case IR::Opcode::LoadBufferF32x2:
case IR::Opcode::LoadBufferF32x3:
case IR::Opcode::LoadBufferF32x4:
case IR::Opcode::ReadConstBuffer:
case IR::Opcode::StoreBufferF32:
case IR::Opcode::StoreBufferF32x2:
case IR::Opcode::StoreBufferF32x3:
case IR::Opcode::StoreBufferF32x4:
return IR::Type::F32;
case IR::Opcode::LoadBufferU32:
case IR::Opcode::ReadConstBufferU32:
case IR::Opcode::LoadBufferU32x2:
case IR::Opcode::LoadBufferU32x3:
case IR::Opcode::LoadBufferU32x4:
case IR::Opcode::StoreBufferU32:
case IR::Opcode::StoreBufferU32x2:
case IR::Opcode::StoreBufferU32x3:
case IR::Opcode::StoreBufferU32x4:
case IR::Opcode::ReadConstBuffer:
case IR::Opcode::BufferAtomicIAdd32:
case IR::Opcode::BufferAtomicSwap32:
return IR::Type::U32;
@ -191,6 +188,10 @@ public:
u32 Add(const BufferResource& desc) {
const u32 index{Add(buffer_resources, desc, [&desc](const auto& existing) {
// Only one GDS binding can exist.
if (desc.is_gds_buffer && existing.is_gds_buffer) {
return true;
}
return desc.sgpr_base == existing.sgpr_base &&
desc.dword_offset == existing.dword_offset &&
desc.inline_cbuf == existing.inline_cbuf;
@ -399,8 +400,7 @@ void PatchBufferInstruction(IR::Block& block, IR::Inst& inst, Info& info,
ASSERT(!buffer.swizzle_enable && !buffer.add_tid_enable);
// Address of constant buffer reads can be calculated at IR emittion time.
if (inst.GetOpcode() == IR::Opcode::ReadConstBuffer ||
inst.GetOpcode() == IR::Opcode::ReadConstBufferU32) {
if (inst.GetOpcode() == IR::Opcode::ReadConstBuffer) {
return;
}
@ -609,6 +609,51 @@ void PatchImageInstruction(IR::Block& block, IR::Inst& inst, Info& info, Descrip
}
}
void PatchDataRingInstruction(IR::Block& block, IR::Inst& inst, Info& info,
Descriptors& descriptors) {
// Insert gds binding in the shader if it doesn't exist already.
// The buffer is used for append/consume counters.
constexpr static AmdGpu::Buffer GdsSharp{.base_address = 1};
const u32 binding = descriptors.Add(BufferResource{
.used_types = IR::Type::U32,
.inline_cbuf = GdsSharp,
.is_gds_buffer = true,
.is_written = true,
});
const auto pred = [](const IR::Inst* inst) -> std::optional<const IR::Inst*> {
if (inst->GetOpcode() == IR::Opcode::GetUserData) {
return inst;
}
return std::nullopt;
};
// Attempt to deduce the GDS address of counter at compile time.
const u32 gds_addr = [&] {
const IR::Value& gds_offset = inst.Arg(0);
if (gds_offset.IsImmediate()) {
// Nothing to do, offset is known.
return gds_offset.U32() & 0xFFFF;
}
const auto result = IR::BreadthFirstSearch(&inst, pred);
ASSERT_MSG(result, "Unable to track M0 source");
// M0 must be set by some user data register.
const IR::Inst* prod = gds_offset.InstRecursive();
const u32 ud_reg = u32(result.value()->Arg(0).ScalarReg());
u32 m0_val = info.user_data[ud_reg] >> 16;
if (prod->GetOpcode() == IR::Opcode::IAdd32) {
m0_val += prod->Arg(1).U32();
}
return m0_val & 0xFFFF;
}();
// Patch instruction.
IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)};
inst.SetArg(0, ir.Imm32(gds_addr >> 2));
inst.SetArg(1, ir.Imm32(binding));
}
void ResourceTrackingPass(IR::Program& program) {
// Iterate resource instructions and patch them after finding the sharp.
auto& info = program.info;
@ -625,6 +670,10 @@ void ResourceTrackingPass(IR::Program& program) {
}
if (IsImageInstruction(inst)) {
PatchImageInstruction(*block, inst, info, descriptors);
continue;
}
if (IsDataRingInstruction(inst)) {
PatchDataRingInstruction(*block, inst, info, descriptors);
}
}
}

View file

@ -44,8 +44,17 @@ struct GotoVariable : FlagTag {
u32 index;
};
using Variant = std::variant<IR::ScalarReg, IR::VectorReg, GotoVariable, SccFlagTag, ExecFlagTag,
VccFlagTag, VccLoTag, VccHiTag, M0Tag>;
struct ThreadBitScalar : FlagTag {
ThreadBitScalar() = default;
explicit ThreadBitScalar(IR::ScalarReg sgpr_) : sgpr{sgpr_} {}
auto operator<=>(const ThreadBitScalar&) const noexcept = default;
IR::ScalarReg sgpr;
};
using Variant = std::variant<IR::ScalarReg, IR::VectorReg, GotoVariable, ThreadBitScalar,
SccFlagTag, ExecFlagTag, VccFlagTag, VccLoTag, VccHiTag, M0Tag>;
using ValueMap = std::unordered_map<IR::Block*, IR::Value>;
struct DefTable {
@ -70,6 +79,13 @@ struct DefTable {
goto_vars[variable.index].insert_or_assign(block, value);
}
const IR::Value& Def(IR::Block* block, ThreadBitScalar variable) {
return block->ssa_sreg_values[RegIndex(variable.sgpr)];
}
void SetDef(IR::Block* block, ThreadBitScalar variable, const IR::Value& value) {
block->ssa_sreg_values[RegIndex(variable.sgpr)] = value;
}
const IR::Value& Def(IR::Block* block, SccFlagTag) {
return scc_flag[block];
}
@ -173,7 +189,7 @@ public:
}
template <typename Type>
IR::Value ReadVariable(Type variable, IR::Block* root_block, bool is_thread_bit = false) {
IR::Value ReadVariable(Type variable, IR::Block* root_block) {
boost::container::small_vector<ReadState<Type>, 64> stack{
ReadState<Type>(nullptr),
ReadState<Type>(root_block),
@ -201,7 +217,7 @@ public:
} else if (!block->IsSsaSealed()) {
// Incomplete CFG
IR::Inst* phi{&*block->PrependNewInst(block->begin(), IR::Opcode::Phi)};
phi->SetFlags(is_thread_bit ? IR::Type::U1 : IR::TypeOf(UndefOpcode(variable)));
phi->SetFlags(IR::TypeOf(UndefOpcode(variable)));
incomplete_phis[block].insert_or_assign(variable, phi);
stack.back().result = IR::Value{&*phi};
@ -214,7 +230,7 @@ public:
} else {
// Break potential cycles with operandless phi
IR::Inst* const phi{&*block->PrependNewInst(block->begin(), IR::Opcode::Phi)};
phi->SetFlags(is_thread_bit ? IR::Type::U1 : IR::TypeOf(UndefOpcode(variable)));
phi->SetFlags(IR::TypeOf(UndefOpcode(variable)));
WriteVariable(variable, block, IR::Value{phi});
@ -263,9 +279,7 @@ private:
template <typename Type>
IR::Value AddPhiOperands(Type variable, IR::Inst& phi, IR::Block* block) {
for (IR::Block* const imm_pred : block->ImmPredecessors()) {
const bool is_thread_bit =
std::is_same_v<Type, IR::ScalarReg> && phi.Flags<IR::Type>() == IR::Type::U1;
phi.AddPhiOperand(imm_pred, ReadVariable(variable, imm_pred, is_thread_bit));
phi.AddPhiOperand(imm_pred, ReadVariable(variable, imm_pred));
}
return TryRemoveTrivialPhi(phi, block, UndefOpcode(variable));
}
@ -313,7 +327,11 @@ private:
void VisitInst(Pass& pass, IR::Block* block, IR::Inst& inst) {
const IR::Opcode opcode{inst.GetOpcode()};
switch (opcode) {
case IR::Opcode::SetThreadBitScalarReg:
case IR::Opcode::SetThreadBitScalarReg: {
const IR::ScalarReg reg{inst.Arg(0).ScalarReg()};
pass.WriteVariable(ThreadBitScalar{reg}, block, inst.Arg(1));
break;
}
case IR::Opcode::SetScalarRegister: {
const IR::ScalarReg reg{inst.Arg(0).ScalarReg()};
pass.WriteVariable(reg, block, inst.Arg(1));
@ -345,11 +363,15 @@ void VisitInst(Pass& pass, IR::Block* block, IR::Inst& inst) {
case IR::Opcode::SetM0:
pass.WriteVariable(M0Tag{}, block, inst.Arg(0));
break;
case IR::Opcode::GetThreadBitScalarReg:
case IR::Opcode::GetThreadBitScalarReg: {
const IR::ScalarReg reg{inst.Arg(0).ScalarReg()};
const IR::Value value = pass.ReadVariable(ThreadBitScalar{reg}, block);
inst.ReplaceUsesWith(value);
break;
}
case IR::Opcode::GetScalarRegister: {
const IR::ScalarReg reg{inst.Arg(0).ScalarReg()};
const bool thread_bit = opcode == IR::Opcode::GetThreadBitScalarReg;
const IR::Value value = pass.ReadVariable(reg, block, thread_bit);
const IR::Value value = pass.ReadVariable(reg, block);
inst.ReplaceUsesWith(value);
break;
}