shader_recompiler: Proper support for inst-typed buffer format operations. (#2469)

This commit is contained in:
squidbus 2025-02-21 03:01:18 -08:00 committed by GitHub
parent 6860bb7349
commit 9424047214
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 167 additions and 207 deletions

View file

@ -277,10 +277,9 @@ public:
// Buffer Memory
// MUBUF / MTBUF
void BUFFER_LOAD(u32 num_dwords, bool is_typed, const GcnInst& inst);
void BUFFER_LOAD_FORMAT(u32 num_dwords, const GcnInst& inst);
void BUFFER_STORE(u32 num_dwords, bool is_typed, const GcnInst& inst);
void BUFFER_STORE_FORMAT(u32 num_dwords, const GcnInst& inst);
void BUFFER_LOAD(u32 num_dwords, bool is_inst_typed, bool is_buffer_typed, const GcnInst& inst);
void BUFFER_STORE(u32 num_dwords, bool is_inst_typed, bool is_buffer_typed,
const GcnInst& inst);
void BUFFER_ATOMIC(AtomicOp op, const GcnInst& inst);
// Image Memory

View file

@ -11,59 +11,59 @@ void Translator::EmitVectorMemory(const GcnInst& inst) {
// Buffer load operations
case Opcode::TBUFFER_LOAD_FORMAT_X:
return BUFFER_LOAD(1, true, inst);
return BUFFER_LOAD(1, true, false, inst);
case Opcode::TBUFFER_LOAD_FORMAT_XY:
return BUFFER_LOAD(2, true, inst);
return BUFFER_LOAD(2, true, false, inst);
case Opcode::TBUFFER_LOAD_FORMAT_XYZ:
return BUFFER_LOAD(3, true, inst);
return BUFFER_LOAD(3, true, false, inst);
case Opcode::TBUFFER_LOAD_FORMAT_XYZW:
return BUFFER_LOAD(4, true, inst);
return BUFFER_LOAD(4, true, false, inst);
case Opcode::BUFFER_LOAD_FORMAT_X:
return BUFFER_LOAD_FORMAT(1, inst);
return BUFFER_LOAD(1, false, true, inst);
case Opcode::BUFFER_LOAD_FORMAT_XY:
return BUFFER_LOAD_FORMAT(2, inst);
return BUFFER_LOAD(2, false, true, inst);
case Opcode::BUFFER_LOAD_FORMAT_XYZ:
return BUFFER_LOAD_FORMAT(3, inst);
return BUFFER_LOAD(3, false, true, inst);
case Opcode::BUFFER_LOAD_FORMAT_XYZW:
return BUFFER_LOAD_FORMAT(4, inst);
return BUFFER_LOAD(4, false, true, inst);
case Opcode::BUFFER_LOAD_DWORD:
return BUFFER_LOAD(1, false, inst);
return BUFFER_LOAD(1, false, false, inst);
case Opcode::BUFFER_LOAD_DWORDX2:
return BUFFER_LOAD(2, false, inst);
return BUFFER_LOAD(2, false, false, inst);
case Opcode::BUFFER_LOAD_DWORDX3:
return BUFFER_LOAD(3, false, inst);
return BUFFER_LOAD(3, false, false, inst);
case Opcode::BUFFER_LOAD_DWORDX4:
return BUFFER_LOAD(4, false, inst);
return BUFFER_LOAD(4, false, false, inst);
// Buffer store operations
case Opcode::BUFFER_STORE_FORMAT_X:
return BUFFER_STORE_FORMAT(1, inst);
return BUFFER_STORE(1, false, true, inst);
case Opcode::BUFFER_STORE_FORMAT_XY:
return BUFFER_STORE_FORMAT(2, inst);
return BUFFER_STORE(2, false, true, inst);
case Opcode::BUFFER_STORE_FORMAT_XYZ:
return BUFFER_STORE_FORMAT(3, inst);
return BUFFER_STORE(3, false, true, inst);
case Opcode::BUFFER_STORE_FORMAT_XYZW:
return BUFFER_STORE_FORMAT(4, inst);
return BUFFER_STORE(4, false, true, inst);
case Opcode::TBUFFER_STORE_FORMAT_X:
return BUFFER_STORE(1, true, inst);
return BUFFER_STORE(1, true, false, inst);
case Opcode::TBUFFER_STORE_FORMAT_XY:
return BUFFER_STORE(2, true, inst);
return BUFFER_STORE(2, true, false, inst);
case Opcode::TBUFFER_STORE_FORMAT_XYZ:
return BUFFER_STORE(3, true, inst);
return BUFFER_STORE(3, true, false, inst);
case Opcode::TBUFFER_STORE_FORMAT_XYZW:
return BUFFER_STORE(4, true, inst);
return BUFFER_STORE(4, true, false, inst);
case Opcode::BUFFER_STORE_DWORD:
return BUFFER_STORE(1, false, inst);
return BUFFER_STORE(1, false, false, inst);
case Opcode::BUFFER_STORE_DWORDX2:
return BUFFER_STORE(2, false, inst);
return BUFFER_STORE(2, false, false, inst);
case Opcode::BUFFER_STORE_DWORDX3:
return BUFFER_STORE(3, false, inst);
return BUFFER_STORE(3, false, false, inst);
case Opcode::BUFFER_STORE_DWORDX4:
return BUFFER_STORE(4, false, inst);
return BUFFER_STORE(4, false, false, inst);
// Buffer atomic operations
case Opcode::BUFFER_ATOMIC_ADD:
@ -165,7 +165,8 @@ void Translator::EmitVectorMemory(const GcnInst& inst) {
}
}
void Translator::BUFFER_LOAD(u32 num_dwords, bool is_typed, const GcnInst& inst) {
void Translator::BUFFER_LOAD(u32 num_dwords, bool is_inst_typed, bool is_buffer_typed,
const GcnInst& inst) {
const auto& mubuf = inst.control.mubuf;
const bool is_ring = mubuf.glc && mubuf.slc;
const IR::VectorReg vaddr{inst.src[0].code};
@ -195,66 +196,38 @@ void Translator::BUFFER_LOAD(u32 num_dwords, bool is_typed, const GcnInst& inst)
buffer_info.inst_offset.Assign(mubuf.offset);
buffer_info.globally_coherent.Assign(mubuf.glc);
buffer_info.system_coherent.Assign(mubuf.slc);
buffer_info.typed.Assign(is_typed);
if (is_typed) {
buffer_info.typed.Assign(is_inst_typed || is_buffer_typed);
if (is_inst_typed) {
const auto& mtbuf = inst.control.mtbuf;
const auto dmft = static_cast<AmdGpu::DataFormat>(mtbuf.dfmt);
const auto nfmt = static_cast<AmdGpu::NumberFormat>(mtbuf.nfmt);
ASSERT(nfmt == AmdGpu::NumberFormat::Float &&
(dmft == AmdGpu::DataFormat::Format32_32_32_32 ||
dmft == AmdGpu::DataFormat::Format32_32_32 ||
dmft == AmdGpu::DataFormat::Format32_32 || dmft == AmdGpu::DataFormat::Format32));
buffer_info.inst_data_fmt.Assign(static_cast<AmdGpu::DataFormat>(mtbuf.dfmt));
buffer_info.inst_num_fmt.Assign(static_cast<AmdGpu::NumberFormat>(mtbuf.nfmt));
} else {
buffer_info.inst_data_fmt.Assign(AmdGpu::DataFormat::FormatInvalid);
}
const IR::Value handle =
ir.CompositeConstruct(ir.GetScalarReg(sharp), ir.GetScalarReg(sharp + 1),
ir.GetScalarReg(sharp + 2), ir.GetScalarReg(sharp + 3));
const IR::Value value = ir.LoadBufferU32(num_dwords, handle, address, buffer_info);
const IR::VectorReg dst_reg{inst.src[1].code};
if (num_dwords == 1) {
ir.SetVectorReg(dst_reg, IR::U32{value});
return;
}
for (u32 i = 0; i < num_dwords; i++) {
ir.SetVectorReg(dst_reg + i, IR::U32{ir.CompositeExtract(value, i)});
if (buffer_info.typed) {
const IR::Value value = ir.LoadBufferFormat(handle, address, buffer_info);
for (u32 i = 0; i < num_dwords; i++) {
ir.SetVectorReg(dst_reg + i, IR::F32{ir.CompositeExtract(value, i)});
}
} else {
const IR::Value value = ir.LoadBufferU32(num_dwords, handle, address, buffer_info);
if (num_dwords == 1) {
ir.SetVectorReg(dst_reg, IR::U32{value});
return;
}
for (u32 i = 0; i < num_dwords; i++) {
ir.SetVectorReg(dst_reg + i, IR::U32{ir.CompositeExtract(value, i)});
}
}
}
void Translator::BUFFER_LOAD_FORMAT(u32 num_dwords, const GcnInst& inst) {
const auto& mubuf = inst.control.mubuf;
const IR::VectorReg vaddr{inst.src[0].code};
const IR::ScalarReg sharp{inst.src[2].code * 4};
const IR::Value address = [&] -> IR::Value {
if (mubuf.idxen && mubuf.offen) {
return ir.CompositeConstruct(ir.GetVectorReg(vaddr), ir.GetVectorReg(vaddr + 1));
}
if (mubuf.idxen || mubuf.offen) {
return ir.GetVectorReg(vaddr);
}
return {};
}();
const IR::Value soffset{GetSrc(inst.src[3])};
ASSERT_MSG(soffset.IsImmediate() && soffset.U32() == 0, "Non immediate offset not supported");
IR::BufferInstInfo buffer_info{};
buffer_info.index_enable.Assign(mubuf.idxen);
buffer_info.offset_enable.Assign(mubuf.offen);
buffer_info.inst_offset.Assign(mubuf.offset);
buffer_info.globally_coherent.Assign(mubuf.glc);
buffer_info.system_coherent.Assign(mubuf.slc);
buffer_info.typed.Assign(true);
const IR::Value handle =
ir.CompositeConstruct(ir.GetScalarReg(sharp), ir.GetScalarReg(sharp + 1),
ir.GetScalarReg(sharp + 2), ir.GetScalarReg(sharp + 3));
const IR::Value value = ir.LoadBufferFormat(handle, address, buffer_info);
const IR::VectorReg dst_reg{inst.src[1].code};
for (u32 i = 0; i < num_dwords; i++) {
ir.SetVectorReg(dst_reg + i, IR::F32{ir.CompositeExtract(value, i)});
}
}
void Translator::BUFFER_STORE(u32 num_dwords, bool is_typed, const GcnInst& inst) {
void Translator::BUFFER_STORE(u32 num_dwords, bool is_inst_typed, bool is_buffer_typed,
const GcnInst& inst) {
const auto& mubuf = inst.control.mubuf;
const bool is_ring = mubuf.glc && mubuf.slc;
const IR::VectorReg vaddr{inst.src[0].code};
@ -285,80 +258,38 @@ void Translator::BUFFER_STORE(u32 num_dwords, bool is_typed, const GcnInst& inst
buffer_info.inst_offset.Assign(mubuf.offset);
buffer_info.globally_coherent.Assign(mubuf.glc);
buffer_info.system_coherent.Assign(mubuf.slc);
buffer_info.typed.Assign(is_typed);
if (is_typed) {
buffer_info.typed.Assign(is_inst_typed || is_buffer_typed);
if (is_inst_typed) {
const auto& mtbuf = inst.control.mtbuf;
const auto dmft = static_cast<AmdGpu::DataFormat>(mtbuf.dfmt);
const auto nfmt = static_cast<AmdGpu::NumberFormat>(mtbuf.nfmt);
ASSERT(nfmt == AmdGpu::NumberFormat::Float &&
(dmft == AmdGpu::DataFormat::Format32_32_32_32 ||
dmft == AmdGpu::DataFormat::Format32_32_32 ||
dmft == AmdGpu::DataFormat::Format32_32 || dmft == AmdGpu::DataFormat::Format32));
buffer_info.inst_data_fmt.Assign(static_cast<AmdGpu::DataFormat>(mtbuf.dfmt));
buffer_info.inst_num_fmt.Assign(static_cast<AmdGpu::NumberFormat>(mtbuf.nfmt));
} else {
buffer_info.inst_data_fmt.Assign(AmdGpu::DataFormat::FormatInvalid);
}
IR::Value value{};
const IR::VectorReg src_reg{inst.src[1].code};
switch (num_dwords) {
case 1:
value = ir.GetVectorReg(src_reg);
break;
case 2:
value = ir.CompositeConstruct(ir.GetVectorReg(src_reg), ir.GetVectorReg(src_reg + 1));
break;
case 3:
value = ir.CompositeConstruct(ir.GetVectorReg(src_reg), ir.GetVectorReg(src_reg + 1),
ir.GetVectorReg(src_reg + 2));
break;
case 4:
value = ir.CompositeConstruct(ir.GetVectorReg(src_reg), ir.GetVectorReg(src_reg + 1),
ir.GetVectorReg(src_reg + 2), ir.GetVectorReg(src_reg + 3));
break;
}
const IR::Value handle =
ir.CompositeConstruct(ir.GetScalarReg(sharp), ir.GetScalarReg(sharp + 1),
ir.GetScalarReg(sharp + 2), ir.GetScalarReg(sharp + 3));
ir.StoreBufferU32(num_dwords, handle, address, value, buffer_info);
}
void Translator::BUFFER_STORE_FORMAT(u32 num_dwords, const GcnInst& inst) {
const auto& mubuf = inst.control.mubuf;
const IR::VectorReg vaddr{inst.src[0].code};
const IR::ScalarReg sharp{inst.src[2].code * 4};
const IR::Value address = [&] -> IR::Value {
if (mubuf.idxen && mubuf.offen) {
return ir.CompositeConstruct(ir.GetVectorReg(vaddr), ir.GetVectorReg(vaddr + 1));
}
if (mubuf.idxen || mubuf.offen) {
return ir.GetVectorReg(vaddr);
}
return {};
}();
const IR::Value soffset{GetSrc(inst.src[3])};
ASSERT_MSG(soffset.IsImmediate() && soffset.U32() == 0, "Non immediate offset not supported");
IR::BufferInstInfo buffer_info{};
buffer_info.index_enable.Assign(mubuf.idxen);
buffer_info.offset_enable.Assign(mubuf.offen);
buffer_info.inst_offset.Assign(mubuf.offset);
buffer_info.globally_coherent.Assign(mubuf.glc);
buffer_info.system_coherent.Assign(mubuf.slc);
buffer_info.typed.Assign(true);
const IR::VectorReg src_reg{inst.src[1].code};
std::array<IR::F32, 4> comps{};
boost::container::static_vector<IR::Value, 4> comps;
for (u32 i = 0; i < num_dwords; i++) {
comps[i] = ir.GetVectorReg<IR::F32>(src_reg + i);
const auto src_reg_i = src_reg + i;
if (buffer_info.typed) {
comps.push_back(ir.GetVectorReg<IR::F32>(src_reg_i));
} else {
comps.push_back(ir.GetVectorReg<IR::U32>(src_reg_i));
}
}
for (u32 i = num_dwords; i < 4; i++) {
comps[i] = ir.Imm32(0.f);
if (buffer_info.typed) {
for (u32 i = num_dwords; i < 4; i++) {
comps.push_back(ir.Imm32(0.f));
}
ir.StoreBufferFormat(handle, address, ir.CompositeConstruct(comps), buffer_info);
} else {
const auto value = num_dwords == 1 ? comps[0] : ir.CompositeConstruct(comps);
ir.StoreBufferU32(num_dwords, handle, address, value, buffer_info);
}
const IR::Value value = ir.CompositeConstruct(comps[0], comps[1], comps[2], comps[3]);
const IR::Value handle =
ir.CompositeConstruct(ir.GetScalarReg(sharp), ir.GetScalarReg(sharp + 1),
ir.GetScalarReg(sharp + 2), ir.GetScalarReg(sharp + 3));
ir.StoreBufferFormat(handle, address, value, buffer_info);
}
void Translator::BUFFER_ATOMIC(AtomicOp op, const GcnInst& inst) {