shader_recompiler: Add more instructions and fix a few thinhs

This commit is contained in:
raphaelthegreat 2024-06-05 22:22:34 +03:00
parent 728249f58d
commit ae7e6dafd5
18 changed files with 245 additions and 78 deletions

View file

@ -55,26 +55,48 @@ void Translator::S_ANDN2_B64(const GcnInst& inst) {
const IR::U1 src0{get_src(inst.src[0])};
const IR::U1 src1{get_src(inst.src[1])};
const IR::U1 result{ir.LogicalAnd(src0, ir.LogicalNot(src1))};
SetDst(inst.dst[0], result);
ir.SetScc(result);
switch (inst.dst[0].field) {
case OperandField::VccLo:
ir.SetVcc(result);
break;
case OperandField::ExecLo:
ir.SetExec(result);
break;
case OperandField::ScalarGPR:
ir.SetThreadBitScalarReg(IR::ScalarReg(inst.dst[0].code), result);
break;
default:
UNREACHABLE();
}
}
void Translator::S_AND_SAVEEXEC_B64(const GcnInst& inst) {
// This instruction normally operates on 64-bit data (EXEC, VCC, SGPRs)
// However here we flatten it to 1-bit EXEC and 1-bit VCC. For the destination
// SGPR we have a special IR opcode for SPGRs that act as thread masks.
ASSERT(inst.src[0].field == OperandField::VccLo);
const IR::U1 exec{ir.GetExec()};
const IR::U1 vcc{ir.GetVcc()};
// Mark destination SPGR as an EXEC context. This means we will use 1-bit
// IR instruction whenever it's loaded.
ASSERT(inst.dst[0].field == OperandField::ScalarGPR);
const u32 reg = inst.dst[0].code;
exec_contexts[reg] = true;
ir.SetThreadBitScalarReg(IR::ScalarReg(reg), exec);
switch (inst.dst[0].field) {
case OperandField::ScalarGPR: {
const u32 reg = inst.dst[0].code;
exec_contexts[reg] = true;
ir.SetThreadBitScalarReg(IR::ScalarReg(reg), exec);
break;
}
case OperandField::VccLo:
ir.SetVcc(exec);
break;
default:
UNREACHABLE();
}
// Update EXEC.
ASSERT(inst.src[0].field == OperandField::VccLo);
ir.SetExec(ir.LogicalAnd(exec, ir.GetVcc()));
ir.SetExec(ir.LogicalAnd(exec, vcc));
}
void Translator::S_MOV_B64(const GcnInst& inst) {
@ -114,9 +136,17 @@ void Translator::S_OR_B64(bool negate, const GcnInst& inst) {
if (negate) {
result = ir.LogicalNot(result);
}
ASSERT(inst.dst[0].field == OperandField::VccLo);
ir.SetVcc(result);
ir.SetScc(result);
switch (inst.dst[0].field) {
case OperandField::VccLo:
ir.SetVcc(result);
break;
case OperandField::ScalarGPR:
ir.SetThreadBitScalarReg(IR::ScalarReg(inst.dst[0].code), result);
break;
default:
UNREACHABLE();
}
}
void Translator::S_AND_B64(const GcnInst& inst) {
@ -135,9 +165,17 @@ void Translator::S_AND_B64(const GcnInst& inst) {
const IR::U1 src0{get_src(inst.src[0])};
const IR::U1 src1{get_src(inst.src[1])};
const IR::U1 result = ir.LogicalAnd(src0, src1);
ASSERT(inst.dst[0].field == OperandField::VccLo);
ir.SetVcc(result);
ir.SetScc(result);
switch (inst.dst[0].field) {
case OperandField::VccLo:
ir.SetVcc(result);
break;
case OperandField::ScalarGPR:
ir.SetThreadBitScalarReg(IR::ScalarReg(inst.dst[0].code), result);
break;
default:
UNREACHABLE();
}
}
void Translator::S_ADD_I32(const GcnInst& inst) {
@ -169,6 +207,36 @@ void Translator::S_CSELECT_B32(const GcnInst& inst) {
SetDst(inst.dst[0], IR::U32{ir.Select(ir.GetScc(), src0, src1)});
}
void Translator::S_CSELECT_B64(const GcnInst& inst) {
const auto get_src = [&](const InstOperand& operand) {
switch (operand.field) {
case OperandField::VccLo:
return ir.GetVcc();
case OperandField::ExecLo:
return ir.GetExec();
case OperandField::ScalarGPR:
return ir.GetThreadBitScalarReg(IR::ScalarReg(operand.code));
case OperandField::ConstZero:
return ir.Imm1(false);
default:
UNREACHABLE();
}
};
const IR::U1 src0{get_src(inst.src[0])};
const IR::U1 src1{get_src(inst.src[1])};
const IR::U1 result{ir.Select(ir.GetScc(), src0, src1)};
switch (inst.dst[0].field) {
case OperandField::VccLo:
ir.SetVcc(result);
break;
case OperandField::ScalarGPR:
ir.SetThreadBitScalarReg(IR::ScalarReg(inst.dst[0].code), result);
break;
default:
UNREACHABLE();
}
}
void Translator::S_BFE_U32(const GcnInst& inst) {
const IR::U32 src0{GetSrc(inst.src[0])};
const IR::U32 src1{GetSrc(inst.src[1])};
@ -179,4 +247,12 @@ void Translator::S_BFE_U32(const GcnInst& inst) {
ir.SetScc(ir.INotEqual(result, ir.Imm32(0)));
}
void Translator::S_LSHL_B32(const GcnInst& inst) {
const IR::U32 src0{GetSrc(inst.src[0])};
const IR::U32 src1{GetSrc(inst.src[1])};
const IR::U32 result = ir.ShiftLeftLogical(src0, ir.BitwiseAnd(src1, ir.Imm32(0x1F)));
SetDst(inst.dst[0], result);
ir.SetScc(ir.INotEqual(result, ir.Imm32(0)));
}
} // namespace Shader::Gcn

View file

@ -5,30 +5,16 @@
namespace Shader::Gcn {
void Load(IR::IREmitter& ir, int num_dwords, const IR::Value& handle, IR::ScalarReg dst_reg,
const IR::U32U64& address) {
for (u32 i = 0; i < num_dwords; i++) {
if (handle.IsEmpty()) {
ir.SetScalarReg(dst_reg++, ir.ReadConst(address, ir.Imm32(i)));
} else {
const IR::U32 index = ir.IAdd(address, ir.Imm32(i));
ir.SetScalarReg(dst_reg++, ir.ReadConstBuffer(handle, index));
}
}
}
void Translator::S_LOAD_DWORD(int num_dwords, const GcnInst& inst) {
const auto& smrd = inst.control.smrd;
ASSERT_MSG(smrd.imm, "Bindless texture loads unsupported");
const IR::ScalarReg sbase{inst.src[0].code * 2};
const IR::U32 offset =
smrd.imm ? ir.Imm32(smrd.offset * 4)
: IR::U32{ir.ShiftLeftLogical(ir.GetScalarReg(IR::ScalarReg(smrd.offset)),
ir.Imm32(2))};
const IR::U64 base =
ir.PackUint2x32(ir.CompositeConstruct(ir.GetScalarReg(sbase), ir.GetScalarReg(sbase + 1)));
const IR::U64 address = ir.IAdd(base, offset);
const IR::ScalarReg dst_reg{inst.dst[0].code};
Load(ir, num_dwords, {}, dst_reg, address);
const IR::Value base =
ir.CompositeConstruct(ir.GetScalarReg(sbase), ir.GetScalarReg(sbase + 1));
IR::ScalarReg dst_reg{inst.dst[0].code};
for (u32 i = 0; i < num_dwords; i++) {
ir.SetScalarReg(dst_reg++, ir.ReadConst(base, ir.Imm32(smrd.offset + i)));
}
}
void Translator::S_BUFFER_LOAD_DWORD(int num_dwords, const GcnInst& inst) {
@ -37,8 +23,11 @@ void Translator::S_BUFFER_LOAD_DWORD(int num_dwords, const GcnInst& inst) {
const IR::U32 dword_offset =
smrd.imm ? ir.Imm32(smrd.offset) : ir.GetScalarReg(IR::ScalarReg(smrd.offset));
const IR::Value vsharp = ir.GetScalarReg(sbase);
const IR::ScalarReg dst_reg{inst.dst[0].code};
Load(ir, num_dwords, vsharp, dst_reg, dword_offset);
IR::ScalarReg dst_reg{inst.dst[0].code};
for (u32 i = 0; i < num_dwords; i++) {
const IR::U32 index = ir.IAdd(dword_offset, ir.Imm32(i));
ir.SetScalarReg(dst_reg++, ir.ReadConstBuffer(vsharp, index));
}
}
} // namespace Shader::Gcn

View file

@ -128,7 +128,11 @@ IR::U1U32F32 Translator::GetSrc(const InstOperand& operand, bool force_flt) {
value = ir.GetExec();
break;
case OperandField::VccLo:
value = ir.GetVccLo();
if (force_flt) {
value = ir.BitCast<IR::F32>(ir.GetVccLo());
} else {
value = ir.GetVccLo();
}
break;
case OperandField::VccHi:
value = ir.GetVccHi();
@ -252,6 +256,12 @@ void Translate(IR::Block* block, std::span<const GcnInst> inst_list, Info& info)
break;
case Opcode::S_WAITCNT:
break;
case Opcode::S_LOAD_DWORDX4:
translator.S_LOAD_DWORD(4, inst);
break;
case Opcode::S_LOAD_DWORDX8:
translator.S_LOAD_DWORD(8, inst);
break;
case Opcode::S_BUFFER_LOAD_DWORD:
translator.S_BUFFER_LOAD_DWORD(1, inst);
break;
@ -352,9 +362,18 @@ void Translate(IR::Block* block, std::span<const GcnInst> inst_list, Info& info)
case Opcode::S_CMP_LG_U32:
translator.S_CMP(ConditionOp::LG, false, inst);
break;
case Opcode::S_CMP_LG_I32:
translator.S_CMP(ConditionOp::LG, true, inst);
break;
case Opcode::S_CMP_EQ_I32:
translator.S_CMP(ConditionOp::EQ, true, inst);
break;
case Opcode::S_CMP_EQ_U32:
translator.S_CMP(ConditionOp::EQ, false, inst);
break;
case Opcode::S_LSHL_B32:
translator.S_LSHL_B32(inst);
break;
case Opcode::V_CNDMASK_B32:
translator.V_CNDMASK_B32(inst);
break;
@ -505,13 +524,21 @@ void Translate(IR::Block* block, std::span<const GcnInst> inst_list, Info& info)
case Opcode::S_CSELECT_B32:
translator.S_CSELECT_B32(inst);
break;
case Opcode::S_CSELECT_B64:
translator.S_CSELECT_B64(inst);
break;
case Opcode::S_BFE_U32:
translator.S_BFE_U32(inst);
break;
case Opcode::V_RNDNE_F32:
translator.V_RNDNE_F32(inst);
break;
case Opcode::S_NOP:
case Opcode::S_CBRANCH_EXECZ:
case Opcode::S_CBRANCH_SCC0:
case Opcode::S_CBRANCH_SCC1:
case Opcode::S_CBRANCH_VCCNZ:
case Opcode::S_CBRANCH_VCCZ:
case Opcode::S_BRANCH:
case Opcode::S_WQM_B64:
case Opcode::V_INTERP_P1_F32:

View file

@ -46,7 +46,9 @@ public:
void S_AND_B32(const GcnInst& inst);
void S_LSHR_B32(const GcnInst& inst);
void S_CSELECT_B32(const GcnInst& inst);
void S_CSELECT_B64(const GcnInst& inst);
void S_BFE_U32(const GcnInst& inst);
void S_LSHL_B32(const GcnInst& inst);
// Scalar Memory
void S_LOAD_DWORD(int num_dwords, const GcnInst& inst);
@ -101,6 +103,7 @@ public:
void V_LSHR_B32(const GcnInst& inst);
void V_ASHRREV_I32(const GcnInst& inst);
void V_MAD_U32_U24(const GcnInst& inst);
void V_RNDNE_F32(const GcnInst& inst);
// Vector Memory
void BUFFER_LOAD_FORMAT(u32 num_dwords, bool is_typed, const GcnInst& inst);

View file

@ -33,7 +33,7 @@ void Translator::V_CNDMASK_B32(const GcnInst& inst) {
const IR::VectorReg dst_reg{inst.dst[0].code};
const IR::ScalarReg flag_reg{inst.src[2].code};
const IR::U1 flag = inst.src[2].field == OperandField::ScalarGPR
? ir.INotEqual(ir.GetScalarReg(flag_reg), ir.Imm32(0U))
? ir.GetThreadBitScalarReg(flag_reg)
: ir.GetVcc();
// We can treat the instruction as integer most of the time, but when a source is
@ -85,21 +85,21 @@ void Translator::V_CVT_F32_U32(const GcnInst& inst) {
}
void Translator::V_MAD_F32(const GcnInst& inst) {
const IR::F32 src0{GetSrc(inst.src[0])};
const IR::F32 src1{GetSrc(inst.src[1])};
const IR::F32 src2{GetSrc(inst.src[2])};
const IR::F32 src0{GetSrc(inst.src[0], true)};
const IR::F32 src1{GetSrc(inst.src[1], true)};
const IR::F32 src2{GetSrc(inst.src[2], true)};
SetDst(inst.dst[0], ir.FPFma(src0, src1, src2));
}
void Translator::V_FRACT_F32(const GcnInst& inst) {
const IR::F32 src0{GetSrc(inst.src[0])};
const IR::F32 src0{GetSrc(inst.src[0], true)};
const IR::VectorReg dst_reg{inst.dst[0].code};
ir.SetVectorReg(dst_reg, ir.Fract(src0));
}
void Translator::V_ADD_F32(const GcnInst& inst) {
const IR::F32 src0{GetSrc(inst.src[0])};
const IR::F32 src1{GetSrc(inst.src[1])};
const IR::F32 src0{GetSrc(inst.src[0], true)};
const IR::F32 src1{GetSrc(inst.src[1], true)};
SetDst(inst.dst[0], ir.FPAdd(src0, src1));
}
@ -114,14 +114,14 @@ void Translator::V_CVT_OFF_F32_I4(const GcnInst& inst) {
void Translator::V_MED3_F32(const GcnInst& inst) {
const IR::F32 src0{GetSrc(inst.src[0], true)};
const IR::F32 src1{GetSrc(inst.src[1])};
const IR::F32 src2{GetSrc(inst.src[2])};
const IR::F32 src1{GetSrc(inst.src[1], true)};
const IR::F32 src2{GetSrc(inst.src[2], true)};
const IR::F32 mmx = ir.FPMin(ir.FPMax(src0, src1), src2);
SetDst(inst.dst[0], ir.FPMax(ir.FPMin(src0, src1), mmx));
}
void Translator::V_FLOOR_F32(const GcnInst& inst) {
const IR::F32 src0{GetSrc(inst.src[0])};
const IR::F32 src0{GetSrc(inst.src[0], true)};
const IR::VectorReg dst_reg{inst.dst[0].code};
ir.SetVectorReg(dst_reg, ir.FPFloor(src0));
}
@ -167,7 +167,17 @@ void Translator::V_CMP_F32(ConditionOp op, const GcnInst& inst) {
UNREACHABLE();
}
}();
ir.SetVcc(result);
switch (inst.dst[1].field) {
case OperandField::VccLo:
ir.SetVcc(result);
break;
case OperandField::ScalarGPR:
ir.SetThreadBitScalarReg(IR::ScalarReg(inst.dst[1].code), result);
break;
default:
UNREACHABLE();
}
}
void Translator::V_MAX_F32(const GcnInst& inst) {
@ -357,4 +367,9 @@ void Translator::V_MAD_U32_U24(const GcnInst& inst) {
V_MAD_I32_I24(inst);
}
void Translator::V_RNDNE_F32(const GcnInst& inst) {
const IR::F32 src0{GetSrc(inst.src[0], true)};
SetDst(inst.dst[0], ir.FPRoundEven(src0));
}
} // namespace Shader::Gcn