shader_recompiler: Additional scope handling and user data as push constants (#1013)

* shader_recompiler: Use push constants for user data regs * shader: Add some GR2 instructions * shader: Add some instructions * shader: Add instructions for knack * touchups * spirv: Better names * buffer_cache: Ignore non gpu modified images * clang format * Add log * more fixes
2025-07-07 09:36:21 +00:00 · 2024-09-23 09:55:43 +03:00 · 2024-09-23 09:55:43 +03:00 · ee38eec7fe
commit ee38eec7fe
parent fb5bc371cb
23 changed files with 180 additions and 87 deletions
--- a/src/shader_recompiler/frontend/control_flow_graph.cpp
+++ b/src/shader_recompiler/frontend/control_flow_graph.cpp
@ -23,7 +23,6 @@ struct Compare {

 static IR::Condition MakeCondition(const GcnInst& inst) {
    if (inst.IsCmpx()) {
-        ASSERT(inst.opcode == Opcode::V_CMPX_NE_U32);
        return IR::Condition::Execnz;
    }

@ -99,7 +98,7 @@ void CFG::EmitDivergenceLabels() {
               // with SAVEEXEC to mask the threads that didn't pass the condition
               // of initial branch.
               (inst.opcode == Opcode::S_ANDN2_B64 && inst.dst[0].field == OperandField::ExecLo) ||
-               inst.opcode == Opcode::V_CMPX_NE_U32;
+               inst.IsCmpx();
    };
    const auto is_close_scope = [](const GcnInst& inst) {
        // Closing an EXEC scope can be either a branch instruction
@ -109,7 +108,7 @@ void CFG::EmitDivergenceLabels() {
               // Sometimes compiler might insert instructions between the SAVEEXEC and the branch.
               // Those instructions need to be wrapped in the condition as well so allow branch
               // as end scope instruction.
-               inst.opcode == Opcode::S_CBRANCH_EXECZ ||
+               inst.opcode == Opcode::S_CBRANCH_EXECZ || inst.opcode == Opcode::S_ENDPGM ||
               (inst.opcode == Opcode::S_ANDN2_B64 && inst.dst[0].field == OperandField::ExecLo);
    };

@ -127,7 +126,8 @@ void CFG::EmitDivergenceLabels() {
        s32 curr_begin = -1;
        for (size_t index = GetIndex(start); index < end_index; index++) {
            const auto& inst = inst_list[index];
-            if (is_close_scope(inst) && curr_begin != -1) {
+            const bool is_close = is_close_scope(inst);
+            if ((is_close || index == end_index - 1) && curr_begin != -1) {
                // If there are no instructions inside scope don't do anything.
                if (index - curr_begin == 1) {
                    curr_begin = -1;
@ -138,8 +138,16 @@ void CFG::EmitDivergenceLabels() {
                const auto& save_inst = inst_list[curr_begin];
                const Label label = index_to_pc[curr_begin] + save_inst.length;
                AddLabel(label);
-                // Add a label to the close scope instruction as well.
-                AddLabel(index_to_pc[index]);
+                // Add a label to the close scope instruction.
+                // There are 3 cases where we need to close a scope.
+                // * Close scope instruction inside the block
+                // * Close scope instruction at the end of the block (cbranch or endpgm)
+                // * Normal instruction at the end of the block
+                // For the last case we must NOT add a label as that would cause
+                // the instruction to be separated into its own basic block.
+                if (is_close) {
+                    AddLabel(index_to_pc[index]);
+                }
                // Reset scope begin.
                curr_begin = -1;
            }
@ -194,7 +202,7 @@ void CFG::LinkBlocks() {
        const auto end_inst{block.end_inst};
        // Handle divergence block inserted here.
        if (end_inst.opcode == Opcode::S_AND_SAVEEXEC_B64 ||
-            end_inst.opcode == Opcode::S_ANDN2_B64 || end_inst.opcode == Opcode::V_CMPX_NE_U32) {
+            end_inst.opcode == Opcode::S_ANDN2_B64 || end_inst.IsCmpx()) {
            // Blocks are stored ordered by address in the set
            auto next_it = std::next(it);
            auto* target_block = &(*next_it);
--- a/src/shader_recompiler/frontend/translate/scalar_alu.cpp
+++ b/src/shader_recompiler/frontend/translate/scalar_alu.cpp
@ -281,6 +281,12 @@ void Translator::S_AND_B64(NegateMode negate, const GcnInst& inst) {
            return ir.GetExec();
        case OperandField::ScalarGPR:
            return ir.GetThreadBitScalarReg(IR::ScalarReg(operand.code));
+        case OperandField::ConstZero:
+            return ir.Imm1(false);
+        case OperandField::SignedConstIntNeg:
+            ASSERT_MSG(-s32(operand.code) + SignedConstIntNegMin - 1 == -1,
+                       "SignedConstIntNeg must be -1");
+            return ir.Imm1(true);
        default:
            UNREACHABLE();
        }
@ -506,6 +512,8 @@ void Translator::S_NOT_B64(const GcnInst& inst) {
            return ir.GetExec();
        case OperandField::ScalarGPR:
            return ir.GetThreadBitScalarReg(IR::ScalarReg(operand.code));
+        case OperandField::ConstZero:
+            return ir.Imm1(false);
        default:
            UNREACHABLE();
        }
@ -520,6 +528,9 @@ void Translator::S_NOT_B64(const GcnInst& inst) {
    case OperandField::ScalarGPR:
        ir.SetThreadBitScalarReg(IR::ScalarReg(inst.dst[0].code), result);
        break;
+    case OperandField::ExecLo:
+        ir.SetExec(result);
+        break;
    default:
        UNREACHABLE();
    }
--- a/src/shader_recompiler/frontend/translate/translate.cpp
+++ b/src/shader_recompiler/frontend/translate/translate.cpp
@ -174,7 +174,7 @@ T Translator::GetSrc(const InstOperand& operand) {
            value = ir.IAbs(value);
        }
        if (operand.input_modifier.neg) {
-            UNREACHABLE();
+            value = ir.INeg(value);
        }
    }
    return value;
--- a/src/shader_recompiler/frontend/translate/translate.h
+++ b/src/shader_recompiler/frontend/translate/translate.h
@ -155,6 +155,7 @@ public:
    void V_SUBREV_I32(const GcnInst& inst);
    void V_ADDC_U32(const GcnInst& inst);
    void V_LDEXP_F32(const GcnInst& inst);
+    void V_CVT_PKNORM_U16_F32(const GcnInst& inst);
    void V_CVT_PKRTZ_F16_F32(const GcnInst& inst);

    // VOP1
--- a/src/shader_recompiler/frontend/translate/vector_alu.cpp
+++ b/src/shader_recompiler/frontend/translate/vector_alu.cpp
@ -89,6 +89,8 @@ void Translator::EmitVectorAlu(const GcnInst& inst) {
        return V_ADDC_U32(inst);
    case Opcode::V_LDEXP_F32:
        return V_LDEXP_F32(inst);
+    case Opcode::V_CVT_PKNORM_U16_F32:
+        return V_CVT_PKNORM_U16_F32(inst);
    case Opcode::V_CVT_PKRTZ_F16_F32:
        return V_CVT_PKRTZ_F16_F32(inst);

@ -244,6 +246,8 @@ void Translator::EmitVectorAlu(const GcnInst& inst) {
        //     V_CMPX_{OP8}_I32
    case Opcode::V_CMPX_LT_I32:
        return V_CMP_U32(ConditionOp::LT, true, true, inst);
+    case Opcode::V_CMPX_EQ_I32:
+        return V_CMP_U32(ConditionOp::EQ, true, true, inst);
    case Opcode::V_CMPX_GT_I32:
        return V_CMP_U32(ConditionOp::GT, true, true, inst);
    case Opcode::V_CMPX_LG_I32:
@ -583,6 +587,15 @@ void Translator::V_LDEXP_F32(const GcnInst& inst) {
    SetDst(inst.dst[0], ir.FPLdexp(src0, src1));
 }

+void Translator::V_CVT_PKNORM_U16_F32(const GcnInst& inst) {
+    const IR::F32 src0{GetSrc<IR::F32>(inst.src[0])};
+    const IR::F32 src1{GetSrc<IR::F32>(inst.src[1])};
+    const IR::U32 dst0 = ir.ConvertFToU(32, ir.FPMul(src0, ir.Imm32(65535.f)));
+    const IR::U32 dst1 = ir.ConvertFToU(32, ir.FPMul(src1, ir.Imm32(65535.f)));
+    const IR::VectorReg dst_reg{inst.dst[0].code};
+    ir.SetVectorReg(dst_reg, ir.BitFieldInsert(dst0, dst1, ir.Imm32(16), ir.Imm32(16)));
+}
+
 void Translator::V_CVT_PKRTZ_F16_F32(const GcnInst& inst) {
    const IR::Value vec_f32 =
        ir.CompositeConstruct(GetSrc<IR::F32>(inst.src[0]), GetSrc<IR::F32>(inst.src[1]));
@ -1046,6 +1059,11 @@ void Translator::V_LSHL_B64(const GcnInst& inst) {
    const IR::U64 src0{GetSrc64(inst.src[0])};
    const IR::U64 src1{GetSrc64(inst.src[1])};
    const IR::VectorReg dst_reg{inst.dst[0].code};
+    if (src0.IsImmediate() && src0.U64() == -1) {
+        ir.SetVectorReg(dst_reg, ir.Imm32(0xFFFFFFFF));
+        ir.SetVectorReg(dst_reg + 1, ir.Imm32(0xFFFFFFFF));
+        return;
+    }
    ASSERT_MSG(src0.IsImmediate() && src0.U64() == 0 && src1.IsImmediate() && src1.U64() == 0,
               "V_LSHL_B64 with non-zero src0 or src1 is not supported");
    ir.SetVectorReg(dst_reg, ir.Imm32(0));