shader_recompiler: Small instruction parsing refactor/bugfixes (#340)

* translator: Implemtn f32 to f16 convert * shader_recompiler: Add bit instructions * shader_recompiler: More data share instructions * shader_recompiler: Remove exec contexts, fix S_MOV_B64 * shader_recompiler: Split instruction parsing into categories * shader_recompiler: Better BFS search * shader_recompiler: Constant propagation pass for cmp_class_f32 * shader_recompiler: Partial readfirstlane implementation * shader_recompiler: Stub readlane/writelane only for non-compute * hack: Fix swizzle on RDR * Will properly fix this when merging this * clang format * address_space: Bump user area size to full * shader_recompiler: V_INTERP_MOV_F32 * Should work the same as spirv will emit flat decoration on demand * kernel: Add MAP_OP_MAP_FLEXIBLE * image_view: Attempt to apply storage swizzle on format * vk_scheduler: Barrier attachments on renderpass end * clang format * liverpool: cs state backup * shader_recompiler: More instructions and formats * vector_alu: Proper V_MBCNT_U32_B32 * shader_recompiler: Port some dark souls things * file_system: Implement sceKernelRename * more formats * clang format * resource_tracking_pass: Back to assert * translate: Tracedata * kernel: Remove tracy lock * Solves random crashes in Dark Souls * code: Review comments
2025-07-11 20:25:55 +00:00 · 2024-07-31 00:32:40 +03:00 · 2024-07-31 00:32:40 +03:00 · a7c9bfa5c5
commit a7c9bfa5c5
parent ac6dc20c3b
66 changed files with 1349 additions and 904 deletions
--- a/src/shader_recompiler/ir/breadth_first_search.h
+++ b/src/shader_recompiler/ir/breadth_first_search.h
@ -12,16 +12,16 @@
 namespace Shader::IR {

 template <typename Pred>
-auto BreadthFirstSearch(const Value& value, Pred&& pred)
-    -> std::invoke_result_t<Pred, const Inst*> {
-    if (value.IsImmediate()) {
-        // Nothing to do with immediates
-        return std::nullopt;
+auto BreadthFirstSearch(const Inst* inst, Pred&& pred) -> std::invoke_result_t<Pred, const Inst*> {
+    // Most often case the instruction is the desired already.
+    if (const std::optional result = pred(inst)) {
+        return result;
    }
+
    // Breadth-first search visiting the right most arguments first
    boost::container::small_vector<const Inst*, 2> visited;
    std::queue<const Inst*> queue;
-    queue.push(value.InstRecursive());
+    queue.push(inst);

    while (!queue.empty()) {
        // Pop one instruction from the queue
@ -49,4 +49,14 @@ auto BreadthFirstSearch(const Value& value, Pred&& pred)
    return std::nullopt;
 }

+template <typename Pred>
+auto BreadthFirstSearch(const Value& value, Pred&& pred)
+    -> std::invoke_result_t<Pred, const Inst*> {
+    if (value.IsImmediate()) {
+        // Nothing to do with immediates
+        return std::nullopt;
+    }
+    return BreadthFirstSearch(value.InstRecursive(), pred);
+}
+
 } // namespace Shader::IR
--- a/src/shader_recompiler/ir/ir_emitter.cpp
+++ b/src/shader_recompiler/ir/ir_emitter.cpp
@ -278,7 +278,7 @@ Value IREmitter::LoadShared(int bit_size, bool is_signed, const U32& offset) {
    case 32:
        return Inst<U32>(Opcode::LoadSharedU32, offset);
    case 64:
-        return Inst<U64>(Opcode::LoadSharedU64, offset);
+        return Inst(Opcode::LoadSharedU64, offset);
    case 128:
        return Inst(Opcode::LoadSharedU128, offset);
    default:
@ -373,6 +373,10 @@ U32 IREmitter::LaneId() {
    return Inst<U32>(Opcode::LaneId);
 }

+U32 IREmitter::WarpId() {
+    return Inst<U32>(Opcode::WarpId);
+}
+
 U32 IREmitter::QuadShuffle(const U32& value, const U32& index) {
    return Inst<U32>(Opcode::QuadShuffle, value, index);
 }
@ -876,6 +880,10 @@ U1 IREmitter::FPIsInf(const F32F64& value) {
    }
 }

+U1 IREmitter::FPCmpClass32(const F32& value, const U32& op) {
+    return Inst<U1>(Opcode::FPCmpClass32, value, op);
+}
+
 U1 IREmitter::FPOrdered(const F32F64& lhs, const F32F64& rhs) {
    if (lhs.Type() != rhs.Type()) {
        UNREACHABLE_MSG("Mismatching types {} and {}", lhs.Type(), rhs.Type());
@ -1088,6 +1096,10 @@ U32 IREmitter::FindUMsb(const U32& value) {
    return Inst<U32>(Opcode::FindUMsb32, value);
 }

+U32 IREmitter::FindILsb(const U32& value) {
+    return Inst<U32>(Opcode::FindILsb32, value);
+}
+
 U32 IREmitter::SMin(const U32& a, const U32& b) {
    return Inst<U32>(Opcode::SMin32, a, b);
 }
@ -1274,6 +1286,11 @@ U16U32U64 IREmitter::UConvert(size_t result_bitsize, const U16U32U64& value) {
        default:
            break;
        }
+    case 32:
+        switch (value.Type()) {
+        case Type::U16:
+            return Inst<U32>(Opcode::ConvertU32U16, value);
+        }
    default:
        break;
    }
--- a/src/shader_recompiler/ir/ir_emitter.h
+++ b/src/shader_recompiler/ir/ir_emitter.h
@ -95,6 +95,7 @@ public:
                     BufferInstInfo info);

    [[nodiscard]] U32 LaneId();
+    [[nodiscard]] U32 WarpId();
    [[nodiscard]] U32 QuadShuffle(const U32& value, const U32& index);

    [[nodiscard]] Value CompositeConstruct(const Value& e1, const Value& e2);
@ -150,6 +151,7 @@ public:
    [[nodiscard]] U1 FPGreaterThan(const F32F64& lhs, const F32F64& rhs, bool ordered = true);
    [[nodiscard]] U1 FPIsNan(const F32F64& value);
    [[nodiscard]] U1 FPIsInf(const F32F64& value);
+    [[nodiscard]] U1 FPCmpClass32(const F32& value, const U32& op);
    [[nodiscard]] U1 FPOrdered(const F32F64& lhs, const F32F64& rhs);
    [[nodiscard]] U1 FPUnordered(const F32F64& lhs, const F32F64& rhs);
    [[nodiscard]] F32F64 FPMax(const F32F64& lhs, const F32F64& rhs, bool is_legacy = false);
@ -179,6 +181,7 @@ public:

    [[nodiscard]] U32 FindSMsb(const U32& value);
    [[nodiscard]] U32 FindUMsb(const U32& value);
+    [[nodiscard]] U32 FindILsb(const U32& value);
    [[nodiscard]] U32 SMin(const U32& a, const U32& b);
    [[nodiscard]] U32 UMin(const U32& a, const U32& b);
    [[nodiscard]] U32 IMin(const U32& a, const U32& b, bool is_signed);
--- a/src/shader_recompiler/ir/opcodes.inc
+++ b/src/shader_recompiler/ir/opcodes.inc
@ -219,6 +219,7 @@ OPCODE(FPIsNan32,                                           U1,             F32,
 OPCODE(FPIsNan64,                                           U1,             F64,                                                                            )
 OPCODE(FPIsInf32,                                           U1,             F32,                                                                            )
 OPCODE(FPIsInf64,                                           U1,             F64,                                                                            )
+OPCODE(FPCmpClass32,                                        U1,             F32,            U32                                                             )

 // Integer operations
 OPCODE(IAdd32,                                              U32,            U32,            U32,                                                            )
@ -254,6 +255,7 @@ OPCODE(BitwiseNot32,                                        U32,            U32,

 OPCODE(FindSMsb32,                                          U32,            U32,                                                                            )
 OPCODE(FindUMsb32,                                          U32,            U32,                                                                            )
+OPCODE(FindILsb32,                                          U32,            U32,                                                                            )
 OPCODE(SMin32,                                              U32,            U32,            U32,                                                            )
 OPCODE(UMin32,                                              U32,            U32,            U32,                                                            )
 OPCODE(SMax32,                                              U32,            U32,            U32,                                                            )
@ -293,6 +295,7 @@ OPCODE(ConvertF64S32,                                       F64,            U32,
 OPCODE(ConvertF64U32,                                       F64,            U32,                                                                            )
 OPCODE(ConvertF32U16,                                       F32,            U16,                                                                            )
 OPCODE(ConvertU16U32,                                       U16,            U32,                                                                            )
+OPCODE(ConvertU32U16,                                       U32,            U16,                                                                            )

 // Image operations
 OPCODE(ImageSampleImplicitLod,                              F32x4,          Opaque,         Opaque,         Opaque,         Opaque,                         )
@ -323,4 +326,5 @@ OPCODE(ImageAtomicExchange32,                               U32,            Opaq

 // Warp operations
 OPCODE(LaneId,                                              U32,                                                                                            )
+OPCODE(WarpId,                                              U32,                                                                                            )
 OPCODE(QuadShuffle,                                         U32,            U32,            U32                                                             )
--- a/src/shader_recompiler/ir/passes/constant_propogation_pass.cpp
+++ b/src/shader_recompiler/ir/passes/constant_propogation_pass.cpp
@ -238,6 +238,18 @@ void FoldBooleanConvert(IR::Inst& inst) {
    }
 }

+void FoldCmpClass(IR::Inst& inst) {
+    ASSERT_MSG(inst.Arg(1).IsImmediate(), "Unable to resolve compare operation");
+    const auto class_mask = static_cast<IR::FloatClassFunc>(inst.Arg(1).U32());
+    if ((class_mask & IR::FloatClassFunc::NaN) == IR::FloatClassFunc::NaN) {
+        inst.ReplaceOpcode(IR::Opcode::FPIsNan32);
+    } else if ((class_mask & IR::FloatClassFunc::Infinity) == IR::FloatClassFunc::Infinity) {
+        inst.ReplaceOpcode(IR::Opcode::FPIsInf32);
+    } else {
+        UNREACHABLE();
+    }
+}
+
 void ConstantPropagation(IR::Block& block, IR::Inst& inst) {
    switch (inst.GetOpcode()) {
    case IR::Opcode::IAdd32:
@ -251,6 +263,9 @@ void ConstantPropagation(IR::Block& block, IR::Inst& inst) {
    case IR::Opcode::IMul32:
        FoldWhenAllImmediates(inst, [](u32 a, u32 b) { return a * b; });
        return;
+    case IR::Opcode::FPCmpClass32:
+        FoldCmpClass(inst);
+        return;
    case IR::Opcode::ShiftRightArithmetic32:
        FoldWhenAllImmediates(inst, [](s32 a, s32 b) { return static_cast<u32>(a >> b); });
        return;
--- a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp
+++ b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp
@ -2,7 +2,6 @@
 // SPDX-License-Identifier: GPL-2.0-or-later

 #include <algorithm>
-#include <deque>
 #include <boost/container/small_vector.hpp>
 #include "shader_recompiler/ir/basic_block.h"
 #include "shader_recompiler/ir/breadth_first_search.h"
@ -273,9 +272,18 @@ std::pair<const IR::Inst*, bool> TryDisableAnisoLod0(const IR::Inst* inst) {
 }

 SharpLocation TrackSharp(const IR::Inst* inst) {
-    while (inst->GetOpcode() == IR::Opcode::Phi) {
-        inst = inst->Arg(0).InstRecursive();
-    }
+    // Search until we find a potential sharp source.
+    const auto pred0 = [](const IR::Inst* inst) -> std::optional<const IR::Inst*> {
+        if (inst->GetOpcode() == IR::Opcode::GetUserData ||
+            inst->GetOpcode() == IR::Opcode::ReadConst) {
+            return inst;
+        }
+        return std::nullopt;
+    };
+    const auto result = IR::BreadthFirstSearch(inst, pred0);
+    ASSERT_MSG(result, "Unable to track sharp source");
+    inst = result.value();
+    // If its from user data not much else to do.
    if (inst->GetOpcode() == IR::Opcode::GetUserData) {
        return SharpLocation{
            .sgpr_base = u32(IR::ScalarReg::Max),
@ -289,14 +297,14 @@ SharpLocation TrackSharp(const IR::Inst* inst) {
    const IR::Inst* spgpr_base = inst->Arg(0).InstRecursive();

    // Retrieve SGPR pair that holds sbase
-    const auto pred = [](const IR::Inst* inst) -> std::optional<IR::ScalarReg> {
+    const auto pred1 = [](const IR::Inst* inst) -> std::optional<IR::ScalarReg> {
        if (inst->GetOpcode() == IR::Opcode::GetUserData) {
            return inst->Arg(0).ScalarReg();
        }
        return std::nullopt;
    };
-    const auto base0 = IR::BreadthFirstSearch(spgpr_base->Arg(0), pred);
-    const auto base1 = IR::BreadthFirstSearch(spgpr_base->Arg(1), pred);
+    const auto base0 = IR::BreadthFirstSearch(spgpr_base->Arg(0), pred1);
+    const auto base1 = IR::BreadthFirstSearch(spgpr_base->Arg(1), pred1);
    ASSERT_MSG(base0 && base1, "Nested resource loads not supported");

    // Return retrieved location.
@ -456,36 +464,26 @@ IR::Value PatchCubeCoord(IR::IREmitter& ir, const IR::Value& s, const IR::Value&
 }

 void PatchImageInstruction(IR::Block& block, IR::Inst& inst, Info& info, Descriptors& descriptors) {
-    std::deque<IR::Inst*> insts{&inst};
-    const auto& pred = [](auto opcode) -> bool {
-        return (opcode == IR::Opcode::CompositeConstructU32x2 || // IMAGE_SAMPLE (image+sampler)
-                opcode == IR::Opcode::ReadConst ||               // IMAGE_LOAD (image only)
-                opcode == IR::Opcode::GetUserData);
+    const auto pred = [](const IR::Inst* inst) -> std::optional<const IR::Inst*> {
+        const auto opcode = inst->GetOpcode();
+        if (opcode == IR::Opcode::CompositeConstructU32x2 || // IMAGE_SAMPLE (image+sampler)
+            opcode == IR::Opcode::ReadConst ||               // IMAGE_LOAD (image only)
+            opcode == IR::Opcode::GetUserData) {
+            return inst;
+        }
+        return std::nullopt;
    };
-
-    IR::Inst* producer{};
-    while (!insts.empty() && (producer = insts.front(), !pred(producer->GetOpcode()))) {
-        for (auto arg_idx = 0u; arg_idx < producer->NumArgs(); ++arg_idx) {
-            const auto arg = producer->Arg(arg_idx);
-            if (arg.TryInstRecursive()) {
-                insts.push_back(arg.InstRecursive());
-            }
-        }
-        insts.pop_front();
-    }
-    ASSERT(pred(producer->GetOpcode()));
-    auto [tsharp_handle, ssharp_handle] = [&] -> std::pair<IR::Inst*, IR::Inst*> {
-        if (producer->GetOpcode() == IR::Opcode::CompositeConstructU32x2) {
-            return std::make_pair(producer->Arg(0).InstRecursive(),
-                                  producer->Arg(1).InstRecursive());
-        }
-        return std::make_pair(producer, nullptr);
-    }();
+    const auto result = IR::BreadthFirstSearch(&inst, pred);
+    ASSERT_MSG(result, "Unable to find image sharp source");
+    const IR::Inst* producer = result.value();
+    const bool has_sampler = producer->GetOpcode() == IR::Opcode::CompositeConstructU32x2;
+    const auto tsharp_handle = has_sampler ? producer->Arg(0).InstRecursive() : producer;

    // Read image sharp.
    const auto tsharp = TrackSharp(tsharp_handle);
    const auto image = info.ReadUd<AmdGpu::Image>(tsharp.sgpr_base, tsharp.dword_offset);
    const auto inst_info = inst.Flags<IR::TextureInstInfo>();
+    ASSERT(image.GetType() != AmdGpu::ImageType::Invalid);
    u32 image_binding = descriptors.Add(ImageResource{
        .sgpr_base = tsharp.sgpr_base,
        .dword_offset = tsharp.dword_offset,
@ -496,17 +494,32 @@ void PatchImageInstruction(IR::Block& block, IR::Inst& inst, Info& info, Descrip
    });

    // Read sampler sharp. This doesn't exist for IMAGE_LOAD/IMAGE_STORE instructions
-    if (ssharp_handle) {
+    const u32 sampler_binding = [&] {
+        if (!has_sampler) {
+            return 0U;
+        }
+        const IR::Value& handle = producer->Arg(1);
+        // Inline sampler resource.
+        if (handle.IsImmediate()) {
+            LOG_WARNING(Render_Vulkan, "Inline sampler detected");
+            return descriptors.Add(SamplerResource{
+                .sgpr_base = std::numeric_limits<u32>::max(),
+                .dword_offset = 0,
+                .inline_sampler = AmdGpu::Sampler{.raw0 = handle.U32()},
+            });
+        }
+        // Normal sampler resource.
+        const auto ssharp_handle = handle.InstRecursive();
        const auto& [ssharp_ud, disable_aniso] = TryDisableAnisoLod0(ssharp_handle);
        const auto ssharp = TrackSharp(ssharp_ud);
-        const u32 sampler_binding = descriptors.Add(SamplerResource{
+        return descriptors.Add(SamplerResource{
            .sgpr_base = ssharp.sgpr_base,
            .dword_offset = ssharp.dword_offset,
            .associated_image = image_binding,
            .disable_aniso = disable_aniso,
        });
-        image_binding |= (sampler_binding << 16);
-    }
+    }();
+    image_binding |= (sampler_binding << 16);

    // Patch image handle
    IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)};
@ -607,7 +620,7 @@ void ResourceTrackingPass(IR::Program& program) {
    // Iterate resource instructions and patch them after finding the sharp.
    auto& info = program.info;
    Descriptors descriptors{info.buffers, info.images, info.samplers};
-    for (IR::Block* const block : program.post_order_blocks) {
+    for (IR::Block* const block : program.blocks) {
        for (IR::Inst& inst : block->Instructions()) {
            if (IsBufferInstruction(inst)) {
                PatchBufferInstruction(*block, inst, info, descriptors);
--- a/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp
+++ b/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp
@ -20,11 +20,19 @@ void Visit(Info& info, IR::Inst& inst) {
    case IR::Opcode::LoadSharedU8:
    case IR::Opcode::WriteSharedU8:
        info.uses_shared_u8 = true;
+        info.uses_shared = true;
        break;
    case IR::Opcode::LoadSharedS16:
    case IR::Opcode::LoadSharedU16:
    case IR::Opcode::WriteSharedU16:
        info.uses_shared_u16 = true;
+        info.uses_shared = true;
+        break;
+    case IR::Opcode::LoadSharedU32:
+    case IR::Opcode::LoadSharedU64:
+    case IR::Opcode::WriteSharedU32:
+    case IR::Opcode::WriteSharedU64:
+        info.uses_shared = true;
        break;
    case IR::Opcode::ConvertF32F16:
    case IR::Opcode::BitCastF16U16:
--- a/src/shader_recompiler/ir/reg.h
+++ b/src/shader_recompiler/ir/reg.h
@ -5,6 +5,7 @@

 #include "common/assert.h"
 #include "common/bit_field.h"
+#include "common/enum.h"
 #include "common/types.h"
 #include "video_core/amdgpu/pixel_format.h"

@ -24,6 +25,23 @@ enum class FpDenormMode : u32 {
    InOutAllow = 3,
 };

+enum class FloatClassFunc : u32 {
+    SignalingNan = 1 << 0,
+    QuietNan = 1 << 1,
+    NegativeInfinity = 1 << 2,
+    NegativeNormal = 1 << 3,
+    NegativeDenorm = 1 << 4,
+    NegativeZero = 1 << 5,
+    PositiveZero = 1 << 6,
+    PositiveDenorm = 1 << 7,
+    PositiveNormal = 1 << 8,
+    PositiveInfinity = 1 << 9,
+
+    NaN = SignalingNan | QuietNan,
+    Infinity = PositiveInfinity | NegativeInfinity,
+};
+DECLARE_ENUM_FLAG_OPERATORS(FloatClassFunc)
+
 union Mode {
    BitField<0, 4, FpRoundMode> fp_round;
    BitField<4, 2, FpDenormMode> fp_denorm_single;