Tessellation (#1528)

* shader_recompiler: Tessellation WIP * fix compiler errors after merge DONT MERGE set log file to /dev/null DONT MERGE linux pthread bb fix save work DONT MERGE dump ir save more work fix mistake with ES shader skip list add input patch control points dynamic state random stuff * WIP Tessellation partial implementation. Squash commits * test: make local/tcs use attr arrays * attr arrays in TCS/TES * dont define empty attr arrays * switch to special opcodes for tess tcs/tes reads and tcs writes * impl tcs/tes read attr insts * rebase fix * save some work * save work probably broken and slow * put Vertex LogicalStage after TCS and TES to fix bindings * more refactors * refactor pattern matching and optimize modulos (disabled) * enable modulo opt * copyright * rebase fixes * remove some prints * remove some stuff * Add TCS/TES support for shader patching and use LogicalStage * refactor and handle wider DS instructions * get rid of GetAttributes for special tess constants reads. Immediately replace some upon seeing readconstbuffer. Gets rid of some extra passes over IR * stop relying on GNMX HsConstants struct. Change runtime_info.hs_info and some regs * delete some more stuff * update comments for current implementation * some cleanup * uint error * more cleanup * remove patch control points dynamic state (because runtime_info already depends on it) * fix potential problem with determining passthrough --------- Co-authored-by: IndecisiveTurtle <47210458+raphaelthegreat@users.noreply.github.com>
2025-06-26 20:36:16 +00:00 · 2024-12-14 02:56:17 -08:00 · 2024-12-14 02:56:17 -08:00 · 3c0c921ef5
commit 3c0c921ef5
parent 3e22622508
54 changed files with 2146 additions and 189 deletions
--- a/src/shader_recompiler/frontend/tessellation.h
+++ b/src/shader_recompiler/frontend/tessellation.h
@ -0,0 +1,38 @@
+// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#pragma once
+
+#include "common/types.h"
+
+namespace Shader {
+
+struct TessellationDataConstantBuffer {
+    u32 ls_stride;
+    u32 hs_cp_stride;      // HullStateConstants::m_cpStride != 0 ? HullStateConstants::m_cpStride :
+                           // ls_stride
+    u32 num_patches;       // num patches submitted in threadgroup
+    u32 hs_output_base;    // HullStateConstants::m_numInputCP::m_cpStride != 0 ?
+                           // HullStateConstants::m_numInputCP * ls_stride * num_patches : 0
+                           // basically 0 when passthrough
+    u32 patch_const_size;  // 16 * num_patch_attrs
+    u32 patch_const_base;  // hs_output_base + patch_output_size
+    u32 patch_output_size; // output_cp_stride * num_output_cp_per_patch
+    f32 off_chip_tessellation_factor_threshold;
+    u32 first_edge_tess_factor_index;
+};
+
+// Assign names to dword fields of TessellationDataConstantBuffer
+enum class TessConstantAttribute : u32 {
+    LsStride,
+    HsCpStride,
+    HsNumPatch,
+    HsOutputBase,
+    PatchConstSize,
+    PatchConstBase,
+    PatchOutputSize,
+    OffChipTessellationFactorThreshold,
+    FirstEdgeTessFactorIndex,
+};
+
+} // namespace Shader
--- a/src/shader_recompiler/frontend/translate/data_share.cpp
+++ b/src/shader_recompiler/frontend/translate/data_share.cpp
@ -1,8 +1,8 @@
 // SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
 // SPDX-License-Identifier: GPL-2.0-or-later
-
 #include "shader_recompiler/frontend/translate/translate.h"
 #include "shader_recompiler/ir/reg.h"
+#include "shader_recompiler/runtime_info.h"

 namespace Shader::Gcn {

@ -73,10 +73,11 @@ void Translator::EmitDataShare(const GcnInst& inst) {
 void Translator::V_READFIRSTLANE_B32(const GcnInst& inst) {
    const IR::U32 value{GetSrc(inst.src[0])};

-    if (info.stage != Stage::Compute) {
-        SetDst(inst.dst[0], value);
-    } else {
+    if (info.l_stage == LogicalStage::Compute ||
+        info.l_stage == LogicalStage::TessellationControl) {
        SetDst(inst.dst[0], ir.ReadFirstLane(value));
+    } else {
+        SetDst(inst.dst[0], value);
    }
 }

--- a/src/shader_recompiler/frontend/translate/scalar_alu.cpp
+++ b/src/shader_recompiler/frontend/translate/scalar_alu.cpp
@ -1,6 +1,8 @@
 // SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
 // SPDX-License-Identifier: GPL-2.0-or-later

+#include <bit>
+#include "common/assert.h"
 #include "shader_recompiler/frontend/translate/translate.h"

 namespace Shader::Gcn {
@ -78,8 +80,10 @@ void Translator::EmitScalarAlu(const GcnInst& inst) {
            return S_BFM_B32(inst);
        case Opcode::S_MUL_I32:
            return S_MUL_I32(inst);
+        case Opcode::S_BFE_I32:
+            return S_BFE(inst, true);
        case Opcode::S_BFE_U32:
-            return S_BFE_U32(inst);
+            return S_BFE(inst, false);
        case Opcode::S_ABSDIFF_I32:
            return S_ABSDIFF_I32(inst);

@ -434,12 +438,12 @@ void Translator::S_MUL_I32(const GcnInst& inst) {
    SetDst(inst.dst[0], ir.IMul(GetSrc(inst.src[0]), GetSrc(inst.src[1])));
 }

-void Translator::S_BFE_U32(const GcnInst& inst) {
+void Translator::S_BFE(const GcnInst& inst, bool is_signed) {
    const IR::U32 src0{GetSrc(inst.src[0])};
    const IR::U32 src1{GetSrc(inst.src[1])};
    const IR::U32 offset{ir.BitwiseAnd(src1, ir.Imm32(0x1F))};
    const IR::U32 count{ir.BitFieldExtract(src1, ir.Imm32(16), ir.Imm32(7))};
-    const IR::U32 result{ir.BitFieldExtract(src0, offset, count)};
+    const IR::U32 result{ir.BitFieldExtract(src0, offset, count, is_signed)};
    SetDst(inst.dst[0], result);
    ir.SetScc(ir.INotEqual(result, ir.Imm32(0)));
 }
--- a/src/shader_recompiler/frontend/translate/translate.cpp
+++ b/src/shader_recompiler/frontend/translate/translate.cpp
@ -8,6 +8,8 @@
 #include "shader_recompiler/frontend/fetch_shader.h"
 #include "shader_recompiler/frontend/translate/translate.h"
 #include "shader_recompiler/info.h"
+#include "shader_recompiler/ir/attribute.h"
+#include "shader_recompiler/ir/reg.h"
 #include "shader_recompiler/runtime_info.h"
 #include "video_core/amdgpu/resource.h"
 #include "video_core/amdgpu/types.h"
@ -34,9 +36,8 @@ void Translator::EmitPrologue() {
    }

    IR::VectorReg dst_vreg = IR::VectorReg::V0;
-    switch (info.stage) {
-    case Stage::Vertex:
-    case Stage::Export:
+    switch (info.l_stage) {
+    case LogicalStage::Vertex:
        // v0: vertex ID, always present
        ir.SetVectorReg(dst_vreg++, ir.GetAttributeU32(IR::Attribute::VertexId));
        // v1: instance ID, step rate 0
@ -52,7 +53,7 @@ void Translator::EmitPrologue() {
            ir.SetVectorReg(dst_vreg++, ir.GetAttributeU32(IR::Attribute::InstanceId));
        }
        break;
-    case Stage::Fragment:
+    case LogicalStage::Fragment:
        dst_vreg = IR::VectorReg::V0;
        if (runtime_info.fs_info.addr_flags.persp_sample_ena) {
            ++dst_vreg; // I
@ -122,7 +123,30 @@ void Translator::EmitPrologue() {
            }
        }
        break;
-    case Stage::Compute:
+    case LogicalStage::TessellationControl: {
+        // Should be laid out like:
+        // [0:8]: patch id within VGT
+        // [8:12]: output control point id
+        ir.SetVectorReg(IR::VectorReg::V1,
+                        ir.GetAttributeU32(IR::Attribute::PackedHullInvocationInfo));
+        // TODO PrimitiveId is probably V2 but haven't seen it yet
+        break;
+    }
+    case LogicalStage::TessellationEval:
+        ir.SetVectorReg(IR::VectorReg::V0,
+                        ir.GetAttribute(IR::Attribute::TessellationEvaluationPointU));
+        ir.SetVectorReg(IR::VectorReg::V1,
+                        ir.GetAttribute(IR::Attribute::TessellationEvaluationPointV));
+        // V2 is similar to PrimitiveID but not the same. It seems to only be used in
+        // compiler-generated address calculations. Its probably the patch id within the
+        // patches running locally on a given VGT (or CU, whichever is the granularity of LDS
+        // memory)
+        // Set to 0. See explanation in comment describing hull/domain passes
+        ir.SetVectorReg(IR::VectorReg::V2, ir.Imm32(0u));
+        // V3 is the actual PrimitiveID as intended by the shader author.
+        ir.SetVectorReg(IR::VectorReg::V3, ir.GetAttributeU32(IR::Attribute::PrimitiveId));
+        break;
+    case LogicalStage::Compute:
        ir.SetVectorReg(dst_vreg++, ir.GetAttributeU32(IR::Attribute::LocalInvocationId, 0));
        ir.SetVectorReg(dst_vreg++, ir.GetAttributeU32(IR::Attribute::LocalInvocationId, 1));
        ir.SetVectorReg(dst_vreg++, ir.GetAttributeU32(IR::Attribute::LocalInvocationId, 2));
@ -137,7 +161,7 @@ void Translator::EmitPrologue() {
            ir.SetScalarReg(dst_sreg++, ir.GetAttributeU32(IR::Attribute::WorkgroupId, 2));
        }
        break;
-    case Stage::Geometry:
+    case LogicalStage::Geometry:
        switch (runtime_info.gs_info.out_primitive[0]) {
        case AmdGpu::GsOutputPrimitiveType::TriangleStrip:
            ir.SetVectorReg(IR::VectorReg::V3, ir.Imm32(2u)); // vertex 2
@ -152,7 +176,7 @@ void Translator::EmitPrologue() {
        ir.SetVectorReg(IR::VectorReg::V2, ir.GetAttributeU32(IR::Attribute::PrimitiveId));
        break;
    default:
-        throw NotImplementedException("Unknown shader stage");
+        UNREACHABLE_MSG("Unknown shader stage");
    }
 }

@ -503,7 +527,8 @@ void Translate(IR::Block* block, u32 pc, std::span<const GcnInst> inst_list, Inf

        // Special case for emitting fetch shader.
        if (inst.opcode == Opcode::S_SWAPPC_B64) {
-            ASSERT(info.stage == Stage::Vertex || info.stage == Stage::Export);
+            ASSERT(info.stage == Stage::Vertex || info.stage == Stage::Export ||
+                   info.stage == Stage::Local);
            translator.EmitFetch(inst);
            continue;
        }
--- a/src/shader_recompiler/frontend/translate/translate.h
+++ b/src/shader_recompiler/frontend/translate/translate.h
@ -94,7 +94,8 @@ public:
    void S_ASHR_I32(const GcnInst& inst);
    void S_BFM_B32(const GcnInst& inst);
    void S_MUL_I32(const GcnInst& inst);
-    void S_BFE_U32(const GcnInst& inst);
+    void S_BFE(const GcnInst& inst, bool is_signed);
+    void S_BFE_I32(const GcnInst& inst);
    void S_ABSDIFF_I32(const GcnInst& inst);
    void S_NOT_B32(const GcnInst& inst);

@ -217,7 +218,7 @@ public:

    // VOP3a
    void V_MAD_F32(const GcnInst& inst);
-    void V_MAD_I32_I24(const GcnInst& inst, bool is_signed = false);
+    void V_MAD_I32_I24(const GcnInst& inst, bool is_signed = true);
    void V_MAD_U32_U24(const GcnInst& inst);
    void V_CUBEID_F32(const GcnInst& inst);
    void V_CUBESC_F32(const GcnInst& inst);
--- a/src/shader_recompiler/frontend/translate/vector_alu.cpp
+++ b/src/shader_recompiler/frontend/translate/vector_alu.cpp
@ -1060,8 +1060,14 @@ void Translator::V_CUBEMA_F32(const GcnInst& inst) {

 void Translator::V_BFE_U32(bool is_signed, const GcnInst& inst) {
    const IR::U32 src0{GetSrc(inst.src[0])};
-    const IR::U32 src1{ir.BitwiseAnd(GetSrc(inst.src[1]), ir.Imm32(0x1F))};
-    const IR::U32 src2{ir.BitwiseAnd(GetSrc(inst.src[2]), ir.Imm32(0x1F))};
+    IR::U32 src1{GetSrc(inst.src[1])};
+    IR::U32 src2{GetSrc(inst.src[2])};
+    if (!src1.IsImmediate()) {
+        src1 = ir.BitwiseAnd(src1, ir.Imm32(0x1F));
+    }
+    if (!src2.IsImmediate()) {
+        src2 = ir.BitwiseAnd(src2, ir.Imm32(0x1F));
+    }
    SetDst(inst.dst[0], ir.BitFieldExtract(src0, src1, src2, is_signed));
 }

--- a/src/shader_recompiler/frontend/translate/vector_memory.cpp
+++ b/src/shader_recompiler/frontend/translate/vector_memory.cpp
@ -189,7 +189,8 @@ void Translator::BUFFER_LOAD(u32 num_dwords, bool is_typed, const GcnInst& inst)
    buffer_info.index_enable.Assign(mtbuf.idxen);
    buffer_info.offset_enable.Assign(mtbuf.offen);
    buffer_info.inst_offset.Assign(mtbuf.offset);
-    buffer_info.ring_access.Assign(is_ring);
+    buffer_info.globally_coherent.Assign(mtbuf.glc);
+    buffer_info.system_coherent.Assign(mtbuf.slc);
    if (is_typed) {
        const auto dmft = static_cast<AmdGpu::DataFormat>(mtbuf.dfmt);
        const auto nfmt = static_cast<AmdGpu::NumberFormat>(mtbuf.nfmt);
@ -247,11 +248,15 @@ void Translator::BUFFER_STORE(u32 num_dwords, bool is_typed, const GcnInst& inst
    const IR::ScalarReg sharp{inst.src[2].code * 4};
    const IR::Value soffset{GetSrc(inst.src[3])};

-    if (info.stage != Stage::Export && info.stage != Stage::Geometry) {
+    if (info.stage != Stage::Export && info.stage != Stage::Hull && info.stage != Stage::Geometry) {
        ASSERT_MSG(soffset.IsImmediate() && soffset.U32() == 0,
                   "Non immediate offset not supported");
    }

+    if (info.stage == Stage::Hull) {
+        // printf("here\n"); // break
+    }
+
    IR::Value address = [&] -> IR::Value {
        if (is_ring) {
            return ir.CompositeConstruct(ir.GetVectorReg(vaddr), soffset);
@ -269,7 +274,8 @@ void Translator::BUFFER_STORE(u32 num_dwords, bool is_typed, const GcnInst& inst
    buffer_info.index_enable.Assign(mtbuf.idxen);
    buffer_info.offset_enable.Assign(mtbuf.offen);
    buffer_info.inst_offset.Assign(mtbuf.offset);
-    buffer_info.ring_access.Assign(is_ring);
+    buffer_info.globally_coherent.Assign(mtbuf.glc);
+    buffer_info.system_coherent.Assign(mtbuf.slc);
    if (is_typed) {
        const auto dmft = static_cast<AmdGpu::DataFormat>(mtbuf.dfmt);
        const auto nfmt = static_cast<AmdGpu::NumberFormat>(mtbuf.nfmt);