shader_recompiler: Optimize general case of buffer addressing (#3159)

* shader_recompiler: Simplify dma types Only U32 is needed for S_LOAD_DWORD * shader_recompiler: Perform address shift on IR level Buffer instructions now expect address in the data unit they work on. Doing the shift on IR level will allow us to optimize some operations away on common case * shader_recompiler: Optimize common buffer access pattern * emit_spirv: Use 32-bit integer ops for fault buffer Not many GPUs have 8-bit bitwise or operations so that would probably require some overhead to emulate from the driver * resource_tracking_pass: Fix texel buffer shift
2025-07-12 12:45:56 +00:00 · 2025-06-26 12:14:36 +03:00 · 2025-06-26 12:14:36 +03:00 · a49b13fe66
commit a49b13fe66
parent 6eaec7a004
12 changed files with 271 additions and 233 deletions
--- a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp
+++ b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp
@ -105,6 +105,49 @@ IR::Type BufferDataType(const IR::Inst& inst, AmdGpu::NumberFormat num_format) {
    }
 }

+u32 BufferAddressShift(const IR::Inst& inst, AmdGpu::DataFormat data_format) {
+    switch (inst.GetOpcode()) {
+    case IR::Opcode::LoadBufferU8:
+    case IR::Opcode::StoreBufferU8:
+        return 0;
+    case IR::Opcode::LoadBufferU16:
+    case IR::Opcode::StoreBufferU16:
+        return 1;
+    case IR::Opcode::LoadBufferU64:
+    case IR::Opcode::StoreBufferU64:
+    case IR::Opcode::BufferAtomicIAdd64:
+        return 3;
+    case IR::Opcode::LoadBufferFormatF32:
+    case IR::Opcode::StoreBufferFormatF32: {
+        switch (data_format) {
+        case AmdGpu::DataFormat::Format8:
+            return 0;
+        case AmdGpu::DataFormat::Format8_8:
+        case AmdGpu::DataFormat::Format16:
+            return 1;
+        case AmdGpu::DataFormat::Format8_8_8_8:
+        case AmdGpu::DataFormat::Format16_16:
+        case AmdGpu::DataFormat::Format10_11_11:
+        case AmdGpu::DataFormat::Format2_10_10_10:
+        case AmdGpu::DataFormat::Format16_16_16_16:
+        case AmdGpu::DataFormat::Format32:
+        case AmdGpu::DataFormat::Format32_32:
+        case AmdGpu::DataFormat::Format32_32_32:
+        case AmdGpu::DataFormat::Format32_32_32_32:
+            return 2;
+        default:
+            return 0;
+        }
+        break;
+    }
+    case IR::Opcode::ReadConstBuffer:
+        // Provided address is already in dwords
+        return 0;
+    default:
+        return 2;
+    }
+}
+
 bool IsImageAtomicInstruction(const IR::Inst& inst) {
    switch (inst.GetOpcode()) {
    case IR::Opcode::ImageAtomicIAdd32:
@ -496,6 +539,22 @@ void PatchDataRingAccess(IR::Block& block, IR::Inst& inst, Info& info, Descripto
 IR::U32 CalculateBufferAddress(IR::IREmitter& ir, const IR::Inst& inst, const Info& info,
                               const AmdGpu::Buffer& buffer, u32 stride) {
    const auto inst_info = inst.Flags<IR::BufferInstInfo>();
+    const u32 inst_offset = inst_info.inst_offset.Value();
+    const auto is_inst_typed = inst_info.inst_data_fmt != AmdGpu::DataFormat::FormatInvalid;
+    const auto data_format = is_inst_typed
+                                 ? AmdGpu::RemapDataFormat(inst_info.inst_data_fmt.Value())
+                                 : buffer.GetDataFmt();
+    const u32 shift = BufferAddressShift(inst, data_format);
+    const u32 mask = (1 << shift) - 1;
+
+    // If address calculation is of the form "index * const_stride + offset" with offset constant
+    // and both const_stride and offset are divisible with the element size, apply shift directly.
+    if (inst_info.index_enable && !inst_info.offset_enable && !buffer.swizzle_enable &&
+        !buffer.add_tid_enable && (stride & mask) == 0 && (inst_offset & mask) == 0) {
+        // buffer_offset = index * (const_stride >> shift) + (inst_offset >> shift)
+        const IR::U32 index = IR::U32{inst.Arg(1)};
+        return ir.IAdd(ir.IMul(index, ir.Imm32(stride >> shift)), ir.Imm32(inst_offset >> shift));
+    }

    // index = (inst_idxen ? vgpr_index : 0) + (const_add_tid_enable ? thread_id[5:0] : 0)
    IR::U32 index = ir.Imm32(0U);
@ -512,7 +571,7 @@ IR::U32 CalculateBufferAddress(IR::IREmitter& ir, const IR::Inst& inst, const In
        index = ir.IAdd(index, thread_id);
    }
    // offset = (inst_offen ? vgpr_offset : 0) + inst_offset
-    IR::U32 offset = ir.Imm32(inst_info.inst_offset.Value());
+    IR::U32 offset = ir.Imm32(inst_offset);
    if (inst_info.offset_enable) {
        const IR::U32 vgpr_offset = inst_info.index_enable
                                        ? IR::U32{ir.CompositeExtract(inst.Arg(1), 1)}
@ -545,6 +604,9 @@ IR::U32 CalculateBufferAddress(IR::IREmitter& ir, const IR::Inst& inst, const In
        // buffer_offset = index * const_stride + offset
        buffer_offset = ir.IAdd(ir.IMul(index, const_stride), offset);
    }
+    if (shift != 0) {
+        buffer_offset = ir.ShiftRightLogical(buffer_offset, ir.Imm32(shift));
+    }
    return buffer_offset;
 }

--- a/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp
+++ b/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp
@ -102,7 +102,7 @@ void Visit(Info& info, const IR::Inst& inst) {
        info.uses_lane_id = true;
        break;
    case IR::Opcode::ReadConst:
-        if (info.readconst_types == Info::ReadConstType::None) {
+        if (!info.uses_dma) {
            info.buffers.push_back({
                .used_types = IR::Type::U32,
                // We can't guarantee that flatbuf will not grow past UBO
@ -116,7 +116,7 @@ void Visit(Info& info, const IR::Inst& inst) {
        } else {
            info.readconst_types |= Info::ReadConstType::Dynamic;
        }
-        info.dma_types |= IR::Type::U32;
+        info.uses_dma = true;
        break;
    case IR::Opcode::PackUfloat10_11_11:
        info.uses_pack_10_11_11 = true;
@ -130,21 +130,22 @@ void Visit(Info& info, const IR::Inst& inst) {
 }

 void CollectShaderInfoPass(IR::Program& program) {
+    auto& info = program.info;
    for (IR::Block* const block : program.post_order_blocks) {
        for (IR::Inst& inst : block->Instructions()) {
-            Visit(program.info, inst);
+            Visit(info, inst);
        }
    }

-    if (program.info.dma_types != IR::Type::Void) {
-        program.info.buffers.push_back({
+    if (info.uses_dma) {
+        info.buffers.push_back({
            .used_types = IR::Type::U64,
            .inline_cbuf = AmdGpu::Buffer::Placeholder(VideoCore::BufferCache::BDA_PAGETABLE_SIZE),
            .buffer_type = BufferType::BdaPagetable,
            .is_written = true,
        });
-        program.info.buffers.push_back({
-            .used_types = IR::Type::U8,
+        info.buffers.push_back({
+            .used_types = IR::Type::U32,
            .inline_cbuf = AmdGpu::Buffer::Placeholder(VideoCore::BufferCache::FAULT_BUFFER_SIZE),
            .buffer_type = BufferType::FaultBuffer,
            .is_written = true,