shader_recompiler: Optimize general case of buffer addressing (#3159)
Some checks are pending
Build and Release / reuse (push) Waiting to run
Build and Release / clang-format (push) Waiting to run
Build and Release / get-info (push) Waiting to run
Build and Release / windows-sdl (push) Blocked by required conditions
Build and Release / windows-qt (push) Blocked by required conditions
Build and Release / macos-sdl (push) Blocked by required conditions
Build and Release / macos-qt (push) Blocked by required conditions
Build and Release / linux-sdl (push) Blocked by required conditions
Build and Release / linux-qt (push) Blocked by required conditions
Build and Release / linux-sdl-gcc (push) Blocked by required conditions
Build and Release / linux-qt-gcc (push) Blocked by required conditions
Build and Release / pre-release (push) Blocked by required conditions

* shader_recompiler: Simplify dma types

Only U32 is needed for S_LOAD_DWORD

* shader_recompiler: Perform address shift on IR level

Buffer instructions now expect address in the data unit they work on. Doing the shift on IR level will allow us to optimize some operations away on common case

* shader_recompiler: Optimize common buffer access pattern

* emit_spirv: Use 32-bit integer ops for fault buffer

Not many GPUs have 8-bit bitwise or operations so that would probably require some overhead to emulate from the driver

* resource_tracking_pass: Fix texel buffer shift
This commit is contained in:
TheTurtle 2025-06-26 12:14:36 +03:00 committed by GitHub
parent 6eaec7a004
commit a49b13fe66
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
12 changed files with 271 additions and 233 deletions

View file

@ -105,6 +105,49 @@ IR::Type BufferDataType(const IR::Inst& inst, AmdGpu::NumberFormat num_format) {
}
}
u32 BufferAddressShift(const IR::Inst& inst, AmdGpu::DataFormat data_format) {
switch (inst.GetOpcode()) {
case IR::Opcode::LoadBufferU8:
case IR::Opcode::StoreBufferU8:
return 0;
case IR::Opcode::LoadBufferU16:
case IR::Opcode::StoreBufferU16:
return 1;
case IR::Opcode::LoadBufferU64:
case IR::Opcode::StoreBufferU64:
case IR::Opcode::BufferAtomicIAdd64:
return 3;
case IR::Opcode::LoadBufferFormatF32:
case IR::Opcode::StoreBufferFormatF32: {
switch (data_format) {
case AmdGpu::DataFormat::Format8:
return 0;
case AmdGpu::DataFormat::Format8_8:
case AmdGpu::DataFormat::Format16:
return 1;
case AmdGpu::DataFormat::Format8_8_8_8:
case AmdGpu::DataFormat::Format16_16:
case AmdGpu::DataFormat::Format10_11_11:
case AmdGpu::DataFormat::Format2_10_10_10:
case AmdGpu::DataFormat::Format16_16_16_16:
case AmdGpu::DataFormat::Format32:
case AmdGpu::DataFormat::Format32_32:
case AmdGpu::DataFormat::Format32_32_32:
case AmdGpu::DataFormat::Format32_32_32_32:
return 2;
default:
return 0;
}
break;
}
case IR::Opcode::ReadConstBuffer:
// Provided address is already in dwords
return 0;
default:
return 2;
}
}
bool IsImageAtomicInstruction(const IR::Inst& inst) {
switch (inst.GetOpcode()) {
case IR::Opcode::ImageAtomicIAdd32:
@ -496,6 +539,22 @@ void PatchDataRingAccess(IR::Block& block, IR::Inst& inst, Info& info, Descripto
IR::U32 CalculateBufferAddress(IR::IREmitter& ir, const IR::Inst& inst, const Info& info,
const AmdGpu::Buffer& buffer, u32 stride) {
const auto inst_info = inst.Flags<IR::BufferInstInfo>();
const u32 inst_offset = inst_info.inst_offset.Value();
const auto is_inst_typed = inst_info.inst_data_fmt != AmdGpu::DataFormat::FormatInvalid;
const auto data_format = is_inst_typed
? AmdGpu::RemapDataFormat(inst_info.inst_data_fmt.Value())
: buffer.GetDataFmt();
const u32 shift = BufferAddressShift(inst, data_format);
const u32 mask = (1 << shift) - 1;
// If address calculation is of the form "index * const_stride + offset" with offset constant
// and both const_stride and offset are divisible with the element size, apply shift directly.
if (inst_info.index_enable && !inst_info.offset_enable && !buffer.swizzle_enable &&
!buffer.add_tid_enable && (stride & mask) == 0 && (inst_offset & mask) == 0) {
// buffer_offset = index * (const_stride >> shift) + (inst_offset >> shift)
const IR::U32 index = IR::U32{inst.Arg(1)};
return ir.IAdd(ir.IMul(index, ir.Imm32(stride >> shift)), ir.Imm32(inst_offset >> shift));
}
// index = (inst_idxen ? vgpr_index : 0) + (const_add_tid_enable ? thread_id[5:0] : 0)
IR::U32 index = ir.Imm32(0U);
@ -512,7 +571,7 @@ IR::U32 CalculateBufferAddress(IR::IREmitter& ir, const IR::Inst& inst, const In
index = ir.IAdd(index, thread_id);
}
// offset = (inst_offen ? vgpr_offset : 0) + inst_offset
IR::U32 offset = ir.Imm32(inst_info.inst_offset.Value());
IR::U32 offset = ir.Imm32(inst_offset);
if (inst_info.offset_enable) {
const IR::U32 vgpr_offset = inst_info.index_enable
? IR::U32{ir.CompositeExtract(inst.Arg(1), 1)}
@ -545,6 +604,9 @@ IR::U32 CalculateBufferAddress(IR::IREmitter& ir, const IR::Inst& inst, const In
// buffer_offset = index * const_stride + offset
buffer_offset = ir.IAdd(ir.IMul(index, const_stride), offset);
}
if (shift != 0) {
buffer_offset = ir.ShiftRightLogical(buffer_offset, ir.Imm32(shift));
}
return buffer_offset;
}

View file

@ -102,7 +102,7 @@ void Visit(Info& info, const IR::Inst& inst) {
info.uses_lane_id = true;
break;
case IR::Opcode::ReadConst:
if (info.readconst_types == Info::ReadConstType::None) {
if (!info.uses_dma) {
info.buffers.push_back({
.used_types = IR::Type::U32,
// We can't guarantee that flatbuf will not grow past UBO
@ -116,7 +116,7 @@ void Visit(Info& info, const IR::Inst& inst) {
} else {
info.readconst_types |= Info::ReadConstType::Dynamic;
}
info.dma_types |= IR::Type::U32;
info.uses_dma = true;
break;
case IR::Opcode::PackUfloat10_11_11:
info.uses_pack_10_11_11 = true;
@ -130,21 +130,22 @@ void Visit(Info& info, const IR::Inst& inst) {
}
void CollectShaderInfoPass(IR::Program& program) {
auto& info = program.info;
for (IR::Block* const block : program.post_order_blocks) {
for (IR::Inst& inst : block->Instructions()) {
Visit(program.info, inst);
Visit(info, inst);
}
}
if (program.info.dma_types != IR::Type::Void) {
program.info.buffers.push_back({
if (info.uses_dma) {
info.buffers.push_back({
.used_types = IR::Type::U64,
.inline_cbuf = AmdGpu::Buffer::Placeholder(VideoCore::BufferCache::BDA_PAGETABLE_SIZE),
.buffer_type = BufferType::BdaPagetable,
.is_written = true,
});
program.info.buffers.push_back({
.used_types = IR::Type::U8,
info.buffers.push_back({
.used_types = IR::Type::U32,
.inline_cbuf = AmdGpu::Buffer::Placeholder(VideoCore::BufferCache::FAULT_BUFFER_SIZE),
.buffer_type = BufferType::FaultBuffer,
.is_written = true,