mirror of
https://github.com/shadps4-emu/shadPS4.git
synced 2025-07-12 12:45:56 +00:00
shader_recompiler: Optimize general case of buffer addressing (#3159)
Some checks are pending
Build and Release / reuse (push) Waiting to run
Build and Release / clang-format (push) Waiting to run
Build and Release / get-info (push) Waiting to run
Build and Release / windows-sdl (push) Blocked by required conditions
Build and Release / windows-qt (push) Blocked by required conditions
Build and Release / macos-sdl (push) Blocked by required conditions
Build and Release / macos-qt (push) Blocked by required conditions
Build and Release / linux-sdl (push) Blocked by required conditions
Build and Release / linux-qt (push) Blocked by required conditions
Build and Release / linux-sdl-gcc (push) Blocked by required conditions
Build and Release / linux-qt-gcc (push) Blocked by required conditions
Build and Release / pre-release (push) Blocked by required conditions
Some checks are pending
Build and Release / reuse (push) Waiting to run
Build and Release / clang-format (push) Waiting to run
Build and Release / get-info (push) Waiting to run
Build and Release / windows-sdl (push) Blocked by required conditions
Build and Release / windows-qt (push) Blocked by required conditions
Build and Release / macos-sdl (push) Blocked by required conditions
Build and Release / macos-qt (push) Blocked by required conditions
Build and Release / linux-sdl (push) Blocked by required conditions
Build and Release / linux-qt (push) Blocked by required conditions
Build and Release / linux-sdl-gcc (push) Blocked by required conditions
Build and Release / linux-qt-gcc (push) Blocked by required conditions
Build and Release / pre-release (push) Blocked by required conditions
* shader_recompiler: Simplify dma types Only U32 is needed for S_LOAD_DWORD * shader_recompiler: Perform address shift on IR level Buffer instructions now expect address in the data unit they work on. Doing the shift on IR level will allow us to optimize some operations away on common case * shader_recompiler: Optimize common buffer access pattern * emit_spirv: Use 32-bit integer ops for fault buffer Not many GPUs have 8-bit bitwise or operations so that would probably require some overhead to emulate from the driver * resource_tracking_pass: Fix texel buffer shift
This commit is contained in:
parent
6eaec7a004
commit
a49b13fe66
12 changed files with 271 additions and 233 deletions
|
@ -105,6 +105,49 @@ IR::Type BufferDataType(const IR::Inst& inst, AmdGpu::NumberFormat num_format) {
|
|||
}
|
||||
}
|
||||
|
||||
u32 BufferAddressShift(const IR::Inst& inst, AmdGpu::DataFormat data_format) {
|
||||
switch (inst.GetOpcode()) {
|
||||
case IR::Opcode::LoadBufferU8:
|
||||
case IR::Opcode::StoreBufferU8:
|
||||
return 0;
|
||||
case IR::Opcode::LoadBufferU16:
|
||||
case IR::Opcode::StoreBufferU16:
|
||||
return 1;
|
||||
case IR::Opcode::LoadBufferU64:
|
||||
case IR::Opcode::StoreBufferU64:
|
||||
case IR::Opcode::BufferAtomicIAdd64:
|
||||
return 3;
|
||||
case IR::Opcode::LoadBufferFormatF32:
|
||||
case IR::Opcode::StoreBufferFormatF32: {
|
||||
switch (data_format) {
|
||||
case AmdGpu::DataFormat::Format8:
|
||||
return 0;
|
||||
case AmdGpu::DataFormat::Format8_8:
|
||||
case AmdGpu::DataFormat::Format16:
|
||||
return 1;
|
||||
case AmdGpu::DataFormat::Format8_8_8_8:
|
||||
case AmdGpu::DataFormat::Format16_16:
|
||||
case AmdGpu::DataFormat::Format10_11_11:
|
||||
case AmdGpu::DataFormat::Format2_10_10_10:
|
||||
case AmdGpu::DataFormat::Format16_16_16_16:
|
||||
case AmdGpu::DataFormat::Format32:
|
||||
case AmdGpu::DataFormat::Format32_32:
|
||||
case AmdGpu::DataFormat::Format32_32_32:
|
||||
case AmdGpu::DataFormat::Format32_32_32_32:
|
||||
return 2;
|
||||
default:
|
||||
return 0;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case IR::Opcode::ReadConstBuffer:
|
||||
// Provided address is already in dwords
|
||||
return 0;
|
||||
default:
|
||||
return 2;
|
||||
}
|
||||
}
|
||||
|
||||
bool IsImageAtomicInstruction(const IR::Inst& inst) {
|
||||
switch (inst.GetOpcode()) {
|
||||
case IR::Opcode::ImageAtomicIAdd32:
|
||||
|
@ -496,6 +539,22 @@ void PatchDataRingAccess(IR::Block& block, IR::Inst& inst, Info& info, Descripto
|
|||
IR::U32 CalculateBufferAddress(IR::IREmitter& ir, const IR::Inst& inst, const Info& info,
|
||||
const AmdGpu::Buffer& buffer, u32 stride) {
|
||||
const auto inst_info = inst.Flags<IR::BufferInstInfo>();
|
||||
const u32 inst_offset = inst_info.inst_offset.Value();
|
||||
const auto is_inst_typed = inst_info.inst_data_fmt != AmdGpu::DataFormat::FormatInvalid;
|
||||
const auto data_format = is_inst_typed
|
||||
? AmdGpu::RemapDataFormat(inst_info.inst_data_fmt.Value())
|
||||
: buffer.GetDataFmt();
|
||||
const u32 shift = BufferAddressShift(inst, data_format);
|
||||
const u32 mask = (1 << shift) - 1;
|
||||
|
||||
// If address calculation is of the form "index * const_stride + offset" with offset constant
|
||||
// and both const_stride and offset are divisible with the element size, apply shift directly.
|
||||
if (inst_info.index_enable && !inst_info.offset_enable && !buffer.swizzle_enable &&
|
||||
!buffer.add_tid_enable && (stride & mask) == 0 && (inst_offset & mask) == 0) {
|
||||
// buffer_offset = index * (const_stride >> shift) + (inst_offset >> shift)
|
||||
const IR::U32 index = IR::U32{inst.Arg(1)};
|
||||
return ir.IAdd(ir.IMul(index, ir.Imm32(stride >> shift)), ir.Imm32(inst_offset >> shift));
|
||||
}
|
||||
|
||||
// index = (inst_idxen ? vgpr_index : 0) + (const_add_tid_enable ? thread_id[5:0] : 0)
|
||||
IR::U32 index = ir.Imm32(0U);
|
||||
|
@ -512,7 +571,7 @@ IR::U32 CalculateBufferAddress(IR::IREmitter& ir, const IR::Inst& inst, const In
|
|||
index = ir.IAdd(index, thread_id);
|
||||
}
|
||||
// offset = (inst_offen ? vgpr_offset : 0) + inst_offset
|
||||
IR::U32 offset = ir.Imm32(inst_info.inst_offset.Value());
|
||||
IR::U32 offset = ir.Imm32(inst_offset);
|
||||
if (inst_info.offset_enable) {
|
||||
const IR::U32 vgpr_offset = inst_info.index_enable
|
||||
? IR::U32{ir.CompositeExtract(inst.Arg(1), 1)}
|
||||
|
@ -545,6 +604,9 @@ IR::U32 CalculateBufferAddress(IR::IREmitter& ir, const IR::Inst& inst, const In
|
|||
// buffer_offset = index * const_stride + offset
|
||||
buffer_offset = ir.IAdd(ir.IMul(index, const_stride), offset);
|
||||
}
|
||||
if (shift != 0) {
|
||||
buffer_offset = ir.ShiftRightLogical(buffer_offset, ir.Imm32(shift));
|
||||
}
|
||||
return buffer_offset;
|
||||
}
|
||||
|
||||
|
|
|
@ -102,7 +102,7 @@ void Visit(Info& info, const IR::Inst& inst) {
|
|||
info.uses_lane_id = true;
|
||||
break;
|
||||
case IR::Opcode::ReadConst:
|
||||
if (info.readconst_types == Info::ReadConstType::None) {
|
||||
if (!info.uses_dma) {
|
||||
info.buffers.push_back({
|
||||
.used_types = IR::Type::U32,
|
||||
// We can't guarantee that flatbuf will not grow past UBO
|
||||
|
@ -116,7 +116,7 @@ void Visit(Info& info, const IR::Inst& inst) {
|
|||
} else {
|
||||
info.readconst_types |= Info::ReadConstType::Dynamic;
|
||||
}
|
||||
info.dma_types |= IR::Type::U32;
|
||||
info.uses_dma = true;
|
||||
break;
|
||||
case IR::Opcode::PackUfloat10_11_11:
|
||||
info.uses_pack_10_11_11 = true;
|
||||
|
@ -130,21 +130,22 @@ void Visit(Info& info, const IR::Inst& inst) {
|
|||
}
|
||||
|
||||
void CollectShaderInfoPass(IR::Program& program) {
|
||||
auto& info = program.info;
|
||||
for (IR::Block* const block : program.post_order_blocks) {
|
||||
for (IR::Inst& inst : block->Instructions()) {
|
||||
Visit(program.info, inst);
|
||||
Visit(info, inst);
|
||||
}
|
||||
}
|
||||
|
||||
if (program.info.dma_types != IR::Type::Void) {
|
||||
program.info.buffers.push_back({
|
||||
if (info.uses_dma) {
|
||||
info.buffers.push_back({
|
||||
.used_types = IR::Type::U64,
|
||||
.inline_cbuf = AmdGpu::Buffer::Placeholder(VideoCore::BufferCache::BDA_PAGETABLE_SIZE),
|
||||
.buffer_type = BufferType::BdaPagetable,
|
||||
.is_written = true,
|
||||
});
|
||||
program.info.buffers.push_back({
|
||||
.used_types = IR::Type::U8,
|
||||
info.buffers.push_back({
|
||||
.used_types = IR::Type::U32,
|
||||
.inline_cbuf = AmdGpu::Buffer::Placeholder(VideoCore::BufferCache::FAULT_BUFFER_SIZE),
|
||||
.buffer_type = BufferType::FaultBuffer,
|
||||
.is_written = true,
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue