shader_recompiler: Optimize general case of buffer addressing (#3159)
Some checks are pending
Build and Release / reuse (push) Waiting to run
Build and Release / clang-format (push) Waiting to run
Build and Release / get-info (push) Waiting to run
Build and Release / windows-sdl (push) Blocked by required conditions
Build and Release / windows-qt (push) Blocked by required conditions
Build and Release / macos-sdl (push) Blocked by required conditions
Build and Release / macos-qt (push) Blocked by required conditions
Build and Release / linux-sdl (push) Blocked by required conditions
Build and Release / linux-qt (push) Blocked by required conditions
Build and Release / linux-sdl-gcc (push) Blocked by required conditions
Build and Release / linux-qt-gcc (push) Blocked by required conditions
Build and Release / pre-release (push) Blocked by required conditions

* shader_recompiler: Simplify dma types

Only U32 is needed for S_LOAD_DWORD

* shader_recompiler: Perform address shift on IR level

Buffer instructions now expect address in the data unit they work on. Doing the shift on IR level will allow us to optimize some operations away on common case

* shader_recompiler: Optimize common buffer access pattern

* emit_spirv: Use 32-bit integer ops for fault buffer

Not many GPUs have 8-bit bitwise or operations so that would probably require some overhead to emulate from the driver

* resource_tracking_pass: Fix texel buffer shift
This commit is contained in:
TheTurtle 2025-06-26 12:14:36 +03:00 committed by GitHub
parent 6eaec7a004
commit a49b13fe66
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
12 changed files with 271 additions and 233 deletions

View file

@ -71,7 +71,7 @@ EmitContext::EmitContext(const Profile& profile_, const RuntimeInfo& runtime_inf
Bindings& binding_)
: Sirit::Module(profile_.supported_spirv), info{info_}, runtime_info{runtime_info_},
profile{profile_}, stage{info.stage}, l_stage{info.l_stage}, binding{binding_} {
if (info.dma_types != IR::Type::Void) {
if (info.uses_dma) {
SetMemoryModel(spv::AddressingModel::PhysicalStorageBuffer64, spv::MemoryModel::GLSL450);
} else {
SetMemoryModel(spv::AddressingModel::Logical, spv::MemoryModel::GLSL450);
@ -169,34 +169,8 @@ void EmitContext::DefineArithmeticTypes() {
if (info.uses_fp64) {
frexp_result_f64 = Name(TypeStruct(F64[1], S32[1]), "frexp_result_f64");
}
if (True(info.dma_types & IR::Type::F64)) {
physical_pointer_types[PointerType::F64] =
TypePointer(spv::StorageClass::PhysicalStorageBuffer, F64[1]);
}
if (True(info.dma_types & IR::Type::U64)) {
physical_pointer_types[PointerType::U64] =
TypePointer(spv::StorageClass::PhysicalStorageBuffer, U64);
}
if (True(info.dma_types & IR::Type::F32)) {
physical_pointer_types[PointerType::F32] =
TypePointer(spv::StorageClass::PhysicalStorageBuffer, F32[1]);
}
if (True(info.dma_types & IR::Type::U32)) {
physical_pointer_types[PointerType::U32] =
TypePointer(spv::StorageClass::PhysicalStorageBuffer, U32[1]);
}
if (True(info.dma_types & IR::Type::F16)) {
physical_pointer_types[PointerType::F16] =
TypePointer(spv::StorageClass::PhysicalStorageBuffer, F16[1]);
}
if (True(info.dma_types & IR::Type::U16)) {
physical_pointer_types[PointerType::U16] =
TypePointer(spv::StorageClass::PhysicalStorageBuffer, U16);
}
if (True(info.dma_types & IR::Type::U8)) {
physical_pointer_types[PointerType::U8] =
TypePointer(spv::StorageClass::PhysicalStorageBuffer, U8);
if (info.uses_dma) {
physical_pointer_type_u32 = TypePointer(spv::StorageClass::PhysicalStorageBuffer, U32[1]);
}
}
@ -239,7 +213,7 @@ Id EmitContext::GetBufferSize(const u32 sharp_idx) {
// Can this be done with memory access? Like we do now with ReadConst
const auto& srt_flatbuf = buffers[flatbuf_index];
ASSERT(srt_flatbuf.buffer_type == BufferType::Flatbuf);
const auto [id, pointer_type] = srt_flatbuf[PointerType::U32];
const auto [id, pointer_type] = srt_flatbuf.Alias(PointerType::U32);
const auto rsrc1{
OpLoad(U32[1], OpAccessChain(pointer_type, id, u32_zero_value, ConstU32(sharp_idx + 1)))};
@ -255,39 +229,70 @@ Id EmitContext::GetBufferSize(const u32 sharp_idx) {
}
void EmitContext::DefineBufferProperties() {
if (!profile.needs_buffer_offsets && profile.supports_robust_buffer_access) {
return;
}
for (u32 i = 0; i < buffers.size(); i++) {
BufferDefinition& buffer = buffers[i];
auto& buffer = buffers[i];
const auto& desc = info.buffers[i];
const u32 binding = buffer.binding;
if (buffer.buffer_type != BufferType::Guest) {
continue;
}
const u32 binding = buffer.binding;
const u32 half = PushData::BufOffsetIndex + (binding >> 4);
const u32 comp = (binding & 0xf) >> 2;
const u32 offset = (binding & 0x3) << 3;
const Id ptr{OpAccessChain(TypePointer(spv::StorageClass::PushConstant, U32[1]),
push_data_block, ConstU32(half), ConstU32(comp))};
const Id value{OpLoad(U32[1], ptr)};
buffer.offset = OpBitFieldUExtract(U32[1], value, ConstU32(offset), ConstU32(8U));
Name(buffer.offset, fmt::format("buf{}_off", binding));
buffer.offset_dwords = OpShiftRightLogical(U32[1], buffer.offset, ConstU32(2U));
Name(buffer.offset_dwords, fmt::format("buf{}_dword_off", binding));
// Only need to load size if performing bounds checks and the buffer is both guest and not
// inline.
if (!profile.supports_robust_buffer_access && buffer.buffer_type == BufferType::Guest) {
const BufferResource& desc = info.buffers[i];
if (desc.sharp_idx == std::numeric_limits<u32>::max()) {
buffer.size = ConstU32(desc.inline_cbuf.GetSize());
} else {
buffer.size = GetBufferSize(desc.sharp_idx);
// Only load and apply buffer offsets if host GPU alignment is larger than guest.
if (profile.needs_buffer_offsets) {
const u32 half = PushData::BufOffsetIndex + (binding >> 4);
const u32 comp = (binding & 0xf) >> 2;
const u32 offset = (binding & 0x3) << 3;
const Id ptr{OpAccessChain(TypePointer(spv::StorageClass::PushConstant, U32[1]),
push_data_block, ConstU32(half), ConstU32(comp))};
const Id value{OpLoad(U32[1], ptr)};
const Id buf_offset{OpBitFieldUExtract(U32[1], value, ConstU32(offset), ConstU32(8U))};
Name(buf_offset, fmt::format("buf{}_off", binding));
buffer.Offset(PointerSize::B8) = buf_offset;
if (True(desc.used_types & IR::Type::U16)) {
const Id buf_word_offset{OpShiftRightLogical(U32[1], buf_offset, ConstU32(1U))};
Name(buf_word_offset, fmt::format("buf{}_word_off", binding));
buffer.Offset(PointerSize::B16) = buf_word_offset;
}
if (True(desc.used_types & IR::Type::U32)) {
const Id buf_dword_offset{OpShiftRightLogical(U32[1], buf_offset, ConstU32(2U))};
Name(buf_dword_offset, fmt::format("buf{}_dword_off", binding));
buffer.Offset(PointerSize::B32) = buf_dword_offset;
}
if (True(desc.used_types & IR::Type::U64)) {
const Id buf_qword_offset{OpShiftRightLogical(U32[1], buf_offset, ConstU32(3U))};
Name(buf_qword_offset, fmt::format("buf{}_qword_off", binding));
buffer.Offset(PointerSize::B64) = buf_qword_offset;
}
}
// Only load size if performing bounds checks.
if (!profile.supports_robust_buffer_access) {
const Id buf_size{desc.sharp_idx == std::numeric_limits<u32>::max()
? ConstU32(desc.inline_cbuf.GetSize())
: GetBufferSize(desc.sharp_idx)};
Name(buf_size, fmt::format("buf{}_size", binding));
buffer.Size(PointerSize::B8) = buf_size;
if (True(desc.used_types & IR::Type::U16)) {
const Id buf_word_size{OpShiftRightLogical(U32[1], buf_size, ConstU32(1U))};
Name(buf_word_size, fmt::format("buf{}_short_size", binding));
buffer.Size(PointerSize::B16) = buf_word_size;
}
if (True(desc.used_types & IR::Type::U32)) {
const Id buf_dword_size{OpShiftRightLogical(U32[1], buf_size, ConstU32(2U))};
Name(buf_dword_size, fmt::format("buf{}_dword_size", binding));
buffer.Size(PointerSize::B32) = buf_dword_size;
}
if (True(desc.used_types & IR::Type::U64)) {
const Id buf_qword_size{OpShiftRightLogical(U32[1], buf_size, ConstU32(3U))};
Name(buf_qword_size, fmt::format("buf{}_qword_size", binding));
buffer.Size(PointerSize::B64) = buf_qword_size;
}
Name(buffer.size, fmt::format("buf{}_size", binding));
buffer.size_shorts = OpShiftRightLogical(U32[1], buffer.size, ConstU32(1U));
Name(buffer.size_shorts, fmt::format("buf{}_short_size", binding));
buffer.size_dwords = OpShiftRightLogical(U32[1], buffer.size, ConstU32(2U));
Name(buffer.size_dwords, fmt::format("buf{}_dword_size", binding));
buffer.size_qwords = OpShiftRightLogical(U32[1], buffer.size, ConstU32(3U));
Name(buffer.size_qwords, fmt::format("buf{}_qword_size", binding));
}
}
}
@ -779,8 +784,7 @@ EmitContext::BufferSpv EmitContext::DefineBuffer(bool is_storage, bool is_writte
};
void EmitContext::DefineBuffers() {
if (!profile.supports_robust_buffer_access &&
info.readconst_types == Info::ReadConstType::None) {
if (!profile.supports_robust_buffer_access && !info.uses_dma) {
// In case Flatbuf has not already been bound by IR and is needed
// to query buffer sizes, bind it now.
info.buffers.push_back({
@ -809,23 +813,23 @@ void EmitContext::DefineBuffers() {
// Define aliases depending on the shader usage.
auto& spv_buffer = buffers.emplace_back(binding.buffer++, desc.buffer_type);
if (True(desc.used_types & IR::Type::U64)) {
spv_buffer[PointerType::U64] =
spv_buffer.Alias(PointerType::U64) =
DefineBuffer(is_storage, desc.is_written, 3, desc.buffer_type, U64);
}
if (True(desc.used_types & IR::Type::U32)) {
spv_buffer[PointerType::U32] =
spv_buffer.Alias(PointerType::U32) =
DefineBuffer(is_storage, desc.is_written, 2, desc.buffer_type, U32[1]);
}
if (True(desc.used_types & IR::Type::F32)) {
spv_buffer[PointerType::F32] =
spv_buffer.Alias(PointerType::F32) =
DefineBuffer(is_storage, desc.is_written, 2, desc.buffer_type, F32[1]);
}
if (True(desc.used_types & IR::Type::U16)) {
spv_buffer[PointerType::U16] =
spv_buffer.Alias(PointerType::U16) =
DefineBuffer(is_storage, desc.is_written, 1, desc.buffer_type, U16);
}
if (True(desc.used_types & IR::Type::U8)) {
spv_buffer[PointerType::U8] =
spv_buffer.Alias(PointerType::U8) =
DefineBuffer(is_storage, desc.is_written, 0, desc.buffer_type, U8);
}
++binding.unified;
@ -1154,7 +1158,7 @@ Id EmitContext::DefineGetBdaPointer() {
const auto page{OpShiftRightLogical(U64, address, caching_pagebits)};
const auto page32{OpUConvert(U32[1], page)};
const auto& bda_buffer{buffers[bda_pagetable_index]};
const auto [bda_buffer_id, bda_pointer_type] = bda_buffer[PointerType::U64];
const auto [bda_buffer_id, bda_pointer_type] = bda_buffer.Alias(PointerType::U64);
const auto bda_ptr{OpAccessChain(bda_pointer_type, bda_buffer_id, u32_zero_value, page32)};
const auto bda{OpLoad(U64, bda_ptr)};
@ -1166,14 +1170,14 @@ Id EmitContext::DefineGetBdaPointer() {
// First time acces, mark as fault
AddLabel(fault_label);
const auto& fault_buffer{buffers[fault_buffer_index]};
const auto [fault_buffer_id, fault_pointer_type] = fault_buffer[PointerType::U8];
const auto page_div8{OpShiftRightLogical(U32[1], page32, ConstU32(3U))};
const auto page_mod8{OpBitwiseAnd(U32[1], page32, ConstU32(7U))};
const auto page_mask{OpShiftLeftLogical(U8, u8_one_value, page_mod8)};
const auto [fault_buffer_id, fault_pointer_type] = fault_buffer.Alias(PointerType::U32);
const auto page_div32{OpShiftRightLogical(U32[1], page32, ConstU32(5U))};
const auto page_mod32{OpBitwiseAnd(U32[1], page32, ConstU32(31U))};
const auto page_mask{OpShiftLeftLogical(U32[1], u32_one_value, page_mod32)};
const auto fault_ptr{
OpAccessChain(fault_pointer_type, fault_buffer_id, u32_zero_value, page_div8)};
const auto fault_value{OpLoad(U8, fault_ptr)};
const auto fault_value_masked{OpBitwiseOr(U8, fault_value, page_mask)};
OpAccessChain(fault_pointer_type, fault_buffer_id, u32_zero_value, page_div32)};
const auto fault_value{OpLoad(U32[1], fault_ptr)};
const auto fault_value_masked{OpBitwiseOr(U32[1], fault_value, page_mask)};
OpStore(fault_ptr, fault_value_masked);
// Return null pointer
@ -1211,14 +1215,15 @@ Id EmitContext::DefineReadConst(bool dynamic) {
const auto offset_bytes{OpShiftLeftLogical(U32[1], offset, ConstU32(2U))};
const auto addr{OpIAdd(U64, base_addr, OpUConvert(U64, offset_bytes))};
const auto result = EmitMemoryRead(U32[1], addr, [&]() {
const auto result = EmitDwordMemoryRead(addr, [&]() {
if (dynamic) {
return u32_zero_value;
} else {
const auto& flatbuf_buffer{buffers[flatbuf_index]};
ASSERT(flatbuf_buffer.binding >= 0 &&
flatbuf_buffer.buffer_type == BufferType::Flatbuf);
const auto [flatbuf_buffer_id, flatbuf_pointer_type] = flatbuf_buffer[PointerType::U32];
const auto [flatbuf_buffer_id, flatbuf_pointer_type] =
flatbuf_buffer.Alias(PointerType::U32);
const auto ptr{OpAccessChain(flatbuf_pointer_type, flatbuf_buffer_id, u32_zero_value,
flatbuf_offset)};
return OpLoad(U32[1], ptr);
@ -1239,7 +1244,7 @@ void EmitContext::DefineFunctions() {
uf11_to_f32 = DefineUfloatM5ToFloat32(6, "uf11_to_f32");
uf10_to_f32 = DefineUfloatM5ToFloat32(5, "uf10_to_f32");
}
if (info.dma_types != IR::Type::Void) {
if (info.uses_dma) {
get_bda_pointer = DefineGetBdaPointer();
}