shader_recompiler: Reduce cases where shared memory to buffer pass is needed. (#3082)
Some checks are pending
Build and Release / reuse (push) Waiting to run
Build and Release / clang-format (push) Waiting to run
Build and Release / get-info (push) Waiting to run
Build and Release / windows-sdl (push) Blocked by required conditions
Build and Release / windows-qt (push) Blocked by required conditions
Build and Release / macos-sdl (push) Blocked by required conditions
Build and Release / macos-qt (push) Blocked by required conditions
Build and Release / linux-sdl (push) Blocked by required conditions
Build and Release / linux-qt (push) Blocked by required conditions
Build and Release / linux-sdl-gcc (push) Blocked by required conditions
Build and Release / linux-qt-gcc (push) Blocked by required conditions
Build and Release / pre-release (push) Blocked by required conditions

This commit is contained in:
squidbus 2025-06-11 13:24:41 -07:00 committed by GitHub
parent 69a50fa713
commit c71dc740e2
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 232 additions and 49 deletions

View file

@ -870,6 +870,7 @@ set(SHADER_RECOMPILER src/shader_recompiler/exception.h
src/shader_recompiler/ir/passes/ring_access_elimination.cpp
src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp
src/shader_recompiler/ir/passes/shared_memory_barrier_pass.cpp
src/shader_recompiler/ir/passes/shared_memory_simplify_pass.cpp
src/shader_recompiler/ir/passes/shared_memory_to_storage_pass.cpp
src/shader_recompiler/ir/passes/ssa_rewrite_pass.cpp
src/shader_recompiler/ir/abstract_syntax_list.cpp

View file

@ -303,7 +303,8 @@ void SetupCapabilities(const Info& info, const Profile& profile, EmitContext& ct
ctx.AddCapability(spv::Capability::PhysicalStorageBufferAddresses);
ctx.AddExtension("SPV_KHR_physical_storage_buffer");
}
if (info.uses_shared && profile.supports_workgroup_explicit_memory_layout) {
const auto shared_type_count = std::popcount(static_cast<u32>(info.shared_types));
if (shared_type_count > 1 && profile.supports_workgroup_explicit_memory_layout) {
ctx.AddExtension("SPV_KHR_workgroup_memory_explicit_layout");
ctx.AddCapability(spv::Capability::WorkgroupMemoryExplicitLayoutKHR);
ctx.AddCapability(spv::Capability::WorkgroupMemoryExplicitLayout16BitAccessKHR);

View file

@ -979,32 +979,46 @@ void EmitContext::DefineImagesAndSamplers() {
}
void EmitContext::DefineSharedMemory() {
if (!info.uses_shared) {
const auto num_types = std::popcount(static_cast<u32>(info.shared_types));
if (num_types == 0) {
return;
}
ASSERT(info.stage == Stage::Compute);
const u32 shared_memory_size = runtime_info.cs_info.shared_memory_size;
const auto make_type = [&](Id element_type, u32 element_size) {
const auto make_type = [&](IR::Type type, Id element_type, u32 element_size,
std::string_view name) {
if (False(info.shared_types & type)) {
// Skip unused shared memory types.
return std::make_tuple(Id{}, Id{}, Id{});
}
const u32 num_elements{Common::DivCeil(shared_memory_size, element_size)};
const Id array_type{TypeArray(element_type, ConstU32(num_elements))};
Decorate(array_type, spv::Decoration::ArrayStride, element_size);
const Id struct_type{TypeStruct(array_type)};
MemberDecorate(struct_type, 0u, spv::Decoration::Offset, 0u);
Decorate(struct_type, spv::Decoration::Block);
const Id pointer = TypePointer(spv::StorageClass::Workgroup, struct_type);
const Id element_pointer = TypePointer(spv::StorageClass::Workgroup, element_type);
const Id variable = AddGlobalVariable(pointer, spv::StorageClass::Workgroup);
Decorate(variable, spv::Decoration::Aliased);
Name(variable, name);
interfaces.push_back(variable);
if (num_types > 1) {
Decorate(struct_type, spv::Decoration::Block);
Decorate(variable, spv::Decoration::Aliased);
}
return std::make_tuple(variable, element_pointer, pointer);
};
std::tie(shared_memory_u16, shared_u16, shared_memory_u16_type) = make_type(U16, 2u);
std::tie(shared_memory_u32, shared_u32, shared_memory_u32_type) = make_type(U32[1], 4u);
std::tie(shared_memory_u64, shared_u64, shared_memory_u64_type) = make_type(U64, 8u);
std::tie(shared_memory_u16, shared_u16, shared_memory_u16_type) =
make_type(IR::Type::U16, U16, 2u, "shared_mem_u16");
std::tie(shared_memory_u32, shared_u32, shared_memory_u32_type) =
make_type(IR::Type::U32, U32[1], 4u, "shared_mem_u32");
std::tie(shared_memory_u64, shared_u64, shared_memory_u64_type) =
make_type(IR::Type::U64, U64, 8u, "shared_mem_u64");
}
Id EmitContext::DefineFloat32ToUfloatM5(u32 mantissa_bits, const std::string_view name) {

View file

@ -214,7 +214,7 @@ struct Info {
bool uses_lane_id{};
bool uses_group_quad{};
bool uses_group_ballot{};
bool uses_shared{};
IR::Type shared_types{};
bool uses_fp16{};
bool uses_fp64{};
bool uses_pack_10_11_11{};

View file

@ -28,6 +28,7 @@ void HullShaderTransform(IR::Program& program, RuntimeInfo& runtime_info);
void DomainShaderTransform(IR::Program& program, RuntimeInfo& runtime_info);
void SharedMemoryBarrierPass(IR::Program& program, const RuntimeInfo& runtime_info,
const Profile& profile);
void SharedMemorySimplifyPass(IR::Program& program, const Profile& profile);
void SharedMemoryToStoragePass(IR::Program& program, const RuntimeInfo& runtime_info,
const Profile& profile);

View file

@ -35,12 +35,28 @@ void Visit(Info& info, const IR::Inst& inst) {
break;
}
case IR::Opcode::LoadSharedU16:
case IR::Opcode::LoadSharedU32:
case IR::Opcode::LoadSharedU64:
case IR::Opcode::WriteSharedU16:
info.shared_types |= IR::Type::U16;
break;
case IR::Opcode::LoadSharedU32:
case IR::Opcode::WriteSharedU32:
case IR::Opcode::SharedAtomicIAdd32:
case IR::Opcode::SharedAtomicISub32:
case IR::Opcode::SharedAtomicSMin32:
case IR::Opcode::SharedAtomicUMin32:
case IR::Opcode::SharedAtomicSMax32:
case IR::Opcode::SharedAtomicUMax32:
case IR::Opcode::SharedAtomicInc32:
case IR::Opcode::SharedAtomicDec32:
case IR::Opcode::SharedAtomicAnd32:
case IR::Opcode::SharedAtomicOr32:
case IR::Opcode::SharedAtomicXor32:
info.shared_types |= IR::Type::U32;
break;
case IR::Opcode::LoadSharedU64:
case IR::Opcode::WriteSharedU64:
info.uses_shared = true;
case IR::Opcode::SharedAtomicIAdd64:
info.shared_types |= IR::Type::U64;
break;
case IR::Opcode::ConvertF16F32:
case IR::Opcode::ConvertF32F16:

View file

@ -0,0 +1,127 @@
// SPDX-FileCopyrightText: Copyright 2025 shadPS4 Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#include "shader_recompiler/ir/ir_emitter.h"
#include "shader_recompiler/ir/program.h"
#include "shader_recompiler/profile.h"
namespace Shader::Optimization {
static bool Requires16BitSharedAtomic(const IR::Inst& inst) {
// Nothing yet
return false;
}
static bool Requires64BitSharedAtomic(const IR::Inst& inst) {
switch (inst.GetOpcode()) {
case IR::Opcode::SharedAtomicIAdd64:
return true;
default:
return false;
}
}
static bool IsNon32BitSharedLoadStore(const IR::Inst& inst) {
switch (inst.GetOpcode()) {
case IR::Opcode::LoadSharedU16:
case IR::Opcode::LoadSharedU64:
case IR::Opcode::WriteSharedU16:
case IR::Opcode::WriteSharedU64:
return true;
default:
return false;
}
}
IR::Type CalculateSpecialSharedAtomicTypes(IR::Program& program) {
IR::Type extra_atomic_types{IR::Type::Void};
for (IR::Block* const block : program.blocks) {
for (IR::Inst& inst : block->Instructions()) {
if (Requires16BitSharedAtomic(inst)) {
extra_atomic_types |= IR::Type::U16;
}
if (Requires64BitSharedAtomic(inst)) {
extra_atomic_types |= IR::Type::U64;
}
}
}
return extra_atomic_types;
}
// Simplifies down U16 and U64 shared memory operations to U32 when aliasing is not supported and
// atomics of the same type are not used.
void SharedMemorySimplifyPass(IR::Program& program, const Profile& profile) {
if (program.info.stage != Stage::Compute || profile.supports_workgroup_explicit_memory_layout) {
return;
}
const auto atomic_types = CalculateSpecialSharedAtomicTypes(program);
if (True(atomic_types & IR::Type::U16) && True(atomic_types & IR::Type::U64)) {
// If both other atomic types are used, there is nothing to do.
return;
}
// Iterate through shared load/store U16/U64 instructions, replacing with
// equivalent U32 ops when the types are not needed for atomics.
for (IR::Block* const block : program.blocks) {
for (IR::Inst& inst : block->Instructions()) {
if (!IsNon32BitSharedLoadStore(inst)) {
continue;
}
IR::IREmitter ir{*block, IR::Block::InstructionList::s_iterator_to(inst)};
const IR::U32 offset{inst.Arg(0)};
if (False(atomic_types & IR::Type::U16)) {
switch (inst.GetOpcode()) {
case IR::Opcode::LoadSharedU16: {
const IR::U32 dword_offset{ir.BitwiseAnd(offset, ir.Imm32(~3U))};
const IR::U32 dword_value{ir.LoadShared(32, false, dword_offset)};
const IR::U32 bit_offset{
ir.IMul(ir.BitwiseAnd(offset, ir.Imm32(2U)), ir.Imm32(8U))};
const IR::U32 value{ir.BitFieldExtract(dword_value, bit_offset, ir.Imm32(16U))};
inst.ReplaceUsesWithAndRemove(ir.UConvert(16, value));
continue;
}
case IR::Opcode::WriteSharedU16: {
const IR::U32 value{ir.UConvert(32, IR::U16{inst.Arg(1)})};
const IR::U32 bit_offset{
ir.IMul(ir.BitwiseAnd(offset, ir.Imm32(2U)), ir.Imm32(8U))};
const IR::U32 dword_offset{ir.BitwiseAnd(offset, ir.Imm32(~3U))};
const IR::U32 dword_value{
ir.LoadShared(32, false, ir.BitwiseAnd(offset, dword_offset))};
const IR::U32 new_dword_value{
ir.BitFieldInsert(dword_value, value, bit_offset, ir.Imm32(16U))};
ir.WriteShared(32, new_dword_value, dword_offset);
inst.Invalidate();
continue;
}
default:
break;
}
}
if (False(atomic_types & IR::Type::U64)) {
switch (inst.GetOpcode()) {
case IR::Opcode::LoadSharedU64: {
const IR::U32 value0{ir.LoadShared(32, false, offset)};
const IR::U32 value1{ir.LoadShared(32, false, ir.IAdd(offset, ir.Imm32(4U)))};
const IR::Value value{ir.PackUint2x32(ir.CompositeConstruct(value0, value1))};
inst.ReplaceUsesWithAndRemove(value);
continue;
}
case IR::Opcode::WriteSharedU64: {
const IR::Value value{ir.UnpackUint2x32(IR::U64{inst.Arg(1)})};
const IR::U32 value0{ir.CompositeExtract(value, 0)};
const IR::U32 value1{ir.CompositeExtract(value, 1)};
ir.WriteShared(32, value0, offset);
ir.WriteShared(32, value1, ir.IAdd(offset, ir.Imm32(4U)));
inst.Invalidate();
continue;
}
default:
break;
}
}
}
}
}
} // namespace Shader::Optimization

View file

@ -34,20 +34,74 @@ static bool IsSharedAccess(const IR::Inst& inst) {
}
}
IR::Type CalculateSharedMemoryTypes(IR::Program& program) {
IR::Type used_types{IR::Type::Void};
for (IR::Block* const block : program.blocks) {
for (IR::Inst& inst : block->Instructions()) {
if (!IsSharedAccess(inst)) {
continue;
}
switch (inst.GetOpcode()) {
case IR::Opcode::LoadSharedU16:
case IR::Opcode::WriteSharedU16:
used_types |= IR::Type::U16;
break;
case IR::Opcode::LoadSharedU32:
case IR::Opcode::WriteSharedU32:
case IR::Opcode::SharedAtomicIAdd32:
case IR::Opcode::SharedAtomicISub32:
case IR::Opcode::SharedAtomicSMin32:
case IR::Opcode::SharedAtomicUMin32:
case IR::Opcode::SharedAtomicSMax32:
case IR::Opcode::SharedAtomicUMax32:
case IR::Opcode::SharedAtomicInc32:
case IR::Opcode::SharedAtomicDec32:
case IR::Opcode::SharedAtomicAnd32:
case IR::Opcode::SharedAtomicOr32:
case IR::Opcode::SharedAtomicXor32:
used_types |= IR::Type::U32;
break;
case IR::Opcode::LoadSharedU64:
case IR::Opcode::WriteSharedU64:
case IR::Opcode::SharedAtomicIAdd64:
used_types |= IR::Type::U64;
break;
default:
break;
}
}
}
return used_types;
}
void SharedMemoryToStoragePass(IR::Program& program, const RuntimeInfo& runtime_info,
const Profile& profile) {
if (program.info.stage != Stage::Compute) {
return;
}
// Only perform the transform if there is shared memory and either host shared memory is
// insufficient or the device does not support VK_KHR_workgroup_memory_explicit_layout
// Run this pass if:
// * There are shared memory instructions.
// * One of the following is true:
// * Requested shared memory size is too large for the host shared memory.
// * Workgroup explicit memory is not supported and multiple shared memory types are used.
const u32 shared_memory_size = runtime_info.cs_info.shared_memory_size;
if (shared_memory_size == 0 || (shared_memory_size <= profile.max_shared_memory_size &&
profile.supports_workgroup_explicit_memory_layout)) {
const auto used_types = CalculateSharedMemoryTypes(program);
if (used_types == IR::Type::Void || (shared_memory_size <= profile.max_shared_memory_size &&
(profile.supports_workgroup_explicit_memory_layout ||
std::popcount(static_cast<u32>(used_types)) == 1))) {
return;
}
// Add a buffer binding for shared memory storage buffer.
const u32 binding = static_cast<u32>(program.info.buffers.size());
IR::Type used_types{};
program.info.buffers.push_back({
.used_types = used_types,
.inline_cbuf = AmdGpu::Buffer::Null(),
.buffer_type = BufferType::SharedMemory,
.is_written = true,
});
for (IR::Block* const block : program.blocks) {
for (IR::Inst& inst : block->Instructions()) {
if (!IsSharedAccess(inst)) {
@ -58,29 +112,21 @@ void SharedMemoryToStoragePass(IR::Program& program, const RuntimeInfo& runtime_
const IR::U32 offset = ir.IMul(ir.GetAttributeU32(IR::Attribute::WorkgroupIndex),
ir.Imm32(shared_memory_size));
const IR::U32 address = ir.IAdd(IR::U32{inst.Arg(0)}, offset);
// Replace shared atomics first
switch (inst.GetOpcode()) {
case IR::Opcode::SharedAtomicIAdd32:
inst.ReplaceUsesWithAndRemove(
ir.BufferAtomicIAdd(handle, address, inst.Arg(1), {}));
used_types |= IR::Type::U32;
continue;
case IR::Opcode::SharedAtomicIAdd64:
inst.ReplaceUsesWithAndRemove(
ir.BufferAtomicIAdd(handle, address, inst.Arg(1), {}));
used_types |= IR::Type::U64;
continue;
case IR::Opcode::SharedAtomicISub32:
inst.ReplaceUsesWithAndRemove(
ir.BufferAtomicISub(handle, address, inst.Arg(1), {}));
used_types |= IR::Type::U32;
continue;
case IR::Opcode::SharedAtomicSMin32:
case IR::Opcode::SharedAtomicUMin32: {
const bool is_signed = inst.GetOpcode() == IR::Opcode::SharedAtomicSMin32;
inst.ReplaceUsesWithAndRemove(
ir.BufferAtomicIMin(handle, address, inst.Arg(1), is_signed, {}));
used_types |= IR::Type::U32;
continue;
}
case IR::Opcode::SharedAtomicSMax32:
@ -88,73 +134,49 @@ void SharedMemoryToStoragePass(IR::Program& program, const RuntimeInfo& runtime_
const bool is_signed = inst.GetOpcode() == IR::Opcode::SharedAtomicSMax32;
inst.ReplaceUsesWithAndRemove(
ir.BufferAtomicIMax(handle, address, inst.Arg(1), is_signed, {}));
used_types |= IR::Type::U32;
continue;
}
case IR::Opcode::SharedAtomicInc32:
inst.ReplaceUsesWithAndRemove(ir.BufferAtomicInc(handle, address, {}));
used_types |= IR::Type::U32;
continue;
case IR::Opcode::SharedAtomicDec32:
inst.ReplaceUsesWithAndRemove(ir.BufferAtomicDec(handle, address, {}));
used_types |= IR::Type::U32;
continue;
case IR::Opcode::SharedAtomicAnd32:
inst.ReplaceUsesWithAndRemove(ir.BufferAtomicAnd(handle, address, inst.Arg(1), {}));
used_types |= IR::Type::U32;
continue;
case IR::Opcode::SharedAtomicOr32:
inst.ReplaceUsesWithAndRemove(ir.BufferAtomicOr(handle, address, inst.Arg(1), {}));
used_types |= IR::Type::U32;
continue;
case IR::Opcode::SharedAtomicXor32:
inst.ReplaceUsesWithAndRemove(ir.BufferAtomicXor(handle, address, inst.Arg(1), {}));
used_types |= IR::Type::U32;
continue;
default:
break;
}
// Replace shared operations.
switch (inst.GetOpcode()) {
case IR::Opcode::LoadSharedU16:
inst.ReplaceUsesWithAndRemove(ir.LoadBufferU16(handle, address, {}));
used_types |= IR::Type::U16;
break;
case IR::Opcode::LoadSharedU32:
inst.ReplaceUsesWithAndRemove(ir.LoadBufferU32(1, handle, address, {}));
used_types |= IR::Type::U32;
break;
case IR::Opcode::LoadSharedU64:
inst.ReplaceUsesWithAndRemove(ir.LoadBufferU64(handle, address, {}));
used_types |= IR::Type::U64;
break;
case IR::Opcode::WriteSharedU16:
ir.StoreBufferU16(handle, address, IR::U16{inst.Arg(1)}, {});
inst.Invalidate();
used_types |= IR::Type::U16;
break;
case IR::Opcode::WriteSharedU32:
ir.StoreBufferU32(1, handle, address, inst.Arg(1), {});
inst.Invalidate();
used_types |= IR::Type::U32;
break;
case IR::Opcode::WriteSharedU64:
ir.StoreBufferU64(handle, address, IR::U64{inst.Arg(1)}, {});
inst.Invalidate();
used_types |= IR::Type::U64;
break;
default:
break;
}
}
}
// Add buffer binding for shared memory storage buffer.
program.info.buffers.push_back({
.used_types = used_types,
.inline_cbuf = AmdGpu::Buffer::Null(),
.buffer_type = BufferType::SharedMemory,
.is_written = true,
});
}
} // namespace Shader::Optimization

View file

@ -78,6 +78,7 @@ IR::Program TranslateProgram(std::span<const u32> code, Pools& pools, Info& info
Shader::Optimization::FlattenExtendedUserdataPass(program);
Shader::Optimization::ResourceTrackingPass(program);
Shader::Optimization::LowerBufferFormatToRaw(program);
Shader::Optimization::SharedMemorySimplifyPass(program, profile);
Shader::Optimization::SharedMemoryToStoragePass(program, runtime_info, profile);
Shader::Optimization::SharedMemoryBarrierPass(program, runtime_info, profile);
Shader::Optimization::IdentityRemovalPass(program.blocks);