diff --git a/CMakeLists.txt b/CMakeLists.txt index 7c2739d22..0d89524cc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -870,6 +870,7 @@ set(SHADER_RECOMPILER src/shader_recompiler/exception.h src/shader_recompiler/ir/passes/ring_access_elimination.cpp src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp src/shader_recompiler/ir/passes/shared_memory_barrier_pass.cpp + src/shader_recompiler/ir/passes/shared_memory_simplify_pass.cpp src/shader_recompiler/ir/passes/shared_memory_to_storage_pass.cpp src/shader_recompiler/ir/passes/ssa_rewrite_pass.cpp src/shader_recompiler/ir/abstract_syntax_list.cpp diff --git a/src/shader_recompiler/backend/spirv/emit_spirv.cpp b/src/shader_recompiler/backend/spirv/emit_spirv.cpp index 37d7eea35..93fb81df4 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv.cpp @@ -303,7 +303,8 @@ void SetupCapabilities(const Info& info, const Profile& profile, EmitContext& ct ctx.AddCapability(spv::Capability::PhysicalStorageBufferAddresses); ctx.AddExtension("SPV_KHR_physical_storage_buffer"); } - if (info.uses_shared && profile.supports_workgroup_explicit_memory_layout) { + const auto shared_type_count = std::popcount(static_cast(info.shared_types)); + if (shared_type_count > 1 && profile.supports_workgroup_explicit_memory_layout) { ctx.AddExtension("SPV_KHR_workgroup_memory_explicit_layout"); ctx.AddCapability(spv::Capability::WorkgroupMemoryExplicitLayoutKHR); ctx.AddCapability(spv::Capability::WorkgroupMemoryExplicitLayout16BitAccessKHR); diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp index c47a75739..0a8f78f72 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp @@ -979,32 +979,46 @@ void EmitContext::DefineImagesAndSamplers() { } void EmitContext::DefineSharedMemory() { - if (!info.uses_shared) { + const auto num_types = std::popcount(static_cast(info.shared_types)); + if (num_types == 0) { return; } ASSERT(info.stage == Stage::Compute); const u32 shared_memory_size = runtime_info.cs_info.shared_memory_size; - const auto make_type = [&](Id element_type, u32 element_size) { + const auto make_type = [&](IR::Type type, Id element_type, u32 element_size, + std::string_view name) { + if (False(info.shared_types & type)) { + // Skip unused shared memory types. + return std::make_tuple(Id{}, Id{}, Id{}); + } + const u32 num_elements{Common::DivCeil(shared_memory_size, element_size)}; const Id array_type{TypeArray(element_type, ConstU32(num_elements))}; Decorate(array_type, spv::Decoration::ArrayStride, element_size); const Id struct_type{TypeStruct(array_type)}; MemberDecorate(struct_type, 0u, spv::Decoration::Offset, 0u); - Decorate(struct_type, spv::Decoration::Block); const Id pointer = TypePointer(spv::StorageClass::Workgroup, struct_type); const Id element_pointer = TypePointer(spv::StorageClass::Workgroup, element_type); const Id variable = AddGlobalVariable(pointer, spv::StorageClass::Workgroup); - Decorate(variable, spv::Decoration::Aliased); + Name(variable, name); interfaces.push_back(variable); + if (num_types > 1) { + Decorate(struct_type, spv::Decoration::Block); + Decorate(variable, spv::Decoration::Aliased); + } + return std::make_tuple(variable, element_pointer, pointer); }; - std::tie(shared_memory_u16, shared_u16, shared_memory_u16_type) = make_type(U16, 2u); - std::tie(shared_memory_u32, shared_u32, shared_memory_u32_type) = make_type(U32[1], 4u); - std::tie(shared_memory_u64, shared_u64, shared_memory_u64_type) = make_type(U64, 8u); + std::tie(shared_memory_u16, shared_u16, shared_memory_u16_type) = + make_type(IR::Type::U16, U16, 2u, "shared_mem_u16"); + std::tie(shared_memory_u32, shared_u32, shared_memory_u32_type) = + make_type(IR::Type::U32, U32[1], 4u, "shared_mem_u32"); + std::tie(shared_memory_u64, shared_u64, shared_memory_u64_type) = + make_type(IR::Type::U64, U64, 8u, "shared_mem_u64"); } Id EmitContext::DefineFloat32ToUfloatM5(u32 mantissa_bits, const std::string_view name) { diff --git a/src/shader_recompiler/info.h b/src/shader_recompiler/info.h index e14c7988d..f25111350 100644 --- a/src/shader_recompiler/info.h +++ b/src/shader_recompiler/info.h @@ -214,7 +214,7 @@ struct Info { bool uses_lane_id{}; bool uses_group_quad{}; bool uses_group_ballot{}; - bool uses_shared{}; + IR::Type shared_types{}; bool uses_fp16{}; bool uses_fp64{}; bool uses_pack_10_11_11{}; diff --git a/src/shader_recompiler/ir/passes/ir_passes.h b/src/shader_recompiler/ir/passes/ir_passes.h index 06e4ac850..57d36f6df 100644 --- a/src/shader_recompiler/ir/passes/ir_passes.h +++ b/src/shader_recompiler/ir/passes/ir_passes.h @@ -28,6 +28,7 @@ void HullShaderTransform(IR::Program& program, RuntimeInfo& runtime_info); void DomainShaderTransform(IR::Program& program, RuntimeInfo& runtime_info); void SharedMemoryBarrierPass(IR::Program& program, const RuntimeInfo& runtime_info, const Profile& profile); +void SharedMemorySimplifyPass(IR::Program& program, const Profile& profile); void SharedMemoryToStoragePass(IR::Program& program, const RuntimeInfo& runtime_info, const Profile& profile); diff --git a/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp b/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp index ba8d1cca6..4cd16d18f 100644 --- a/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp +++ b/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp @@ -35,12 +35,28 @@ void Visit(Info& info, const IR::Inst& inst) { break; } case IR::Opcode::LoadSharedU16: - case IR::Opcode::LoadSharedU32: - case IR::Opcode::LoadSharedU64: case IR::Opcode::WriteSharedU16: + info.shared_types |= IR::Type::U16; + break; + case IR::Opcode::LoadSharedU32: case IR::Opcode::WriteSharedU32: + case IR::Opcode::SharedAtomicIAdd32: + case IR::Opcode::SharedAtomicISub32: + case IR::Opcode::SharedAtomicSMin32: + case IR::Opcode::SharedAtomicUMin32: + case IR::Opcode::SharedAtomicSMax32: + case IR::Opcode::SharedAtomicUMax32: + case IR::Opcode::SharedAtomicInc32: + case IR::Opcode::SharedAtomicDec32: + case IR::Opcode::SharedAtomicAnd32: + case IR::Opcode::SharedAtomicOr32: + case IR::Opcode::SharedAtomicXor32: + info.shared_types |= IR::Type::U32; + break; + case IR::Opcode::LoadSharedU64: case IR::Opcode::WriteSharedU64: - info.uses_shared = true; + case IR::Opcode::SharedAtomicIAdd64: + info.shared_types |= IR::Type::U64; break; case IR::Opcode::ConvertF16F32: case IR::Opcode::ConvertF32F16: diff --git a/src/shader_recompiler/ir/passes/shared_memory_simplify_pass.cpp b/src/shader_recompiler/ir/passes/shared_memory_simplify_pass.cpp new file mode 100644 index 000000000..0f80a3b28 --- /dev/null +++ b/src/shader_recompiler/ir/passes/shared_memory_simplify_pass.cpp @@ -0,0 +1,127 @@ +// SPDX-FileCopyrightText: Copyright 2025 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include "shader_recompiler/ir/ir_emitter.h" +#include "shader_recompiler/ir/program.h" +#include "shader_recompiler/profile.h" + +namespace Shader::Optimization { + +static bool Requires16BitSharedAtomic(const IR::Inst& inst) { + // Nothing yet + return false; +} + +static bool Requires64BitSharedAtomic(const IR::Inst& inst) { + switch (inst.GetOpcode()) { + case IR::Opcode::SharedAtomicIAdd64: + return true; + default: + return false; + } +} + +static bool IsNon32BitSharedLoadStore(const IR::Inst& inst) { + switch (inst.GetOpcode()) { + case IR::Opcode::LoadSharedU16: + case IR::Opcode::LoadSharedU64: + case IR::Opcode::WriteSharedU16: + case IR::Opcode::WriteSharedU64: + return true; + default: + return false; + } +} + +IR::Type CalculateSpecialSharedAtomicTypes(IR::Program& program) { + IR::Type extra_atomic_types{IR::Type::Void}; + for (IR::Block* const block : program.blocks) { + for (IR::Inst& inst : block->Instructions()) { + if (Requires16BitSharedAtomic(inst)) { + extra_atomic_types |= IR::Type::U16; + } + if (Requires64BitSharedAtomic(inst)) { + extra_atomic_types |= IR::Type::U64; + } + } + } + return extra_atomic_types; +} + +// Simplifies down U16 and U64 shared memory operations to U32 when aliasing is not supported and +// atomics of the same type are not used. +void SharedMemorySimplifyPass(IR::Program& program, const Profile& profile) { + if (program.info.stage != Stage::Compute || profile.supports_workgroup_explicit_memory_layout) { + return; + } + + const auto atomic_types = CalculateSpecialSharedAtomicTypes(program); + if (True(atomic_types & IR::Type::U16) && True(atomic_types & IR::Type::U64)) { + // If both other atomic types are used, there is nothing to do. + return; + } + + // Iterate through shared load/store U16/U64 instructions, replacing with + // equivalent U32 ops when the types are not needed for atomics. + for (IR::Block* const block : program.blocks) { + for (IR::Inst& inst : block->Instructions()) { + if (!IsNon32BitSharedLoadStore(inst)) { + continue; + } + IR::IREmitter ir{*block, IR::Block::InstructionList::s_iterator_to(inst)}; + const IR::U32 offset{inst.Arg(0)}; + if (False(atomic_types & IR::Type::U16)) { + switch (inst.GetOpcode()) { + case IR::Opcode::LoadSharedU16: { + const IR::U32 dword_offset{ir.BitwiseAnd(offset, ir.Imm32(~3U))}; + const IR::U32 dword_value{ir.LoadShared(32, false, dword_offset)}; + const IR::U32 bit_offset{ + ir.IMul(ir.BitwiseAnd(offset, ir.Imm32(2U)), ir.Imm32(8U))}; + const IR::U32 value{ir.BitFieldExtract(dword_value, bit_offset, ir.Imm32(16U))}; + inst.ReplaceUsesWithAndRemove(ir.UConvert(16, value)); + continue; + } + case IR::Opcode::WriteSharedU16: { + const IR::U32 value{ir.UConvert(32, IR::U16{inst.Arg(1)})}; + const IR::U32 bit_offset{ + ir.IMul(ir.BitwiseAnd(offset, ir.Imm32(2U)), ir.Imm32(8U))}; + const IR::U32 dword_offset{ir.BitwiseAnd(offset, ir.Imm32(~3U))}; + const IR::U32 dword_value{ + ir.LoadShared(32, false, ir.BitwiseAnd(offset, dword_offset))}; + const IR::U32 new_dword_value{ + ir.BitFieldInsert(dword_value, value, bit_offset, ir.Imm32(16U))}; + ir.WriteShared(32, new_dword_value, dword_offset); + inst.Invalidate(); + continue; + } + default: + break; + } + } + if (False(atomic_types & IR::Type::U64)) { + switch (inst.GetOpcode()) { + case IR::Opcode::LoadSharedU64: { + const IR::U32 value0{ir.LoadShared(32, false, offset)}; + const IR::U32 value1{ir.LoadShared(32, false, ir.IAdd(offset, ir.Imm32(4U)))}; + const IR::Value value{ir.PackUint2x32(ir.CompositeConstruct(value0, value1))}; + inst.ReplaceUsesWithAndRemove(value); + continue; + } + case IR::Opcode::WriteSharedU64: { + const IR::Value value{ir.UnpackUint2x32(IR::U64{inst.Arg(1)})}; + const IR::U32 value0{ir.CompositeExtract(value, 0)}; + const IR::U32 value1{ir.CompositeExtract(value, 1)}; + ir.WriteShared(32, value0, offset); + ir.WriteShared(32, value1, ir.IAdd(offset, ir.Imm32(4U))); + inst.Invalidate(); + continue; + } + default: + break; + } + } + } + } +} + +} // namespace Shader::Optimization diff --git a/src/shader_recompiler/ir/passes/shared_memory_to_storage_pass.cpp b/src/shader_recompiler/ir/passes/shared_memory_to_storage_pass.cpp index 839a8ddc5..a6900e180 100644 --- a/src/shader_recompiler/ir/passes/shared_memory_to_storage_pass.cpp +++ b/src/shader_recompiler/ir/passes/shared_memory_to_storage_pass.cpp @@ -34,20 +34,74 @@ static bool IsSharedAccess(const IR::Inst& inst) { } } +IR::Type CalculateSharedMemoryTypes(IR::Program& program) { + IR::Type used_types{IR::Type::Void}; + for (IR::Block* const block : program.blocks) { + for (IR::Inst& inst : block->Instructions()) { + if (!IsSharedAccess(inst)) { + continue; + } + switch (inst.GetOpcode()) { + case IR::Opcode::LoadSharedU16: + case IR::Opcode::WriteSharedU16: + used_types |= IR::Type::U16; + break; + case IR::Opcode::LoadSharedU32: + case IR::Opcode::WriteSharedU32: + case IR::Opcode::SharedAtomicIAdd32: + case IR::Opcode::SharedAtomicISub32: + case IR::Opcode::SharedAtomicSMin32: + case IR::Opcode::SharedAtomicUMin32: + case IR::Opcode::SharedAtomicSMax32: + case IR::Opcode::SharedAtomicUMax32: + case IR::Opcode::SharedAtomicInc32: + case IR::Opcode::SharedAtomicDec32: + case IR::Opcode::SharedAtomicAnd32: + case IR::Opcode::SharedAtomicOr32: + case IR::Opcode::SharedAtomicXor32: + used_types |= IR::Type::U32; + break; + case IR::Opcode::LoadSharedU64: + case IR::Opcode::WriteSharedU64: + case IR::Opcode::SharedAtomicIAdd64: + used_types |= IR::Type::U64; + break; + default: + break; + } + } + } + return used_types; +} + void SharedMemoryToStoragePass(IR::Program& program, const RuntimeInfo& runtime_info, const Profile& profile) { if (program.info.stage != Stage::Compute) { return; } - // Only perform the transform if there is shared memory and either host shared memory is - // insufficient or the device does not support VK_KHR_workgroup_memory_explicit_layout + + // Run this pass if: + // * There are shared memory instructions. + // * One of the following is true: + // * Requested shared memory size is too large for the host shared memory. + // * Workgroup explicit memory is not supported and multiple shared memory types are used. const u32 shared_memory_size = runtime_info.cs_info.shared_memory_size; - if (shared_memory_size == 0 || (shared_memory_size <= profile.max_shared_memory_size && - profile.supports_workgroup_explicit_memory_layout)) { + const auto used_types = CalculateSharedMemoryTypes(program); + if (used_types == IR::Type::Void || (shared_memory_size <= profile.max_shared_memory_size && + (profile.supports_workgroup_explicit_memory_layout || + std::popcount(static_cast(used_types)) == 1))) { return; } + + // Add a buffer binding for shared memory storage buffer. const u32 binding = static_cast(program.info.buffers.size()); - IR::Type used_types{}; + program.info.buffers.push_back({ + .used_types = used_types, + .inline_cbuf = AmdGpu::Buffer::Null(), + .buffer_type = BufferType::SharedMemory, + .is_written = true, + }); + for (IR::Block* const block : program.blocks) { for (IR::Inst& inst : block->Instructions()) { if (!IsSharedAccess(inst)) { @@ -58,29 +112,21 @@ void SharedMemoryToStoragePass(IR::Program& program, const RuntimeInfo& runtime_ const IR::U32 offset = ir.IMul(ir.GetAttributeU32(IR::Attribute::WorkgroupIndex), ir.Imm32(shared_memory_size)); const IR::U32 address = ir.IAdd(IR::U32{inst.Arg(0)}, offset); - // Replace shared atomics first switch (inst.GetOpcode()) { case IR::Opcode::SharedAtomicIAdd32: - inst.ReplaceUsesWithAndRemove( - ir.BufferAtomicIAdd(handle, address, inst.Arg(1), {})); - used_types |= IR::Type::U32; - continue; case IR::Opcode::SharedAtomicIAdd64: inst.ReplaceUsesWithAndRemove( ir.BufferAtomicIAdd(handle, address, inst.Arg(1), {})); - used_types |= IR::Type::U64; continue; case IR::Opcode::SharedAtomicISub32: inst.ReplaceUsesWithAndRemove( ir.BufferAtomicISub(handle, address, inst.Arg(1), {})); - used_types |= IR::Type::U32; continue; case IR::Opcode::SharedAtomicSMin32: case IR::Opcode::SharedAtomicUMin32: { const bool is_signed = inst.GetOpcode() == IR::Opcode::SharedAtomicSMin32; inst.ReplaceUsesWithAndRemove( ir.BufferAtomicIMin(handle, address, inst.Arg(1), is_signed, {})); - used_types |= IR::Type::U32; continue; } case IR::Opcode::SharedAtomicSMax32: @@ -88,73 +134,49 @@ void SharedMemoryToStoragePass(IR::Program& program, const RuntimeInfo& runtime_ const bool is_signed = inst.GetOpcode() == IR::Opcode::SharedAtomicSMax32; inst.ReplaceUsesWithAndRemove( ir.BufferAtomicIMax(handle, address, inst.Arg(1), is_signed, {})); - used_types |= IR::Type::U32; continue; } case IR::Opcode::SharedAtomicInc32: inst.ReplaceUsesWithAndRemove(ir.BufferAtomicInc(handle, address, {})); - used_types |= IR::Type::U32; continue; case IR::Opcode::SharedAtomicDec32: inst.ReplaceUsesWithAndRemove(ir.BufferAtomicDec(handle, address, {})); - used_types |= IR::Type::U32; continue; case IR::Opcode::SharedAtomicAnd32: inst.ReplaceUsesWithAndRemove(ir.BufferAtomicAnd(handle, address, inst.Arg(1), {})); - used_types |= IR::Type::U32; continue; case IR::Opcode::SharedAtomicOr32: inst.ReplaceUsesWithAndRemove(ir.BufferAtomicOr(handle, address, inst.Arg(1), {})); - used_types |= IR::Type::U32; continue; case IR::Opcode::SharedAtomicXor32: inst.ReplaceUsesWithAndRemove(ir.BufferAtomicXor(handle, address, inst.Arg(1), {})); - used_types |= IR::Type::U32; continue; - default: - break; - } - // Replace shared operations. - switch (inst.GetOpcode()) { case IR::Opcode::LoadSharedU16: inst.ReplaceUsesWithAndRemove(ir.LoadBufferU16(handle, address, {})); - used_types |= IR::Type::U16; break; case IR::Opcode::LoadSharedU32: inst.ReplaceUsesWithAndRemove(ir.LoadBufferU32(1, handle, address, {})); - used_types |= IR::Type::U32; break; case IR::Opcode::LoadSharedU64: inst.ReplaceUsesWithAndRemove(ir.LoadBufferU64(handle, address, {})); - used_types |= IR::Type::U64; break; case IR::Opcode::WriteSharedU16: ir.StoreBufferU16(handle, address, IR::U16{inst.Arg(1)}, {}); inst.Invalidate(); - used_types |= IR::Type::U16; break; case IR::Opcode::WriteSharedU32: ir.StoreBufferU32(1, handle, address, inst.Arg(1), {}); inst.Invalidate(); - used_types |= IR::Type::U32; break; case IR::Opcode::WriteSharedU64: ir.StoreBufferU64(handle, address, IR::U64{inst.Arg(1)}, {}); inst.Invalidate(); - used_types |= IR::Type::U64; break; default: break; } } } - // Add buffer binding for shared memory storage buffer. - program.info.buffers.push_back({ - .used_types = used_types, - .inline_cbuf = AmdGpu::Buffer::Null(), - .buffer_type = BufferType::SharedMemory, - .is_written = true, - }); } } // namespace Shader::Optimization diff --git a/src/shader_recompiler/recompiler.cpp b/src/shader_recompiler/recompiler.cpp index 9f92857d6..e17fb1c9e 100644 --- a/src/shader_recompiler/recompiler.cpp +++ b/src/shader_recompiler/recompiler.cpp @@ -78,6 +78,7 @@ IR::Program TranslateProgram(std::span code, Pools& pools, Info& info Shader::Optimization::FlattenExtendedUserdataPass(program); Shader::Optimization::ResourceTrackingPass(program); Shader::Optimization::LowerBufferFormatToRaw(program); + Shader::Optimization::SharedMemorySimplifyPass(program, profile); Shader::Optimization::SharedMemoryToStoragePass(program, runtime_info, profile); Shader::Optimization::SharedMemoryBarrierPass(program, runtime_info, profile); Shader::Optimization::IdentityRemovalPass(program.blocks);