diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h index 3e2cea9e5..aaa2bb526 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h +++ b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h @@ -120,10 +120,8 @@ Id EmitUndefU32(EmitContext& ctx); Id EmitUndefU64(EmitContext& ctx); Id EmitLoadSharedU32(EmitContext& ctx, Id offset); Id EmitLoadSharedU64(EmitContext& ctx, Id offset); -Id EmitLoadSharedU128(EmitContext& ctx, Id offset); void EmitWriteSharedU32(EmitContext& ctx, Id offset, Id value); void EmitWriteSharedU64(EmitContext& ctx, Id offset, Id value); -void EmitWriteSharedU128(EmitContext& ctx, Id offset, Id value); Id EmitSharedAtomicIAdd32(EmitContext& ctx, Id offset, Id value); Id EmitSharedAtomicUMax32(EmitContext& ctx, Id offset, Id value); Id EmitSharedAtomicSMax32(EmitContext& ctx, Id offset, Id value); diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_shared_memory.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_shared_memory.cpp index 6ab213864..550b95f3d 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_shared_memory.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_shared_memory.cpp @@ -38,24 +38,6 @@ Id EmitLoadSharedU64(EmitContext& ctx, Id offset) { } } -Id EmitLoadSharedU128(EmitContext& ctx, Id offset) { - const Id shift_id{ctx.ConstU32(2U)}; - const Id base_index{ctx.OpShiftRightArithmetic(ctx.U32[1], offset, shift_id)}; - std::array values{}; - for (u32 i = 0; i < 4; ++i) { - const Id index{i == 0 ? base_index : ctx.OpIAdd(ctx.U32[1], base_index, ctx.ConstU32(i))}; - if (ctx.info.has_emulated_shared_memory) { - const Id pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, - ctx.u32_zero_value, index)}; - values[i] = ctx.OpLoad(ctx.U32[1], pointer); - } else { - const Id pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, index)}; - values[i] = ctx.OpLoad(ctx.U32[1], pointer); - } - } - return ctx.OpCompositeConstruct(ctx.U32[4], values); -} - void EmitWriteSharedU32(EmitContext& ctx, Id offset, Id value) { const Id shift{ctx.ConstU32(2U)}; const Id word_offset{ctx.OpShiftRightArithmetic(ctx.U32[1], offset, shift)}; @@ -88,20 +70,4 @@ void EmitWriteSharedU64(EmitContext& ctx, Id offset, Id value) { } } -void EmitWriteSharedU128(EmitContext& ctx, Id offset, Id value) { - const Id shift{ctx.ConstU32(2U)}; - const Id base_index{ctx.OpShiftRightArithmetic(ctx.U32[1], offset, shift)}; - for (u32 i = 0; i < 4; ++i) { - const Id index{i == 0 ? base_index : ctx.OpIAdd(ctx.U32[1], base_index, ctx.ConstU32(i))}; - if (ctx.info.has_emulated_shared_memory) { - const Id pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, - ctx.u32_zero_value, index)}; - ctx.OpStore(pointer, ctx.OpCompositeExtract(ctx.U32[1], value, i)); - } else { - const Id pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, index)}; - ctx.OpStore(pointer, ctx.OpCompositeExtract(ctx.U32[1], value, i)); - } - } -} - } // namespace Shader::Backend::SPIRV diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp index 4d5e817b4..d676d205d 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp @@ -813,6 +813,8 @@ void EmitContext::DefineSharedMemory() { if (!info.uses_shared) { return; } + ASSERT(info.stage == Stage::Compute); + const u32 max_shared_memory_size = profile.max_shared_memory_size; u32 shared_memory_size = runtime_info.cs_info.shared_memory_size; if (shared_memory_size == 0) { diff --git a/src/shader_recompiler/info.h b/src/shader_recompiler/info.h index b32eb6833..57d428a49 100644 --- a/src/shader_recompiler/info.h +++ b/src/shader_recompiler/info.h @@ -233,7 +233,8 @@ struct Info { } void AddBindings(Backend::Bindings& bnd) const { - const auto total_buffers = buffers.size() + (has_readconst ? 1 : 0); + const auto total_buffers = + buffers.size() + (has_readconst ? 1 : 0) + (has_emulated_shared_memory ? 1 : 0); bnd.buffer += total_buffers; bnd.unified += total_buffers + images.size() + samplers.size(); bnd.user_data += ud_mask.NumRegs(); diff --git a/src/shader_recompiler/ir/ir_emitter.cpp b/src/shader_recompiler/ir/ir_emitter.cpp index 7e3d0f937..06c01878d 100644 --- a/src/shader_recompiler/ir/ir_emitter.cpp +++ b/src/shader_recompiler/ir/ir_emitter.cpp @@ -308,8 +308,6 @@ Value IREmitter::LoadShared(int bit_size, bool is_signed, const U32& offset) { return Inst(Opcode::LoadSharedU32, offset); case 64: return Inst(Opcode::LoadSharedU64, offset); - case 128: - return Inst(Opcode::LoadSharedU128, offset); default: UNREACHABLE_MSG("Invalid bit size {}", bit_size); } @@ -323,9 +321,6 @@ void IREmitter::WriteShared(int bit_size, const Value& value, const U32& offset) case 64: Inst(Opcode::WriteSharedU64, offset, value); break; - case 128: - Inst(Opcode::WriteSharedU128, offset, value); - break; default: UNREACHABLE_MSG("Invalid bit size {}", bit_size); } diff --git a/src/shader_recompiler/ir/microinstruction.cpp b/src/shader_recompiler/ir/microinstruction.cpp index fdbc019e3..580156f5b 100644 --- a/src/shader_recompiler/ir/microinstruction.cpp +++ b/src/shader_recompiler/ir/microinstruction.cpp @@ -78,7 +78,6 @@ bool Inst::MayHaveSideEffects() const noexcept { case Opcode::BufferAtomicSwap32: case Opcode::DataAppend: case Opcode::DataConsume: - case Opcode::WriteSharedU128: case Opcode::WriteSharedU64: case Opcode::WriteSharedU32: case Opcode::SharedAtomicIAdd32: diff --git a/src/shader_recompiler/ir/opcodes.inc b/src/shader_recompiler/ir/opcodes.inc index 0d87430d2..d5e17631b 100644 --- a/src/shader_recompiler/ir/opcodes.inc +++ b/src/shader_recompiler/ir/opcodes.inc @@ -32,10 +32,8 @@ OPCODE(EmitPrimitive, Void, // Shared memory operations OPCODE(LoadSharedU32, U32, U32, ) OPCODE(LoadSharedU64, U32x2, U32, ) -OPCODE(LoadSharedU128, U32x4, U32, ) OPCODE(WriteSharedU32, Void, U32, U32, ) OPCODE(WriteSharedU64, Void, U32, U32x2, ) -OPCODE(WriteSharedU128, Void, U32, U32x4, ) // Shared atomic operations OPCODE(SharedAtomicIAdd32, U32, U32, U32, ) diff --git a/src/shader_recompiler/ir/passes/hull_shader_transform.cpp b/src/shader_recompiler/ir/passes/hull_shader_transform.cpp index b41e38339..fced4b362 100644 --- a/src/shader_recompiler/ir/passes/hull_shader_transform.cpp +++ b/src/shader_recompiler/ir/passes/hull_shader_transform.cpp @@ -225,10 +225,8 @@ private: switch (use.user->GetOpcode()) { case IR::Opcode::LoadSharedU32: case IR::Opcode::LoadSharedU64: - case IR::Opcode::LoadSharedU128: case IR::Opcode::WriteSharedU32: - case IR::Opcode::WriteSharedU64: - case IR::Opcode::WriteSharedU128: { + case IR::Opcode::WriteSharedU64: { u32 counter = inst->Flags(); inst->SetFlags(counter + inc); // Stop here @@ -435,12 +433,9 @@ void HullShaderTransform(IR::Program& program, RuntimeInfo& runtime_info) { } case IR::Opcode::WriteSharedU32: - case IR::Opcode::WriteSharedU64: - case IR::Opcode::WriteSharedU128: { + case IR::Opcode::WriteSharedU64: { IR::IREmitter ir{*block, IR::Block::InstructionList::s_iterator_to(inst)}; - const u32 num_dwords = opcode == IR::Opcode::WriteSharedU32 - ? 1 - : (opcode == IR::Opcode::WriteSharedU64 ? 2 : 4); + const u32 num_dwords = opcode == IR::Opcode::WriteSharedU32 ? 1 : 2; const IR::U32 addr{inst.Arg(0)}; const IR::U32 data{inst.Arg(1).Resolve()}; @@ -480,15 +475,12 @@ void HullShaderTransform(IR::Program& program, RuntimeInfo& runtime_info) { break; } - case IR::Opcode::LoadSharedU32: { - case IR::Opcode::LoadSharedU64: - case IR::Opcode::LoadSharedU128: + case IR::Opcode::LoadSharedU32: + case IR::Opcode::LoadSharedU64: { IR::IREmitter ir{*block, IR::Block::InstructionList::s_iterator_to(inst)}; const IR::U32 addr{inst.Arg(0)}; const AttributeRegion region = GetAttributeRegionKind(&inst, info, runtime_info); - const u32 num_dwords = opcode == IR::Opcode::LoadSharedU32 - ? 1 - : (opcode == IR::Opcode::LoadSharedU64 ? 2 : 4); + const u32 num_dwords = opcode == IR::Opcode::LoadSharedU32 ? 1 : 2; ASSERT_MSG(region == AttributeRegion::InputCP || region == AttributeRegion::OutputCP, "Unhandled read of patchconst attribute in hull shader"); @@ -562,14 +554,11 @@ void DomainShaderTransform(IR::Program& program, RuntimeInfo& runtime_info) { IR::IREmitter ir{*block, IR::Block::InstructionList::s_iterator_to(inst)}; const auto opcode = inst.GetOpcode(); switch (inst.GetOpcode()) { - case IR::Opcode::LoadSharedU32: { - case IR::Opcode::LoadSharedU64: - case IR::Opcode::LoadSharedU128: + case IR::Opcode::LoadSharedU32: + case IR::Opcode::LoadSharedU64: { const IR::U32 addr{inst.Arg(0)}; AttributeRegion region = GetAttributeRegionKind(&inst, info, runtime_info); - const u32 num_dwords = opcode == IR::Opcode::LoadSharedU32 - ? 1 - : (opcode == IR::Opcode::LoadSharedU64 ? 2 : 4); + const u32 num_dwords = opcode == IR::Opcode::LoadSharedU32 ? 1 : 2; const auto GetInput = [&](IR::U32 addr, u32 off_dw) -> IR::F32 { if (region == AttributeRegion::OutputCP) { return ReadTessControlPointAttribute( @@ -611,10 +600,8 @@ void TessellationPreprocess(IR::Program& program, RuntimeInfo& runtime_info) { switch (inst.GetOpcode()) { case IR::Opcode::LoadSharedU32: case IR::Opcode::LoadSharedU64: - case IR::Opcode::LoadSharedU128: case IR::Opcode::WriteSharedU32: - case IR::Opcode::WriteSharedU64: - case IR::Opcode::WriteSharedU128: { + case IR::Opcode::WriteSharedU64: { IR::Value addr = inst.Arg(0); auto read_const_buffer = IR::BreadthFirstSearch( addr, [](IR::Inst* maybe_tess_const) -> std::optional { diff --git a/src/shader_recompiler/ir/passes/ir_passes.h b/src/shader_recompiler/ir/passes/ir_passes.h index 0d6816ae0..3c98579a0 100644 --- a/src/shader_recompiler/ir/passes/ir_passes.h +++ b/src/shader_recompiler/ir/passes/ir_passes.h @@ -20,7 +20,7 @@ void FlattenExtendedUserdataPass(IR::Program& program); void ResourceTrackingPass(IR::Program& program); void CollectShaderInfoPass(IR::Program& program); void LowerBufferFormatToRaw(IR::Program& program); -void LowerSharedMemToRegisters(IR::Program& program); +void LowerSharedMemToRegisters(IR::Program& program, const RuntimeInfo& runtime_info); void RingAccessElimination(const IR::Program& program, const RuntimeInfo& runtime_info, Stage stage); void TessellationPreprocess(IR::Program& program, RuntimeInfo& runtime_info); diff --git a/src/shader_recompiler/ir/passes/lower_shared_mem_to_registers.cpp b/src/shader_recompiler/ir/passes/lower_shared_mem_to_registers.cpp index c109f3595..23963a991 100644 --- a/src/shader_recompiler/ir/passes/lower_shared_mem_to_registers.cpp +++ b/src/shader_recompiler/ir/passes/lower_shared_mem_to_registers.cpp @@ -1,38 +1,81 @@ // SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later -#include +#include + +#include "shader_recompiler/ir/ir_emitter.h" #include "shader_recompiler/ir/program.h" namespace Shader::Optimization { -void LowerSharedMemToRegisters(IR::Program& program) { - boost::container::small_vector ds_writes; - Info& info{program.info}; +static bool IsSharedMemoryInst(const IR::Inst& inst) { + const auto opcode = inst.GetOpcode(); + return opcode == IR::Opcode::LoadSharedU32 || opcode == IR::Opcode::LoadSharedU64 || + opcode == IR::Opcode::WriteSharedU32 || opcode == IR::Opcode::WriteSharedU64; +} + +static u32 GetSharedMemImmOffset(const IR::Inst& inst) { + const auto* address = inst.Arg(0).InstRecursive(); + ASSERT(address->GetOpcode() == IR::Opcode::IAdd32); + const auto ir_offset = address->Arg(1); + ASSERT_MSG(ir_offset.IsImmediate()); + const auto offset = ir_offset.U32(); + // Typical usage is the compiler spilling registers into shared memory, with 256 bytes between + // each register to account for 4 bytes per register times 64 threads per group. Ensure that + // this assumption holds, as if it does not this approach may need to be revised. + ASSERT_MSG(offset % 256 == 0, "Unexpected shared memory offset alignment: {}", offset); + return offset; +} + +static void ConvertSharedMemToVgpr(IR::IREmitter& ir, IR::Inst& inst, const IR::VectorReg vgpr) { + switch (inst.GetOpcode()) { + case IR::Opcode::LoadSharedU32: + inst.ReplaceUsesWithAndRemove(ir.GetVectorReg(vgpr)); + break; + case IR::Opcode::LoadSharedU64: + inst.ReplaceUsesWithAndRemove( + ir.CompositeConstruct(ir.GetVectorReg(vgpr), ir.GetVectorReg(vgpr + 1))); + break; + case IR::Opcode::WriteSharedU32: + ir.SetVectorReg(vgpr, IR::U32{inst.Arg(1)}); + inst.Invalidate(); + break; + case IR::Opcode::WriteSharedU64: { + const auto value = inst.Arg(1); + ir.SetVectorReg(vgpr, IR::U32{ir.CompositeExtract(value, 0)}); + ir.SetVectorReg(vgpr, IR::U32{ir.CompositeExtract(value, 1)}); + inst.Invalidate(); + break; + } + default: + UNREACHABLE_MSG("Unknown shared memory opcode: {}", inst.GetOpcode()); + } +} + +void LowerSharedMemToRegisters(IR::Program& program, const RuntimeInfo& runtime_info) { + u32 next_vgpr_num = runtime_info.num_allocated_vgprs; + std::unordered_map vgpr_map; + const auto get_vgpr = [&next_vgpr_num, &vgpr_map](const u32 offset) { + const auto [it, is_new] = vgpr_map.try_emplace(offset); + if (is_new) { + ASSERT_MSG(next_vgpr_num < 256, "Out of VGPRs"); + const auto new_vgpr = static_cast(next_vgpr_num++); + it->second = new_vgpr; + } + return it->second; + }; + for (IR::Block* const block : program.blocks) { for (IR::Inst& inst : block->Instructions()) { - const auto opcode = inst.GetOpcode(); - if (opcode == IR::Opcode::WriteSharedU32 || opcode == IR::Opcode::WriteSharedU64) { - ds_writes.emplace_back(&inst); + if (!IsSharedMemoryInst(inst)) { continue; } - if (opcode == IR::Opcode::LoadSharedU32 || opcode == IR::Opcode::LoadSharedU64) { - // Search for write instruction with same offset - const IR::Inst* prod = inst.Arg(0).InstRecursive(); - const auto it = std::ranges::find_if(ds_writes, [&](const IR::Inst* write) { - const IR::Inst* write_prod = write->Arg(0).InstRecursive(); - return write_prod->Arg(1).U32() == prod->Arg(1).U32(); - }); - ASSERT(it != ds_writes.end()); - // Replace data read with value written. - inst.ReplaceUsesWithAndRemove((*it)->Arg(1)); - } + const auto offset = GetSharedMemImmOffset(inst); + const auto vgpr = get_vgpr(offset); + IR::IREmitter ir{*block, IR::Block::InstructionList::s_iterator_to(inst)}; + ConvertSharedMemToVgpr(ir, inst, vgpr); } } - // We should have eliminated everything. Invalidate data write instructions. - for (const auto inst : ds_writes) { - inst->Invalidate(); - } } } // namespace Shader::Optimization diff --git a/src/shader_recompiler/recompiler.cpp b/src/shader_recompiler/recompiler.cpp index a9f7aeb40..5a6d1d775 100644 --- a/src/shader_recompiler/recompiler.cpp +++ b/src/shader_recompiler/recompiler.cpp @@ -65,6 +65,10 @@ IR::Program TranslateProgram(std::span code, Pools& pools, Info& info // Run optimization passes const auto stage = program.info.stage; + if (stage == Stage::Fragment) { + // Before SSA pass, as it will rewrite to VGPR load/store. + Shader::Optimization::LowerSharedMemToRegisters(program, runtime_info); + } Shader::Optimization::SsaRewritePass(program.post_order_blocks); Shader::Optimization::IdentityRemovalPass(program.blocks); if (info.l_stage == LogicalStage::TessellationControl) { @@ -82,9 +86,6 @@ IR::Program TranslateProgram(std::span code, Pools& pools, Info& info } Shader::Optimization::ConstantPropagationPass(program.post_order_blocks); Shader::Optimization::RingAccessElimination(program, runtime_info, stage); - if (stage != Stage::Compute) { - Shader::Optimization::LowerSharedMemToRegisters(program); - } Shader::Optimization::ConstantPropagationPass(program.post_order_blocks); Shader::Optimization::FlattenExtendedUserdataPass(program); Shader::Optimization::ResourceTrackingPass(program); diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 8b1d5d8b3..ac6aac7b3 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -535,6 +535,7 @@ void Rasterizer::BindBuffers(const Shader::Info& stage, Shader::Backend::Binding .descriptorType = vk::DescriptorType::eStorageBuffer, .pBufferInfo = &buffer_infos.back(), }); + ++binding.buffer; } // Bind the flattened user data buffer as a UBO so it's accessible to the shader