diff --git a/CMakeLists.txt b/CMakeLists.txt index 22a811d30..95766bc67 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -766,11 +766,11 @@ set(SHADER_RECOMPILER src/shader_recompiler/exception.h src/shader_recompiler/ir/passes/identity_removal_pass.cpp src/shader_recompiler/ir/passes/ir_passes.h src/shader_recompiler/ir/passes/lower_buffer_format_to_raw.cpp - src/shader_recompiler/ir/passes/lower_shared_mem_to_registers.cpp src/shader_recompiler/ir/passes/resource_tracking_pass.cpp src/shader_recompiler/ir/passes/ring_access_elimination.cpp src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp src/shader_recompiler/ir/passes/shared_memory_barrier_pass.cpp + src/shader_recompiler/ir/passes/shared_memory_to_storage_pass.cpp src/shader_recompiler/ir/passes/ssa_rewrite_pass.cpp src/shader_recompiler/ir/abstract_syntax_list.h src/shader_recompiler/ir/attribute.cpp diff --git a/src/shader_recompiler/backend/spirv/emit_spirv.cpp b/src/shader_recompiler/backend/spirv/emit_spirv.cpp index 3712380f5..2a5b9335e 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv.cpp @@ -242,14 +242,17 @@ void SetupCapabilities(const Info& info, const Profile& profile, EmitContext& ct ctx.AddCapability(spv::Capability::Image1D); ctx.AddCapability(spv::Capability::Sampled1D); ctx.AddCapability(spv::Capability::ImageQuery); + ctx.AddCapability(spv::Capability::Int8); + ctx.AddCapability(spv::Capability::Int16); + ctx.AddCapability(spv::Capability::Int64); + ctx.AddCapability(spv::Capability::UniformAndStorageBuffer8BitAccess); + ctx.AddCapability(spv::Capability::UniformAndStorageBuffer16BitAccess); if (info.uses_fp16) { ctx.AddCapability(spv::Capability::Float16); - ctx.AddCapability(spv::Capability::Int16); } if (info.uses_fp64) { ctx.AddCapability(spv::Capability::Float64); } - ctx.AddCapability(spv::Capability::Int64); if (info.has_storage_images) { ctx.AddCapability(spv::Capability::StorageImageExtendedFormats); ctx.AddCapability(spv::Capability::StorageImageReadWithoutFormat); diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp index ce65a5ccb..92cfcbb0f 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp @@ -23,10 +23,13 @@ Id SharedAtomicU32(EmitContext& ctx, Id offset, Id value, Id BufferAtomicU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value, Id (Sirit::Module::*atomic_func)(Id, Id, Id, Id, Id)) { - auto& buffer = ctx.buffers[handle]; - address = ctx.OpIAdd(ctx.U32[1], address, buffer.offset); + const auto& buffer = ctx.buffers[handle]; + if (Sirit::ValidId(buffer.offset)) { + address = ctx.OpIAdd(ctx.U32[1], address, buffer.offset); + } const Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(2u)); - const Id ptr = ctx.OpAccessChain(buffer.pointer_type, buffer.id, ctx.u32_zero_value, index); + const auto [id, pointer_type] = buffer[EmitContext::BufferAlias::U32]; + const Id ptr = ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, index); const auto [scope, semantics]{AtomicArgs(ctx)}; return (ctx.*atomic_func)(ctx.U32[1], ptr, scope, semantics, value); } @@ -165,17 +168,17 @@ Id EmitImageAtomicExchange32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id co } Id EmitDataAppend(EmitContext& ctx, u32 gds_addr, u32 binding) { - auto& buffer = ctx.buffers[binding]; - const Id ptr = ctx.OpAccessChain(buffer.pointer_type, buffer.id, ctx.u32_zero_value, - ctx.ConstU32(gds_addr)); + const auto& buffer = ctx.buffers[binding]; + const auto [id, pointer_type] = buffer[EmitContext::BufferAlias::U32]; + const Id ptr = ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, ctx.ConstU32(gds_addr)); const auto [scope, semantics]{AtomicArgs(ctx)}; return ctx.OpAtomicIIncrement(ctx.U32[1], ptr, scope, semantics); } Id EmitDataConsume(EmitContext& ctx, u32 gds_addr, u32 binding) { - auto& buffer = ctx.buffers[binding]; - const Id ptr = ctx.OpAccessChain(buffer.pointer_type, buffer.id, ctx.u32_zero_value, - ctx.ConstU32(gds_addr)); + const auto& buffer = ctx.buffers[binding]; + const auto [id, pointer_type] = buffer[EmitContext::BufferAlias::U32]; + const Id ptr = ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, ctx.ConstU32(gds_addr)); const auto [scope, semantics]{AtomicArgs(ctx)}; return ctx.OpAtomicIDecrement(ctx.U32[1], ptr, scope, semantics); } diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp index ae77ed413..cc7b7e097 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp @@ -160,21 +160,25 @@ void EmitGetGotoVariable(EmitContext&) { UNREACHABLE_MSG("Unreachable instruction"); } +using BufferAlias = EmitContext::BufferAlias; + Id EmitReadConst(EmitContext& ctx, IR::Inst* inst) { - u32 flatbuf_off_dw = inst->Flags(); - ASSERT(ctx.srt_flatbuf.binding >= 0); - ASSERT(flatbuf_off_dw > 0); - Id index = ctx.ConstU32(flatbuf_off_dw); - auto& buffer = ctx.srt_flatbuf; - const Id ptr{ctx.OpAccessChain(buffer.pointer_type, buffer.id, ctx.u32_zero_value, index)}; + const u32 flatbuf_off_dw = inst->Flags(); + const auto& srt_flatbuf = ctx.buffers.back(); + ASSERT(srt_flatbuf.binding >= 0 && flatbuf_off_dw > 0 && + srt_flatbuf.buffer_type == BufferType::ReadConstUbo); + const auto [id, pointer_type] = srt_flatbuf[BufferAlias::U32]; + const Id ptr{ + ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, ctx.ConstU32(flatbuf_off_dw))}; return ctx.OpLoad(ctx.U32[1], ptr); } Id EmitReadConstBuffer(EmitContext& ctx, u32 handle, Id index) { - auto& buffer = ctx.buffers[handle]; + const auto& buffer = ctx.buffers[handle]; index = ctx.OpIAdd(ctx.U32[1], index, buffer.offset_dwords); - const Id ptr{ctx.OpAccessChain(buffer.pointer_type, buffer.id, ctx.u32_zero_value, index)}; - return ctx.OpLoad(buffer.data_types->Get(1), ptr); + const auto [id, pointer_type] = buffer[BufferAlias::U32]; + const Id ptr{ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, index)}; + return ctx.OpLoad(ctx.U32[1], ptr); } Id EmitReadStepRate(EmitContext& ctx, int rate_idx) { @@ -184,7 +188,7 @@ Id EmitReadStepRate(EmitContext& ctx, int rate_idx) { rate_idx == 0 ? ctx.u32_zero_value : ctx.u32_one_value)); } -Id EmitGetAttributeForGeometry(EmitContext& ctx, IR::Attribute attr, u32 comp, Id index) { +static Id EmitGetAttributeForGeometry(EmitContext& ctx, IR::Attribute attr, u32 comp, Id index) { if (IR::IsPosition(attr)) { ASSERT(attr == IR::Attribute::Position0); const auto position_arr_ptr = ctx.TypePointer(spv::StorageClass::Input, ctx.F32[4]); @@ -285,6 +289,8 @@ Id EmitGetAttributeU32(EmitContext& ctx, IR::Attribute attr, u32 comp) { return EmitReadStepRate(ctx, 0); case IR::Attribute::InstanceId1: return EmitReadStepRate(ctx, 1); + case IR::Attribute::WorkgroupIndex: + return ctx.workgroup_index_id; case IR::Attribute::WorkgroupId: return ctx.OpCompositeExtract(ctx.U32[1], ctx.OpLoad(ctx.U32[3], ctx.workgroup_id), comp); case IR::Attribute::LocalInvocationId: @@ -396,140 +402,158 @@ void EmitSetPatch(EmitContext& ctx, IR::Patch patch, Id value) { ctx.OpStore(pointer, value); } -template -static Id EmitLoadBufferU32xN(EmitContext& ctx, u32 handle, Id address) { - auto& buffer = ctx.buffers[handle]; - address = ctx.OpIAdd(ctx.U32[1], address, buffer.offset); +template +static Id EmitLoadBufferB32xN(EmitContext& ctx, u32 handle, Id address) { + const auto& spv_buffer = ctx.buffers[handle]; + if (Sirit::ValidId(spv_buffer.offset)) { + address = ctx.OpIAdd(ctx.U32[1], address, spv_buffer.offset); + } const Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(2u)); + const auto& data_types = alias == BufferAlias::U32 ? ctx.U32 : ctx.F32; + const auto [id, pointer_type] = spv_buffer[alias]; if constexpr (N == 1) { - const Id ptr{ctx.OpAccessChain(buffer.pointer_type, buffer.id, ctx.u32_zero_value, index)}; - return ctx.OpLoad(buffer.data_types->Get(1), ptr); + const Id ptr{ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, index)}; + return ctx.OpLoad(data_types[1], ptr); } else { boost::container::static_vector ids; for (u32 i = 0; i < N; i++) { const Id index_i = ctx.OpIAdd(ctx.U32[1], index, ctx.ConstU32(i)); - const Id ptr{ - ctx.OpAccessChain(buffer.pointer_type, buffer.id, ctx.u32_zero_value, index_i)}; - ids.push_back(ctx.OpLoad(buffer.data_types->Get(1), ptr)); + const Id ptr{ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, index_i)}; + ids.push_back(ctx.OpLoad(data_types[1], ptr)); } - return ctx.OpCompositeConstruct(buffer.data_types->Get(N), ids); + return ctx.OpCompositeConstruct(data_types[N], ids); } } Id EmitLoadBufferU8(EmitContext& ctx, IR::Inst*, u32 handle, Id address) { - const Id byte_index{ctx.OpBitwiseAnd(ctx.U32[1], address, ctx.ConstU32(3u))}; - const Id bit_offset{ctx.OpShiftLeftLogical(ctx.U32[1], byte_index, ctx.ConstU32(3u))}; - const Id dword{EmitLoadBufferU32xN<1>(ctx, handle, address)}; - return ctx.OpBitFieldUExtract(ctx.U32[1], dword, bit_offset, ctx.ConstU32(8u)); + const auto& spv_buffer = ctx.buffers[handle]; + if (Sirit::ValidId(spv_buffer.offset)) { + address = ctx.OpIAdd(ctx.U32[1], address, spv_buffer.offset); + } + const auto [id, pointer_type] = spv_buffer[BufferAlias::U8]; + const Id ptr{ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, address)}; + return ctx.OpUConvert(ctx.U32[1], ctx.OpLoad(ctx.U8, ptr)); } Id EmitLoadBufferU16(EmitContext& ctx, IR::Inst*, u32 handle, Id address) { - const Id byte_index{ctx.OpBitwiseAnd(ctx.U32[1], address, ctx.ConstU32(2u))}; - const Id bit_offset{ctx.OpShiftLeftLogical(ctx.U32[1], byte_index, ctx.ConstU32(3u))}; - const Id dword{EmitLoadBufferU32xN<1>(ctx, handle, address)}; - return ctx.OpBitFieldUExtract(ctx.U32[1], dword, bit_offset, ctx.ConstU32(16u)); + const auto& spv_buffer = ctx.buffers[handle]; + if (Sirit::ValidId(spv_buffer.offset)) { + address = ctx.OpIAdd(ctx.U32[1], address, spv_buffer.offset); + } + const auto [id, pointer_type] = spv_buffer[BufferAlias::U16]; + const Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(1u)); + const Id ptr{ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, index)}; + return ctx.OpUConvert(ctx.U32[1], ctx.OpLoad(ctx.U16, ptr)); } Id EmitLoadBufferU32(EmitContext& ctx, IR::Inst*, u32 handle, Id address) { - return EmitLoadBufferU32xN<1>(ctx, handle, address); + return EmitLoadBufferB32xN<1, BufferAlias::U32>(ctx, handle, address); } Id EmitLoadBufferU32x2(EmitContext& ctx, IR::Inst*, u32 handle, Id address) { - return EmitLoadBufferU32xN<2>(ctx, handle, address); + return EmitLoadBufferB32xN<2, BufferAlias::U32>(ctx, handle, address); } Id EmitLoadBufferU32x3(EmitContext& ctx, IR::Inst*, u32 handle, Id address) { - return EmitLoadBufferU32xN<3>(ctx, handle, address); + return EmitLoadBufferB32xN<3, BufferAlias::U32>(ctx, handle, address); } Id EmitLoadBufferU32x4(EmitContext& ctx, IR::Inst*, u32 handle, Id address) { - return EmitLoadBufferU32xN<4>(ctx, handle, address); + return EmitLoadBufferB32xN<4, BufferAlias::U32>(ctx, handle, address); } Id EmitLoadBufferF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { - return ctx.OpBitcast(ctx.F32[1], EmitLoadBufferU32(ctx, inst, handle, address)); + return EmitLoadBufferB32xN<1, BufferAlias::F32>(ctx, handle, address); } Id EmitLoadBufferF32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { - return ctx.OpBitcast(ctx.F32[2], EmitLoadBufferU32x2(ctx, inst, handle, address)); + return EmitLoadBufferB32xN<2, BufferAlias::F32>(ctx, handle, address); } Id EmitLoadBufferF32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { - return ctx.OpBitcast(ctx.F32[3], EmitLoadBufferU32x3(ctx, inst, handle, address)); + return EmitLoadBufferB32xN<3, BufferAlias::F32>(ctx, handle, address); } Id EmitLoadBufferF32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { - return ctx.OpBitcast(ctx.F32[4], EmitLoadBufferU32x4(ctx, inst, handle, address)); + return EmitLoadBufferB32xN<4, BufferAlias::F32>(ctx, handle, address); } Id EmitLoadBufferFormatF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { UNREACHABLE_MSG("SPIR-V instruction"); } -template -static void EmitStoreBufferU32xN(EmitContext& ctx, u32 handle, Id address, Id value) { - auto& buffer = ctx.buffers[handle]; - address = ctx.OpIAdd(ctx.U32[1], address, buffer.offset); +template +static void EmitStoreBufferB32xN(EmitContext& ctx, u32 handle, Id address, Id value) { + const auto& spv_buffer = ctx.buffers[handle]; + if (Sirit::ValidId(spv_buffer.offset)) { + address = ctx.OpIAdd(ctx.U32[1], address, spv_buffer.offset); + } const Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(2u)); + const auto& data_types = alias == BufferAlias::U32 ? ctx.U32 : ctx.F32; + const auto [id, pointer_type] = spv_buffer[alias]; if constexpr (N == 1) { - const Id ptr{ctx.OpAccessChain(buffer.pointer_type, buffer.id, ctx.u32_zero_value, index)}; + const Id ptr{ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, index)}; ctx.OpStore(ptr, value); } else { for (u32 i = 0; i < N; i++) { const Id index_i = ctx.OpIAdd(ctx.U32[1], index, ctx.ConstU32(i)); - const Id ptr = - ctx.OpAccessChain(buffer.pointer_type, buffer.id, ctx.u32_zero_value, index_i); - ctx.OpStore(ptr, ctx.OpCompositeExtract(buffer.data_types->Get(1), value, i)); + const Id ptr = ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, index_i); + ctx.OpStore(ptr, ctx.OpCompositeExtract(data_types[1], value, i)); } } } void EmitStoreBufferU8(EmitContext& ctx, IR::Inst*, u32 handle, Id address, Id value) { - const Id byte_index{ctx.OpBitwiseAnd(ctx.U32[1], address, ctx.ConstU32(3u))}; - const Id bit_offset{ctx.OpShiftLeftLogical(ctx.U32[1], byte_index, ctx.ConstU32(3u))}; - const Id dword{EmitLoadBufferU32xN<1>(ctx, handle, address)}; - const Id new_val{ctx.OpBitFieldInsert(ctx.U32[1], dword, value, bit_offset, ctx.ConstU32(8u))}; - EmitStoreBufferU32xN<1>(ctx, handle, address, new_val); + const auto& spv_buffer = ctx.buffers[handle]; + if (Sirit::ValidId(spv_buffer.offset)) { + address = ctx.OpIAdd(ctx.U32[1], address, spv_buffer.offset); + } + const auto [id, pointer_type] = spv_buffer[BufferAlias::U8]; + const Id ptr{ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, address)}; + ctx.OpStore(ptr, ctx.OpUConvert(ctx.U8, value)); } void EmitStoreBufferU16(EmitContext& ctx, IR::Inst*, u32 handle, Id address, Id value) { - const Id byte_index{ctx.OpBitwiseAnd(ctx.U32[1], address, ctx.ConstU32(2u))}; - const Id bit_offset{ctx.OpShiftLeftLogical(ctx.U32[1], byte_index, ctx.ConstU32(3u))}; - const Id dword{EmitLoadBufferU32xN<1>(ctx, handle, address)}; - const Id new_val{ctx.OpBitFieldInsert(ctx.U32[1], dword, value, bit_offset, ctx.ConstU32(16u))}; - EmitStoreBufferU32xN<1>(ctx, handle, address, new_val); + const auto& spv_buffer = ctx.buffers[handle]; + if (Sirit::ValidId(spv_buffer.offset)) { + address = ctx.OpIAdd(ctx.U32[1], address, spv_buffer.offset); + } + const auto [id, pointer_type] = spv_buffer[BufferAlias::U16]; + const Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(1u)); + const Id ptr{ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, index)}; + ctx.OpStore(ptr, ctx.OpUConvert(ctx.U16, value)); } void EmitStoreBufferU32(EmitContext& ctx, IR::Inst*, u32 handle, Id address, Id value) { - EmitStoreBufferU32xN<1>(ctx, handle, address, value); + EmitStoreBufferB32xN<1, BufferAlias::U32>(ctx, handle, address, value); } void EmitStoreBufferU32x2(EmitContext& ctx, IR::Inst*, u32 handle, Id address, Id value) { - EmitStoreBufferU32xN<2>(ctx, handle, address, value); + EmitStoreBufferB32xN<2, BufferAlias::U32>(ctx, handle, address, value); } void EmitStoreBufferU32x3(EmitContext& ctx, IR::Inst*, u32 handle, Id address, Id value) { - EmitStoreBufferU32xN<3>(ctx, handle, address, value); + EmitStoreBufferB32xN<3, BufferAlias::U32>(ctx, handle, address, value); } void EmitStoreBufferU32x4(EmitContext& ctx, IR::Inst*, u32 handle, Id address, Id value) { - EmitStoreBufferU32xN<4>(ctx, handle, address, value); + EmitStoreBufferB32xN<4, BufferAlias::U32>(ctx, handle, address, value); } void EmitStoreBufferF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) { - EmitStoreBufferU32(ctx, inst, handle, address, ctx.OpBitcast(ctx.U32[1], value)); + EmitStoreBufferB32xN<1, BufferAlias::F32>(ctx, handle, address, value); } void EmitStoreBufferF32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) { - EmitStoreBufferU32x2(ctx, inst, handle, address, ctx.OpBitcast(ctx.U32[2], value)); + EmitStoreBufferB32xN<2, BufferAlias::F32>(ctx, handle, address, value); } void EmitStoreBufferF32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) { - EmitStoreBufferU32x3(ctx, inst, handle, address, ctx.OpBitcast(ctx.U32[3], value)); + EmitStoreBufferB32xN<3, BufferAlias::F32>(ctx, handle, address, value); } void EmitStoreBufferF32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) { - EmitStoreBufferU32x4(ctx, inst, handle, address, ctx.OpBitcast(ctx.U32[4], value)); + EmitStoreBufferB32xN<4, BufferAlias::F32>(ctx, handle, address, value); } void EmitStoreBufferFormatF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) { diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_shared_memory.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_shared_memory.cpp index 550b95f3d..8b1610d61 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_shared_memory.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_shared_memory.cpp @@ -9,65 +9,35 @@ namespace Shader::Backend::SPIRV { Id EmitLoadSharedU32(EmitContext& ctx, Id offset) { const Id shift_id{ctx.ConstU32(2U)}; const Id index{ctx.OpShiftRightArithmetic(ctx.U32[1], offset, shift_id)}; - if (ctx.info.has_emulated_shared_memory) { - const Id pointer = - ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, ctx.u32_zero_value, index); - return ctx.OpLoad(ctx.U32[1], pointer); - } else { - const Id pointer = ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, index); - return ctx.OpLoad(ctx.U32[1], pointer); - } + const Id pointer = ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, index); + return ctx.OpLoad(ctx.U32[1], pointer); } Id EmitLoadSharedU64(EmitContext& ctx, Id offset) { const Id shift_id{ctx.ConstU32(2U)}; const Id base_index{ctx.OpShiftRightArithmetic(ctx.U32[1], offset, shift_id)}; const Id next_index{ctx.OpIAdd(ctx.U32[1], base_index, ctx.ConstU32(1U))}; - if (ctx.info.has_emulated_shared_memory) { - const Id lhs_pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, - ctx.u32_zero_value, base_index)}; - const Id rhs_pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, - ctx.u32_zero_value, next_index)}; - return ctx.OpCompositeConstruct(ctx.U32[2], ctx.OpLoad(ctx.U32[1], lhs_pointer), - ctx.OpLoad(ctx.U32[1], rhs_pointer)); - } else { - const Id lhs_pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, base_index)}; - const Id rhs_pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, next_index)}; - return ctx.OpCompositeConstruct(ctx.U32[2], ctx.OpLoad(ctx.U32[1], lhs_pointer), - ctx.OpLoad(ctx.U32[1], rhs_pointer)); - } + const Id lhs_pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, base_index)}; + const Id rhs_pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, next_index)}; + return ctx.OpCompositeConstruct(ctx.U32[2], ctx.OpLoad(ctx.U32[1], lhs_pointer), + ctx.OpLoad(ctx.U32[1], rhs_pointer)); } void EmitWriteSharedU32(EmitContext& ctx, Id offset, Id value) { const Id shift{ctx.ConstU32(2U)}; const Id word_offset{ctx.OpShiftRightArithmetic(ctx.U32[1], offset, shift)}; - if (ctx.info.has_emulated_shared_memory) { - const Id pointer = ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, - ctx.u32_zero_value, word_offset); - ctx.OpStore(pointer, value); - } else { - const Id pointer = ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, word_offset); - ctx.OpStore(pointer, value); - } + const Id pointer = ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, word_offset); + ctx.OpStore(pointer, value); } void EmitWriteSharedU64(EmitContext& ctx, Id offset, Id value) { const Id shift{ctx.ConstU32(2U)}; const Id word_offset{ctx.OpShiftRightArithmetic(ctx.U32[1], offset, shift)}; const Id next_offset{ctx.OpIAdd(ctx.U32[1], word_offset, ctx.ConstU32(1U))}; - if (ctx.info.has_emulated_shared_memory) { - const Id lhs_pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, - ctx.u32_zero_value, word_offset)}; - const Id rhs_pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, - ctx.u32_zero_value, next_offset)}; - ctx.OpStore(lhs_pointer, ctx.OpCompositeExtract(ctx.U32[1], value, 0U)); - ctx.OpStore(rhs_pointer, ctx.OpCompositeExtract(ctx.U32[1], value, 1U)); - } else { - const Id lhs_pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, word_offset)}; - const Id rhs_pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, next_offset)}; - ctx.OpStore(lhs_pointer, ctx.OpCompositeExtract(ctx.U32[1], value, 0U)); - ctx.OpStore(rhs_pointer, ctx.OpCompositeExtract(ctx.U32[1], value, 1U)); - } + const Id lhs_pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, word_offset)}; + const Id rhs_pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, next_offset)}; + ctx.OpStore(lhs_pointer, ctx.OpCompositeExtract(ctx.U32[1], value, 0U)); + ctx.OpStore(rhs_pointer, ctx.OpCompositeExtract(ctx.U32[1], value, 1U)); } } // namespace Shader::Backend::SPIRV diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_special.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_special.cpp index a0a3ed8ff..724550cd6 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_special.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_special.cpp @@ -11,6 +11,9 @@ void EmitPrologue(EmitContext& ctx) { if (ctx.stage == Stage::Fragment) { ctx.DefineInterpolatedAttribs(); } + if (ctx.info.loads.Get(IR::Attribute::WorkgroupIndex)) { + ctx.DefineWorkgroupIndex(); + } ctx.DefineBufferOffsets(); } diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp index d676d205d..da20dc691 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp @@ -5,7 +5,6 @@ #include "common/div_ceil.h" #include "shader_recompiler/backend/spirv/spirv_emit_context.h" #include "shader_recompiler/frontend/fetch_shader.h" -#include "shader_recompiler/ir/passes/srt.h" #include "shader_recompiler/runtime_info.h" #include "video_core/amdgpu/types.h" @@ -107,6 +106,8 @@ Id EmitContext::Def(const IR::Value& value) { void EmitContext::DefineArithmeticTypes() { void_id = Name(TypeVoid(), "void_id"); U1[1] = Name(TypeBool(), "bool_id"); + U8 = Name(TypeUInt(8), "u8_id"); + U16 = Name(TypeUInt(16), "u16_id"); if (info.uses_fp16) { F16[1] = Name(TypeFloat(16), "f16_id"); U16 = Name(TypeUInt(16), "u16_id"); @@ -193,6 +194,9 @@ EmitContext::SpirvAttribute EmitContext::GetAttributeInfo(AmdGpu::NumberFormat f void EmitContext::DefineBufferOffsets() { for (BufferDefinition& buffer : buffers) { + if (buffer.buffer_type != BufferType::Guest) { + continue; + } const u32 binding = buffer.binding; const u32 half = PushData::BufOffsetIndex + (binding >> 4); const u32 comp = (binding & 0xf) >> 2; @@ -211,8 +215,7 @@ void EmitContext::DefineInterpolatedAttribs() { if (!profile.needs_manual_interpolation) { return; } - // Iterate all input attributes, load them and manually interpolate with barycentric - // coordinates. + // Iterate all input attributes, load them and manually interpolate. for (s32 i = 0; i < runtime_info.fs_info.num_inputs; i++) { const auto& input = runtime_info.fs_info.inputs[i]; const u32 semantic = input.param_index; @@ -237,6 +240,20 @@ void EmitContext::DefineInterpolatedAttribs() { } } +void EmitContext::DefineWorkgroupIndex() { + const Id workgroup_id_val{OpLoad(U32[3], workgroup_id)}; + const Id workgroup_x{OpCompositeExtract(U32[1], workgroup_id_val, 0)}; + const Id workgroup_y{OpCompositeExtract(U32[1], workgroup_id_val, 1)}; + const Id workgroup_z{OpCompositeExtract(U32[1], workgroup_id_val, 2)}; + const Id num_workgroups{OpLoad(U32[3], num_workgroups_id)}; + const Id num_workgroups_x{OpCompositeExtract(U32[1], num_workgroups, 0)}; + const Id num_workgroups_y{OpCompositeExtract(U32[1], num_workgroups, 1)}; + workgroup_index_id = + OpIAdd(U32[1], OpIAdd(U32[1], workgroup_x, OpIMul(U32[1], workgroup_y, num_workgroups_x)), + OpIMul(U32[1], workgroup_z, OpIMul(U32[1], num_workgroups_x, num_workgroups_y))); + Name(workgroup_index_id, "workgroup_index"); +} + Id MakeDefaultValue(EmitContext& ctx, u32 default_value) { switch (default_value) { case 0: @@ -305,9 +322,16 @@ void EmitContext::DefineInputs() { break; } case LogicalStage::Fragment: - frag_coord = DefineVariable(F32[4], spv::BuiltIn::FragCoord, spv::StorageClass::Input); - frag_depth = DefineVariable(F32[1], spv::BuiltIn::FragDepth, spv::StorageClass::Output); - front_facing = DefineVariable(U1[1], spv::BuiltIn::FrontFacing, spv::StorageClass::Input); + if (info.loads.GetAny(IR::Attribute::FragCoord)) { + frag_coord = DefineVariable(F32[4], spv::BuiltIn::FragCoord, spv::StorageClass::Input); + } + if (info.stores.Get(IR::Attribute::Depth)) { + frag_depth = DefineVariable(F32[1], spv::BuiltIn::FragDepth, spv::StorageClass::Output); + } + if (info.loads.Get(IR::Attribute::IsFrontFace)) { + front_facing = + DefineVariable(U1[1], spv::BuiltIn::FrontFacing, spv::StorageClass::Input); + } if (profile.needs_manual_interpolation) { gl_bary_coord_id = DefineVariable(F32[3], spv::BuiltIn::BaryCoordKHR, spv::StorageClass::Input); @@ -342,9 +366,19 @@ void EmitContext::DefineInputs() { } break; case LogicalStage::Compute: - workgroup_id = DefineVariable(U32[3], spv::BuiltIn::WorkgroupId, spv::StorageClass::Input); - local_invocation_id = - DefineVariable(U32[3], spv::BuiltIn::LocalInvocationId, spv::StorageClass::Input); + if (info.loads.GetAny(IR::Attribute::WorkgroupIndex) || + info.loads.GetAny(IR::Attribute::WorkgroupId)) { + workgroup_id = + DefineVariable(U32[3], spv::BuiltIn::WorkgroupId, spv::StorageClass::Input); + } + if (info.loads.GetAny(IR::Attribute::WorkgroupIndex)) { + num_workgroups_id = + DefineVariable(U32[3], spv::BuiltIn::NumWorkgroups, spv::StorageClass::Input); + } + if (info.loads.GetAny(IR::Attribute::LocalInvocationId)) { + local_invocation_id = + DefineVariable(U32[3], spv::BuiltIn::LocalInvocationId, spv::StorageClass::Input); + } break; case LogicalStage::Geometry: { primitive_id = DefineVariable(U32[1], spv::BuiltIn::PrimitiveId, spv::StorageClass::Input); @@ -588,78 +622,74 @@ void EmitContext::DefinePushDataBlock() { interfaces.push_back(push_data_block); } -void EmitContext::DefineBuffers() { - boost::container::small_vector type_ids; - const auto define_struct = [&](Id record_array_type, bool is_instance_data, - std::optional explicit_name = {}) { - const Id struct_type{TypeStruct(record_array_type)}; - if (std::ranges::find(type_ids, record_array_type.value, &Id::value) != type_ids.end()) { - return struct_type; - } - Decorate(record_array_type, spv::Decoration::ArrayStride, 4); - auto name = is_instance_data ? fmt::format("{}_instance_data_f32", stage) - : fmt::format("{}_cbuf_block_f32", stage); - name = explicit_name.value_or(name); - Name(struct_type, name); +EmitContext::BufferSpv EmitContext::DefineBuffer(bool is_storage, bool is_written, u32 elem_shift, + BufferType buffer_type, Id data_type) { + // Define array type. + const Id max_num_items = ConstU32(u32(profile.max_ubo_size) >> elem_shift); + const Id record_array_type{is_storage ? TypeRuntimeArray(data_type) + : TypeArray(data_type, max_num_items)}; + // Define block struct type. Don't perform decorations twice on the same Id. + const Id struct_type{TypeStruct(record_array_type)}; + if (std::ranges::find(buf_type_ids, record_array_type.value, &Id::value) == + buf_type_ids.end()) { + Decorate(record_array_type, spv::Decoration::ArrayStride, 1 << elem_shift); Decorate(struct_type, spv::Decoration::Block); MemberName(struct_type, 0, "data"); MemberDecorate(struct_type, 0, spv::Decoration::Offset, 0U); - type_ids.push_back(record_array_type); - return struct_type; - }; - - if (info.has_readconst) { - const Id data_type = U32[1]; - const auto storage_class = spv::StorageClass::Uniform; - const Id pointer_type = TypePointer(storage_class, data_type); - const Id record_array_type{ - TypeArray(U32[1], ConstU32(static_cast(info.flattened_ud_buf.size())))}; - - const Id struct_type{define_struct(record_array_type, false, "srt_flatbuf_ty")}; - - const Id struct_pointer_type{TypePointer(storage_class, struct_type)}; - const Id id{AddGlobalVariable(struct_pointer_type, storage_class)}; - Decorate(id, spv::Decoration::Binding, binding.unified++); - Decorate(id, spv::Decoration::DescriptorSet, 0U); - Name(id, "srt_flatbuf_ubo"); - - srt_flatbuf = { - .id = id, - .binding = binding.buffer++, - .pointer_type = pointer_type, - }; - interfaces.push_back(id); + buf_type_ids.push_back(record_array_type); } + // Define buffer binding interface. + const auto storage_class = + is_storage ? spv::StorageClass::StorageBuffer : spv::StorageClass::Uniform; + const Id struct_pointer_type{TypePointer(storage_class, struct_type)}; + const Id pointer_type = TypePointer(storage_class, data_type); + const Id id{AddGlobalVariable(struct_pointer_type, storage_class)}; + Decorate(id, spv::Decoration::Binding, binding.unified); + Decorate(id, spv::Decoration::DescriptorSet, 0U); + if (is_storage && !is_written) { + Decorate(id, spv::Decoration::NonWritable); + } + switch (buffer_type) { + case Shader::BufferType::GdsBuffer: + Name(id, "gds_buffer"); + break; + case Shader::BufferType::ReadConstUbo: + Name(id, "srt_flatbuf_ubo"); + break; + case Shader::BufferType::SharedMemory: + Name(id, "ssbo_shmem"); + break; + default: + Name(id, fmt::format("{}_{}", is_storage ? "ssbo" : "ubo", binding.buffer)); + } + interfaces.push_back(id); + return {id, pointer_type}; +}; +void EmitContext::DefineBuffers() { for (const auto& desc : info.buffers) { - const auto sharp = desc.GetSharp(info); - const bool is_storage = desc.IsStorage(sharp, profile); - const u32 array_size = profile.max_ubo_size >> 2; - const auto* data_types = True(desc.used_types & IR::Type::F32) ? &F32 : &U32; - const Id data_type = (*data_types)[1]; - const Id record_array_type{is_storage ? TypeRuntimeArray(data_type) - : TypeArray(data_type, ConstU32(array_size))}; - const Id struct_type{define_struct(record_array_type, desc.is_instance_data)}; + const auto buf_sharp = desc.GetSharp(info); + const bool is_storage = desc.IsStorage(buf_sharp, profile); - const auto storage_class = - is_storage ? spv::StorageClass::StorageBuffer : spv::StorageClass::Uniform; - const Id struct_pointer_type{TypePointer(storage_class, struct_type)}; - const Id pointer_type = TypePointer(storage_class, data_type); - const Id id{AddGlobalVariable(struct_pointer_type, storage_class)}; - Decorate(id, spv::Decoration::Binding, binding.unified++); - Decorate(id, spv::Decoration::DescriptorSet, 0U); - if (is_storage && !desc.is_written) { - Decorate(id, spv::Decoration::NonWritable); + // Define aliases depending on the shader usage. + auto& spv_buffer = buffers.emplace_back(binding.buffer++, desc.buffer_type); + if (True(desc.used_types & IR::Type::U32)) { + spv_buffer[BufferAlias::U32] = + DefineBuffer(is_storage, desc.is_written, 2, desc.buffer_type, U32[1]); } - Name(id, fmt::format("{}_{}", is_storage ? "ssbo" : "cbuf", desc.sharp_idx)); - - buffers.push_back({ - .id = id, - .binding = binding.buffer++, - .data_types = data_types, - .pointer_type = pointer_type, - }); - interfaces.push_back(id); + if (True(desc.used_types & IR::Type::F32)) { + spv_buffer[BufferAlias::F32] = + DefineBuffer(is_storage, desc.is_written, 2, desc.buffer_type, F32[1]); + } + if (True(desc.used_types & IR::Type::U16)) { + spv_buffer[BufferAlias::U16] = + DefineBuffer(is_storage, desc.is_written, 1, desc.buffer_type, U16); + } + if (True(desc.used_types & IR::Type::U8)) { + spv_buffer[BufferAlias::U8] = + DefineBuffer(is_storage, desc.is_written, 0, desc.buffer_type, U8); + } + ++binding.unified; } } @@ -809,51 +839,18 @@ void EmitContext::DefineImagesAndSamplers() { } void EmitContext::DefineSharedMemory() { - static constexpr size_t DefaultSharedMemSize = 2_KB; if (!info.uses_shared) { return; } ASSERT(info.stage == Stage::Compute); - - const u32 max_shared_memory_size = profile.max_shared_memory_size; - u32 shared_memory_size = runtime_info.cs_info.shared_memory_size; - if (shared_memory_size == 0) { - shared_memory_size = DefaultSharedMemSize; - } - + const u32 shared_memory_size = runtime_info.cs_info.shared_memory_size; const u32 num_elements{Common::DivCeil(shared_memory_size, 4U)}; const Id type{TypeArray(U32[1], ConstU32(num_elements))}; - - if (shared_memory_size <= max_shared_memory_size) { - shared_memory_u32_type = TypePointer(spv::StorageClass::Workgroup, type); - shared_u32 = TypePointer(spv::StorageClass::Workgroup, U32[1]); - shared_memory_u32 = AddGlobalVariable(shared_memory_u32_type, spv::StorageClass::Workgroup); - Name(shared_memory_u32, "shared_mem"); - interfaces.push_back(shared_memory_u32); - } else { - shared_memory_u32_type = TypePointer(spv::StorageClass::StorageBuffer, type); - shared_u32 = TypePointer(spv::StorageClass::StorageBuffer, U32[1]); - - Decorate(type, spv::Decoration::ArrayStride, 4); - - const Id struct_type{TypeStruct(type)}; - Name(struct_type, "shared_memory_buf"); - Decorate(struct_type, spv::Decoration::Block); - MemberName(struct_type, 0, "data"); - MemberDecorate(struct_type, 0, spv::Decoration::Offset, 0U); - - const Id struct_pointer_type{TypePointer(spv::StorageClass::StorageBuffer, struct_type)}; - const Id ssbo_id{AddGlobalVariable(struct_pointer_type, spv::StorageClass::StorageBuffer)}; - Decorate(ssbo_id, spv::Decoration::Binding, binding.unified++); - Decorate(ssbo_id, spv::Decoration::DescriptorSet, 0U); - Name(ssbo_id, "shared_mem_ssbo"); - - shared_memory_u32 = ssbo_id; - - info.has_emulated_shared_memory = true; - info.shared_memory_size = shared_memory_size; - interfaces.push_back(ssbo_id); - } + shared_memory_u32_type = TypePointer(spv::StorageClass::Workgroup, type); + shared_u32 = TypePointer(spv::StorageClass::Workgroup, U32[1]); + shared_memory_u32 = AddGlobalVariable(shared_memory_u32_type, spv::StorageClass::Workgroup); + Name(shared_memory_u32, "shared_mem"); + interfaces.push_back(shared_memory_u32); } Id EmitContext::DefineFloat32ToUfloatM5(u32 mantissa_bits, const std::string_view name) { diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.h b/src/shader_recompiler/backend/spirv/spirv_emit_context.h index 23fca4212..0fe6e336c 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.h +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.h @@ -8,7 +8,7 @@ #include "shader_recompiler/backend/bindings.h" #include "shader_recompiler/info.h" -#include "shader_recompiler/ir/program.h" +#include "shader_recompiler/ir/value.h" #include "shader_recompiler/profile.h" namespace Shader::Backend::SPIRV { @@ -45,6 +45,7 @@ public: void DefineBufferOffsets(); void DefineInterpolatedAttribs(); + void DefineWorkgroupIndex(); [[nodiscard]] Id DefineInput(Id type, std::optional location = std::nullopt, std::optional builtin = std::nullopt) { @@ -200,8 +201,10 @@ public: std::array patches{}; Id workgroup_id{}; + Id num_workgroups_id{}; + Id workgroup_index_id{}; Id local_invocation_id{}; - Id invocation_id{}; // for instanced geoshaders or output vertices within TCS patch + Id invocation_id{}; Id subgroup_local_invocation_id{}; Id image_u32{}; @@ -227,18 +230,38 @@ public: bool is_storage = false; }; - struct BufferDefinition { + enum class BufferAlias : u32 { + U8, + U16, + U32, + F32, + NumAlias, + }; + + struct BufferSpv { Id id; - Id offset; - Id offset_dwords; - u32 binding; - const VectorIds* data_types; Id pointer_type; }; + struct BufferDefinition { + u32 binding; + BufferType buffer_type; + Id offset; + Id offset_dwords; + std::array aliases; + + const BufferSpv& operator[](BufferAlias alias) const { + return aliases[u32(alias)]; + } + + BufferSpv& operator[](BufferAlias alias) { + return aliases[u32(alias)]; + } + }; + Bindings& binding; + boost::container::small_vector buf_type_ids; boost::container::small_vector buffers; - BufferDefinition srt_flatbuf; boost::container::small_vector images; boost::container::small_vector samplers; @@ -279,6 +302,9 @@ private: SpirvAttribute GetAttributeInfo(AmdGpu::NumberFormat fmt, Id id, u32 num_components, bool output); + BufferSpv DefineBuffer(bool is_storage, bool is_written, u32 elem_shift, BufferType buffer_type, + Id data_type); + Id DefineFloat32ToUfloatM5(u32 mantissa_bits, std::string_view name); Id DefineUfloatM5ToFloat32(u32 mantissa_bits, std::string_view name); }; diff --git a/src/shader_recompiler/frontend/translate/data_share.cpp b/src/shader_recompiler/frontend/translate/data_share.cpp index 62c0423dd..460f8913c 100644 --- a/src/shader_recompiler/frontend/translate/data_share.cpp +++ b/src/shader_recompiler/frontend/translate/data_share.cpp @@ -176,6 +176,13 @@ void Translator::DS_WRITE(int bit_size, bool is_signed, bool is_pair, bool strid const IR::U32 addr{ir.GetVectorReg(IR::VectorReg(inst.src[0].code))}; const IR::VectorReg data0{inst.src[1].code}; const IR::VectorReg data1{inst.src[2].code}; + const u32 offset = (inst.control.ds.offset1 << 8u) + inst.control.ds.offset0; + if (info.stage == Stage::Fragment) { + ASSERT_MSG(!is_pair && bit_size == 32 && offset % 256 == 0, + "Unexpected shared memory offset alignment: {}", offset); + ir.SetVectorReg(GetScratchVgpr(offset), ir.GetVectorReg(data0)); + return; + } if (is_pair) { const u32 adj = (bit_size == 32 ? 4 : 8) * (stride64 ? 64 : 1); const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset0 * adj))); @@ -195,14 +202,12 @@ void Translator::DS_WRITE(int bit_size, bool is_signed, bool is_pair, bool strid addr1); } } else if (bit_size == 64) { - const IR::U32 addr0 = ir.IAdd( - addr, ir.Imm32((u32(inst.control.ds.offset1) << 8u) + u32(inst.control.ds.offset0))); + const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(offset)); const IR::Value data = ir.CompositeConstruct(ir.GetVectorReg(data0), ir.GetVectorReg(data0 + 1)); ir.WriteShared(bit_size, data, addr0); } else { - const IR::U32 addr0 = ir.IAdd( - addr, ir.Imm32((u32(inst.control.ds.offset1) << 8u) + u32(inst.control.ds.offset0))); + const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(offset)); ir.WriteShared(bit_size, ir.GetVectorReg(data0), addr0); } } @@ -223,6 +228,13 @@ void Translator::DS_READ(int bit_size, bool is_signed, bool is_pair, bool stride const GcnInst& inst) { const IR::U32 addr{ir.GetVectorReg(IR::VectorReg(inst.src[0].code))}; IR::VectorReg dst_reg{inst.dst[0].code}; + const u32 offset = (inst.control.ds.offset1 << 8u) + inst.control.ds.offset0; + if (info.stage == Stage::Fragment) { + ASSERT_MSG(!is_pair && bit_size == 32 && offset % 256 == 0, + "Unexpected shared memory offset alignment: {}", offset); + ir.SetVectorReg(dst_reg, ir.GetVectorReg(GetScratchVgpr(offset))); + return; + } if (is_pair) { // Pair loads are either 32 or 64-bit const u32 adj = (bit_size == 32 ? 4 : 8) * (stride64 ? 64 : 1); @@ -243,14 +255,12 @@ void Translator::DS_READ(int bit_size, bool is_signed, bool is_pair, bool stride ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(data1, 1)}); } } else if (bit_size == 64) { - const IR::U32 addr0 = ir.IAdd( - addr, ir.Imm32((u32(inst.control.ds.offset1) << 8u) + u32(inst.control.ds.offset0))); + const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(offset)); const IR::Value data = ir.LoadShared(bit_size, is_signed, addr0); ir.SetVectorReg(dst_reg, IR::U32{ir.CompositeExtract(data, 0)}); ir.SetVectorReg(dst_reg + 1, IR::U32{ir.CompositeExtract(data, 1)}); } else { - const IR::U32 addr0 = ir.IAdd( - addr, ir.Imm32((u32(inst.control.ds.offset1) << 8u) + u32(inst.control.ds.offset0))); + const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(offset)); const IR::U32 data = IR::U32{ir.LoadShared(bit_size, is_signed, addr0)}; ir.SetVectorReg(dst_reg, data); } diff --git a/src/shader_recompiler/frontend/translate/export.cpp b/src/shader_recompiler/frontend/translate/export.cpp index ece35093a..0abef2e81 100644 --- a/src/shader_recompiler/frontend/translate/export.cpp +++ b/src/shader_recompiler/frontend/translate/export.cpp @@ -7,7 +7,7 @@ namespace Shader::Gcn { -u32 SwizzleMrtComponent(const FragmentRuntimeInfo::PsColorBuffer& color_buffer, u32 comp) { +u32 SwizzleMrtComponent(const PsColorBuffer& color_buffer, u32 comp) { const auto [r, g, b, a] = color_buffer.swizzle; const std::array swizzle_array = {r, g, b, a}; const auto swizzled_comp_type = static_cast(swizzle_array[comp]); @@ -16,7 +16,7 @@ u32 SwizzleMrtComponent(const FragmentRuntimeInfo::PsColorBuffer& color_buffer, } void Translator::ExportMrtValue(IR::Attribute attribute, u32 comp, const IR::F32& value, - const FragmentRuntimeInfo::PsColorBuffer& color_buffer) { + const PsColorBuffer& color_buffer) { auto converted = ApplyWriteNumberConversion(ir, value, color_buffer.num_conversion); if (color_buffer.needs_unorm_fixup) { // FIXME: Fix-up for GPUs where float-to-unorm rounding is off from expected. diff --git a/src/shader_recompiler/frontend/translate/translate.cpp b/src/shader_recompiler/frontend/translate/translate.cpp index 7f5504663..7f1bcb33e 100644 --- a/src/shader_recompiler/frontend/translate/translate.cpp +++ b/src/shader_recompiler/frontend/translate/translate.cpp @@ -4,7 +4,6 @@ #include "common/config.h" #include "common/io_file.h" #include "common/path_util.h" -#include "shader_recompiler/exception.h" #include "shader_recompiler/frontend/fetch_shader.h" #include "shader_recompiler/frontend/translate/translate.h" #include "shader_recompiler/info.h" @@ -21,9 +20,14 @@ namespace Shader::Gcn { +static u32 next_vgpr_num; +static std::unordered_map vgpr_map; + Translator::Translator(IR::Block* block_, Info& info_, const RuntimeInfo& runtime_info_, const Profile& profile_) - : ir{*block_, block_->begin()}, info{info_}, runtime_info{runtime_info_}, profile{profile_} {} + : ir{*block_, block_->begin()}, info{info_}, runtime_info{runtime_info_}, profile{profile_} { + next_vgpr_num = vgpr_map.empty() ? runtime_info.num_allocated_vgprs : next_vgpr_num; +} void Translator::EmitPrologue() { ir.Prologue(); @@ -179,8 +183,21 @@ void Translator::EmitPrologue() { default: UNREACHABLE_MSG("Unknown shader stage"); } + + // Clear any scratch vgpr mappings for next shader. + vgpr_map.clear(); } +IR::VectorReg Translator::GetScratchVgpr(u32 offset) { + const auto [it, is_new] = vgpr_map.try_emplace(offset); + if (is_new) { + ASSERT_MSG(next_vgpr_num < 256, "Out of VGPRs"); + const auto new_vgpr = static_cast(next_vgpr_num++); + it->second = new_vgpr; + } + return it->second; +}; + template T Translator::GetSrc(const InstOperand& operand) { constexpr bool is_float = std::is_same_v; @@ -490,7 +507,6 @@ void Translator::EmitFetch(const GcnInst& inst) { info.buffers.push_back({ .sharp_idx = info.srt_info.ReserveSharp(attrib.sgpr_base, attrib.dword_offset, 4), .used_types = IR::Type::F32, - .is_instance_data = true, .instance_attrib = attrib.semantic, }); } diff --git a/src/shader_recompiler/frontend/translate/translate.h b/src/shader_recompiler/frontend/translate/translate.h index 287885854..563881a8e 100644 --- a/src/shader_recompiler/frontend/translate/translate.h +++ b/src/shader_recompiler/frontend/translate/translate.h @@ -309,7 +309,7 @@ private: const IR::F32& x_res, const IR::F32& y_res, const IR::F32& z_res); void ExportMrtValue(IR::Attribute attribute, u32 comp, const IR::F32& value, - const FragmentRuntimeInfo::PsColorBuffer& color_buffer); + const PsColorBuffer& color_buffer); void ExportMrtCompressed(IR::Attribute attribute, u32 idx, const IR::U32& value); void ExportMrtUncompressed(IR::Attribute attribute, u32 comp, const IR::F32& value); void ExportCompressed(IR::Attribute attribute, u32 idx, const IR::U32& value); @@ -317,6 +317,8 @@ private: void LogMissingOpcode(const GcnInst& inst); + IR::VectorReg GetScratchVgpr(u32 offset); + private: IR::IREmitter ir; Info& info; diff --git a/src/shader_recompiler/info.h b/src/shader_recompiler/info.h index 57d428a49..13f310cf8 100644 --- a/src/shader_recompiler/info.h +++ b/src/shader_recompiler/info.h @@ -2,7 +2,6 @@ // SPDX-License-Identifier: GPL-2.0-or-later #pragma once -#include #include #include #include @@ -19,7 +18,6 @@ #include "shader_recompiler/params.h" #include "shader_recompiler/profile.h" #include "shader_recompiler/runtime_info.h" -#include "video_core/amdgpu/liverpool.h" #include "video_core/amdgpu/resource.h" namespace Shader { @@ -37,21 +35,30 @@ enum class TextureType : u32 { }; constexpr u32 NUM_TEXTURE_TYPES = 7; +enum class BufferType : u32 { + Guest, + ReadConstUbo, + GdsBuffer, + SharedMemory, +}; + struct Info; struct BufferResource { u32 sharp_idx; IR::Type used_types; AmdGpu::Buffer inline_cbuf; - bool is_gds_buffer{}; - bool is_instance_data{}; + BufferType buffer_type; u8 instance_attrib{}; bool is_written{}; bool is_formatted{}; - [[nodiscard]] bool IsStorage(const AmdGpu::Buffer& buffer, - const Profile& profile) const noexcept { - return buffer.GetSize() > profile.max_ubo_size || is_written || is_gds_buffer; + bool IsSpecial() const noexcept { + return buffer_type != BufferType::Guest; + } + + bool IsStorage(const AmdGpu::Buffer& buffer, const Profile& profile) const noexcept { + return buffer.GetSize() > profile.max_ubo_size || is_written; } [[nodiscard]] constexpr AmdGpu::Buffer GetSharp(const Info& info) const noexcept; @@ -193,10 +200,8 @@ struct Info { bool uses_unpack_10_11_11{}; bool stores_tess_level_outer{}; bool stores_tess_level_inner{}; - bool translation_failed{}; // indicates that shader has unsupported instructions - bool has_emulated_shared_memory{}; + bool translation_failed{}; bool has_readconst{}; - u32 shared_memory_size{}; u8 mrt_mask{0u}; bool has_fetch_shader{false}; u32 fetch_shader_sgpr_base{0u}; @@ -233,10 +238,8 @@ struct Info { } void AddBindings(Backend::Bindings& bnd) const { - const auto total_buffers = - buffers.size() + (has_readconst ? 1 : 0) + (has_emulated_shared_memory ? 1 : 0); - bnd.buffer += total_buffers; - bnd.unified += total_buffers + images.size() + samplers.size(); + bnd.buffer += buffers.size(); + bnd.unified += buffers.size() + images.size() + samplers.size(); bnd.user_data += ud_mask.NumRegs(); } @@ -283,14 +286,3 @@ constexpr AmdGpu::Image FMaskResource::GetSharp(const Info& info) const noexcept } } // namespace Shader - -template <> -struct fmt::formatter { - constexpr auto parse(format_parse_context& ctx) { - return ctx.begin(); - } - auto format(const Shader::Stage stage, format_context& ctx) const { - constexpr static std::array names = {"fs", "vs", "gs", "es", "hs", "ls", "cs"}; - return fmt::format_to(ctx.out(), "{}", names[static_cast(stage)]); - } -}; diff --git a/src/shader_recompiler/ir/attribute.h b/src/shader_recompiler/ir/attribute.h index bcb2b44a9..5117f5650 100644 --- a/src/shader_recompiler/ir/attribute.h +++ b/src/shader_recompiler/ir/attribute.h @@ -69,16 +69,17 @@ enum class Attribute : u64 { SampleIndex = 72, GlobalInvocationId = 73, WorkgroupId = 74, - LocalInvocationId = 75, - LocalInvocationIndex = 76, - FragCoord = 77, - InstanceId0 = 78, // step rate 0 - InstanceId1 = 79, // step rate 1 - InvocationId = 80, // TCS id in output patch and instanced geometry shader id - PatchVertices = 81, - TessellationEvaluationPointU = 82, - TessellationEvaluationPointV = 83, - PackedHullInvocationInfo = 84, // contains patch id within the VGT and invocation ID + WorkgroupIndex = 75, + LocalInvocationId = 76, + LocalInvocationIndex = 77, + FragCoord = 78, + InstanceId0 = 79, // step rate 0 + InstanceId1 = 80, // step rate 1 + InvocationId = 81, // TCS id in output patch and instanced geometry shader id + PatchVertices = 82, + TessellationEvaluationPointU = 83, + TessellationEvaluationPointV = 84, + PackedHullInvocationInfo = 85, // contains patch id within the VGT and invocation ID Max, }; diff --git a/src/shader_recompiler/ir/passes/ir_passes.h b/src/shader_recompiler/ir/passes/ir_passes.h index 3c98579a0..69628dbfd 100644 --- a/src/shader_recompiler/ir/passes/ir_passes.h +++ b/src/shader_recompiler/ir/passes/ir_passes.h @@ -20,12 +20,14 @@ void FlattenExtendedUserdataPass(IR::Program& program); void ResourceTrackingPass(IR::Program& program); void CollectShaderInfoPass(IR::Program& program); void LowerBufferFormatToRaw(IR::Program& program); -void LowerSharedMemToRegisters(IR::Program& program, const RuntimeInfo& runtime_info); void RingAccessElimination(const IR::Program& program, const RuntimeInfo& runtime_info, Stage stage); void TessellationPreprocess(IR::Program& program, RuntimeInfo& runtime_info); void HullShaderTransform(IR::Program& program, RuntimeInfo& runtime_info); void DomainShaderTransform(IR::Program& program, RuntimeInfo& runtime_info); -void SharedMemoryBarrierPass(IR::Program& program, const Profile& profile); +void SharedMemoryBarrierPass(IR::Program& program, const RuntimeInfo& runtime_info, + const Profile& profile); +void SharedMemoryToStoragePass(IR::Program& program, const RuntimeInfo& runtime_info, + const Profile& profile); } // namespace Shader::Optimization diff --git a/src/shader_recompiler/ir/passes/lower_shared_mem_to_registers.cpp b/src/shader_recompiler/ir/passes/lower_shared_mem_to_registers.cpp deleted file mode 100644 index 23963a991..000000000 --- a/src/shader_recompiler/ir/passes/lower_shared_mem_to_registers.cpp +++ /dev/null @@ -1,81 +0,0 @@ -// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project -// SPDX-License-Identifier: GPL-2.0-or-later - -#include - -#include "shader_recompiler/ir/ir_emitter.h" -#include "shader_recompiler/ir/program.h" - -namespace Shader::Optimization { - -static bool IsSharedMemoryInst(const IR::Inst& inst) { - const auto opcode = inst.GetOpcode(); - return opcode == IR::Opcode::LoadSharedU32 || opcode == IR::Opcode::LoadSharedU64 || - opcode == IR::Opcode::WriteSharedU32 || opcode == IR::Opcode::WriteSharedU64; -} - -static u32 GetSharedMemImmOffset(const IR::Inst& inst) { - const auto* address = inst.Arg(0).InstRecursive(); - ASSERT(address->GetOpcode() == IR::Opcode::IAdd32); - const auto ir_offset = address->Arg(1); - ASSERT_MSG(ir_offset.IsImmediate()); - const auto offset = ir_offset.U32(); - // Typical usage is the compiler spilling registers into shared memory, with 256 bytes between - // each register to account for 4 bytes per register times 64 threads per group. Ensure that - // this assumption holds, as if it does not this approach may need to be revised. - ASSERT_MSG(offset % 256 == 0, "Unexpected shared memory offset alignment: {}", offset); - return offset; -} - -static void ConvertSharedMemToVgpr(IR::IREmitter& ir, IR::Inst& inst, const IR::VectorReg vgpr) { - switch (inst.GetOpcode()) { - case IR::Opcode::LoadSharedU32: - inst.ReplaceUsesWithAndRemove(ir.GetVectorReg(vgpr)); - break; - case IR::Opcode::LoadSharedU64: - inst.ReplaceUsesWithAndRemove( - ir.CompositeConstruct(ir.GetVectorReg(vgpr), ir.GetVectorReg(vgpr + 1))); - break; - case IR::Opcode::WriteSharedU32: - ir.SetVectorReg(vgpr, IR::U32{inst.Arg(1)}); - inst.Invalidate(); - break; - case IR::Opcode::WriteSharedU64: { - const auto value = inst.Arg(1); - ir.SetVectorReg(vgpr, IR::U32{ir.CompositeExtract(value, 0)}); - ir.SetVectorReg(vgpr, IR::U32{ir.CompositeExtract(value, 1)}); - inst.Invalidate(); - break; - } - default: - UNREACHABLE_MSG("Unknown shared memory opcode: {}", inst.GetOpcode()); - } -} - -void LowerSharedMemToRegisters(IR::Program& program, const RuntimeInfo& runtime_info) { - u32 next_vgpr_num = runtime_info.num_allocated_vgprs; - std::unordered_map vgpr_map; - const auto get_vgpr = [&next_vgpr_num, &vgpr_map](const u32 offset) { - const auto [it, is_new] = vgpr_map.try_emplace(offset); - if (is_new) { - ASSERT_MSG(next_vgpr_num < 256, "Out of VGPRs"); - const auto new_vgpr = static_cast(next_vgpr_num++); - it->second = new_vgpr; - } - return it->second; - }; - - for (IR::Block* const block : program.blocks) { - for (IR::Inst& inst : block->Instructions()) { - if (!IsSharedMemoryInst(inst)) { - continue; - } - const auto offset = GetSharedMemImmOffset(inst); - const auto vgpr = get_vgpr(offset); - IR::IREmitter ir{*block, IR::Block::InstructionList::s_iterator_to(inst)}; - ConvertSharedMemToVgpr(ir, inst, vgpr); - } - } -} - -} // namespace Shader::Optimization diff --git a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp index 029558d9e..c5bfe5796 100644 --- a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp +++ b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp @@ -78,7 +78,20 @@ bool IsDataRingInstruction(const IR::Inst& inst) { } IR::Type BufferDataType(const IR::Inst& inst, AmdGpu::NumberFormat num_format) { - return IR::Type::U32; + switch (inst.GetOpcode()) { + case IR::Opcode::LoadBufferU8: + case IR::Opcode::StoreBufferU8: + return IR::Type::U8; + case IR::Opcode::LoadBufferU16: + case IR::Opcode::StoreBufferU16: + return IR::Type::U16; + case IR::Opcode::LoadBufferFormatF32: + case IR::Opcode::StoreBufferFormatF32: + // Formatted buffer loads can use a variety of types. + return IR::Type::U32 | IR::Type::F32 | IR::Type::U16 | IR::Type::U8; + default: + return IR::Type::U32; + } } bool IsImageAtomicInstruction(const IR::Inst& inst) { @@ -121,11 +134,9 @@ public: u32 Add(const BufferResource& desc) { const u32 index{Add(buffer_resources, desc, [&desc](const auto& existing) { - // Only one GDS binding can exist. - if (desc.is_gds_buffer && existing.is_gds_buffer) { - return true; - } - return desc.sharp_idx == existing.sharp_idx && desc.inline_cbuf == existing.inline_cbuf; + return desc.sharp_idx == existing.sharp_idx && + desc.inline_cbuf == existing.inline_cbuf && + desc.buffer_type == existing.buffer_type; })}; auto& buffer = buffer_resources[index]; buffer.used_types |= desc.used_types; @@ -272,6 +283,7 @@ s32 TryHandleInlineCbuf(IR::Inst& inst, Info& info, Descriptors& descriptors, .sharp_idx = std::numeric_limits::max(), .used_types = BufferDataType(inst, cbuf.GetNumberFmt()), .inline_cbuf = cbuf, + .buffer_type = BufferType::Guest, }); } @@ -286,6 +298,7 @@ void PatchBufferSharp(IR::Block& block, IR::Inst& inst, Info& info, Descriptors& binding = descriptors.Add(BufferResource{ .sharp_idx = sharp, .used_types = BufferDataType(inst, buffer.GetNumberFmt()), + .buffer_type = BufferType::Guest, .is_written = IsBufferStore(inst), .is_formatted = inst.GetOpcode() == IR::Opcode::LoadBufferFormatF32 || inst.GetOpcode() == IR::Opcode::StoreBufferFormatF32, @@ -402,13 +415,10 @@ void PatchImageSharp(IR::Block& block, IR::Inst& inst, Info& info, Descriptors& } void PatchDataRingAccess(IR::Block& block, IR::Inst& inst, Info& info, Descriptors& descriptors) { - // Insert gds binding in the shader if it doesn't exist already. - // The buffer is used for append/consume counters. - constexpr static AmdGpu::Buffer GdsSharp{.base_address = 1}; const u32 binding = descriptors.Add(BufferResource{ .used_types = IR::Type::U32, - .inline_cbuf = GdsSharp, - .is_gds_buffer = true, + .inline_cbuf = AmdGpu::Buffer::Null(), + .buffer_type = BufferType::GdsBuffer, .is_written = true, }); @@ -420,12 +430,12 @@ void PatchDataRingAccess(IR::Block& block, IR::Inst& inst, Info& info, Descripto }; // Attempt to deduce the GDS address of counter at compile time. - const u32 gds_addr = [&] { - const IR::Value& gds_offset = inst.Arg(0); - if (gds_offset.IsImmediate()) { - // Nothing to do, offset is known. - return gds_offset.U32() & 0xFFFF; - } + u32 gds_addr = 0; + const IR::Value& gds_offset = inst.Arg(0); + if (gds_offset.IsImmediate()) { + // Nothing to do, offset is known. + gds_addr = gds_offset.U32() & 0xFFFF; + } else { const auto result = IR::BreadthFirstSearch(&inst, pred); ASSERT_MSG(result, "Unable to track M0 source"); @@ -436,8 +446,8 @@ void PatchDataRingAccess(IR::Block& block, IR::Inst& inst, Info& info, Descripto if (prod->GetOpcode() == IR::Opcode::IAdd32) { m0_val += prod->Arg(1).U32(); } - return m0_val & 0xFFFF; - }(); + gds_addr = m0_val & 0xFFFF; + } // Patch instruction. IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)}; diff --git a/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp b/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp index f3a1fc9a8..219378a6c 100644 --- a/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp +++ b/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp @@ -74,7 +74,14 @@ void Visit(Info& info, const IR::Inst& inst) { info.uses_lane_id = true; break; case IR::Opcode::ReadConst: - info.has_readconst = true; + if (!info.has_readconst) { + info.buffers.push_back({ + .used_types = IR::Type::U32, + .inline_cbuf = AmdGpu::Buffer::Null(), + .buffer_type = BufferType::ReadConstUbo, + }); + info.has_readconst = true; + } break; case IR::Opcode::PackUfloat10_11_11: info.uses_pack_10_11_11 = true; @@ -88,10 +95,9 @@ void Visit(Info& info, const IR::Inst& inst) { } void CollectShaderInfoPass(IR::Program& program) { - Info& info{program.info}; for (IR::Block* const block : program.post_order_blocks) { for (IR::Inst& inst : block->Instructions()) { - Visit(info, inst); + Visit(program.info, inst); } } } diff --git a/src/shader_recompiler/ir/passes/shared_memory_barrier_pass.cpp b/src/shader_recompiler/ir/passes/shared_memory_barrier_pass.cpp index ec7d7e986..0ee52cf19 100644 --- a/src/shader_recompiler/ir/passes/shared_memory_barrier_pass.cpp +++ b/src/shader_recompiler/ir/passes/shared_memory_barrier_pass.cpp @@ -8,37 +8,46 @@ namespace Shader::Optimization { +static bool IsLoadShared(const IR::Inst& inst) { + return inst.GetOpcode() == IR::Opcode::LoadSharedU32 || + inst.GetOpcode() == IR::Opcode::LoadSharedU64; +} + +static bool IsWriteShared(const IR::Inst& inst) { + return inst.GetOpcode() == IR::Opcode::WriteSharedU32 || + inst.GetOpcode() == IR::Opcode::WriteSharedU64; +} + +// Inserts barriers when a shared memory write and read occur in the same basic block. static void EmitBarrierInBlock(IR::Block* block) { - // This is inteded to insert a barrier when shared memory write and read - // occur in the same basic block. Also checks if branch depth is zero as - // we don't want to insert barrier in potentially divergent code. - bool emit_barrier_on_write = false; - bool emit_barrier_on_read = false; - const auto emit_barrier = [block](bool& emit_cond, IR::Inst& inst) { - if (emit_cond) { - IR::IREmitter ir{*block, IR::Block::InstructionList::s_iterator_to(inst)}; - ir.Barrier(); - emit_cond = false; - } + enum class BarrierAction : u32 { + None, + BarrierOnWrite, + BarrierOnRead, }; + BarrierAction action{}; for (IR::Inst& inst : block->Instructions()) { - if (inst.GetOpcode() == IR::Opcode::LoadSharedU32 || - inst.GetOpcode() == IR::Opcode::LoadSharedU64) { - emit_barrier(emit_barrier_on_read, inst); - emit_barrier_on_write = true; + if (IsLoadShared(inst)) { + if (action == BarrierAction::BarrierOnRead) { + IR::IREmitter ir{*block, IR::Block::InstructionList::s_iterator_to(inst)}; + ir.Barrier(); + } + action = BarrierAction::BarrierOnWrite; + continue; } - if (inst.GetOpcode() == IR::Opcode::WriteSharedU32 || - inst.GetOpcode() == IR::Opcode::WriteSharedU64) { - emit_barrier(emit_barrier_on_write, inst); - emit_barrier_on_read = true; + if (IsWriteShared(inst)) { + if (action == BarrierAction::BarrierOnWrite) { + IR::IREmitter ir{*block, IR::Block::InstructionList::s_iterator_to(inst)}; + ir.Barrier(); + } + action = BarrierAction::BarrierOnRead; } } } +// Inserts a barrier after divergent conditional blocks to avoid undefined +// behavior when some threads write and others read from shared memory. static void EmitBarrierInMergeBlock(const IR::AbstractSyntaxNode::Data& data) { - // Insert a barrier after divergent conditional blocks. - // This avoids potential softlocks and crashes when some threads - // initialize shared memory and others read from it. const IR::U1 cond = data.if_node.cond; const auto insert_barrier = IR::BreadthFirstSearch(cond, [](IR::Inst* inst) -> std::optional { @@ -56,8 +65,21 @@ static void EmitBarrierInMergeBlock(const IR::AbstractSyntaxNode::Data& data) { } } -void SharedMemoryBarrierPass(IR::Program& program, const Profile& profile) { - if (!program.info.uses_shared || !profile.needs_lds_barriers) { +static constexpr u32 GcnSubgroupSize = 64; + +void SharedMemoryBarrierPass(IR::Program& program, const RuntimeInfo& runtime_info, + const Profile& profile) { + if (program.info.stage != Stage::Compute) { + return; + } + const auto& cs_info = runtime_info.cs_info; + const u32 shared_memory_size = cs_info.shared_memory_size; + const u32 threadgroup_size = + cs_info.workgroup_size[0] * cs_info.workgroup_size[1] * cs_info.workgroup_size[2]; + // The compiler can only omit barriers when the local workgroup size is the same as the HW + // subgroup. + if (shared_memory_size == 0 || threadgroup_size != GcnSubgroupSize || + !profile.needs_lds_barriers) { return; } using Type = IR::AbstractSyntaxNode::Type; @@ -67,6 +89,8 @@ void SharedMemoryBarrierPass(IR::Program& program, const Profile& profile) { --branch_depth; continue; } + // Check if branch depth is zero, we don't want to insert barrier in potentially divergent + // code. if (node.type == Type::If && branch_depth++ == 0) { EmitBarrierInMergeBlock(node.data); continue; diff --git a/src/shader_recompiler/ir/passes/shared_memory_to_storage_pass.cpp b/src/shader_recompiler/ir/passes/shared_memory_to_storage_pass.cpp new file mode 100644 index 000000000..25aaf257c --- /dev/null +++ b/src/shader_recompiler/ir/passes/shared_memory_to_storage_pass.cpp @@ -0,0 +1,117 @@ +// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include "shader_recompiler/ir/ir_emitter.h" +#include "shader_recompiler/ir/program.h" +#include "shader_recompiler/profile.h" + +namespace Shader::Optimization { + +static bool IsSharedAccess(const IR::Inst& inst) { + const auto opcode = inst.GetOpcode(); + switch (opcode) { + case IR::Opcode::LoadSharedU32: + case IR::Opcode::LoadSharedU64: + case IR::Opcode::WriteSharedU32: + case IR::Opcode::WriteSharedU64: + case IR::Opcode::SharedAtomicAnd32: + case IR::Opcode::SharedAtomicIAdd32: + case IR::Opcode::SharedAtomicOr32: + case IR::Opcode::SharedAtomicSMax32: + case IR::Opcode::SharedAtomicUMax32: + case IR::Opcode::SharedAtomicSMin32: + case IR::Opcode::SharedAtomicUMin32: + case IR::Opcode::SharedAtomicXor32: + return true; + default: + return false; + } +} + +void SharedMemoryToStoragePass(IR::Program& program, const RuntimeInfo& runtime_info, + const Profile& profile) { + if (program.info.stage != Stage::Compute) { + return; + } + // Only perform the transform if the host shared memory is insufficient. + const u32 shared_memory_size = runtime_info.cs_info.shared_memory_size; + if (shared_memory_size <= profile.max_shared_memory_size) { + return; + } + // Add buffer binding for shared memory storage buffer. + const u32 binding = static_cast(program.info.buffers.size()); + program.info.buffers.push_back({ + .used_types = IR::Type::U32, + .inline_cbuf = AmdGpu::Buffer::Null(), + .buffer_type = BufferType::SharedMemory, + .is_written = true, + }); + for (IR::Block* const block : program.blocks) { + for (IR::Inst& inst : block->Instructions()) { + if (!IsSharedAccess(inst)) { + continue; + } + IR::IREmitter ir{*block, IR::Block::InstructionList::s_iterator_to(inst)}; + const IR::U32 handle = ir.Imm32(binding); + // Replace shared atomics first + switch (inst.GetOpcode()) { + case IR::Opcode::SharedAtomicAnd32: + inst.ReplaceUsesWithAndRemove( + ir.BufferAtomicAnd(handle, inst.Arg(0), inst.Arg(1), {})); + continue; + case IR::Opcode::SharedAtomicIAdd32: + inst.ReplaceUsesWithAndRemove( + ir.BufferAtomicIAdd(handle, inst.Arg(0), inst.Arg(1), {})); + continue; + case IR::Opcode::SharedAtomicOr32: + inst.ReplaceUsesWithAndRemove( + ir.BufferAtomicOr(handle, inst.Arg(0), inst.Arg(1), {})); + continue; + case IR::Opcode::SharedAtomicSMax32: + case IR::Opcode::SharedAtomicUMax32: { + const bool is_signed = inst.GetOpcode() == IR::Opcode::SharedAtomicSMax32; + inst.ReplaceUsesWithAndRemove( + ir.BufferAtomicIMax(handle, inst.Arg(0), inst.Arg(1), is_signed, {})); + continue; + } + case IR::Opcode::SharedAtomicSMin32: + case IR::Opcode::SharedAtomicUMin32: { + const bool is_signed = inst.GetOpcode() == IR::Opcode::SharedAtomicSMin32; + inst.ReplaceUsesWithAndRemove( + ir.BufferAtomicIMin(handle, inst.Arg(0), inst.Arg(1), is_signed, {})); + continue; + } + case IR::Opcode::SharedAtomicXor32: + inst.ReplaceUsesWithAndRemove( + ir.BufferAtomicXor(handle, inst.Arg(0), inst.Arg(1), {})); + continue; + default: + break; + } + // Replace shared operations. + const IR::U32 offset = ir.IMul(ir.GetAttributeU32(IR::Attribute::WorkgroupIndex), + ir.Imm32(shared_memory_size)); + const IR::U32 address = ir.IAdd(IR::U32{inst.Arg(0)}, offset); + switch (inst.GetOpcode()) { + case IR::Opcode::LoadSharedU32: + inst.ReplaceUsesWithAndRemove(ir.LoadBufferU32(1, handle, address, {})); + break; + case IR::Opcode::LoadSharedU64: + inst.ReplaceUsesWithAndRemove(ir.LoadBufferU32(2, handle, address, {})); + break; + case IR::Opcode::WriteSharedU32: + ir.StoreBufferU32(1, handle, address, inst.Arg(1), {}); + inst.Invalidate(); + break; + case IR::Opcode::WriteSharedU64: + ir.StoreBufferU32(2, handle, address, inst.Arg(1), {}); + inst.Invalidate(); + break; + default: + break; + } + } + } +} + +} // namespace Shader::Optimization diff --git a/src/shader_recompiler/recompiler.cpp b/src/shader_recompiler/recompiler.cpp index 5a6d1d775..1c132ebbb 100644 --- a/src/shader_recompiler/recompiler.cpp +++ b/src/shader_recompiler/recompiler.cpp @@ -65,10 +65,6 @@ IR::Program TranslateProgram(std::span code, Pools& pools, Info& info // Run optimization passes const auto stage = program.info.stage; - if (stage == Stage::Fragment) { - // Before SSA pass, as it will rewrite to VGPR load/store. - Shader::Optimization::LowerSharedMemToRegisters(program, runtime_info); - } Shader::Optimization::SsaRewritePass(program.post_order_blocks); Shader::Optimization::IdentityRemovalPass(program.blocks); if (info.l_stage == LogicalStage::TessellationControl) { @@ -90,11 +86,12 @@ IR::Program TranslateProgram(std::span code, Pools& pools, Info& info Shader::Optimization::FlattenExtendedUserdataPass(program); Shader::Optimization::ResourceTrackingPass(program); Shader::Optimization::LowerBufferFormatToRaw(program); + Shader::Optimization::SharedMemoryToStoragePass(program, runtime_info, profile); + Shader::Optimization::SharedMemoryBarrierPass(program, runtime_info, profile); Shader::Optimization::IdentityRemovalPass(program.blocks); Shader::Optimization::DeadCodeEliminationPass(program); Shader::Optimization::ConstantPropagationPass(program.post_order_blocks); Shader::Optimization::CollectShaderInfoPass(program); - Shader::Optimization::SharedMemoryBarrierPass(program, profile); return program; } diff --git a/src/shader_recompiler/runtime_info.h b/src/shader_recompiler/runtime_info.h index 78973c2d4..517392b98 100644 --- a/src/shader_recompiler/runtime_info.h +++ b/src/shader_recompiler/runtime_info.h @@ -167,6 +167,17 @@ enum class MrtSwizzle : u8 { }; static constexpr u32 MaxColorBuffers = 8; +struct PsColorBuffer { + AmdGpu::NumberFormat num_format : 4; + AmdGpu::NumberConversion num_conversion : 2; + AmdGpu::Liverpool::ShaderExportFormat export_format : 4; + u32 needs_unorm_fixup : 1; + u32 pad : 21; + AmdGpu::CompMapping swizzle; + + auto operator<=>(const PsColorBuffer&) const noexcept = default; +}; + struct FragmentRuntimeInfo { struct PsInput { u8 param_index; @@ -184,15 +195,6 @@ struct FragmentRuntimeInfo { AmdGpu::Liverpool::PsInput addr_flags; u32 num_inputs; std::array inputs; - struct PsColorBuffer { - AmdGpu::NumberFormat num_format; - AmdGpu::NumberConversion num_conversion; - AmdGpu::CompMapping swizzle; - AmdGpu::Liverpool::ShaderExportFormat export_format; - bool needs_unorm_fixup; - - auto operator<=>(const PsColorBuffer&) const noexcept = default; - }; std::array color_buffers; bool operator==(const FragmentRuntimeInfo& other) const noexcept { @@ -264,3 +266,14 @@ struct RuntimeInfo { }; } // namespace Shader + +template <> +struct fmt::formatter { + constexpr auto parse(format_parse_context& ctx) { + return ctx.begin(); + } + auto format(const Shader::Stage stage, format_context& ctx) const { + constexpr static std::array names = {"fs", "vs", "gs", "es", "hs", "ls", "cs"}; + return fmt::format_to(ctx.out(), "{}", names[static_cast(stage)]); + } +}; diff --git a/src/shader_recompiler/specialization.h b/src/shader_recompiler/specialization.h index 9bf9e71e4..1c3bfc60a 100644 --- a/src/shader_recompiler/specialization.h +++ b/src/shader_recompiler/specialization.h @@ -98,12 +98,6 @@ struct StageSpecialization { }); } u32 binding{}; - if (info->has_emulated_shared_memory) { - binding++; - } - if (info->has_readconst) { - binding++; - } ForEachSharp(binding, buffers, info->buffers, [profile_](auto& spec, const auto& desc, AmdGpu::Buffer sharp) { spec.stride = sharp.GetStride(); @@ -195,18 +189,6 @@ struct StageSpecialization { } } u32 binding{}; - if (info->has_emulated_shared_memory != other.info->has_emulated_shared_memory) { - return false; - } - if (info->has_readconst != other.info->has_readconst) { - return false; - } - if (info->has_emulated_shared_memory) { - binding++; - } - if (info->has_readconst) { - binding++; - } for (u32 i = 0; i < buffers.size(); i++) { if (other.bitset[binding++] && buffers[i] != other.buffers[i]) { return false; diff --git a/src/video_core/amdgpu/liverpool.h b/src/video_core/amdgpu/liverpool.h index 525a0c9f1..5b9b647eb 100644 --- a/src/video_core/amdgpu/liverpool.h +++ b/src/video_core/amdgpu/liverpool.h @@ -197,6 +197,10 @@ struct Liverpool { return settings.lds_dwords.Value() * 128 * 4; } + u32 NumWorkgroups() const noexcept { + return dim_x * dim_y * dim_z; + } + bool IsTgidEnabled(u32 i) const noexcept { return (settings.tgid_enable.Value() >> i) & 1; } diff --git a/src/video_core/amdgpu/resource.h b/src/video_core/amdgpu/resource.h index fa8edb3e2..64a85c812 100644 --- a/src/video_core/amdgpu/resource.h +++ b/src/video_core/amdgpu/resource.h @@ -31,6 +31,12 @@ struct Buffer { u32 _padding1 : 6; u32 type : 2; // overlaps with T# type, so should be 0 for buffer + static constexpr Buffer Null() { + Buffer buffer{}; + buffer.base_address = 1; + return buffer; + } + bool Valid() const { return type == 0u; } diff --git a/src/video_core/amdgpu/types.h b/src/video_core/amdgpu/types.h index ee2dda494..d991e0abd 100644 --- a/src/video_core/amdgpu/types.h +++ b/src/video_core/amdgpu/types.h @@ -183,7 +183,7 @@ enum class NumberFormat : u32 { Ubscaled = 13, }; -enum class CompSwizzle : u32 { +enum class CompSwizzle : u8 { Zero = 0, One = 1, Red = 4, @@ -193,10 +193,10 @@ enum class CompSwizzle : u32 { }; enum class NumberConversion : u32 { - None, - UintToUscaled, - SintToSscaled, - UnormToUbnorm, + None = 0, + UintToUscaled = 1, + SintToSscaled = 2, + UnormToUbnorm = 3, }; struct CompMapping { diff --git a/src/video_core/buffer_cache/buffer.h b/src/video_core/buffer_cache/buffer.h index ec92a0ebf..188b4b2ca 100644 --- a/src/video_core/buffer_cache/buffer.h +++ b/src/video_core/buffer_cache/buffer.h @@ -168,7 +168,7 @@ public: void Commit(); /// Maps and commits a memory region with user provided data - u64 Copy(VAddr src, size_t size, size_t alignment = 0) { + u64 Copy(auto src, size_t size, size_t alignment = 0) { const auto [data, offset] = Map(size, alignment); std::memcpy(data, reinterpret_cast(src), size); Commit(); diff --git a/src/video_core/buffer_cache/buffer_cache.cpp b/src/video_core/buffer_cache/buffer_cache.cpp index 37af62f30..ccb45c095 100644 --- a/src/video_core/buffer_cache/buffer_cache.cpp +++ b/src/video_core/buffer_cache/buffer_cache.cpp @@ -5,11 +5,8 @@ #include "common/alignment.h" #include "common/scope_exit.h" #include "common/types.h" -#include "shader_recompiler/frontend/fetch_shader.h" -#include "shader_recompiler/info.h" #include "video_core/amdgpu/liverpool.h" #include "video_core/buffer_cache/buffer_cache.h" -#include "video_core/renderer_vulkan/liverpool_to_vk.h" #include "video_core/renderer_vulkan/vk_graphics_pipeline.h" #include "video_core/renderer_vulkan/vk_instance.h" #include "video_core/renderer_vulkan/vk_scheduler.h" @@ -18,8 +15,8 @@ namespace VideoCore { static constexpr size_t DataShareBufferSize = 64_KB; -static constexpr size_t StagingBufferSize = 1_GB; -static constexpr size_t UboStreamBufferSize = 64_MB; +static constexpr size_t StagingBufferSize = 512_MB; +static constexpr size_t UboStreamBufferSize = 128_MB; BufferCache::BufferCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_, AmdGpu::Liverpool* liverpool_, TextureCache& texture_cache_, @@ -29,10 +26,8 @@ BufferCache::BufferCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& s staging_buffer{instance, scheduler, MemoryUsage::Upload, StagingBufferSize}, stream_buffer{instance, scheduler, MemoryUsage::Stream, UboStreamBufferSize}, gds_buffer{instance, scheduler, MemoryUsage::Stream, 0, AllFlags, DataShareBufferSize}, - lds_buffer{instance, scheduler, MemoryUsage::DeviceLocal, 0, AllFlags, DataShareBufferSize}, memory_tracker{&tracker} { Vulkan::SetObjectName(instance.GetDevice(), gds_buffer.Handle(), "GDS Buffer"); - Vulkan::SetObjectName(instance.GetDevice(), lds_buffer.Handle(), "LDS Buffer"); // Ensure the first slot is used for the null buffer const auto null_id = @@ -251,14 +246,6 @@ void BufferCache::InlineData(VAddr address, const void* value, u32 num_bytes, bo }); } -std::pair BufferCache::ObtainHostUBO(std::span data) { - static constexpr u64 StreamThreshold = CACHING_PAGESIZE; - ASSERT(data.size_bytes() <= StreamThreshold); - const u64 offset = stream_buffer.Copy(reinterpret_cast(data.data()), data.size_bytes(), - instance.UniformMinAlignment()); - return {&stream_buffer, offset}; -} - std::pair BufferCache::ObtainBuffer(VAddr device_addr, u32 size, bool is_written, bool is_texel_buffer, BufferId buffer_id) { // For small uniform buffers that have not been modified by gpu diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index 088c22c12..71a6bed2a 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -68,9 +68,9 @@ public: return &gds_buffer; } - /// Returns a pointer to LDS device local buffer. - [[nodiscard]] const Buffer* GetLdsBuffer() const noexcept { - return &lds_buffer; + /// Retrieves the host visible device local stream buffer. + [[nodiscard]] StreamBuffer& GetStreamBuffer() noexcept { + return stream_buffer; } /// Retrieves the buffer with the specified id. @@ -90,8 +90,6 @@ public: /// Writes a value to GPU buffer. void InlineData(VAddr address, const void* value, u32 num_bytes, bool is_gds); - [[nodiscard]] std::pair ObtainHostUBO(std::span data); - /// Obtains a buffer for the specified region. [[nodiscard]] std::pair ObtainBuffer(VAddr gpu_addr, u32 size, bool is_written, bool is_texel_buffer = false, @@ -159,7 +157,6 @@ private: StreamBuffer staging_buffer; StreamBuffer stream_buffer; Buffer gds_buffer; - Buffer lds_buffer; std::shared_mutex mutex; Common::SlotVector slot_buffers; RangeSet gpu_modified_ranges; diff --git a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp index f0346559d..f6216f54f 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp @@ -3,11 +3,9 @@ #include -#include "video_core/buffer_cache/buffer_cache.h" #include "video_core/renderer_vulkan/vk_compute_pipeline.h" #include "video_core/renderer_vulkan/vk_instance.h" #include "video_core/renderer_vulkan/vk_scheduler.h" -#include "video_core/texture_cache/texture_cache.h" namespace Vulkan { @@ -29,23 +27,6 @@ ComputePipeline::ComputePipeline(const Instance& instance, Scheduler& scheduler, u32 binding{}; boost::container::small_vector bindings; - - if (info->has_emulated_shared_memory) { - bindings.push_back({ - .binding = binding++, - .descriptorType = vk::DescriptorType::eStorageBuffer, - .descriptorCount = 1, - .stageFlags = vk::ShaderStageFlagBits::eCompute, - }); - } - if (info->has_readconst) { - bindings.push_back({ - .binding = binding++, - .descriptorType = vk::DescriptorType::eUniformBuffer, - .descriptorCount = 1, - .stageFlags = vk::ShaderStageFlagBits::eCompute, - }); - } for (const auto& buffer : info->buffers) { const auto sharp = buffer.GetSharp(*info); bindings.push_back({ diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp index 4eecd1edf..2c432e9bf 100644 --- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp @@ -7,18 +7,13 @@ #include #include "common/assert.h" -#include "common/io_file.h" #include "shader_recompiler/backend/spirv/emit_spirv_quad_rect.h" #include "shader_recompiler/frontend/fetch_shader.h" -#include "shader_recompiler/runtime_info.h" #include "video_core/amdgpu/resource.h" -#include "video_core/buffer_cache/buffer_cache.h" #include "video_core/renderer_vulkan/vk_graphics_pipeline.h" #include "video_core/renderer_vulkan/vk_instance.h" -#include "video_core/renderer_vulkan/vk_pipeline_cache.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_shader_util.h" -#include "video_core/texture_cache/texture_cache.h" namespace Vulkan { @@ -357,14 +352,6 @@ void GraphicsPipeline::BuildDescSetLayout() { if (!stage) { continue; } - if (stage->has_readconst) { - bindings.push_back({ - .binding = binding++, - .descriptorType = vk::DescriptorType::eUniformBuffer, - .descriptorCount = 1, - .stageFlags = gp_stage_flags, - }); - } for (const auto& buffer : stage->buffers) { const auto sharp = buffer.GetSharp(*stage); bindings.push_back({ diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.h b/src/video_core/renderer_vulkan/vk_graphics_pipeline.h index 64cc761f4..e6596db2f 100644 --- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.h +++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.h @@ -35,8 +35,7 @@ struct GraphicsPipelineKey { std::array stage_hashes; u32 num_color_attachments; std::array color_formats; - std::array - color_buffers; + std::array color_buffers; vk::Format depth_format; vk::Format stencil_format; diff --git a/src/video_core/renderer_vulkan/vk_instance.cpp b/src/video_core/renderer_vulkan/vk_instance.cpp index 780779c0b..a17f8c9c2 100644 --- a/src/video_core/renderer_vulkan/vk_instance.cpp +++ b/src/video_core/renderer_vulkan/vk_instance.cpp @@ -1,14 +1,11 @@ // SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later -#include -#include #include #include #include #include "common/assert.h" -#include "common/config.h" #include "common/debug.h" #include "sdl_window.h" #include "video_core/renderer_vulkan/liverpool_to_vk.h" @@ -206,13 +203,12 @@ std::string Instance::GetDriverVersionName() { } bool Instance::CreateDevice() { - const vk::StructureChain feature_chain = - physical_device - .getFeatures2(); + const vk::StructureChain feature_chain = physical_device.getFeatures2< + vk::PhysicalDeviceFeatures2, vk::PhysicalDeviceVulkan11Features, + vk::PhysicalDeviceVulkan12Features, vk::PhysicalDeviceRobustness2FeaturesEXT, + vk::PhysicalDeviceExtendedDynamicState3FeaturesEXT, + vk::PhysicalDevicePrimitiveTopologyListRestartFeaturesEXT, + vk::PhysicalDevicePortabilitySubsetFeaturesKHR>(); features = feature_chain.get().features; #ifdef __APPLE__ portability_features = feature_chain.get(); @@ -319,6 +315,7 @@ bool Instance::CreateDevice() { const auto topology_list_restart_features = feature_chain.get(); + const auto vk11_features = feature_chain.get(); const auto vk12_features = feature_chain.get(); vk::StructureChain device_chain = { vk::DeviceCreateInfo{ @@ -351,12 +348,17 @@ bool Instance::CreateDevice() { }, }, vk::PhysicalDeviceVulkan11Features{ - .shaderDrawParameters = true, + .storageBuffer16BitAccess = vk11_features.storageBuffer16BitAccess, + .uniformAndStorageBuffer16BitAccess = vk11_features.uniformAndStorageBuffer16BitAccess, + .shaderDrawParameters = vk11_features.shaderDrawParameters, }, vk::PhysicalDeviceVulkan12Features{ .samplerMirrorClampToEdge = vk12_features.samplerMirrorClampToEdge, .drawIndirectCount = vk12_features.drawIndirectCount, + .storageBuffer8BitAccess = vk12_features.storageBuffer8BitAccess, + .uniformAndStorageBuffer8BitAccess = vk12_features.uniformAndStorageBuffer8BitAccess, .shaderFloat16 = vk12_features.shaderFloat16, + .shaderInt8 = vk12_features.shaderInt8, .scalarBlockLayout = vk12_features.scalarBlockLayout, .uniformBufferStandardLayout = vk12_features.uniformBufferStandardLayout, .separateDepthStencilLayouts = vk12_features.separateDepthStencilLayouts, diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index f7afd2e75..6ac7f7e43 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -345,12 +345,12 @@ bool PipelineCache::RefreshGraphicsKey() { key.color_formats[remapped_cb] = LiverpoolToVK::SurfaceFormat(col_buf.GetDataFmt(), col_buf.GetNumberFmt()); - key.color_buffers[remapped_cb] = { + key.color_buffers[remapped_cb] = Shader::PsColorBuffer{ .num_format = col_buf.GetNumberFmt(), .num_conversion = col_buf.GetNumberConversion(), - .swizzle = col_buf.Swizzle(), .export_format = regs.color_export_format.GetFormat(cb), .needs_unorm_fixup = needs_unorm_fixup, + .swizzle = col_buf.Swizzle(), }; } diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index ac6aac7b3..816f149b0 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -19,6 +19,20 @@ namespace Vulkan { +static Shader::PushData MakeUserData(const AmdGpu::Liverpool::Regs& regs) { + Shader::PushData push_data{}; + push_data.step0 = regs.vgt_instance_step_rate_0; + push_data.step1 = regs.vgt_instance_step_rate_1; + + // TODO(roamic): Add support for multiple viewports and geometry shaders when ViewportIndex + // is encountered and implemented in the recompiler. + push_data.xoffset = regs.viewport_control.xoffset_enable ? regs.viewports[0].xoffset : 0.f; + push_data.xscale = regs.viewport_control.xscale_enable ? regs.viewports[0].xscale : 1.f; + push_data.yoffset = regs.viewport_control.yoffset_enable ? regs.viewports[0].yoffset : 0.f; + push_data.yscale = regs.viewport_control.yscale_enable ? regs.viewports[0].yscale : 1.f; + return push_data; +} + Rasterizer::Rasterizer(const Instance& instance_, Scheduler& scheduler_, AmdGpu::Liverpool* liverpool_) : instance{instance_}, scheduler{scheduler_}, page_manager{this}, @@ -426,95 +440,69 @@ void Rasterizer::Finish() { } bool Rasterizer::BindResources(const Pipeline* pipeline) { - buffer_infos.clear(); - buffer_views.clear(); - image_infos.clear(); - - const auto& regs = liverpool->regs; - - if (pipeline->IsCompute()) { - const auto& info = pipeline->GetStage(Shader::LogicalStage::Compute); - - // Assume if a shader reads and writes metas at the same time, it is a copy shader. - bool meta_read = false; - for (const auto& desc : info.buffers) { - if (desc.is_gds_buffer) { - continue; - } - if (!desc.is_written) { - const VAddr address = desc.GetSharp(info).base_address; - meta_read = texture_cache.IsMeta(address); - } - } - - // Most of the time when a metadata is updated with a shader it gets cleared. It means - // we can skip the whole dispatch and update the tracked state instead. Also, it is not - // intended to be consumed and in such rare cases (e.g. HTile introspection, CRAA) we - // will need its full emulation anyways. For cases of metadata read a warning will be - // logged. - if (!meta_read) { - for (const auto& desc : info.buffers) { - const auto sharp = desc.GetSharp(info); - const VAddr address = sharp.base_address; - if (desc.is_written) { - // Assume all slices were updates - if (texture_cache.ClearMeta(address)) { - LOG_TRACE(Render_Vulkan, "Metadata update skipped"); - return false; - } - } else { - if (texture_cache.IsMeta(address)) { - LOG_WARNING(Render_Vulkan, - "Unexpected metadata read by a CS shader (buffer)"); - } - } - } - } + if (IsComputeMetaClear(pipeline)) { + return false; } set_writes.clear(); buffer_barriers.clear(); + buffer_infos.clear(); + buffer_views.clear(); + image_infos.clear(); // Bind resource buffers and textures. - Shader::PushData push_data{}; Shader::Backend::Bindings binding{}; - + Shader::PushData push_data = MakeUserData(liverpool->regs); for (const auto* stage : pipeline->GetStages()) { if (!stage) { continue; } - push_data.step0 = regs.vgt_instance_step_rate_0; - push_data.step1 = regs.vgt_instance_step_rate_1; - - // TODO(roamic): add support for multiple viewports and geometry shaders when ViewportIndex - // is encountered and implemented in the recompiler. - if (stage->stage == Shader::Stage::Vertex) { - push_data.xoffset = - regs.viewport_control.xoffset_enable ? regs.viewports[0].xoffset : 0.f; - push_data.xscale = regs.viewport_control.xscale_enable ? regs.viewports[0].xscale : 1.f; - push_data.yoffset = - regs.viewport_control.yoffset_enable ? regs.viewports[0].yoffset : 0.f; - push_data.yscale = regs.viewport_control.yscale_enable ? regs.viewports[0].yscale : 1.f; - } stage->PushUd(binding, push_data); - - BindBuffers(*stage, binding, push_data, set_writes, buffer_barriers); - BindTextures(*stage, binding, set_writes); + BindBuffers(*stage, binding, push_data); + BindTextures(*stage, binding); } pipeline->BindResources(set_writes, buffer_barriers, push_data); - return true; } +bool Rasterizer::IsComputeMetaClear(const Pipeline* pipeline) { + if (!pipeline->IsCompute()) { + return false; + } + + const auto& info = pipeline->GetStage(Shader::LogicalStage::Compute); + + // Assume if a shader reads and writes metas at the same time, it is a copy shader. + for (const auto& desc : info.buffers) { + const VAddr address = desc.GetSharp(info).base_address; + if (!desc.IsSpecial() && !desc.is_written && texture_cache.IsMeta(address)) { + return false; + } + } + + // Most of the time when a metadata is updated with a shader it gets cleared. It means + // we can skip the whole dispatch and update the tracked state instead. Also, it is not + // intended to be consumed and in such rare cases (e.g. HTile introspection, CRAA) we + // will need its full emulation anyways. + for (const auto& desc : info.buffers) { + const VAddr address = desc.GetSharp(info).base_address; + if (!desc.IsSpecial() && desc.is_written && texture_cache.ClearMeta(address)) { + // Assume all slices were updates + LOG_TRACE(Render_Vulkan, "Metadata update skipped"); + return true; + } + } + return false; +} + void Rasterizer::BindBuffers(const Shader::Info& stage, Shader::Backend::Bindings& binding, - Shader::PushData& push_data, Pipeline::DescriptorWrites& set_writes, - Pipeline::BufferBarriers& buffer_barriers) { + Shader::PushData& push_data) { buffer_bindings.clear(); for (const auto& desc : stage.buffers) { const auto vsharp = desc.GetSharp(stage); - if (!desc.is_gds_buffer && vsharp.base_address != 0 && vsharp.GetSize() > 0) { + if (!desc.IsSpecial() && vsharp.base_address != 0 && vsharp.GetSize() > 0) { const auto buffer_id = buffer_cache.FindBuffer(vsharp.base_address, vsharp.GetSize()); buffer_bindings.emplace_back(buffer_id, vsharp); } else { @@ -522,47 +510,30 @@ void Rasterizer::BindBuffers(const Shader::Info& stage, Shader::Backend::Binding } } - // Bind a SSBO to act as shared memory in case of not being able to use a workgroup buffer - // (e.g. when the compute shared memory is bigger than the GPU's shared memory) - if (stage.has_emulated_shared_memory) { - const auto* lds_buf = buffer_cache.GetLdsBuffer(); - buffer_infos.emplace_back(lds_buf->Handle(), 0, lds_buf->SizeBytes()); - set_writes.push_back({ - .dstSet = VK_NULL_HANDLE, - .dstBinding = binding.unified++, - .dstArrayElement = 0, - .descriptorCount = 1, - .descriptorType = vk::DescriptorType::eStorageBuffer, - .pBufferInfo = &buffer_infos.back(), - }); - ++binding.buffer; - } - - // Bind the flattened user data buffer as a UBO so it's accessible to the shader - if (stage.has_readconst) { - const auto [vk_buffer, offset] = buffer_cache.ObtainHostUBO(stage.flattened_ud_buf); - buffer_infos.emplace_back(vk_buffer->Handle(), offset, - stage.flattened_ud_buf.size() * sizeof(u32)); - set_writes.push_back({ - .dstSet = VK_NULL_HANDLE, - .dstBinding = binding.unified++, - .dstArrayElement = 0, - .descriptorCount = 1, - .descriptorType = vk::DescriptorType::eUniformBuffer, - .pBufferInfo = &buffer_infos.back(), - }); - ++binding.buffer; - } - // Second pass to re-bind buffers that were updated after binding for (u32 i = 0; i < buffer_bindings.size(); i++) { const auto& [buffer_id, vsharp] = buffer_bindings[i]; const auto& desc = stage.buffers[i]; const bool is_storage = desc.IsStorage(vsharp, pipeline_cache.GetProfile()); + // Buffer is not from the cache, either a special buffer or unbound. if (!buffer_id) { - if (desc.is_gds_buffer) { + if (desc.buffer_type == Shader::BufferType::GdsBuffer) { const auto* gds_buf = buffer_cache.GetGdsBuffer(); buffer_infos.emplace_back(gds_buf->Handle(), 0, gds_buf->SizeBytes()); + } else if (desc.buffer_type == Shader::BufferType::ReadConstUbo) { + auto& vk_buffer = buffer_cache.GetStreamBuffer(); + const u32 ubo_size = stage.flattened_ud_buf.size() * sizeof(u32); + const u64 offset = vk_buffer.Copy(stage.flattened_ud_buf.data(), ubo_size, + instance.UniformMinAlignment()); + buffer_infos.emplace_back(vk_buffer.Handle(), offset, ubo_size); + } else if (desc.buffer_type == Shader::BufferType::SharedMemory) { + auto& lds_buffer = buffer_cache.GetStreamBuffer(); + const auto& cs_program = liverpool->GetCsRegs(); + const auto lds_size = cs_program.SharedMemSize() * cs_program.NumWorkgroups(); + const auto [data, offset] = + lds_buffer.Map(lds_size, instance.StorageMinAlignment()); + std::memset(data, 0, lds_size); + buffer_infos.emplace_back(lds_buffer.Handle(), offset, lds_size); } else if (instance.IsNullDescriptorSupported()) { buffer_infos.emplace_back(VK_NULL_HANDLE, 0, VK_WHOLE_SIZE); } else { @@ -605,8 +576,7 @@ void Rasterizer::BindBuffers(const Shader::Info& stage, Shader::Backend::Binding } } -void Rasterizer::BindTextures(const Shader::Info& stage, Shader::Backend::Bindings& binding, - Pipeline::DescriptorWrites& set_writes) { +void Rasterizer::BindTextures(const Shader::Info& stage, Shader::Backend::Bindings& binding) { image_bindings.clear(); for (const auto& image_desc : stage.images) { diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index db458662c..292944a10 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h @@ -81,11 +81,9 @@ private: bool FilterDraw(); void BindBuffers(const Shader::Info& stage, Shader::Backend::Bindings& binding, - Shader::PushData& push_data, Pipeline::DescriptorWrites& set_writes, - Pipeline::BufferBarriers& buffer_barriers); + Shader::PushData& push_data); - void BindTextures(const Shader::Info& stage, Shader::Backend::Bindings& binding, - Pipeline::DescriptorWrites& set_writes); + void BindTextures(const Shader::Info& stage, Shader::Backend::Bindings& binding); bool BindResources(const Pipeline* pipeline); void ResetBindings() { @@ -95,6 +93,8 @@ private: bound_images.clear(); } + bool IsComputeMetaClear(const Pipeline* pipeline); + private: const Instance& instance; Scheduler& scheduler;