diff --git a/src/shader_recompiler/backend/spirv/emit_spirv.cpp b/src/shader_recompiler/backend/spirv/emit_spirv.cpp index f2e6279f4..37d7eea35 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv.cpp @@ -303,6 +303,11 @@ void SetupCapabilities(const Info& info, const Profile& profile, EmitContext& ct ctx.AddCapability(spv::Capability::PhysicalStorageBufferAddresses); ctx.AddExtension("SPV_KHR_physical_storage_buffer"); } + if (info.uses_shared && profile.supports_workgroup_explicit_memory_layout) { + ctx.AddExtension("SPV_KHR_workgroup_memory_explicit_layout"); + ctx.AddCapability(spv::Capability::WorkgroupMemoryExplicitLayoutKHR); + ctx.AddCapability(spv::Capability::WorkgroupMemoryExplicitLayout16BitAccessKHR); + } } void DefineEntryPoint(const Info& info, EmitContext& ctx, Id main) { diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp index a342b47b6..13fd8e180 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp @@ -1,6 +1,8 @@ // SPDX-FileCopyrightText: Copyright 2021 yuzu Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later +#include "common/div_ceil.h" +#include "shader_recompiler/backend/spirv/emit_spirv_bounds.h" #include "shader_recompiler/backend/spirv/emit_spirv_instructions.h" #include "shader_recompiler/backend/spirv/spirv_emit_context.h" @@ -15,42 +17,40 @@ std::pair AtomicArgs(EmitContext& ctx) { Id SharedAtomicU32(EmitContext& ctx, Id offset, Id value, Id (Sirit::Module::*atomic_func)(Id, Id, Id, Id, Id)) { const Id shift_id{ctx.ConstU32(2U)}; - const Id index{ctx.OpShiftRightArithmetic(ctx.U32[1], offset, shift_id)}; - const Id pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, index)}; + const Id index{ctx.OpShiftRightLogical(ctx.U32[1], offset, shift_id)}; + const u32 num_elements{Common::DivCeil(ctx.runtime_info.cs_info.shared_memory_size, 4u)}; + const Id pointer{ + ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, ctx.u32_zero_value, index)}; const auto [scope, semantics]{AtomicArgs(ctx)}; - return (ctx.*atomic_func)(ctx.U32[1], pointer, scope, semantics, value); + return AccessBoundsCheck<32>(ctx, index, ctx.ConstU32(num_elements), [&] { + return (ctx.*atomic_func)(ctx.U32[1], pointer, scope, semantics, value); + }); +} + +Id SharedAtomicU64(EmitContext& ctx, Id offset, Id value, + Id (Sirit::Module::*atomic_func)(Id, Id, Id, Id, Id)) { + const Id shift_id{ctx.ConstU32(3U)}; + const Id index{ctx.OpShiftRightLogical(ctx.U32[1], offset, shift_id)}; + const u32 num_elements{Common::DivCeil(ctx.runtime_info.cs_info.shared_memory_size, 8u)}; + const Id pointer{ + ctx.OpAccessChain(ctx.shared_u64, ctx.shared_memory_u64, ctx.u32_zero_value, index)}; + const auto [scope, semantics]{AtomicArgs(ctx)}; + return AccessBoundsCheck<64>(ctx, index, ctx.ConstU32(num_elements), [&] { + return (ctx.*atomic_func)(ctx.U64, pointer, scope, semantics, value); + }); } Id SharedAtomicU32_IncDec(EmitContext& ctx, Id offset, Id (Sirit::Module::*atomic_func)(Id, Id, Id, Id)) { const Id shift_id{ctx.ConstU32(2U)}; - const Id index{ctx.OpShiftRightArithmetic(ctx.U32[1], offset, shift_id)}; - const Id pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, index)}; + const Id index{ctx.OpShiftRightLogical(ctx.U32[1], offset, shift_id)}; + const u32 num_elements{Common::DivCeil(ctx.runtime_info.cs_info.shared_memory_size, 4u)}; + const Id pointer{ + ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, ctx.u32_zero_value, index)}; const auto [scope, semantics]{AtomicArgs(ctx)}; - return (ctx.*atomic_func)(ctx.U32[1], pointer, scope, semantics); -} - -Id BufferAtomicU32BoundsCheck(EmitContext& ctx, Id index, Id buffer_size, auto emit_func) { - if (Sirit::ValidId(buffer_size)) { - // Bounds checking enabled, wrap in a conditional branch to make sure that - // the atomic is not mistakenly executed when the index is out of bounds. - const Id in_bounds = ctx.OpULessThan(ctx.U1[1], index, buffer_size); - const Id ib_label = ctx.OpLabel(); - const Id oob_label = ctx.OpLabel(); - const Id end_label = ctx.OpLabel(); - ctx.OpSelectionMerge(end_label, spv::SelectionControlMask::MaskNone); - ctx.OpBranchConditional(in_bounds, ib_label, oob_label); - ctx.AddLabel(ib_label); - const Id ib_result = emit_func(); - ctx.OpBranch(end_label); - ctx.AddLabel(oob_label); - const Id oob_result = ctx.u32_zero_value; - ctx.OpBranch(end_label); - ctx.AddLabel(end_label); - return ctx.OpPhi(ctx.U32[1], ib_result, ib_label, oob_result, oob_label); - } - // Bounds checking not enabled, just perform the atomic operation. - return emit_func(); + return AccessBoundsCheck<32>(ctx, index, ctx.ConstU32(num_elements), [&] { + return (ctx.*atomic_func)(ctx.U32[1], pointer, scope, semantics); + }); } Id BufferAtomicU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value, @@ -63,7 +63,7 @@ Id BufferAtomicU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id const auto [id, pointer_type] = buffer[EmitContext::PointerType::U32]; const Id ptr = ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, index); const auto [scope, semantics]{AtomicArgs(ctx)}; - return BufferAtomicU32BoundsCheck(ctx, index, buffer.size_dwords, [&] { + return AccessBoundsCheck<32>(ctx, index, buffer.size_dwords, [&] { return (ctx.*atomic_func)(ctx.U32[1], ptr, scope, semantics, value); }); } @@ -79,11 +79,26 @@ Id BufferAtomicU32CmpSwap(EmitContext& ctx, IR::Inst* inst, u32 handle, Id addre const auto [id, pointer_type] = buffer[EmitContext::PointerType::U32]; const Id ptr = ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, index); const auto [scope, semantics]{AtomicArgs(ctx)}; - return BufferAtomicU32BoundsCheck(ctx, index, buffer.size_dwords, [&] { + return AccessBoundsCheck<32>(ctx, index, buffer.size_dwords, [&] { return (ctx.*atomic_func)(ctx.U32[1], ptr, scope, semantics, semantics, value, cmp_value); }); } +Id BufferAtomicU64(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value, + Id (Sirit::Module::*atomic_func)(Id, Id, Id, Id, Id)) { + const auto& buffer = ctx.buffers[handle]; + if (Sirit::ValidId(buffer.offset)) { + address = ctx.OpIAdd(ctx.U32[1], address, buffer.offset); + } + const Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(3u)); + const auto [id, pointer_type] = buffer[EmitContext::PointerType::U64]; + const Id ptr = ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, index); + const auto [scope, semantics]{AtomicArgs(ctx)}; + return AccessBoundsCheck<64>(ctx, index, buffer.size_qwords, [&] { + return (ctx.*atomic_func)(ctx.U64, ptr, scope, semantics, value); + }); +} + Id ImageAtomicU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id value, Id (Sirit::Module::*atomic_func)(Id, Id, Id, Id, Id)) { const auto& texture = ctx.images[handle & 0xFFFF]; @@ -105,6 +120,10 @@ Id EmitSharedAtomicIAdd32(EmitContext& ctx, Id offset, Id value) { return SharedAtomicU32(ctx, offset, value, &Sirit::Module::OpAtomicIAdd); } +Id EmitSharedAtomicIAdd64(EmitContext& ctx, Id offset, Id value) { + return SharedAtomicU64(ctx, offset, value, &Sirit::Module::OpAtomicIAdd); +} + Id EmitSharedAtomicUMax32(EmitContext& ctx, Id offset, Id value) { return SharedAtomicU32(ctx, offset, value, &Sirit::Module::OpAtomicUMax); } @@ -149,6 +168,10 @@ Id EmitBufferAtomicIAdd32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id addre return BufferAtomicU32(ctx, inst, handle, address, value, &Sirit::Module::OpAtomicIAdd); } +Id EmitBufferAtomicIAdd64(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) { + return BufferAtomicU64(ctx, inst, handle, address, value, &Sirit::Module::OpAtomicIAdd); +} + Id EmitBufferAtomicSMin32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) { return BufferAtomicU32(ctx, inst, handle, address, value, &Sirit::Module::OpAtomicSMin); } diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_bounds.h b/src/shader_recompiler/backend/spirv/emit_spirv_bounds.h new file mode 100644 index 000000000..41e70c8c3 --- /dev/null +++ b/src/shader_recompiler/backend/spirv/emit_spirv_bounds.h @@ -0,0 +1,48 @@ +// SPDX-FileCopyrightText: Copyright 2025 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include "shader_recompiler/backend/spirv/emit_spirv_instructions.h" +#include "shader_recompiler/backend/spirv/spirv_emit_context.h" + +namespace Shader::Backend::SPIRV { + +template +auto AccessBoundsCheck(EmitContext& ctx, Id index, Id buffer_size, auto emit_func) { + Id zero_value{}; + Id result_type{}; + if constexpr (bit_size == 64) { + zero_value = ctx.u64_zero_value; + result_type = ctx.U64; + } else if constexpr (bit_size == 32) { + zero_value = ctx.u32_zero_value; + result_type = ctx.U32[1]; + } else if constexpr (bit_size == 16) { + zero_value = ctx.u16_zero_value; + result_type = ctx.U16; + } else { + static_assert(false, "type not supported"); + } + if (Sirit::ValidId(buffer_size)) { + // Bounds checking enabled, wrap in a conditional branch to make sure that + // the atomic is not mistakenly executed when the index is out of bounds. + const Id in_bounds = ctx.OpULessThan(ctx.U1[1], index, buffer_size); + const Id ib_label = ctx.OpLabel(); + const Id end_label = ctx.OpLabel(); + ctx.OpSelectionMerge(end_label, spv::SelectionControlMask::MaskNone); + ctx.OpBranchConditional(in_bounds, ib_label, end_label); + const auto last_label = ctx.last_label; + ctx.AddLabel(ib_label); + const auto ib_result = emit_func(); + ctx.OpBranch(end_label); + ctx.AddLabel(end_label); + if (Sirit::ValidId(ib_result)) { + return ctx.OpPhi(result_type, ib_result, ib_label, zero_value, last_label); + } else { + return Id{0}; + } + } + // Bounds checking not enabled, just perform the atomic operation. + return emit_func(); +} + +} // namespace Shader::Backend::SPIRV diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h index b9707224c..3441c5a23 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h +++ b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h @@ -86,6 +86,7 @@ void EmitStoreBufferF32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id addre void EmitStoreBufferF32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); void EmitStoreBufferFormatF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); Id EmitBufferAtomicIAdd32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); +Id EmitBufferAtomicIAdd64(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); Id EmitBufferAtomicSMin32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); Id EmitBufferAtomicUMin32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); Id EmitBufferAtomicSMax32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); @@ -120,11 +121,14 @@ Id EmitUndefU8(EmitContext& ctx); Id EmitUndefU16(EmitContext& ctx); Id EmitUndefU32(EmitContext& ctx); Id EmitUndefU64(EmitContext& ctx); +Id EmitLoadSharedU16(EmitContext& ctx, Id offset); Id EmitLoadSharedU32(EmitContext& ctx, Id offset); Id EmitLoadSharedU64(EmitContext& ctx, Id offset); +void EmitWriteSharedU16(EmitContext& ctx, Id offset, Id value); void EmitWriteSharedU32(EmitContext& ctx, Id offset, Id value); void EmitWriteSharedU64(EmitContext& ctx, Id offset, Id value); Id EmitSharedAtomicIAdd32(EmitContext& ctx, Id offset, Id value); +Id EmitSharedAtomicIAdd64(EmitContext& ctx, Id offset, Id value); Id EmitSharedAtomicUMax32(EmitContext& ctx, Id offset, Id value); Id EmitSharedAtomicSMax32(EmitContext& ctx, Id offset, Id value); Id EmitSharedAtomicUMin32(EmitContext& ctx, Id offset, Id value); diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_shared_memory.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_shared_memory.cpp index 8b1610d61..c59406499 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_shared_memory.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_shared_memory.cpp @@ -1,43 +1,86 @@ // SPDX-FileCopyrightText: Copyright 2021 yuzu Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later +#include "common/div_ceil.h" +#include "shader_recompiler/backend/spirv/emit_spirv_bounds.h" #include "shader_recompiler/backend/spirv/emit_spirv_instructions.h" #include "shader_recompiler/backend/spirv/spirv_emit_context.h" namespace Shader::Backend::SPIRV { +Id EmitLoadSharedU16(EmitContext& ctx, Id offset) { + const Id shift_id{ctx.ConstU32(1U)}; + const Id index{ctx.OpShiftRightLogical(ctx.U32[1], offset, shift_id)}; + const u32 num_elements{Common::DivCeil(ctx.runtime_info.cs_info.shared_memory_size, 2u)}; + + return AccessBoundsCheck<16>(ctx, index, ctx.ConstU32(num_elements), [&] { + const Id pointer = + ctx.OpAccessChain(ctx.shared_u16, ctx.shared_memory_u16, ctx.u32_zero_value, index); + return ctx.OpLoad(ctx.U16, pointer); + }); +} + Id EmitLoadSharedU32(EmitContext& ctx, Id offset) { const Id shift_id{ctx.ConstU32(2U)}; - const Id index{ctx.OpShiftRightArithmetic(ctx.U32[1], offset, shift_id)}; - const Id pointer = ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, index); - return ctx.OpLoad(ctx.U32[1], pointer); + const Id index{ctx.OpShiftRightLogical(ctx.U32[1], offset, shift_id)}; + const u32 num_elements{Common::DivCeil(ctx.runtime_info.cs_info.shared_memory_size, 4u)}; + + return AccessBoundsCheck<32>(ctx, index, ctx.ConstU32(num_elements), [&] { + const Id pointer = + ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, ctx.u32_zero_value, index); + return ctx.OpLoad(ctx.U32[1], pointer); + }); } Id EmitLoadSharedU64(EmitContext& ctx, Id offset) { - const Id shift_id{ctx.ConstU32(2U)}; - const Id base_index{ctx.OpShiftRightArithmetic(ctx.U32[1], offset, shift_id)}; - const Id next_index{ctx.OpIAdd(ctx.U32[1], base_index, ctx.ConstU32(1U))}; - const Id lhs_pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, base_index)}; - const Id rhs_pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, next_index)}; - return ctx.OpCompositeConstruct(ctx.U32[2], ctx.OpLoad(ctx.U32[1], lhs_pointer), - ctx.OpLoad(ctx.U32[1], rhs_pointer)); + const Id shift_id{ctx.ConstU32(3U)}; + const Id index{ctx.OpShiftRightLogical(ctx.U32[1], offset, shift_id)}; + const u32 num_elements{Common::DivCeil(ctx.runtime_info.cs_info.shared_memory_size, 8u)}; + + return AccessBoundsCheck<64>(ctx, index, ctx.ConstU32(num_elements), [&] { + const Id pointer{ + ctx.OpAccessChain(ctx.shared_u64, ctx.shared_memory_u64, ctx.u32_zero_value, index)}; + return ctx.OpLoad(ctx.U64, pointer); + }); +} + +void EmitWriteSharedU16(EmitContext& ctx, Id offset, Id value) { + const Id shift{ctx.ConstU32(1U)}; + const Id index{ctx.OpShiftRightLogical(ctx.U32[1], offset, shift)}; + const u32 num_elements{Common::DivCeil(ctx.runtime_info.cs_info.shared_memory_size, 2u)}; + + AccessBoundsCheck<16>(ctx, index, ctx.ConstU32(num_elements), [&] { + const Id pointer = + ctx.OpAccessChain(ctx.shared_u16, ctx.shared_memory_u16, ctx.u32_zero_value, index); + ctx.OpStore(pointer, value); + return Id{0}; + }); } void EmitWriteSharedU32(EmitContext& ctx, Id offset, Id value) { const Id shift{ctx.ConstU32(2U)}; - const Id word_offset{ctx.OpShiftRightArithmetic(ctx.U32[1], offset, shift)}; - const Id pointer = ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, word_offset); - ctx.OpStore(pointer, value); + const Id index{ctx.OpShiftRightLogical(ctx.U32[1], offset, shift)}; + const u32 num_elements{Common::DivCeil(ctx.runtime_info.cs_info.shared_memory_size, 4u)}; + + AccessBoundsCheck<32>(ctx, index, ctx.ConstU32(num_elements), [&] { + const Id pointer = + ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, ctx.u32_zero_value, index); + ctx.OpStore(pointer, value); + return Id{0}; + }); } void EmitWriteSharedU64(EmitContext& ctx, Id offset, Id value) { - const Id shift{ctx.ConstU32(2U)}; - const Id word_offset{ctx.OpShiftRightArithmetic(ctx.U32[1], offset, shift)}; - const Id next_offset{ctx.OpIAdd(ctx.U32[1], word_offset, ctx.ConstU32(1U))}; - const Id lhs_pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, word_offset)}; - const Id rhs_pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, next_offset)}; - ctx.OpStore(lhs_pointer, ctx.OpCompositeExtract(ctx.U32[1], value, 0U)); - ctx.OpStore(rhs_pointer, ctx.OpCompositeExtract(ctx.U32[1], value, 1U)); + const Id shift{ctx.ConstU32(3U)}; + const Id index{ctx.OpShiftRightLogical(ctx.U32[1], offset, shift)}; + const u32 num_elements{Common::DivCeil(ctx.runtime_info.cs_info.shared_memory_size, 8u)}; + + AccessBoundsCheck<64>(ctx, index, ctx.ConstU32(num_elements), [&] { + const Id pointer{ + ctx.OpAccessChain(ctx.shared_u64, ctx.shared_memory_u64, ctx.u32_zero_value, index)}; + ctx.OpStore(pointer, value); + return Id{0}; + }); } } // namespace Shader::Backend::SPIRV diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp index 9e51f8e60..672856397 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp @@ -146,6 +146,7 @@ void EmitContext::DefineArithmeticTypes() { false_value = ConstantFalse(U1[1]); u8_one_value = Constant(U8, 1U); u8_zero_value = Constant(U8, 0U); + u16_zero_value = Constant(U16, 0U); u32_one_value = ConstU32(1U); u32_zero_value = ConstU32(0U); f32_zero_value = ConstF32(0.0f); @@ -285,6 +286,8 @@ void EmitContext::DefineBufferProperties() { Name(buffer.size_shorts, fmt::format("buf{}_short_size", binding)); buffer.size_dwords = OpShiftRightLogical(U32[1], buffer.size, ConstU32(2U)); Name(buffer.size_dwords, fmt::format("buf{}_dword_size", binding)); + buffer.size_qwords = OpShiftRightLogical(U32[1], buffer.size, ConstU32(3U)); + Name(buffer.size_qwords, fmt::format("buf{}_qword_size", binding)); } } } @@ -979,13 +982,27 @@ void EmitContext::DefineSharedMemory() { } ASSERT(info.stage == Stage::Compute); const u32 shared_memory_size = runtime_info.cs_info.shared_memory_size; - const u32 num_elements{Common::DivCeil(shared_memory_size, 4U)}; - const Id type{TypeArray(U32[1], ConstU32(num_elements))}; - shared_memory_u32_type = TypePointer(spv::StorageClass::Workgroup, type); - shared_u32 = TypePointer(spv::StorageClass::Workgroup, U32[1]); - shared_memory_u32 = AddGlobalVariable(shared_memory_u32_type, spv::StorageClass::Workgroup); - Name(shared_memory_u32, "shared_mem"); - interfaces.push_back(shared_memory_u32); + + const auto make_type = [&](Id element_type, u32 element_size) { + const u32 num_elements{Common::DivCeil(shared_memory_size, element_size)}; + const Id array_type{TypeArray(element_type, ConstU32(num_elements))}; + Decorate(array_type, spv::Decoration::ArrayStride, element_size); + + const Id struct_type{TypeStruct(array_type)}; + MemberDecorate(struct_type, 0u, spv::Decoration::Offset, 0u); + Decorate(struct_type, spv::Decoration::Block); + + const Id pointer = TypePointer(spv::StorageClass::Workgroup, struct_type); + const Id element_pointer = TypePointer(spv::StorageClass::Workgroup, element_type); + const Id variable = AddGlobalVariable(pointer, spv::StorageClass::Workgroup); + Decorate(variable, spv::Decoration::Aliased); + interfaces.push_back(variable); + + return std::make_tuple(variable, element_pointer, pointer); + }; + std::tie(shared_memory_u16, shared_u16, shared_memory_u16_type) = make_type(U16, 2u); + std::tie(shared_memory_u32, shared_u32, shared_memory_u32_type) = make_type(U32[1], 4u); + std::tie(shared_memory_u64, shared_u64, shared_memory_u64_type) = make_type(U64, 8u); } Id EmitContext::DefineFloat32ToUfloatM5(u32 mantissa_bits, const std::string_view name) { diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.h b/src/shader_recompiler/backend/spirv/spirv_emit_context.h index 20d936cf0..93c4ed265 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.h +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.h @@ -235,17 +235,16 @@ public: Id false_value{}; Id u8_one_value{}; Id u8_zero_value{}; + Id u16_zero_value{}; Id u32_one_value{}; Id u32_zero_value{}; Id f32_zero_value{}; Id u64_one_value{}; Id u64_zero_value{}; - Id shared_u8{}; Id shared_u16{}; Id shared_u32{}; - Id shared_u32x2{}; - Id shared_u32x4{}; + Id shared_u64{}; Id input_u32{}; Id input_f32{}; @@ -285,13 +284,13 @@ public: Id image_u32{}; Id image_f32{}; - Id shared_memory_u8{}; Id shared_memory_u16{}; Id shared_memory_u32{}; - Id shared_memory_u32x2{}; - Id shared_memory_u32x4{}; + Id shared_memory_u64{}; + Id shared_memory_u16_type{}; Id shared_memory_u32_type{}; + Id shared_memory_u64_type{}; Id bary_coord_persp_id{}; Id bary_coord_linear_id{}; @@ -320,6 +319,7 @@ public: Id size; Id size_shorts; Id size_dwords; + Id size_qwords; std::array aliases; const BufferSpv& operator[](PointerType alias) const { diff --git a/src/shader_recompiler/frontend/translate/data_share.cpp b/src/shader_recompiler/frontend/translate/data_share.cpp index c29497ada..4b6a58fd0 100644 --- a/src/shader_recompiler/frontend/translate/data_share.cpp +++ b/src/shader_recompiler/frontend/translate/data_share.cpp @@ -13,6 +13,8 @@ void Translator::EmitDataShare(const GcnInst& inst) { // DS case Opcode::DS_ADD_U32: return DS_ADD_U32(inst, false); + case Opcode::DS_ADD_U64: + return DS_ADD_U64(inst, false); case Opcode::DS_SUB_U32: return DS_SUB_U32(inst, false); case Opcode::DS_INC_U32: @@ -61,10 +63,14 @@ void Translator::EmitDataShare(const GcnInst& inst) { return DS_READ(32, false, true, false, inst); case Opcode::DS_READ2ST64_B32: return DS_READ(32, false, true, true, inst); + case Opcode::DS_READ_U16: + return DS_READ(16, false, false, false, inst); case Opcode::DS_CONSUME: return DS_CONSUME(inst); case Opcode::DS_APPEND: return DS_APPEND(inst); + case Opcode::DS_WRITE_B16: + return DS_WRITE(16, false, false, false, inst); case Opcode::DS_WRITE_B64: return DS_WRITE(64, false, false, false, inst); case Opcode::DS_WRITE2_B64: @@ -123,6 +129,18 @@ void Translator::DS_ADD_U32(const GcnInst& inst, bool rtn) { } } +void Translator::DS_ADD_U64(const GcnInst& inst, bool rtn) { + const IR::U32 addr{GetSrc(inst.src[0])}; + const IR::U64 data{GetSrc64(inst.src[1])}; + const IR::U32 offset = + ir.Imm32((u32(inst.control.ds.offset1) << 8u) + u32(inst.control.ds.offset0)); + const IR::U32 addr_offset = ir.IAdd(addr, offset); + const IR::Value original_val = ir.SharedAtomicIAdd(addr_offset, data); + if (rtn) { + SetDst64(inst.dst[0], IR::U64{original_val}); + } +} + void Translator::DS_MIN_U32(const GcnInst& inst, bool is_signed, bool rtn) { const IR::U32 addr{GetSrc(inst.src[0])}; const IR::U32 data{GetSrc(inst.src[1])}; @@ -201,23 +219,28 @@ void Translator::DS_WRITE(int bit_size, bool is_signed, bool is_pair, bool strid if (bit_size == 32) { ir.WriteShared(32, ir.GetVectorReg(data0), addr0); } else { - ir.WriteShared( - 64, ir.CompositeConstruct(ir.GetVectorReg(data0), ir.GetVectorReg(data0 + 1)), - addr0); + ir.WriteShared(64, + ir.PackUint2x32(ir.CompositeConstruct(ir.GetVectorReg(data0), + ir.GetVectorReg(data0 + 1))), + addr0); } const IR::U32 addr1 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset1 * adj))); if (bit_size == 32) { ir.WriteShared(32, ir.GetVectorReg(data1), addr1); } else { - ir.WriteShared( - 64, ir.CompositeConstruct(ir.GetVectorReg(data1), ir.GetVectorReg(data1 + 1)), - addr1); + ir.WriteShared(64, + ir.PackUint2x32(ir.CompositeConstruct(ir.GetVectorReg(data1), + ir.GetVectorReg(data1 + 1))), + addr1); } } else if (bit_size == 64) { const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(offset)); const IR::Value data = ir.CompositeConstruct(ir.GetVectorReg(data0), ir.GetVectorReg(data0 + 1)); - ir.WriteShared(bit_size, data, addr0); + ir.WriteShared(bit_size, ir.PackUint2x32(data), addr0); + } else if (bit_size == 16) { + const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(offset)); + ir.WriteShared(bit_size, ir.GetVectorReg(data0), addr0); } else { const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(offset)); ir.WriteShared(bit_size, ir.GetVectorReg(data0), addr0); @@ -289,22 +312,29 @@ void Translator::DS_READ(int bit_size, bool is_signed, bool is_pair, bool stride if (bit_size == 32) { ir.SetVectorReg(dst_reg++, IR::U32{data0}); } else { - ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(data0, 0)}); - ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(data0, 1)}); + const auto vector = ir.UnpackUint2x32(IR::U64{data0}); + ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(vector, 0)}); + ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(vector, 1)}); } const IR::U32 addr1 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset1 * adj))); const IR::Value data1 = ir.LoadShared(bit_size, is_signed, addr1); if (bit_size == 32) { ir.SetVectorReg(dst_reg++, IR::U32{data1}); } else { - ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(data1, 0)}); - ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(data1, 1)}); + const auto vector = ir.UnpackUint2x32(IR::U64{data1}); + ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(vector, 0)}); + ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(vector, 1)}); } } else if (bit_size == 64) { const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(offset)); const IR::Value data = ir.LoadShared(bit_size, is_signed, addr0); - ir.SetVectorReg(dst_reg, IR::U32{ir.CompositeExtract(data, 0)}); - ir.SetVectorReg(dst_reg + 1, IR::U32{ir.CompositeExtract(data, 1)}); + const auto vector = ir.UnpackUint2x32(IR::U64{data}); + ir.SetVectorReg(dst_reg, IR::U32{ir.CompositeExtract(vector, 0)}); + ir.SetVectorReg(dst_reg + 1, IR::U32{ir.CompositeExtract(vector, 1)}); + } else if (bit_size == 16) { + const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(offset)); + const IR::U16 data = IR::U16{ir.LoadShared(bit_size, is_signed, addr0)}; + ir.SetVectorReg(dst_reg, ir.UConvert(32, data)); } else { const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(offset)); const IR::U32 data = IR::U32{ir.LoadShared(bit_size, is_signed, addr0)}; diff --git a/src/shader_recompiler/frontend/translate/translate.h b/src/shader_recompiler/frontend/translate/translate.h index 96ca924a3..086b325aa 100644 --- a/src/shader_recompiler/frontend/translate/translate.h +++ b/src/shader_recompiler/frontend/translate/translate.h @@ -271,6 +271,7 @@ public: // Data share // DS void DS_ADD_U32(const GcnInst& inst, bool rtn); + void DS_ADD_U64(const GcnInst& inst, bool rtn); void DS_MIN_U32(const GcnInst& inst, bool is_signed, bool rtn); void DS_MAX_U32(const GcnInst& inst, bool is_signed, bool rtn); void DS_WRITE(int bit_size, bool is_signed, bool is_pair, bool stride64, const GcnInst& inst); diff --git a/src/shader_recompiler/ir/ir_emitter.cpp b/src/shader_recompiler/ir/ir_emitter.cpp index e6cc32829..2c37c8099 100644 --- a/src/shader_recompiler/ir/ir_emitter.cpp +++ b/src/shader_recompiler/ir/ir_emitter.cpp @@ -293,10 +293,12 @@ void IREmitter::SetPatch(Patch patch, const F32& value) { Value IREmitter::LoadShared(int bit_size, bool is_signed, const U32& offset) { switch (bit_size) { + case 16: + return Inst(Opcode::LoadSharedU16, offset); case 32: return Inst(Opcode::LoadSharedU32, offset); case 64: - return Inst(Opcode::LoadSharedU64, offset); + return Inst(Opcode::LoadSharedU64, offset); default: UNREACHABLE_MSG("Invalid bit size {}", bit_size); } @@ -304,6 +306,9 @@ Value IREmitter::LoadShared(int bit_size, bool is_signed, const U32& offset) { void IREmitter::WriteShared(int bit_size, const Value& value, const U32& offset) { switch (bit_size) { + case 16: + Inst(Opcode::WriteSharedU16, offset, value); + break; case 32: Inst(Opcode::WriteSharedU32, offset, value); break; @@ -315,10 +320,12 @@ void IREmitter::WriteShared(int bit_size, const Value& value, const U32& offset) } } -U32F32 IREmitter::SharedAtomicIAdd(const U32& address, const U32F32& data) { +U32U64 IREmitter::SharedAtomicIAdd(const U32& address, const U32U64& data) { switch (data.Type()) { case Type::U32: return Inst(Opcode::SharedAtomicIAdd32, address, data); + case Type::U64: + return Inst(Opcode::SharedAtomicIAdd64, address, data); default: ThrowInvalidType(data.Type()); } diff --git a/src/shader_recompiler/ir/ir_emitter.h b/src/shader_recompiler/ir/ir_emitter.h index 0e41f4b2d..eae44ed04 100644 --- a/src/shader_recompiler/ir/ir_emitter.h +++ b/src/shader_recompiler/ir/ir_emitter.h @@ -99,7 +99,7 @@ public: [[nodiscard]] Value LoadShared(int bit_size, bool is_signed, const U32& offset); void WriteShared(int bit_size, const Value& value, const U32& offset); - [[nodiscard]] U32F32 SharedAtomicIAdd(const U32& address, const U32F32& data); + [[nodiscard]] U32U64 SharedAtomicIAdd(const U32& address, const U32U64& data); [[nodiscard]] U32 SharedAtomicIMin(const U32& address, const U32& data, bool is_signed); [[nodiscard]] U32 SharedAtomicIMax(const U32& address, const U32& data, bool is_signed); [[nodiscard]] U32 SharedAtomicAnd(const U32& address, const U32& data); diff --git a/src/shader_recompiler/ir/opcodes.inc b/src/shader_recompiler/ir/opcodes.inc index 5b3216be6..e96e32297 100644 --- a/src/shader_recompiler/ir/opcodes.inc +++ b/src/shader_recompiler/ir/opcodes.inc @@ -30,13 +30,16 @@ OPCODE(EmitVertex, Void, OPCODE(EmitPrimitive, Void, ) // Shared memory operations +OPCODE(LoadSharedU16, U16, U32, ) OPCODE(LoadSharedU32, U32, U32, ) -OPCODE(LoadSharedU64, U32x2, U32, ) +OPCODE(LoadSharedU64, U64, U32, ) +OPCODE(WriteSharedU16, Void, U32, U16, ) OPCODE(WriteSharedU32, Void, U32, U32, ) -OPCODE(WriteSharedU64, Void, U32, U32x2, ) +OPCODE(WriteSharedU64, Void, U32, U64, ) // Shared atomic operations OPCODE(SharedAtomicIAdd32, U32, U32, U32, ) +OPCODE(SharedAtomicIAdd64, U64, U32, U64, ) OPCODE(SharedAtomicSMin32, U32, U32, U32, ) OPCODE(SharedAtomicUMin32, U32, U32, U32, ) OPCODE(SharedAtomicSMax32, U32, U32, U32, ) @@ -116,6 +119,7 @@ OPCODE(StoreBufferFormatF32, Void, Opaq // Buffer atomic operations OPCODE(BufferAtomicIAdd32, U32, Opaque, Opaque, U32 ) +OPCODE(BufferAtomicIAdd64, U64, Opaque, Opaque, U64 ) OPCODE(BufferAtomicSMin32, U32, Opaque, Opaque, U32 ) OPCODE(BufferAtomicUMin32, U32, Opaque, Opaque, U32 ) OPCODE(BufferAtomicSMax32, U32, Opaque, Opaque, U32 ) diff --git a/src/shader_recompiler/ir/passes/ring_access_elimination.cpp b/src/shader_recompiler/ir/passes/ring_access_elimination.cpp index 02745bf9a..b292b41b9 100644 --- a/src/shader_recompiler/ir/passes/ring_access_elimination.cpp +++ b/src/shader_recompiler/ir/passes/ring_access_elimination.cpp @@ -39,11 +39,13 @@ void RingAccessElimination(const IR::Program& program, const RuntimeInfo& runtim ASSERT(addr->Arg(1).IsImmediate()); offset = addr->Arg(1).U32(); } - IR::Value data = inst.Arg(1).Resolve(); + IR::Value data = is_composite ? ir.UnpackUint2x32(IR::U64{inst.Arg(1).Resolve()}) + : inst.Arg(1).Resolve(); for (s32 i = 0; i < num_components; i++) { const auto attrib = IR::Attribute::Param0 + (offset / 16); const auto comp = (offset / 4) % 4; - const IR::U32 value = IR::U32{is_composite ? data.Inst()->Arg(i) : data}; + const IR::U32 value = + IR::U32{is_composite ? ir.CompositeExtract(data, i) : data}; ir.SetAttribute(attrib, ir.BitCast(value), comp); offset += 4; } diff --git a/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp b/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp index d4759b32e..ba8d1cca6 100644 --- a/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp +++ b/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp @@ -34,8 +34,10 @@ void Visit(Info& info, const IR::Inst& inst) { info.uses_patches |= 1U << IR::GenericPatchIndex(patch); break; } + case IR::Opcode::LoadSharedU16: case IR::Opcode::LoadSharedU32: case IR::Opcode::LoadSharedU64: + case IR::Opcode::WriteSharedU16: case IR::Opcode::WriteSharedU32: case IR::Opcode::WriteSharedU64: info.uses_shared = true; diff --git a/src/shader_recompiler/ir/passes/shared_memory_to_storage_pass.cpp b/src/shader_recompiler/ir/passes/shared_memory_to_storage_pass.cpp index 25aaf257c..409c05940 100644 --- a/src/shader_recompiler/ir/passes/shared_memory_to_storage_pass.cpp +++ b/src/shader_recompiler/ir/passes/shared_memory_to_storage_pass.cpp @@ -16,6 +16,7 @@ static bool IsSharedAccess(const IR::Inst& inst) { case IR::Opcode::WriteSharedU64: case IR::Opcode::SharedAtomicAnd32: case IR::Opcode::SharedAtomicIAdd32: + case IR::Opcode::SharedAtomicIAdd64: case IR::Opcode::SharedAtomicOr32: case IR::Opcode::SharedAtomicSMax32: case IR::Opcode::SharedAtomicUMax32: @@ -33,9 +34,11 @@ void SharedMemoryToStoragePass(IR::Program& program, const RuntimeInfo& runtime_ if (program.info.stage != Stage::Compute) { return; } - // Only perform the transform if the host shared memory is insufficient. + // Only perform the transform if the host shared memory is insufficient + // or the device does not support VK_KHR_workgroup_memory_explicit_layout const u32 shared_memory_size = runtime_info.cs_info.shared_memory_size; - if (shared_memory_size <= profile.max_shared_memory_size) { + if (shared_memory_size <= profile.max_shared_memory_size && + profile.supports_workgroup_explicit_memory_layout) { return; } // Add buffer binding for shared memory storage buffer. @@ -60,6 +63,7 @@ void SharedMemoryToStoragePass(IR::Program& program, const RuntimeInfo& runtime_ ir.BufferAtomicAnd(handle, inst.Arg(0), inst.Arg(1), {})); continue; case IR::Opcode::SharedAtomicIAdd32: + case IR::Opcode::SharedAtomicIAdd64: inst.ReplaceUsesWithAndRemove( ir.BufferAtomicIAdd(handle, inst.Arg(0), inst.Arg(1), {})); continue; @@ -93,12 +97,19 @@ void SharedMemoryToStoragePass(IR::Program& program, const RuntimeInfo& runtime_ ir.Imm32(shared_memory_size)); const IR::U32 address = ir.IAdd(IR::U32{inst.Arg(0)}, offset); switch (inst.GetOpcode()) { + case IR::Opcode::LoadSharedU16: + inst.ReplaceUsesWithAndRemove(ir.LoadBufferU16(handle, address, {})); + break; case IR::Opcode::LoadSharedU32: inst.ReplaceUsesWithAndRemove(ir.LoadBufferU32(1, handle, address, {})); break; case IR::Opcode::LoadSharedU64: inst.ReplaceUsesWithAndRemove(ir.LoadBufferU32(2, handle, address, {})); break; + case IR::Opcode::WriteSharedU16: + ir.StoreBufferU16(handle, address, IR::U32{inst.Arg(1)}, {}); + inst.Invalidate(); + break; case IR::Opcode::WriteSharedU32: ir.StoreBufferU32(1, handle, address, inst.Arg(1), {}); inst.Invalidate(); diff --git a/src/shader_recompiler/profile.h b/src/shader_recompiler/profile.h index 853e4854d..7d313180f 100644 --- a/src/shader_recompiler/profile.h +++ b/src/shader_recompiler/profile.h @@ -23,13 +23,13 @@ struct Profile { bool support_fp32_denorm_preserve{}; bool support_fp32_denorm_flush{}; bool support_fp32_round_to_zero{}; - bool support_explicit_workgroup_layout{}; bool support_legacy_vertex_attributes{}; bool supports_image_load_store_lod{}; bool supports_native_cube_calc{}; bool supports_trinary_minmax{}; bool supports_robust_buffer_access{}; bool supports_image_fp32_atomic_min_max{}; + bool supports_workgroup_explicit_memory_layout{}; bool has_broken_spirv_clamp{}; bool lower_left_origin_mode{}; bool needs_manual_interpolation{}; diff --git a/src/video_core/renderer_vulkan/vk_instance.cpp b/src/video_core/renderer_vulkan/vk_instance.cpp index 9584329f0..0591e06ce 100644 --- a/src/video_core/renderer_vulkan/vk_instance.cpp +++ b/src/video_core/renderer_vulkan/vk_instance.cpp @@ -212,7 +212,8 @@ bool Instance::CreateDevice() { vk::PhysicalDeviceExtendedDynamicState3FeaturesEXT, vk::PhysicalDevicePrimitiveTopologyListRestartFeaturesEXT, vk::PhysicalDevicePortabilitySubsetFeaturesKHR, - vk::PhysicalDeviceShaderAtomicFloat2FeaturesEXT>(); + vk::PhysicalDeviceShaderAtomicFloat2FeaturesEXT, + vk::PhysicalDeviceWorkgroupMemoryExplicitLayoutFeaturesKHR>(); features = feature_chain.get().features; const vk::StructureChain properties_chain = physical_device.getProperties2< @@ -283,6 +284,20 @@ bool Instance::CreateDevice() { LOG_INFO(Render_Vulkan, "- shaderImageFloat32AtomicMinMax: {}", shader_atomic_float2_features.shaderImageFloat32AtomicMinMax); } + workgroup_memory_explicit_layout = + add_extension(VK_KHR_WORKGROUP_MEMORY_EXPLICIT_LAYOUT_EXTENSION_NAME); + if (workgroup_memory_explicit_layout) { + workgroup_memory_explicit_layout_features = + feature_chain.get(); + LOG_INFO(Render_Vulkan, "- workgroupMemoryExplicitLayout: {}", + workgroup_memory_explicit_layout_features.workgroupMemoryExplicitLayout); + LOG_INFO(Render_Vulkan, "- workgroupMemoryExplicitLayoutScalarBlockLayout: {}", + workgroup_memory_explicit_layout_features + .workgroupMemoryExplicitLayoutScalarBlockLayout); + LOG_INFO( + Render_Vulkan, "- workgroupMemoryExplicitLayout16BitAccess: {}", + workgroup_memory_explicit_layout_features.workgroupMemoryExplicitLayout16BitAccess); + } const bool calibrated_timestamps = TRACY_GPU_ENABLED ? add_extension(VK_EXT_CALIBRATED_TIMESTAMPS_EXTENSION_NAME) : false; @@ -420,6 +435,15 @@ bool Instance::CreateDevice() { .shaderImageFloat32AtomicMinMax = shader_atomic_float2_features.shaderImageFloat32AtomicMinMax, }, + vk::PhysicalDeviceWorkgroupMemoryExplicitLayoutFeaturesKHR{ + .workgroupMemoryExplicitLayout = + workgroup_memory_explicit_layout_features.workgroupMemoryExplicitLayout, + .workgroupMemoryExplicitLayoutScalarBlockLayout = + workgroup_memory_explicit_layout_features + .workgroupMemoryExplicitLayoutScalarBlockLayout, + .workgroupMemoryExplicitLayout16BitAccess = + workgroup_memory_explicit_layout_features.workgroupMemoryExplicitLayout16BitAccess, + }, #ifdef __APPLE__ portability_features, #endif @@ -452,6 +476,9 @@ bool Instance::CreateDevice() { if (!shader_atomic_float2) { device_chain.unlink(); } + if (!workgroup_memory_explicit_layout) { + device_chain.unlink(); + } auto [device_result, dev] = physical_device.createDeviceUnique(device_chain.get()); if (device_result != vk::Result::eSuccess) { diff --git a/src/video_core/renderer_vulkan/vk_instance.h b/src/video_core/renderer_vulkan/vk_instance.h index 30848e8b7..c687e6f67 100644 --- a/src/video_core/renderer_vulkan/vk_instance.h +++ b/src/video_core/renderer_vulkan/vk_instance.h @@ -171,6 +171,12 @@ public: return shader_atomic_float2 && shader_atomic_float2_features.shaderImageFloat32AtomicMinMax; } + /// Returns true when VK_KHR_workgroup_memory_explicit_layout is supported. + bool IsWorkgroupMemoryExplicitLayoutSupported() const { + return workgroup_memory_explicit_layout && + workgroup_memory_explicit_layout_features.workgroupMemoryExplicitLayout16BitAccess; + } + /// Returns true when geometry shaders are supported by the device bool IsGeometryStageSupported() const { return features.geometryShader; @@ -349,6 +355,8 @@ private: vk::PhysicalDeviceExtendedDynamicState3FeaturesEXT dynamic_state_3_features; vk::PhysicalDeviceRobustness2FeaturesEXT robustness2_features; vk::PhysicalDeviceShaderAtomicFloat2FeaturesEXT shader_atomic_float2_features; + vk::PhysicalDeviceWorkgroupMemoryExplicitLayoutFeaturesKHR + workgroup_memory_explicit_layout_features; vk::DriverIdKHR driver_id; vk::UniqueDebugUtilsMessengerEXT debug_callback{}; std::string vendor_name; @@ -374,6 +382,7 @@ private: bool amd_gcn_shader{}; bool amd_shader_trinary_minmax{}; bool shader_atomic_float2{}; + bool workgroup_memory_explicit_layout{}; bool portability_subset{}; }; diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index cd8552515..2c3f4ba2f 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -210,7 +210,6 @@ PipelineCache::PipelineCache(const Instance& instance_, Scheduler& scheduler_, .support_fp32_denorm_preserve = bool(vk12_props.shaderDenormPreserveFloat32), .support_fp32_denorm_flush = bool(vk12_props.shaderDenormFlushToZeroFloat32), .support_fp32_round_to_zero = bool(vk12_props.shaderRoundingModeRTZFloat32), - .support_explicit_workgroup_layout = true, .support_legacy_vertex_attributes = instance_.IsLegacyVertexAttributesSupported(), .supports_image_load_store_lod = instance_.IsImageLoadStoreLodSupported(), .supports_native_cube_calc = instance_.IsAmdGcnShaderSupported(), @@ -218,6 +217,8 @@ PipelineCache::PipelineCache(const Instance& instance_, Scheduler& scheduler_, // TODO: Emitted bounds checks cause problems with phi control flow; needs to be fixed. .supports_robust_buffer_access = true, // instance_.IsRobustBufferAccess2Supported(), .supports_image_fp32_atomic_min_max = instance_.IsShaderAtomicFloatImage32MinMaxSupported(), + .supports_workgroup_explicit_memory_layout = + instance_.IsWorkgroupMemoryExplicitLayoutSupported(), .needs_manual_interpolation = instance.IsFragmentShaderBarycentricSupported() && instance.GetDriverID() == vk::DriverId::eNvidiaProprietary, .needs_lds_barriers = instance.GetDriverID() == vk::DriverId::eNvidiaProprietary ||