Handle DS_READ_U16, DS_WRITE_B16, DS_ADD_U64 (#3007)
Some checks are pending
Build and Release / reuse (push) Waiting to run
Build and Release / clang-format (push) Waiting to run
Build and Release / get-info (push) Waiting to run
Build and Release / windows-sdl (push) Blocked by required conditions
Build and Release / windows-qt (push) Blocked by required conditions
Build and Release / macos-sdl (push) Blocked by required conditions
Build and Release / macos-qt (push) Blocked by required conditions
Build and Release / linux-sdl (push) Blocked by required conditions
Build and Release / linux-qt (push) Blocked by required conditions
Build and Release / linux-sdl-gcc (push) Blocked by required conditions
Build and Release / linux-qt-gcc (push) Blocked by required conditions
Build and Release / pre-release (push) Blocked by required conditions

* Handle DS_READ_U16 & DS_WRITE_B16

* Refactor DS translation

* Translate DS_ADD_U64

* format

* Fix RingAccessElimination after changing WriteShared64 type

* Simplify bounds checking in generated SPIR-V
This commit is contained in:
Marcin Mikołajczyk 2025-06-09 21:03:38 +02:00 committed by GitHub
parent a71bfb30a2
commit 217d32b502
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
19 changed files with 323 additions and 89 deletions

View file

@ -303,6 +303,11 @@ void SetupCapabilities(const Info& info, const Profile& profile, EmitContext& ct
ctx.AddCapability(spv::Capability::PhysicalStorageBufferAddresses); ctx.AddCapability(spv::Capability::PhysicalStorageBufferAddresses);
ctx.AddExtension("SPV_KHR_physical_storage_buffer"); ctx.AddExtension("SPV_KHR_physical_storage_buffer");
} }
if (info.uses_shared && profile.supports_workgroup_explicit_memory_layout) {
ctx.AddExtension("SPV_KHR_workgroup_memory_explicit_layout");
ctx.AddCapability(spv::Capability::WorkgroupMemoryExplicitLayoutKHR);
ctx.AddCapability(spv::Capability::WorkgroupMemoryExplicitLayout16BitAccessKHR);
}
} }
void DefineEntryPoint(const Info& info, EmitContext& ctx, Id main) { void DefineEntryPoint(const Info& info, EmitContext& ctx, Id main) {

View file

@ -1,6 +1,8 @@
// SPDX-FileCopyrightText: Copyright 2021 yuzu Emulator Project // SPDX-FileCopyrightText: Copyright 2021 yuzu Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later // SPDX-License-Identifier: GPL-2.0-or-later
#include "common/div_ceil.h"
#include "shader_recompiler/backend/spirv/emit_spirv_bounds.h"
#include "shader_recompiler/backend/spirv/emit_spirv_instructions.h" #include "shader_recompiler/backend/spirv/emit_spirv_instructions.h"
#include "shader_recompiler/backend/spirv/spirv_emit_context.h" #include "shader_recompiler/backend/spirv/spirv_emit_context.h"
@ -15,42 +17,40 @@ std::pair<Id, Id> AtomicArgs(EmitContext& ctx) {
Id SharedAtomicU32(EmitContext& ctx, Id offset, Id value, Id SharedAtomicU32(EmitContext& ctx, Id offset, Id value,
Id (Sirit::Module::*atomic_func)(Id, Id, Id, Id, Id)) { Id (Sirit::Module::*atomic_func)(Id, Id, Id, Id, Id)) {
const Id shift_id{ctx.ConstU32(2U)}; const Id shift_id{ctx.ConstU32(2U)};
const Id index{ctx.OpShiftRightArithmetic(ctx.U32[1], offset, shift_id)}; const Id index{ctx.OpShiftRightLogical(ctx.U32[1], offset, shift_id)};
const Id pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, index)}; const u32 num_elements{Common::DivCeil(ctx.runtime_info.cs_info.shared_memory_size, 4u)};
const Id pointer{
ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, ctx.u32_zero_value, index)};
const auto [scope, semantics]{AtomicArgs(ctx)}; const auto [scope, semantics]{AtomicArgs(ctx)};
return (ctx.*atomic_func)(ctx.U32[1], pointer, scope, semantics, value); return AccessBoundsCheck<32>(ctx, index, ctx.ConstU32(num_elements), [&] {
return (ctx.*atomic_func)(ctx.U32[1], pointer, scope, semantics, value);
});
}
Id SharedAtomicU64(EmitContext& ctx, Id offset, Id value,
Id (Sirit::Module::*atomic_func)(Id, Id, Id, Id, Id)) {
const Id shift_id{ctx.ConstU32(3U)};
const Id index{ctx.OpShiftRightLogical(ctx.U32[1], offset, shift_id)};
const u32 num_elements{Common::DivCeil(ctx.runtime_info.cs_info.shared_memory_size, 8u)};
const Id pointer{
ctx.OpAccessChain(ctx.shared_u64, ctx.shared_memory_u64, ctx.u32_zero_value, index)};
const auto [scope, semantics]{AtomicArgs(ctx)};
return AccessBoundsCheck<64>(ctx, index, ctx.ConstU32(num_elements), [&] {
return (ctx.*atomic_func)(ctx.U64, pointer, scope, semantics, value);
});
} }
Id SharedAtomicU32_IncDec(EmitContext& ctx, Id offset, Id SharedAtomicU32_IncDec(EmitContext& ctx, Id offset,
Id (Sirit::Module::*atomic_func)(Id, Id, Id, Id)) { Id (Sirit::Module::*atomic_func)(Id, Id, Id, Id)) {
const Id shift_id{ctx.ConstU32(2U)}; const Id shift_id{ctx.ConstU32(2U)};
const Id index{ctx.OpShiftRightArithmetic(ctx.U32[1], offset, shift_id)}; const Id index{ctx.OpShiftRightLogical(ctx.U32[1], offset, shift_id)};
const Id pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, index)}; const u32 num_elements{Common::DivCeil(ctx.runtime_info.cs_info.shared_memory_size, 4u)};
const Id pointer{
ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, ctx.u32_zero_value, index)};
const auto [scope, semantics]{AtomicArgs(ctx)}; const auto [scope, semantics]{AtomicArgs(ctx)};
return (ctx.*atomic_func)(ctx.U32[1], pointer, scope, semantics); return AccessBoundsCheck<32>(ctx, index, ctx.ConstU32(num_elements), [&] {
} return (ctx.*atomic_func)(ctx.U32[1], pointer, scope, semantics);
});
Id BufferAtomicU32BoundsCheck(EmitContext& ctx, Id index, Id buffer_size, auto emit_func) {
if (Sirit::ValidId(buffer_size)) {
// Bounds checking enabled, wrap in a conditional branch to make sure that
// the atomic is not mistakenly executed when the index is out of bounds.
const Id in_bounds = ctx.OpULessThan(ctx.U1[1], index, buffer_size);
const Id ib_label = ctx.OpLabel();
const Id oob_label = ctx.OpLabel();
const Id end_label = ctx.OpLabel();
ctx.OpSelectionMerge(end_label, spv::SelectionControlMask::MaskNone);
ctx.OpBranchConditional(in_bounds, ib_label, oob_label);
ctx.AddLabel(ib_label);
const Id ib_result = emit_func();
ctx.OpBranch(end_label);
ctx.AddLabel(oob_label);
const Id oob_result = ctx.u32_zero_value;
ctx.OpBranch(end_label);
ctx.AddLabel(end_label);
return ctx.OpPhi(ctx.U32[1], ib_result, ib_label, oob_result, oob_label);
}
// Bounds checking not enabled, just perform the atomic operation.
return emit_func();
} }
Id BufferAtomicU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value, Id BufferAtomicU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value,
@ -63,7 +63,7 @@ Id BufferAtomicU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id
const auto [id, pointer_type] = buffer[EmitContext::PointerType::U32]; const auto [id, pointer_type] = buffer[EmitContext::PointerType::U32];
const Id ptr = ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, index); const Id ptr = ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, index);
const auto [scope, semantics]{AtomicArgs(ctx)}; const auto [scope, semantics]{AtomicArgs(ctx)};
return BufferAtomicU32BoundsCheck(ctx, index, buffer.size_dwords, [&] { return AccessBoundsCheck<32>(ctx, index, buffer.size_dwords, [&] {
return (ctx.*atomic_func)(ctx.U32[1], ptr, scope, semantics, value); return (ctx.*atomic_func)(ctx.U32[1], ptr, scope, semantics, value);
}); });
} }
@ -79,11 +79,26 @@ Id BufferAtomicU32CmpSwap(EmitContext& ctx, IR::Inst* inst, u32 handle, Id addre
const auto [id, pointer_type] = buffer[EmitContext::PointerType::U32]; const auto [id, pointer_type] = buffer[EmitContext::PointerType::U32];
const Id ptr = ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, index); const Id ptr = ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, index);
const auto [scope, semantics]{AtomicArgs(ctx)}; const auto [scope, semantics]{AtomicArgs(ctx)};
return BufferAtomicU32BoundsCheck(ctx, index, buffer.size_dwords, [&] { return AccessBoundsCheck<32>(ctx, index, buffer.size_dwords, [&] {
return (ctx.*atomic_func)(ctx.U32[1], ptr, scope, semantics, semantics, value, cmp_value); return (ctx.*atomic_func)(ctx.U32[1], ptr, scope, semantics, semantics, value, cmp_value);
}); });
} }
Id BufferAtomicU64(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value,
Id (Sirit::Module::*atomic_func)(Id, Id, Id, Id, Id)) {
const auto& buffer = ctx.buffers[handle];
if (Sirit::ValidId(buffer.offset)) {
address = ctx.OpIAdd(ctx.U32[1], address, buffer.offset);
}
const Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(3u));
const auto [id, pointer_type] = buffer[EmitContext::PointerType::U64];
const Id ptr = ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, index);
const auto [scope, semantics]{AtomicArgs(ctx)};
return AccessBoundsCheck<64>(ctx, index, buffer.size_qwords, [&] {
return (ctx.*atomic_func)(ctx.U64, ptr, scope, semantics, value);
});
}
Id ImageAtomicU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id value, Id ImageAtomicU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id value,
Id (Sirit::Module::*atomic_func)(Id, Id, Id, Id, Id)) { Id (Sirit::Module::*atomic_func)(Id, Id, Id, Id, Id)) {
const auto& texture = ctx.images[handle & 0xFFFF]; const auto& texture = ctx.images[handle & 0xFFFF];
@ -105,6 +120,10 @@ Id EmitSharedAtomicIAdd32(EmitContext& ctx, Id offset, Id value) {
return SharedAtomicU32(ctx, offset, value, &Sirit::Module::OpAtomicIAdd); return SharedAtomicU32(ctx, offset, value, &Sirit::Module::OpAtomicIAdd);
} }
Id EmitSharedAtomicIAdd64(EmitContext& ctx, Id offset, Id value) {
return SharedAtomicU64(ctx, offset, value, &Sirit::Module::OpAtomicIAdd);
}
Id EmitSharedAtomicUMax32(EmitContext& ctx, Id offset, Id value) { Id EmitSharedAtomicUMax32(EmitContext& ctx, Id offset, Id value) {
return SharedAtomicU32(ctx, offset, value, &Sirit::Module::OpAtomicUMax); return SharedAtomicU32(ctx, offset, value, &Sirit::Module::OpAtomicUMax);
} }
@ -149,6 +168,10 @@ Id EmitBufferAtomicIAdd32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id addre
return BufferAtomicU32(ctx, inst, handle, address, value, &Sirit::Module::OpAtomicIAdd); return BufferAtomicU32(ctx, inst, handle, address, value, &Sirit::Module::OpAtomicIAdd);
} }
Id EmitBufferAtomicIAdd64(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) {
return BufferAtomicU64(ctx, inst, handle, address, value, &Sirit::Module::OpAtomicIAdd);
}
Id EmitBufferAtomicSMin32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) { Id EmitBufferAtomicSMin32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) {
return BufferAtomicU32(ctx, inst, handle, address, value, &Sirit::Module::OpAtomicSMin); return BufferAtomicU32(ctx, inst, handle, address, value, &Sirit::Module::OpAtomicSMin);
} }

View file

@ -0,0 +1,48 @@
// SPDX-FileCopyrightText: Copyright 2025 shadPS4 Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#include "shader_recompiler/backend/spirv/emit_spirv_instructions.h"
#include "shader_recompiler/backend/spirv/spirv_emit_context.h"
namespace Shader::Backend::SPIRV {
template <u32 bit_size>
auto AccessBoundsCheck(EmitContext& ctx, Id index, Id buffer_size, auto emit_func) {
Id zero_value{};
Id result_type{};
if constexpr (bit_size == 64) {
zero_value = ctx.u64_zero_value;
result_type = ctx.U64;
} else if constexpr (bit_size == 32) {
zero_value = ctx.u32_zero_value;
result_type = ctx.U32[1];
} else if constexpr (bit_size == 16) {
zero_value = ctx.u16_zero_value;
result_type = ctx.U16;
} else {
static_assert(false, "type not supported");
}
if (Sirit::ValidId(buffer_size)) {
// Bounds checking enabled, wrap in a conditional branch to make sure that
// the atomic is not mistakenly executed when the index is out of bounds.
const Id in_bounds = ctx.OpULessThan(ctx.U1[1], index, buffer_size);
const Id ib_label = ctx.OpLabel();
const Id end_label = ctx.OpLabel();
ctx.OpSelectionMerge(end_label, spv::SelectionControlMask::MaskNone);
ctx.OpBranchConditional(in_bounds, ib_label, end_label);
const auto last_label = ctx.last_label;
ctx.AddLabel(ib_label);
const auto ib_result = emit_func();
ctx.OpBranch(end_label);
ctx.AddLabel(end_label);
if (Sirit::ValidId(ib_result)) {
return ctx.OpPhi(result_type, ib_result, ib_label, zero_value, last_label);
} else {
return Id{0};
}
}
// Bounds checking not enabled, just perform the atomic operation.
return emit_func();
}
} // namespace Shader::Backend::SPIRV

View file

@ -86,6 +86,7 @@ void EmitStoreBufferF32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id addre
void EmitStoreBufferF32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); void EmitStoreBufferF32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
void EmitStoreBufferFormatF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); void EmitStoreBufferFormatF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
Id EmitBufferAtomicIAdd32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); Id EmitBufferAtomicIAdd32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
Id EmitBufferAtomicIAdd64(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
Id EmitBufferAtomicSMin32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); Id EmitBufferAtomicSMin32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
Id EmitBufferAtomicUMin32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); Id EmitBufferAtomicUMin32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
Id EmitBufferAtomicSMax32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); Id EmitBufferAtomicSMax32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
@ -120,11 +121,14 @@ Id EmitUndefU8(EmitContext& ctx);
Id EmitUndefU16(EmitContext& ctx); Id EmitUndefU16(EmitContext& ctx);
Id EmitUndefU32(EmitContext& ctx); Id EmitUndefU32(EmitContext& ctx);
Id EmitUndefU64(EmitContext& ctx); Id EmitUndefU64(EmitContext& ctx);
Id EmitLoadSharedU16(EmitContext& ctx, Id offset);
Id EmitLoadSharedU32(EmitContext& ctx, Id offset); Id EmitLoadSharedU32(EmitContext& ctx, Id offset);
Id EmitLoadSharedU64(EmitContext& ctx, Id offset); Id EmitLoadSharedU64(EmitContext& ctx, Id offset);
void EmitWriteSharedU16(EmitContext& ctx, Id offset, Id value);
void EmitWriteSharedU32(EmitContext& ctx, Id offset, Id value); void EmitWriteSharedU32(EmitContext& ctx, Id offset, Id value);
void EmitWriteSharedU64(EmitContext& ctx, Id offset, Id value); void EmitWriteSharedU64(EmitContext& ctx, Id offset, Id value);
Id EmitSharedAtomicIAdd32(EmitContext& ctx, Id offset, Id value); Id EmitSharedAtomicIAdd32(EmitContext& ctx, Id offset, Id value);
Id EmitSharedAtomicIAdd64(EmitContext& ctx, Id offset, Id value);
Id EmitSharedAtomicUMax32(EmitContext& ctx, Id offset, Id value); Id EmitSharedAtomicUMax32(EmitContext& ctx, Id offset, Id value);
Id EmitSharedAtomicSMax32(EmitContext& ctx, Id offset, Id value); Id EmitSharedAtomicSMax32(EmitContext& ctx, Id offset, Id value);
Id EmitSharedAtomicUMin32(EmitContext& ctx, Id offset, Id value); Id EmitSharedAtomicUMin32(EmitContext& ctx, Id offset, Id value);

View file

@ -1,43 +1,86 @@
// SPDX-FileCopyrightText: Copyright 2021 yuzu Emulator Project // SPDX-FileCopyrightText: Copyright 2021 yuzu Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later // SPDX-License-Identifier: GPL-2.0-or-later
#include "common/div_ceil.h"
#include "shader_recompiler/backend/spirv/emit_spirv_bounds.h"
#include "shader_recompiler/backend/spirv/emit_spirv_instructions.h" #include "shader_recompiler/backend/spirv/emit_spirv_instructions.h"
#include "shader_recompiler/backend/spirv/spirv_emit_context.h" #include "shader_recompiler/backend/spirv/spirv_emit_context.h"
namespace Shader::Backend::SPIRV { namespace Shader::Backend::SPIRV {
Id EmitLoadSharedU16(EmitContext& ctx, Id offset) {
const Id shift_id{ctx.ConstU32(1U)};
const Id index{ctx.OpShiftRightLogical(ctx.U32[1], offset, shift_id)};
const u32 num_elements{Common::DivCeil(ctx.runtime_info.cs_info.shared_memory_size, 2u)};
return AccessBoundsCheck<16>(ctx, index, ctx.ConstU32(num_elements), [&] {
const Id pointer =
ctx.OpAccessChain(ctx.shared_u16, ctx.shared_memory_u16, ctx.u32_zero_value, index);
return ctx.OpLoad(ctx.U16, pointer);
});
}
Id EmitLoadSharedU32(EmitContext& ctx, Id offset) { Id EmitLoadSharedU32(EmitContext& ctx, Id offset) {
const Id shift_id{ctx.ConstU32(2U)}; const Id shift_id{ctx.ConstU32(2U)};
const Id index{ctx.OpShiftRightArithmetic(ctx.U32[1], offset, shift_id)}; const Id index{ctx.OpShiftRightLogical(ctx.U32[1], offset, shift_id)};
const Id pointer = ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, index); const u32 num_elements{Common::DivCeil(ctx.runtime_info.cs_info.shared_memory_size, 4u)};
return ctx.OpLoad(ctx.U32[1], pointer);
return AccessBoundsCheck<32>(ctx, index, ctx.ConstU32(num_elements), [&] {
const Id pointer =
ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, ctx.u32_zero_value, index);
return ctx.OpLoad(ctx.U32[1], pointer);
});
} }
Id EmitLoadSharedU64(EmitContext& ctx, Id offset) { Id EmitLoadSharedU64(EmitContext& ctx, Id offset) {
const Id shift_id{ctx.ConstU32(2U)}; const Id shift_id{ctx.ConstU32(3U)};
const Id base_index{ctx.OpShiftRightArithmetic(ctx.U32[1], offset, shift_id)}; const Id index{ctx.OpShiftRightLogical(ctx.U32[1], offset, shift_id)};
const Id next_index{ctx.OpIAdd(ctx.U32[1], base_index, ctx.ConstU32(1U))}; const u32 num_elements{Common::DivCeil(ctx.runtime_info.cs_info.shared_memory_size, 8u)};
const Id lhs_pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, base_index)};
const Id rhs_pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, next_index)}; return AccessBoundsCheck<64>(ctx, index, ctx.ConstU32(num_elements), [&] {
return ctx.OpCompositeConstruct(ctx.U32[2], ctx.OpLoad(ctx.U32[1], lhs_pointer), const Id pointer{
ctx.OpLoad(ctx.U32[1], rhs_pointer)); ctx.OpAccessChain(ctx.shared_u64, ctx.shared_memory_u64, ctx.u32_zero_value, index)};
return ctx.OpLoad(ctx.U64, pointer);
});
}
void EmitWriteSharedU16(EmitContext& ctx, Id offset, Id value) {
const Id shift{ctx.ConstU32(1U)};
const Id index{ctx.OpShiftRightLogical(ctx.U32[1], offset, shift)};
const u32 num_elements{Common::DivCeil(ctx.runtime_info.cs_info.shared_memory_size, 2u)};
AccessBoundsCheck<16>(ctx, index, ctx.ConstU32(num_elements), [&] {
const Id pointer =
ctx.OpAccessChain(ctx.shared_u16, ctx.shared_memory_u16, ctx.u32_zero_value, index);
ctx.OpStore(pointer, value);
return Id{0};
});
} }
void EmitWriteSharedU32(EmitContext& ctx, Id offset, Id value) { void EmitWriteSharedU32(EmitContext& ctx, Id offset, Id value) {
const Id shift{ctx.ConstU32(2U)}; const Id shift{ctx.ConstU32(2U)};
const Id word_offset{ctx.OpShiftRightArithmetic(ctx.U32[1], offset, shift)}; const Id index{ctx.OpShiftRightLogical(ctx.U32[1], offset, shift)};
const Id pointer = ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, word_offset); const u32 num_elements{Common::DivCeil(ctx.runtime_info.cs_info.shared_memory_size, 4u)};
ctx.OpStore(pointer, value);
AccessBoundsCheck<32>(ctx, index, ctx.ConstU32(num_elements), [&] {
const Id pointer =
ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, ctx.u32_zero_value, index);
ctx.OpStore(pointer, value);
return Id{0};
});
} }
void EmitWriteSharedU64(EmitContext& ctx, Id offset, Id value) { void EmitWriteSharedU64(EmitContext& ctx, Id offset, Id value) {
const Id shift{ctx.ConstU32(2U)}; const Id shift{ctx.ConstU32(3U)};
const Id word_offset{ctx.OpShiftRightArithmetic(ctx.U32[1], offset, shift)}; const Id index{ctx.OpShiftRightLogical(ctx.U32[1], offset, shift)};
const Id next_offset{ctx.OpIAdd(ctx.U32[1], word_offset, ctx.ConstU32(1U))}; const u32 num_elements{Common::DivCeil(ctx.runtime_info.cs_info.shared_memory_size, 8u)};
const Id lhs_pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, word_offset)};
const Id rhs_pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, next_offset)}; AccessBoundsCheck<64>(ctx, index, ctx.ConstU32(num_elements), [&] {
ctx.OpStore(lhs_pointer, ctx.OpCompositeExtract(ctx.U32[1], value, 0U)); const Id pointer{
ctx.OpStore(rhs_pointer, ctx.OpCompositeExtract(ctx.U32[1], value, 1U)); ctx.OpAccessChain(ctx.shared_u64, ctx.shared_memory_u64, ctx.u32_zero_value, index)};
ctx.OpStore(pointer, value);
return Id{0};
});
} }
} // namespace Shader::Backend::SPIRV } // namespace Shader::Backend::SPIRV

View file

@ -146,6 +146,7 @@ void EmitContext::DefineArithmeticTypes() {
false_value = ConstantFalse(U1[1]); false_value = ConstantFalse(U1[1]);
u8_one_value = Constant(U8, 1U); u8_one_value = Constant(U8, 1U);
u8_zero_value = Constant(U8, 0U); u8_zero_value = Constant(U8, 0U);
u16_zero_value = Constant(U16, 0U);
u32_one_value = ConstU32(1U); u32_one_value = ConstU32(1U);
u32_zero_value = ConstU32(0U); u32_zero_value = ConstU32(0U);
f32_zero_value = ConstF32(0.0f); f32_zero_value = ConstF32(0.0f);
@ -285,6 +286,8 @@ void EmitContext::DefineBufferProperties() {
Name(buffer.size_shorts, fmt::format("buf{}_short_size", binding)); Name(buffer.size_shorts, fmt::format("buf{}_short_size", binding));
buffer.size_dwords = OpShiftRightLogical(U32[1], buffer.size, ConstU32(2U)); buffer.size_dwords = OpShiftRightLogical(U32[1], buffer.size, ConstU32(2U));
Name(buffer.size_dwords, fmt::format("buf{}_dword_size", binding)); Name(buffer.size_dwords, fmt::format("buf{}_dword_size", binding));
buffer.size_qwords = OpShiftRightLogical(U32[1], buffer.size, ConstU32(3U));
Name(buffer.size_qwords, fmt::format("buf{}_qword_size", binding));
} }
} }
} }
@ -979,13 +982,27 @@ void EmitContext::DefineSharedMemory() {
} }
ASSERT(info.stage == Stage::Compute); ASSERT(info.stage == Stage::Compute);
const u32 shared_memory_size = runtime_info.cs_info.shared_memory_size; const u32 shared_memory_size = runtime_info.cs_info.shared_memory_size;
const u32 num_elements{Common::DivCeil(shared_memory_size, 4U)};
const Id type{TypeArray(U32[1], ConstU32(num_elements))}; const auto make_type = [&](Id element_type, u32 element_size) {
shared_memory_u32_type = TypePointer(spv::StorageClass::Workgroup, type); const u32 num_elements{Common::DivCeil(shared_memory_size, element_size)};
shared_u32 = TypePointer(spv::StorageClass::Workgroup, U32[1]); const Id array_type{TypeArray(element_type, ConstU32(num_elements))};
shared_memory_u32 = AddGlobalVariable(shared_memory_u32_type, spv::StorageClass::Workgroup); Decorate(array_type, spv::Decoration::ArrayStride, element_size);
Name(shared_memory_u32, "shared_mem");
interfaces.push_back(shared_memory_u32); const Id struct_type{TypeStruct(array_type)};
MemberDecorate(struct_type, 0u, spv::Decoration::Offset, 0u);
Decorate(struct_type, spv::Decoration::Block);
const Id pointer = TypePointer(spv::StorageClass::Workgroup, struct_type);
const Id element_pointer = TypePointer(spv::StorageClass::Workgroup, element_type);
const Id variable = AddGlobalVariable(pointer, spv::StorageClass::Workgroup);
Decorate(variable, spv::Decoration::Aliased);
interfaces.push_back(variable);
return std::make_tuple(variable, element_pointer, pointer);
};
std::tie(shared_memory_u16, shared_u16, shared_memory_u16_type) = make_type(U16, 2u);
std::tie(shared_memory_u32, shared_u32, shared_memory_u32_type) = make_type(U32[1], 4u);
std::tie(shared_memory_u64, shared_u64, shared_memory_u64_type) = make_type(U64, 8u);
} }
Id EmitContext::DefineFloat32ToUfloatM5(u32 mantissa_bits, const std::string_view name) { Id EmitContext::DefineFloat32ToUfloatM5(u32 mantissa_bits, const std::string_view name) {

View file

@ -235,17 +235,16 @@ public:
Id false_value{}; Id false_value{};
Id u8_one_value{}; Id u8_one_value{};
Id u8_zero_value{}; Id u8_zero_value{};
Id u16_zero_value{};
Id u32_one_value{}; Id u32_one_value{};
Id u32_zero_value{}; Id u32_zero_value{};
Id f32_zero_value{}; Id f32_zero_value{};
Id u64_one_value{}; Id u64_one_value{};
Id u64_zero_value{}; Id u64_zero_value{};
Id shared_u8{};
Id shared_u16{}; Id shared_u16{};
Id shared_u32{}; Id shared_u32{};
Id shared_u32x2{}; Id shared_u64{};
Id shared_u32x4{};
Id input_u32{}; Id input_u32{};
Id input_f32{}; Id input_f32{};
@ -285,13 +284,13 @@ public:
Id image_u32{}; Id image_u32{};
Id image_f32{}; Id image_f32{};
Id shared_memory_u8{};
Id shared_memory_u16{}; Id shared_memory_u16{};
Id shared_memory_u32{}; Id shared_memory_u32{};
Id shared_memory_u32x2{}; Id shared_memory_u64{};
Id shared_memory_u32x4{};
Id shared_memory_u16_type{};
Id shared_memory_u32_type{}; Id shared_memory_u32_type{};
Id shared_memory_u64_type{};
Id bary_coord_persp_id{}; Id bary_coord_persp_id{};
Id bary_coord_linear_id{}; Id bary_coord_linear_id{};
@ -320,6 +319,7 @@ public:
Id size; Id size;
Id size_shorts; Id size_shorts;
Id size_dwords; Id size_dwords;
Id size_qwords;
std::array<BufferSpv, u32(PointerType::NumAlias)> aliases; std::array<BufferSpv, u32(PointerType::NumAlias)> aliases;
const BufferSpv& operator[](PointerType alias) const { const BufferSpv& operator[](PointerType alias) const {

View file

@ -13,6 +13,8 @@ void Translator::EmitDataShare(const GcnInst& inst) {
// DS // DS
case Opcode::DS_ADD_U32: case Opcode::DS_ADD_U32:
return DS_ADD_U32(inst, false); return DS_ADD_U32(inst, false);
case Opcode::DS_ADD_U64:
return DS_ADD_U64(inst, false);
case Opcode::DS_SUB_U32: case Opcode::DS_SUB_U32:
return DS_SUB_U32(inst, false); return DS_SUB_U32(inst, false);
case Opcode::DS_INC_U32: case Opcode::DS_INC_U32:
@ -61,10 +63,14 @@ void Translator::EmitDataShare(const GcnInst& inst) {
return DS_READ(32, false, true, false, inst); return DS_READ(32, false, true, false, inst);
case Opcode::DS_READ2ST64_B32: case Opcode::DS_READ2ST64_B32:
return DS_READ(32, false, true, true, inst); return DS_READ(32, false, true, true, inst);
case Opcode::DS_READ_U16:
return DS_READ(16, false, false, false, inst);
case Opcode::DS_CONSUME: case Opcode::DS_CONSUME:
return DS_CONSUME(inst); return DS_CONSUME(inst);
case Opcode::DS_APPEND: case Opcode::DS_APPEND:
return DS_APPEND(inst); return DS_APPEND(inst);
case Opcode::DS_WRITE_B16:
return DS_WRITE(16, false, false, false, inst);
case Opcode::DS_WRITE_B64: case Opcode::DS_WRITE_B64:
return DS_WRITE(64, false, false, false, inst); return DS_WRITE(64, false, false, false, inst);
case Opcode::DS_WRITE2_B64: case Opcode::DS_WRITE2_B64:
@ -123,6 +129,18 @@ void Translator::DS_ADD_U32(const GcnInst& inst, bool rtn) {
} }
} }
void Translator::DS_ADD_U64(const GcnInst& inst, bool rtn) {
const IR::U32 addr{GetSrc(inst.src[0])};
const IR::U64 data{GetSrc64(inst.src[1])};
const IR::U32 offset =
ir.Imm32((u32(inst.control.ds.offset1) << 8u) + u32(inst.control.ds.offset0));
const IR::U32 addr_offset = ir.IAdd(addr, offset);
const IR::Value original_val = ir.SharedAtomicIAdd(addr_offset, data);
if (rtn) {
SetDst64(inst.dst[0], IR::U64{original_val});
}
}
void Translator::DS_MIN_U32(const GcnInst& inst, bool is_signed, bool rtn) { void Translator::DS_MIN_U32(const GcnInst& inst, bool is_signed, bool rtn) {
const IR::U32 addr{GetSrc(inst.src[0])}; const IR::U32 addr{GetSrc(inst.src[0])};
const IR::U32 data{GetSrc(inst.src[1])}; const IR::U32 data{GetSrc(inst.src[1])};
@ -201,23 +219,28 @@ void Translator::DS_WRITE(int bit_size, bool is_signed, bool is_pair, bool strid
if (bit_size == 32) { if (bit_size == 32) {
ir.WriteShared(32, ir.GetVectorReg(data0), addr0); ir.WriteShared(32, ir.GetVectorReg(data0), addr0);
} else { } else {
ir.WriteShared( ir.WriteShared(64,
64, ir.CompositeConstruct(ir.GetVectorReg(data0), ir.GetVectorReg(data0 + 1)), ir.PackUint2x32(ir.CompositeConstruct(ir.GetVectorReg(data0),
addr0); ir.GetVectorReg(data0 + 1))),
addr0);
} }
const IR::U32 addr1 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset1 * adj))); const IR::U32 addr1 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset1 * adj)));
if (bit_size == 32) { if (bit_size == 32) {
ir.WriteShared(32, ir.GetVectorReg(data1), addr1); ir.WriteShared(32, ir.GetVectorReg(data1), addr1);
} else { } else {
ir.WriteShared( ir.WriteShared(64,
64, ir.CompositeConstruct(ir.GetVectorReg(data1), ir.GetVectorReg(data1 + 1)), ir.PackUint2x32(ir.CompositeConstruct(ir.GetVectorReg(data1),
addr1); ir.GetVectorReg(data1 + 1))),
addr1);
} }
} else if (bit_size == 64) { } else if (bit_size == 64) {
const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(offset)); const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(offset));
const IR::Value data = const IR::Value data =
ir.CompositeConstruct(ir.GetVectorReg(data0), ir.GetVectorReg(data0 + 1)); ir.CompositeConstruct(ir.GetVectorReg(data0), ir.GetVectorReg(data0 + 1));
ir.WriteShared(bit_size, data, addr0); ir.WriteShared(bit_size, ir.PackUint2x32(data), addr0);
} else if (bit_size == 16) {
const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(offset));
ir.WriteShared(bit_size, ir.GetVectorReg(data0), addr0);
} else { } else {
const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(offset)); const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(offset));
ir.WriteShared(bit_size, ir.GetVectorReg(data0), addr0); ir.WriteShared(bit_size, ir.GetVectorReg(data0), addr0);
@ -289,22 +312,29 @@ void Translator::DS_READ(int bit_size, bool is_signed, bool is_pair, bool stride
if (bit_size == 32) { if (bit_size == 32) {
ir.SetVectorReg(dst_reg++, IR::U32{data0}); ir.SetVectorReg(dst_reg++, IR::U32{data0});
} else { } else {
ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(data0, 0)}); const auto vector = ir.UnpackUint2x32(IR::U64{data0});
ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(data0, 1)}); ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(vector, 0)});
ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(vector, 1)});
} }
const IR::U32 addr1 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset1 * adj))); const IR::U32 addr1 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset1 * adj)));
const IR::Value data1 = ir.LoadShared(bit_size, is_signed, addr1); const IR::Value data1 = ir.LoadShared(bit_size, is_signed, addr1);
if (bit_size == 32) { if (bit_size == 32) {
ir.SetVectorReg(dst_reg++, IR::U32{data1}); ir.SetVectorReg(dst_reg++, IR::U32{data1});
} else { } else {
ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(data1, 0)}); const auto vector = ir.UnpackUint2x32(IR::U64{data1});
ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(data1, 1)}); ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(vector, 0)});
ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(vector, 1)});
} }
} else if (bit_size == 64) { } else if (bit_size == 64) {
const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(offset)); const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(offset));
const IR::Value data = ir.LoadShared(bit_size, is_signed, addr0); const IR::Value data = ir.LoadShared(bit_size, is_signed, addr0);
ir.SetVectorReg(dst_reg, IR::U32{ir.CompositeExtract(data, 0)}); const auto vector = ir.UnpackUint2x32(IR::U64{data});
ir.SetVectorReg(dst_reg + 1, IR::U32{ir.CompositeExtract(data, 1)}); ir.SetVectorReg(dst_reg, IR::U32{ir.CompositeExtract(vector, 0)});
ir.SetVectorReg(dst_reg + 1, IR::U32{ir.CompositeExtract(vector, 1)});
} else if (bit_size == 16) {
const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(offset));
const IR::U16 data = IR::U16{ir.LoadShared(bit_size, is_signed, addr0)};
ir.SetVectorReg(dst_reg, ir.UConvert(32, data));
} else { } else {
const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(offset)); const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(offset));
const IR::U32 data = IR::U32{ir.LoadShared(bit_size, is_signed, addr0)}; const IR::U32 data = IR::U32{ir.LoadShared(bit_size, is_signed, addr0)};

View file

@ -271,6 +271,7 @@ public:
// Data share // Data share
// DS // DS
void DS_ADD_U32(const GcnInst& inst, bool rtn); void DS_ADD_U32(const GcnInst& inst, bool rtn);
void DS_ADD_U64(const GcnInst& inst, bool rtn);
void DS_MIN_U32(const GcnInst& inst, bool is_signed, bool rtn); void DS_MIN_U32(const GcnInst& inst, bool is_signed, bool rtn);
void DS_MAX_U32(const GcnInst& inst, bool is_signed, bool rtn); void DS_MAX_U32(const GcnInst& inst, bool is_signed, bool rtn);
void DS_WRITE(int bit_size, bool is_signed, bool is_pair, bool stride64, const GcnInst& inst); void DS_WRITE(int bit_size, bool is_signed, bool is_pair, bool stride64, const GcnInst& inst);

View file

@ -293,10 +293,12 @@ void IREmitter::SetPatch(Patch patch, const F32& value) {
Value IREmitter::LoadShared(int bit_size, bool is_signed, const U32& offset) { Value IREmitter::LoadShared(int bit_size, bool is_signed, const U32& offset) {
switch (bit_size) { switch (bit_size) {
case 16:
return Inst<U16>(Opcode::LoadSharedU16, offset);
case 32: case 32:
return Inst<U32>(Opcode::LoadSharedU32, offset); return Inst<U32>(Opcode::LoadSharedU32, offset);
case 64: case 64:
return Inst(Opcode::LoadSharedU64, offset); return Inst<U64>(Opcode::LoadSharedU64, offset);
default: default:
UNREACHABLE_MSG("Invalid bit size {}", bit_size); UNREACHABLE_MSG("Invalid bit size {}", bit_size);
} }
@ -304,6 +306,9 @@ Value IREmitter::LoadShared(int bit_size, bool is_signed, const U32& offset) {
void IREmitter::WriteShared(int bit_size, const Value& value, const U32& offset) { void IREmitter::WriteShared(int bit_size, const Value& value, const U32& offset) {
switch (bit_size) { switch (bit_size) {
case 16:
Inst(Opcode::WriteSharedU16, offset, value);
break;
case 32: case 32:
Inst(Opcode::WriteSharedU32, offset, value); Inst(Opcode::WriteSharedU32, offset, value);
break; break;
@ -315,10 +320,12 @@ void IREmitter::WriteShared(int bit_size, const Value& value, const U32& offset)
} }
} }
U32F32 IREmitter::SharedAtomicIAdd(const U32& address, const U32F32& data) { U32U64 IREmitter::SharedAtomicIAdd(const U32& address, const U32U64& data) {
switch (data.Type()) { switch (data.Type()) {
case Type::U32: case Type::U32:
return Inst<U32>(Opcode::SharedAtomicIAdd32, address, data); return Inst<U32>(Opcode::SharedAtomicIAdd32, address, data);
case Type::U64:
return Inst<U64>(Opcode::SharedAtomicIAdd64, address, data);
default: default:
ThrowInvalidType(data.Type()); ThrowInvalidType(data.Type());
} }

View file

@ -99,7 +99,7 @@ public:
[[nodiscard]] Value LoadShared(int bit_size, bool is_signed, const U32& offset); [[nodiscard]] Value LoadShared(int bit_size, bool is_signed, const U32& offset);
void WriteShared(int bit_size, const Value& value, const U32& offset); void WriteShared(int bit_size, const Value& value, const U32& offset);
[[nodiscard]] U32F32 SharedAtomicIAdd(const U32& address, const U32F32& data); [[nodiscard]] U32U64 SharedAtomicIAdd(const U32& address, const U32U64& data);
[[nodiscard]] U32 SharedAtomicIMin(const U32& address, const U32& data, bool is_signed); [[nodiscard]] U32 SharedAtomicIMin(const U32& address, const U32& data, bool is_signed);
[[nodiscard]] U32 SharedAtomicIMax(const U32& address, const U32& data, bool is_signed); [[nodiscard]] U32 SharedAtomicIMax(const U32& address, const U32& data, bool is_signed);
[[nodiscard]] U32 SharedAtomicAnd(const U32& address, const U32& data); [[nodiscard]] U32 SharedAtomicAnd(const U32& address, const U32& data);

View file

@ -30,13 +30,16 @@ OPCODE(EmitVertex, Void,
OPCODE(EmitPrimitive, Void, ) OPCODE(EmitPrimitive, Void, )
// Shared memory operations // Shared memory operations
OPCODE(LoadSharedU16, U16, U32, )
OPCODE(LoadSharedU32, U32, U32, ) OPCODE(LoadSharedU32, U32, U32, )
OPCODE(LoadSharedU64, U32x2, U32, ) OPCODE(LoadSharedU64, U64, U32, )
OPCODE(WriteSharedU16, Void, U32, U16, )
OPCODE(WriteSharedU32, Void, U32, U32, ) OPCODE(WriteSharedU32, Void, U32, U32, )
OPCODE(WriteSharedU64, Void, U32, U32x2, ) OPCODE(WriteSharedU64, Void, U32, U64, )
// Shared atomic operations // Shared atomic operations
OPCODE(SharedAtomicIAdd32, U32, U32, U32, ) OPCODE(SharedAtomicIAdd32, U32, U32, U32, )
OPCODE(SharedAtomicIAdd64, U64, U32, U64, )
OPCODE(SharedAtomicSMin32, U32, U32, U32, ) OPCODE(SharedAtomicSMin32, U32, U32, U32, )
OPCODE(SharedAtomicUMin32, U32, U32, U32, ) OPCODE(SharedAtomicUMin32, U32, U32, U32, )
OPCODE(SharedAtomicSMax32, U32, U32, U32, ) OPCODE(SharedAtomicSMax32, U32, U32, U32, )
@ -116,6 +119,7 @@ OPCODE(StoreBufferFormatF32, Void, Opaq
// Buffer atomic operations // Buffer atomic operations
OPCODE(BufferAtomicIAdd32, U32, Opaque, Opaque, U32 ) OPCODE(BufferAtomicIAdd32, U32, Opaque, Opaque, U32 )
OPCODE(BufferAtomicIAdd64, U64, Opaque, Opaque, U64 )
OPCODE(BufferAtomicSMin32, U32, Opaque, Opaque, U32 ) OPCODE(BufferAtomicSMin32, U32, Opaque, Opaque, U32 )
OPCODE(BufferAtomicUMin32, U32, Opaque, Opaque, U32 ) OPCODE(BufferAtomicUMin32, U32, Opaque, Opaque, U32 )
OPCODE(BufferAtomicSMax32, U32, Opaque, Opaque, U32 ) OPCODE(BufferAtomicSMax32, U32, Opaque, Opaque, U32 )

View file

@ -39,11 +39,13 @@ void RingAccessElimination(const IR::Program& program, const RuntimeInfo& runtim
ASSERT(addr->Arg(1).IsImmediate()); ASSERT(addr->Arg(1).IsImmediate());
offset = addr->Arg(1).U32(); offset = addr->Arg(1).U32();
} }
IR::Value data = inst.Arg(1).Resolve(); IR::Value data = is_composite ? ir.UnpackUint2x32(IR::U64{inst.Arg(1).Resolve()})
: inst.Arg(1).Resolve();
for (s32 i = 0; i < num_components; i++) { for (s32 i = 0; i < num_components; i++) {
const auto attrib = IR::Attribute::Param0 + (offset / 16); const auto attrib = IR::Attribute::Param0 + (offset / 16);
const auto comp = (offset / 4) % 4; const auto comp = (offset / 4) % 4;
const IR::U32 value = IR::U32{is_composite ? data.Inst()->Arg(i) : data}; const IR::U32 value =
IR::U32{is_composite ? ir.CompositeExtract(data, i) : data};
ir.SetAttribute(attrib, ir.BitCast<IR::F32, IR::U32>(value), comp); ir.SetAttribute(attrib, ir.BitCast<IR::F32, IR::U32>(value), comp);
offset += 4; offset += 4;
} }

View file

@ -34,8 +34,10 @@ void Visit(Info& info, const IR::Inst& inst) {
info.uses_patches |= 1U << IR::GenericPatchIndex(patch); info.uses_patches |= 1U << IR::GenericPatchIndex(patch);
break; break;
} }
case IR::Opcode::LoadSharedU16:
case IR::Opcode::LoadSharedU32: case IR::Opcode::LoadSharedU32:
case IR::Opcode::LoadSharedU64: case IR::Opcode::LoadSharedU64:
case IR::Opcode::WriteSharedU16:
case IR::Opcode::WriteSharedU32: case IR::Opcode::WriteSharedU32:
case IR::Opcode::WriteSharedU64: case IR::Opcode::WriteSharedU64:
info.uses_shared = true; info.uses_shared = true;

View file

@ -16,6 +16,7 @@ static bool IsSharedAccess(const IR::Inst& inst) {
case IR::Opcode::WriteSharedU64: case IR::Opcode::WriteSharedU64:
case IR::Opcode::SharedAtomicAnd32: case IR::Opcode::SharedAtomicAnd32:
case IR::Opcode::SharedAtomicIAdd32: case IR::Opcode::SharedAtomicIAdd32:
case IR::Opcode::SharedAtomicIAdd64:
case IR::Opcode::SharedAtomicOr32: case IR::Opcode::SharedAtomicOr32:
case IR::Opcode::SharedAtomicSMax32: case IR::Opcode::SharedAtomicSMax32:
case IR::Opcode::SharedAtomicUMax32: case IR::Opcode::SharedAtomicUMax32:
@ -33,9 +34,11 @@ void SharedMemoryToStoragePass(IR::Program& program, const RuntimeInfo& runtime_
if (program.info.stage != Stage::Compute) { if (program.info.stage != Stage::Compute) {
return; return;
} }
// Only perform the transform if the host shared memory is insufficient. // Only perform the transform if the host shared memory is insufficient
// or the device does not support VK_KHR_workgroup_memory_explicit_layout
const u32 shared_memory_size = runtime_info.cs_info.shared_memory_size; const u32 shared_memory_size = runtime_info.cs_info.shared_memory_size;
if (shared_memory_size <= profile.max_shared_memory_size) { if (shared_memory_size <= profile.max_shared_memory_size &&
profile.supports_workgroup_explicit_memory_layout) {
return; return;
} }
// Add buffer binding for shared memory storage buffer. // Add buffer binding for shared memory storage buffer.
@ -60,6 +63,7 @@ void SharedMemoryToStoragePass(IR::Program& program, const RuntimeInfo& runtime_
ir.BufferAtomicAnd(handle, inst.Arg(0), inst.Arg(1), {})); ir.BufferAtomicAnd(handle, inst.Arg(0), inst.Arg(1), {}));
continue; continue;
case IR::Opcode::SharedAtomicIAdd32: case IR::Opcode::SharedAtomicIAdd32:
case IR::Opcode::SharedAtomicIAdd64:
inst.ReplaceUsesWithAndRemove( inst.ReplaceUsesWithAndRemove(
ir.BufferAtomicIAdd(handle, inst.Arg(0), inst.Arg(1), {})); ir.BufferAtomicIAdd(handle, inst.Arg(0), inst.Arg(1), {}));
continue; continue;
@ -93,12 +97,19 @@ void SharedMemoryToStoragePass(IR::Program& program, const RuntimeInfo& runtime_
ir.Imm32(shared_memory_size)); ir.Imm32(shared_memory_size));
const IR::U32 address = ir.IAdd(IR::U32{inst.Arg(0)}, offset); const IR::U32 address = ir.IAdd(IR::U32{inst.Arg(0)}, offset);
switch (inst.GetOpcode()) { switch (inst.GetOpcode()) {
case IR::Opcode::LoadSharedU16:
inst.ReplaceUsesWithAndRemove(ir.LoadBufferU16(handle, address, {}));
break;
case IR::Opcode::LoadSharedU32: case IR::Opcode::LoadSharedU32:
inst.ReplaceUsesWithAndRemove(ir.LoadBufferU32(1, handle, address, {})); inst.ReplaceUsesWithAndRemove(ir.LoadBufferU32(1, handle, address, {}));
break; break;
case IR::Opcode::LoadSharedU64: case IR::Opcode::LoadSharedU64:
inst.ReplaceUsesWithAndRemove(ir.LoadBufferU32(2, handle, address, {})); inst.ReplaceUsesWithAndRemove(ir.LoadBufferU32(2, handle, address, {}));
break; break;
case IR::Opcode::WriteSharedU16:
ir.StoreBufferU16(handle, address, IR::U32{inst.Arg(1)}, {});
inst.Invalidate();
break;
case IR::Opcode::WriteSharedU32: case IR::Opcode::WriteSharedU32:
ir.StoreBufferU32(1, handle, address, inst.Arg(1), {}); ir.StoreBufferU32(1, handle, address, inst.Arg(1), {});
inst.Invalidate(); inst.Invalidate();

View file

@ -23,13 +23,13 @@ struct Profile {
bool support_fp32_denorm_preserve{}; bool support_fp32_denorm_preserve{};
bool support_fp32_denorm_flush{}; bool support_fp32_denorm_flush{};
bool support_fp32_round_to_zero{}; bool support_fp32_round_to_zero{};
bool support_explicit_workgroup_layout{};
bool support_legacy_vertex_attributes{}; bool support_legacy_vertex_attributes{};
bool supports_image_load_store_lod{}; bool supports_image_load_store_lod{};
bool supports_native_cube_calc{}; bool supports_native_cube_calc{};
bool supports_trinary_minmax{}; bool supports_trinary_minmax{};
bool supports_robust_buffer_access{}; bool supports_robust_buffer_access{};
bool supports_image_fp32_atomic_min_max{}; bool supports_image_fp32_atomic_min_max{};
bool supports_workgroup_explicit_memory_layout{};
bool has_broken_spirv_clamp{}; bool has_broken_spirv_clamp{};
bool lower_left_origin_mode{}; bool lower_left_origin_mode{};
bool needs_manual_interpolation{}; bool needs_manual_interpolation{};

View file

@ -212,7 +212,8 @@ bool Instance::CreateDevice() {
vk::PhysicalDeviceExtendedDynamicState3FeaturesEXT, vk::PhysicalDeviceExtendedDynamicState3FeaturesEXT,
vk::PhysicalDevicePrimitiveTopologyListRestartFeaturesEXT, vk::PhysicalDevicePrimitiveTopologyListRestartFeaturesEXT,
vk::PhysicalDevicePortabilitySubsetFeaturesKHR, vk::PhysicalDevicePortabilitySubsetFeaturesKHR,
vk::PhysicalDeviceShaderAtomicFloat2FeaturesEXT>(); vk::PhysicalDeviceShaderAtomicFloat2FeaturesEXT,
vk::PhysicalDeviceWorkgroupMemoryExplicitLayoutFeaturesKHR>();
features = feature_chain.get().features; features = feature_chain.get().features;
const vk::StructureChain properties_chain = physical_device.getProperties2< const vk::StructureChain properties_chain = physical_device.getProperties2<
@ -283,6 +284,20 @@ bool Instance::CreateDevice() {
LOG_INFO(Render_Vulkan, "- shaderImageFloat32AtomicMinMax: {}", LOG_INFO(Render_Vulkan, "- shaderImageFloat32AtomicMinMax: {}",
shader_atomic_float2_features.shaderImageFloat32AtomicMinMax); shader_atomic_float2_features.shaderImageFloat32AtomicMinMax);
} }
workgroup_memory_explicit_layout =
add_extension(VK_KHR_WORKGROUP_MEMORY_EXPLICIT_LAYOUT_EXTENSION_NAME);
if (workgroup_memory_explicit_layout) {
workgroup_memory_explicit_layout_features =
feature_chain.get<vk::PhysicalDeviceWorkgroupMemoryExplicitLayoutFeaturesKHR>();
LOG_INFO(Render_Vulkan, "- workgroupMemoryExplicitLayout: {}",
workgroup_memory_explicit_layout_features.workgroupMemoryExplicitLayout);
LOG_INFO(Render_Vulkan, "- workgroupMemoryExplicitLayoutScalarBlockLayout: {}",
workgroup_memory_explicit_layout_features
.workgroupMemoryExplicitLayoutScalarBlockLayout);
LOG_INFO(
Render_Vulkan, "- workgroupMemoryExplicitLayout16BitAccess: {}",
workgroup_memory_explicit_layout_features.workgroupMemoryExplicitLayout16BitAccess);
}
const bool calibrated_timestamps = const bool calibrated_timestamps =
TRACY_GPU_ENABLED ? add_extension(VK_EXT_CALIBRATED_TIMESTAMPS_EXTENSION_NAME) : false; TRACY_GPU_ENABLED ? add_extension(VK_EXT_CALIBRATED_TIMESTAMPS_EXTENSION_NAME) : false;
@ -420,6 +435,15 @@ bool Instance::CreateDevice() {
.shaderImageFloat32AtomicMinMax = .shaderImageFloat32AtomicMinMax =
shader_atomic_float2_features.shaderImageFloat32AtomicMinMax, shader_atomic_float2_features.shaderImageFloat32AtomicMinMax,
}, },
vk::PhysicalDeviceWorkgroupMemoryExplicitLayoutFeaturesKHR{
.workgroupMemoryExplicitLayout =
workgroup_memory_explicit_layout_features.workgroupMemoryExplicitLayout,
.workgroupMemoryExplicitLayoutScalarBlockLayout =
workgroup_memory_explicit_layout_features
.workgroupMemoryExplicitLayoutScalarBlockLayout,
.workgroupMemoryExplicitLayout16BitAccess =
workgroup_memory_explicit_layout_features.workgroupMemoryExplicitLayout16BitAccess,
},
#ifdef __APPLE__ #ifdef __APPLE__
portability_features, portability_features,
#endif #endif
@ -452,6 +476,9 @@ bool Instance::CreateDevice() {
if (!shader_atomic_float2) { if (!shader_atomic_float2) {
device_chain.unlink<vk::PhysicalDeviceShaderAtomicFloat2FeaturesEXT>(); device_chain.unlink<vk::PhysicalDeviceShaderAtomicFloat2FeaturesEXT>();
} }
if (!workgroup_memory_explicit_layout) {
device_chain.unlink<vk::PhysicalDeviceWorkgroupMemoryExplicitLayoutFeaturesKHR>();
}
auto [device_result, dev] = physical_device.createDeviceUnique(device_chain.get()); auto [device_result, dev] = physical_device.createDeviceUnique(device_chain.get());
if (device_result != vk::Result::eSuccess) { if (device_result != vk::Result::eSuccess) {

View file

@ -171,6 +171,12 @@ public:
return shader_atomic_float2 && shader_atomic_float2_features.shaderImageFloat32AtomicMinMax; return shader_atomic_float2 && shader_atomic_float2_features.shaderImageFloat32AtomicMinMax;
} }
/// Returns true when VK_KHR_workgroup_memory_explicit_layout is supported.
bool IsWorkgroupMemoryExplicitLayoutSupported() const {
return workgroup_memory_explicit_layout &&
workgroup_memory_explicit_layout_features.workgroupMemoryExplicitLayout16BitAccess;
}
/// Returns true when geometry shaders are supported by the device /// Returns true when geometry shaders are supported by the device
bool IsGeometryStageSupported() const { bool IsGeometryStageSupported() const {
return features.geometryShader; return features.geometryShader;
@ -349,6 +355,8 @@ private:
vk::PhysicalDeviceExtendedDynamicState3FeaturesEXT dynamic_state_3_features; vk::PhysicalDeviceExtendedDynamicState3FeaturesEXT dynamic_state_3_features;
vk::PhysicalDeviceRobustness2FeaturesEXT robustness2_features; vk::PhysicalDeviceRobustness2FeaturesEXT robustness2_features;
vk::PhysicalDeviceShaderAtomicFloat2FeaturesEXT shader_atomic_float2_features; vk::PhysicalDeviceShaderAtomicFloat2FeaturesEXT shader_atomic_float2_features;
vk::PhysicalDeviceWorkgroupMemoryExplicitLayoutFeaturesKHR
workgroup_memory_explicit_layout_features;
vk::DriverIdKHR driver_id; vk::DriverIdKHR driver_id;
vk::UniqueDebugUtilsMessengerEXT debug_callback{}; vk::UniqueDebugUtilsMessengerEXT debug_callback{};
std::string vendor_name; std::string vendor_name;
@ -374,6 +382,7 @@ private:
bool amd_gcn_shader{}; bool amd_gcn_shader{};
bool amd_shader_trinary_minmax{}; bool amd_shader_trinary_minmax{};
bool shader_atomic_float2{}; bool shader_atomic_float2{};
bool workgroup_memory_explicit_layout{};
bool portability_subset{}; bool portability_subset{};
}; };

View file

@ -210,7 +210,6 @@ PipelineCache::PipelineCache(const Instance& instance_, Scheduler& scheduler_,
.support_fp32_denorm_preserve = bool(vk12_props.shaderDenormPreserveFloat32), .support_fp32_denorm_preserve = bool(vk12_props.shaderDenormPreserveFloat32),
.support_fp32_denorm_flush = bool(vk12_props.shaderDenormFlushToZeroFloat32), .support_fp32_denorm_flush = bool(vk12_props.shaderDenormFlushToZeroFloat32),
.support_fp32_round_to_zero = bool(vk12_props.shaderRoundingModeRTZFloat32), .support_fp32_round_to_zero = bool(vk12_props.shaderRoundingModeRTZFloat32),
.support_explicit_workgroup_layout = true,
.support_legacy_vertex_attributes = instance_.IsLegacyVertexAttributesSupported(), .support_legacy_vertex_attributes = instance_.IsLegacyVertexAttributesSupported(),
.supports_image_load_store_lod = instance_.IsImageLoadStoreLodSupported(), .supports_image_load_store_lod = instance_.IsImageLoadStoreLodSupported(),
.supports_native_cube_calc = instance_.IsAmdGcnShaderSupported(), .supports_native_cube_calc = instance_.IsAmdGcnShaderSupported(),
@ -218,6 +217,8 @@ PipelineCache::PipelineCache(const Instance& instance_, Scheduler& scheduler_,
// TODO: Emitted bounds checks cause problems with phi control flow; needs to be fixed. // TODO: Emitted bounds checks cause problems with phi control flow; needs to be fixed.
.supports_robust_buffer_access = true, // instance_.IsRobustBufferAccess2Supported(), .supports_robust_buffer_access = true, // instance_.IsRobustBufferAccess2Supported(),
.supports_image_fp32_atomic_min_max = instance_.IsShaderAtomicFloatImage32MinMaxSupported(), .supports_image_fp32_atomic_min_max = instance_.IsShaderAtomicFloatImage32MinMaxSupported(),
.supports_workgroup_explicit_memory_layout =
instance_.IsWorkgroupMemoryExplicitLayoutSupported(),
.needs_manual_interpolation = instance.IsFragmentShaderBarycentricSupported() && .needs_manual_interpolation = instance.IsFragmentShaderBarycentricSupported() &&
instance.GetDriverID() == vk::DriverId::eNvidiaProprietary, instance.GetDriverID() == vk::DriverId::eNvidiaProprietary,
.needs_lds_barriers = instance.GetDriverID() == vk::DriverId::eNvidiaProprietary || .needs_lds_barriers = instance.GetDriverID() == vk::DriverId::eNvidiaProprietary ||