mirror of
https://github.com/shadps4-emu/shadPS4.git
synced 2025-06-26 12:26:18 +00:00
shader_recompiler: Reduce cases where shared memory to buffer pass is needed. (#3082)
Some checks are pending
Build and Release / reuse (push) Waiting to run
Build and Release / clang-format (push) Waiting to run
Build and Release / get-info (push) Waiting to run
Build and Release / windows-sdl (push) Blocked by required conditions
Build and Release / windows-qt (push) Blocked by required conditions
Build and Release / macos-sdl (push) Blocked by required conditions
Build and Release / macos-qt (push) Blocked by required conditions
Build and Release / linux-sdl (push) Blocked by required conditions
Build and Release / linux-qt (push) Blocked by required conditions
Build and Release / linux-sdl-gcc (push) Blocked by required conditions
Build and Release / linux-qt-gcc (push) Blocked by required conditions
Build and Release / pre-release (push) Blocked by required conditions
Some checks are pending
Build and Release / reuse (push) Waiting to run
Build and Release / clang-format (push) Waiting to run
Build and Release / get-info (push) Waiting to run
Build and Release / windows-sdl (push) Blocked by required conditions
Build and Release / windows-qt (push) Blocked by required conditions
Build and Release / macos-sdl (push) Blocked by required conditions
Build and Release / macos-qt (push) Blocked by required conditions
Build and Release / linux-sdl (push) Blocked by required conditions
Build and Release / linux-qt (push) Blocked by required conditions
Build and Release / linux-sdl-gcc (push) Blocked by required conditions
Build and Release / linux-qt-gcc (push) Blocked by required conditions
Build and Release / pre-release (push) Blocked by required conditions
This commit is contained in:
parent
69a50fa713
commit
c71dc740e2
9 changed files with 232 additions and 49 deletions
|
@ -28,6 +28,7 @@ void HullShaderTransform(IR::Program& program, RuntimeInfo& runtime_info);
|
|||
void DomainShaderTransform(IR::Program& program, RuntimeInfo& runtime_info);
|
||||
void SharedMemoryBarrierPass(IR::Program& program, const RuntimeInfo& runtime_info,
|
||||
const Profile& profile);
|
||||
void SharedMemorySimplifyPass(IR::Program& program, const Profile& profile);
|
||||
void SharedMemoryToStoragePass(IR::Program& program, const RuntimeInfo& runtime_info,
|
||||
const Profile& profile);
|
||||
|
||||
|
|
|
@ -35,12 +35,28 @@ void Visit(Info& info, const IR::Inst& inst) {
|
|||
break;
|
||||
}
|
||||
case IR::Opcode::LoadSharedU16:
|
||||
case IR::Opcode::LoadSharedU32:
|
||||
case IR::Opcode::LoadSharedU64:
|
||||
case IR::Opcode::WriteSharedU16:
|
||||
info.shared_types |= IR::Type::U16;
|
||||
break;
|
||||
case IR::Opcode::LoadSharedU32:
|
||||
case IR::Opcode::WriteSharedU32:
|
||||
case IR::Opcode::SharedAtomicIAdd32:
|
||||
case IR::Opcode::SharedAtomicISub32:
|
||||
case IR::Opcode::SharedAtomicSMin32:
|
||||
case IR::Opcode::SharedAtomicUMin32:
|
||||
case IR::Opcode::SharedAtomicSMax32:
|
||||
case IR::Opcode::SharedAtomicUMax32:
|
||||
case IR::Opcode::SharedAtomicInc32:
|
||||
case IR::Opcode::SharedAtomicDec32:
|
||||
case IR::Opcode::SharedAtomicAnd32:
|
||||
case IR::Opcode::SharedAtomicOr32:
|
||||
case IR::Opcode::SharedAtomicXor32:
|
||||
info.shared_types |= IR::Type::U32;
|
||||
break;
|
||||
case IR::Opcode::LoadSharedU64:
|
||||
case IR::Opcode::WriteSharedU64:
|
||||
info.uses_shared = true;
|
||||
case IR::Opcode::SharedAtomicIAdd64:
|
||||
info.shared_types |= IR::Type::U64;
|
||||
break;
|
||||
case IR::Opcode::ConvertF16F32:
|
||||
case IR::Opcode::ConvertF32F16:
|
||||
|
|
127
src/shader_recompiler/ir/passes/shared_memory_simplify_pass.cpp
Normal file
127
src/shader_recompiler/ir/passes/shared_memory_simplify_pass.cpp
Normal file
|
@ -0,0 +1,127 @@
|
|||
// SPDX-FileCopyrightText: Copyright 2025 shadPS4 Emulator Project
|
||||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
|
||||
#include "shader_recompiler/ir/ir_emitter.h"
|
||||
#include "shader_recompiler/ir/program.h"
|
||||
#include "shader_recompiler/profile.h"
|
||||
|
||||
namespace Shader::Optimization {
|
||||
|
||||
static bool Requires16BitSharedAtomic(const IR::Inst& inst) {
|
||||
// Nothing yet
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool Requires64BitSharedAtomic(const IR::Inst& inst) {
|
||||
switch (inst.GetOpcode()) {
|
||||
case IR::Opcode::SharedAtomicIAdd64:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
static bool IsNon32BitSharedLoadStore(const IR::Inst& inst) {
|
||||
switch (inst.GetOpcode()) {
|
||||
case IR::Opcode::LoadSharedU16:
|
||||
case IR::Opcode::LoadSharedU64:
|
||||
case IR::Opcode::WriteSharedU16:
|
||||
case IR::Opcode::WriteSharedU64:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
IR::Type CalculateSpecialSharedAtomicTypes(IR::Program& program) {
|
||||
IR::Type extra_atomic_types{IR::Type::Void};
|
||||
for (IR::Block* const block : program.blocks) {
|
||||
for (IR::Inst& inst : block->Instructions()) {
|
||||
if (Requires16BitSharedAtomic(inst)) {
|
||||
extra_atomic_types |= IR::Type::U16;
|
||||
}
|
||||
if (Requires64BitSharedAtomic(inst)) {
|
||||
extra_atomic_types |= IR::Type::U64;
|
||||
}
|
||||
}
|
||||
}
|
||||
return extra_atomic_types;
|
||||
}
|
||||
|
||||
// Simplifies down U16 and U64 shared memory operations to U32 when aliasing is not supported and
|
||||
// atomics of the same type are not used.
|
||||
void SharedMemorySimplifyPass(IR::Program& program, const Profile& profile) {
|
||||
if (program.info.stage != Stage::Compute || profile.supports_workgroup_explicit_memory_layout) {
|
||||
return;
|
||||
}
|
||||
|
||||
const auto atomic_types = CalculateSpecialSharedAtomicTypes(program);
|
||||
if (True(atomic_types & IR::Type::U16) && True(atomic_types & IR::Type::U64)) {
|
||||
// If both other atomic types are used, there is nothing to do.
|
||||
return;
|
||||
}
|
||||
|
||||
// Iterate through shared load/store U16/U64 instructions, replacing with
|
||||
// equivalent U32 ops when the types are not needed for atomics.
|
||||
for (IR::Block* const block : program.blocks) {
|
||||
for (IR::Inst& inst : block->Instructions()) {
|
||||
if (!IsNon32BitSharedLoadStore(inst)) {
|
||||
continue;
|
||||
}
|
||||
IR::IREmitter ir{*block, IR::Block::InstructionList::s_iterator_to(inst)};
|
||||
const IR::U32 offset{inst.Arg(0)};
|
||||
if (False(atomic_types & IR::Type::U16)) {
|
||||
switch (inst.GetOpcode()) {
|
||||
case IR::Opcode::LoadSharedU16: {
|
||||
const IR::U32 dword_offset{ir.BitwiseAnd(offset, ir.Imm32(~3U))};
|
||||
const IR::U32 dword_value{ir.LoadShared(32, false, dword_offset)};
|
||||
const IR::U32 bit_offset{
|
||||
ir.IMul(ir.BitwiseAnd(offset, ir.Imm32(2U)), ir.Imm32(8U))};
|
||||
const IR::U32 value{ir.BitFieldExtract(dword_value, bit_offset, ir.Imm32(16U))};
|
||||
inst.ReplaceUsesWithAndRemove(ir.UConvert(16, value));
|
||||
continue;
|
||||
}
|
||||
case IR::Opcode::WriteSharedU16: {
|
||||
const IR::U32 value{ir.UConvert(32, IR::U16{inst.Arg(1)})};
|
||||
const IR::U32 bit_offset{
|
||||
ir.IMul(ir.BitwiseAnd(offset, ir.Imm32(2U)), ir.Imm32(8U))};
|
||||
const IR::U32 dword_offset{ir.BitwiseAnd(offset, ir.Imm32(~3U))};
|
||||
const IR::U32 dword_value{
|
||||
ir.LoadShared(32, false, ir.BitwiseAnd(offset, dword_offset))};
|
||||
const IR::U32 new_dword_value{
|
||||
ir.BitFieldInsert(dword_value, value, bit_offset, ir.Imm32(16U))};
|
||||
ir.WriteShared(32, new_dword_value, dword_offset);
|
||||
inst.Invalidate();
|
||||
continue;
|
||||
}
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (False(atomic_types & IR::Type::U64)) {
|
||||
switch (inst.GetOpcode()) {
|
||||
case IR::Opcode::LoadSharedU64: {
|
||||
const IR::U32 value0{ir.LoadShared(32, false, offset)};
|
||||
const IR::U32 value1{ir.LoadShared(32, false, ir.IAdd(offset, ir.Imm32(4U)))};
|
||||
const IR::Value value{ir.PackUint2x32(ir.CompositeConstruct(value0, value1))};
|
||||
inst.ReplaceUsesWithAndRemove(value);
|
||||
continue;
|
||||
}
|
||||
case IR::Opcode::WriteSharedU64: {
|
||||
const IR::Value value{ir.UnpackUint2x32(IR::U64{inst.Arg(1)})};
|
||||
const IR::U32 value0{ir.CompositeExtract(value, 0)};
|
||||
const IR::U32 value1{ir.CompositeExtract(value, 1)};
|
||||
ir.WriteShared(32, value0, offset);
|
||||
ir.WriteShared(32, value1, ir.IAdd(offset, ir.Imm32(4U)));
|
||||
inst.Invalidate();
|
||||
continue;
|
||||
}
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace Shader::Optimization
|
|
@ -34,20 +34,74 @@ static bool IsSharedAccess(const IR::Inst& inst) {
|
|||
}
|
||||
}
|
||||
|
||||
IR::Type CalculateSharedMemoryTypes(IR::Program& program) {
|
||||
IR::Type used_types{IR::Type::Void};
|
||||
for (IR::Block* const block : program.blocks) {
|
||||
for (IR::Inst& inst : block->Instructions()) {
|
||||
if (!IsSharedAccess(inst)) {
|
||||
continue;
|
||||
}
|
||||
switch (inst.GetOpcode()) {
|
||||
case IR::Opcode::LoadSharedU16:
|
||||
case IR::Opcode::WriteSharedU16:
|
||||
used_types |= IR::Type::U16;
|
||||
break;
|
||||
case IR::Opcode::LoadSharedU32:
|
||||
case IR::Opcode::WriteSharedU32:
|
||||
case IR::Opcode::SharedAtomicIAdd32:
|
||||
case IR::Opcode::SharedAtomicISub32:
|
||||
case IR::Opcode::SharedAtomicSMin32:
|
||||
case IR::Opcode::SharedAtomicUMin32:
|
||||
case IR::Opcode::SharedAtomicSMax32:
|
||||
case IR::Opcode::SharedAtomicUMax32:
|
||||
case IR::Opcode::SharedAtomicInc32:
|
||||
case IR::Opcode::SharedAtomicDec32:
|
||||
case IR::Opcode::SharedAtomicAnd32:
|
||||
case IR::Opcode::SharedAtomicOr32:
|
||||
case IR::Opcode::SharedAtomicXor32:
|
||||
used_types |= IR::Type::U32;
|
||||
break;
|
||||
case IR::Opcode::LoadSharedU64:
|
||||
case IR::Opcode::WriteSharedU64:
|
||||
case IR::Opcode::SharedAtomicIAdd64:
|
||||
used_types |= IR::Type::U64;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return used_types;
|
||||
}
|
||||
|
||||
void SharedMemoryToStoragePass(IR::Program& program, const RuntimeInfo& runtime_info,
|
||||
const Profile& profile) {
|
||||
if (program.info.stage != Stage::Compute) {
|
||||
return;
|
||||
}
|
||||
// Only perform the transform if there is shared memory and either host shared memory is
|
||||
// insufficient or the device does not support VK_KHR_workgroup_memory_explicit_layout
|
||||
|
||||
// Run this pass if:
|
||||
// * There are shared memory instructions.
|
||||
// * One of the following is true:
|
||||
// * Requested shared memory size is too large for the host shared memory.
|
||||
// * Workgroup explicit memory is not supported and multiple shared memory types are used.
|
||||
const u32 shared_memory_size = runtime_info.cs_info.shared_memory_size;
|
||||
if (shared_memory_size == 0 || (shared_memory_size <= profile.max_shared_memory_size &&
|
||||
profile.supports_workgroup_explicit_memory_layout)) {
|
||||
const auto used_types = CalculateSharedMemoryTypes(program);
|
||||
if (used_types == IR::Type::Void || (shared_memory_size <= profile.max_shared_memory_size &&
|
||||
(profile.supports_workgroup_explicit_memory_layout ||
|
||||
std::popcount(static_cast<u32>(used_types)) == 1))) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Add a buffer binding for shared memory storage buffer.
|
||||
const u32 binding = static_cast<u32>(program.info.buffers.size());
|
||||
IR::Type used_types{};
|
||||
program.info.buffers.push_back({
|
||||
.used_types = used_types,
|
||||
.inline_cbuf = AmdGpu::Buffer::Null(),
|
||||
.buffer_type = BufferType::SharedMemory,
|
||||
.is_written = true,
|
||||
});
|
||||
|
||||
for (IR::Block* const block : program.blocks) {
|
||||
for (IR::Inst& inst : block->Instructions()) {
|
||||
if (!IsSharedAccess(inst)) {
|
||||
|
@ -58,29 +112,21 @@ void SharedMemoryToStoragePass(IR::Program& program, const RuntimeInfo& runtime_
|
|||
const IR::U32 offset = ir.IMul(ir.GetAttributeU32(IR::Attribute::WorkgroupIndex),
|
||||
ir.Imm32(shared_memory_size));
|
||||
const IR::U32 address = ir.IAdd(IR::U32{inst.Arg(0)}, offset);
|
||||
// Replace shared atomics first
|
||||
switch (inst.GetOpcode()) {
|
||||
case IR::Opcode::SharedAtomicIAdd32:
|
||||
inst.ReplaceUsesWithAndRemove(
|
||||
ir.BufferAtomicIAdd(handle, address, inst.Arg(1), {}));
|
||||
used_types |= IR::Type::U32;
|
||||
continue;
|
||||
case IR::Opcode::SharedAtomicIAdd64:
|
||||
inst.ReplaceUsesWithAndRemove(
|
||||
ir.BufferAtomicIAdd(handle, address, inst.Arg(1), {}));
|
||||
used_types |= IR::Type::U64;
|
||||
continue;
|
||||
case IR::Opcode::SharedAtomicISub32:
|
||||
inst.ReplaceUsesWithAndRemove(
|
||||
ir.BufferAtomicISub(handle, address, inst.Arg(1), {}));
|
||||
used_types |= IR::Type::U32;
|
||||
continue;
|
||||
case IR::Opcode::SharedAtomicSMin32:
|
||||
case IR::Opcode::SharedAtomicUMin32: {
|
||||
const bool is_signed = inst.GetOpcode() == IR::Opcode::SharedAtomicSMin32;
|
||||
inst.ReplaceUsesWithAndRemove(
|
||||
ir.BufferAtomicIMin(handle, address, inst.Arg(1), is_signed, {}));
|
||||
used_types |= IR::Type::U32;
|
||||
continue;
|
||||
}
|
||||
case IR::Opcode::SharedAtomicSMax32:
|
||||
|
@ -88,73 +134,49 @@ void SharedMemoryToStoragePass(IR::Program& program, const RuntimeInfo& runtime_
|
|||
const bool is_signed = inst.GetOpcode() == IR::Opcode::SharedAtomicSMax32;
|
||||
inst.ReplaceUsesWithAndRemove(
|
||||
ir.BufferAtomicIMax(handle, address, inst.Arg(1), is_signed, {}));
|
||||
used_types |= IR::Type::U32;
|
||||
continue;
|
||||
}
|
||||
case IR::Opcode::SharedAtomicInc32:
|
||||
inst.ReplaceUsesWithAndRemove(ir.BufferAtomicInc(handle, address, {}));
|
||||
used_types |= IR::Type::U32;
|
||||
continue;
|
||||
case IR::Opcode::SharedAtomicDec32:
|
||||
inst.ReplaceUsesWithAndRemove(ir.BufferAtomicDec(handle, address, {}));
|
||||
used_types |= IR::Type::U32;
|
||||
continue;
|
||||
case IR::Opcode::SharedAtomicAnd32:
|
||||
inst.ReplaceUsesWithAndRemove(ir.BufferAtomicAnd(handle, address, inst.Arg(1), {}));
|
||||
used_types |= IR::Type::U32;
|
||||
continue;
|
||||
case IR::Opcode::SharedAtomicOr32:
|
||||
inst.ReplaceUsesWithAndRemove(ir.BufferAtomicOr(handle, address, inst.Arg(1), {}));
|
||||
used_types |= IR::Type::U32;
|
||||
continue;
|
||||
case IR::Opcode::SharedAtomicXor32:
|
||||
inst.ReplaceUsesWithAndRemove(ir.BufferAtomicXor(handle, address, inst.Arg(1), {}));
|
||||
used_types |= IR::Type::U32;
|
||||
continue;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
// Replace shared operations.
|
||||
switch (inst.GetOpcode()) {
|
||||
case IR::Opcode::LoadSharedU16:
|
||||
inst.ReplaceUsesWithAndRemove(ir.LoadBufferU16(handle, address, {}));
|
||||
used_types |= IR::Type::U16;
|
||||
break;
|
||||
case IR::Opcode::LoadSharedU32:
|
||||
inst.ReplaceUsesWithAndRemove(ir.LoadBufferU32(1, handle, address, {}));
|
||||
used_types |= IR::Type::U32;
|
||||
break;
|
||||
case IR::Opcode::LoadSharedU64:
|
||||
inst.ReplaceUsesWithAndRemove(ir.LoadBufferU64(handle, address, {}));
|
||||
used_types |= IR::Type::U64;
|
||||
break;
|
||||
case IR::Opcode::WriteSharedU16:
|
||||
ir.StoreBufferU16(handle, address, IR::U16{inst.Arg(1)}, {});
|
||||
inst.Invalidate();
|
||||
used_types |= IR::Type::U16;
|
||||
break;
|
||||
case IR::Opcode::WriteSharedU32:
|
||||
ir.StoreBufferU32(1, handle, address, inst.Arg(1), {});
|
||||
inst.Invalidate();
|
||||
used_types |= IR::Type::U32;
|
||||
break;
|
||||
case IR::Opcode::WriteSharedU64:
|
||||
ir.StoreBufferU64(handle, address, IR::U64{inst.Arg(1)}, {});
|
||||
inst.Invalidate();
|
||||
used_types |= IR::Type::U64;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Add buffer binding for shared memory storage buffer.
|
||||
program.info.buffers.push_back({
|
||||
.used_types = used_types,
|
||||
.inline_cbuf = AmdGpu::Buffer::Null(),
|
||||
.buffer_type = BufferType::SharedMemory,
|
||||
.is_written = true,
|
||||
});
|
||||
}
|
||||
|
||||
} // namespace Shader::Optimization
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue