shader_recompiler: Various fixes to shared memory and atomics. (#3075)

* shader_recompiler: Various fixes to shared memory and atomics.

* shader_recompiler: Re-type non-32bit load/stores.
This commit is contained in:
squidbus 2025-06-10 15:41:58 -07:00 committed by GitHub
parent b49340dff8
commit ca92e72efe
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
17 changed files with 391 additions and 227 deletions

View file

@ -353,12 +353,12 @@ U32 IREmitter::SharedAtomicXor(const U32& address, const U32& data) {
return Inst<U32>(Opcode::SharedAtomicXor32, address, data);
}
U32 IREmitter::SharedAtomicIIncrement(const U32& address) {
return Inst<U32>(Opcode::SharedAtomicIIncrement32, address);
U32 IREmitter::SharedAtomicInc(const U32& address) {
return Inst<U32>(Opcode::SharedAtomicInc32, address);
}
U32 IREmitter::SharedAtomicIDecrement(const U32& address) {
return Inst<U32>(Opcode::SharedAtomicIDecrement32, address);
U32 IREmitter::SharedAtomicDec(const U32& address) {
return Inst<U32>(Opcode::SharedAtomicDec32, address);
}
U32 IREmitter::SharedAtomicISub(const U32& address, const U32& data) {
@ -373,12 +373,12 @@ U32 IREmitter::ReadConstBuffer(const Value& handle, const U32& index) {
return Inst<U32>(Opcode::ReadConstBuffer, handle, index);
}
U32 IREmitter::LoadBufferU8(const Value& handle, const Value& address, BufferInstInfo info) {
return Inst<U32>(Opcode::LoadBufferU8, Flags{info}, handle, address);
U8 IREmitter::LoadBufferU8(const Value& handle, const Value& address, BufferInstInfo info) {
return Inst<U8>(Opcode::LoadBufferU8, Flags{info}, handle, address);
}
U32 IREmitter::LoadBufferU16(const Value& handle, const Value& address, BufferInstInfo info) {
return Inst<U32>(Opcode::LoadBufferU16, Flags{info}, handle, address);
U16 IREmitter::LoadBufferU16(const Value& handle, const Value& address, BufferInstInfo info) {
return Inst<U16>(Opcode::LoadBufferU16, Flags{info}, handle, address);
}
Value IREmitter::LoadBufferU32(int num_dwords, const Value& handle, const Value& address,
@ -397,6 +397,10 @@ Value IREmitter::LoadBufferU32(int num_dwords, const Value& handle, const Value&
}
}
U64 IREmitter::LoadBufferU64(const Value& handle, const Value& address, BufferInstInfo info) {
return Inst<U64>(Opcode::LoadBufferU64, Flags{info}, handle, address);
}
Value IREmitter::LoadBufferF32(int num_dwords, const Value& handle, const Value& address,
BufferInstInfo info) {
switch (num_dwords) {
@ -417,12 +421,12 @@ Value IREmitter::LoadBufferFormat(const Value& handle, const Value& address, Buf
return Inst(Opcode::LoadBufferFormatF32, Flags{info}, handle, address);
}
void IREmitter::StoreBufferU8(const Value& handle, const Value& address, const U32& data,
void IREmitter::StoreBufferU8(const Value& handle, const Value& address, const U8& data,
BufferInstInfo info) {
Inst(Opcode::StoreBufferU8, Flags{info}, handle, address, data);
}
void IREmitter::StoreBufferU16(const Value& handle, const Value& address, const U32& data,
void IREmitter::StoreBufferU16(const Value& handle, const Value& address, const U16& data,
BufferInstInfo info) {
Inst(Opcode::StoreBufferU16, Flags{info}, handle, address, data);
}
@ -447,6 +451,11 @@ void IREmitter::StoreBufferU32(int num_dwords, const Value& handle, const Value&
}
}
void IREmitter::StoreBufferU64(const Value& handle, const Value& address, const U64& data,
BufferInstInfo info) {
Inst(Opcode::StoreBufferU64, Flags{info}, handle, address, data);
}
void IREmitter::StoreBufferF32(int num_dwords, const Value& handle, const Value& address,
const Value& data, BufferInstInfo info) {
switch (num_dwords) {
@ -474,7 +483,19 @@ void IREmitter::StoreBufferFormat(const Value& handle, const Value& address, con
Value IREmitter::BufferAtomicIAdd(const Value& handle, const Value& address, const Value& value,
BufferInstInfo info) {
return Inst(Opcode::BufferAtomicIAdd32, Flags{info}, handle, address, value);
switch (value.Type()) {
case Type::U32:
return Inst(Opcode::BufferAtomicIAdd32, Flags{info}, handle, address, value);
case Type::U64:
return Inst(Opcode::BufferAtomicIAdd64, Flags{info}, handle, address, value);
default:
ThrowInvalidType(value.Type());
}
}
Value IREmitter::BufferAtomicISub(const Value& handle, const Value& address, const Value& value,
BufferInstInfo info) {
return Inst(Opcode::BufferAtomicISub32, Flags{info}, handle, address, value);
}
Value IREmitter::BufferAtomicIMin(const Value& handle, const Value& address, const Value& value,
@ -489,14 +510,12 @@ Value IREmitter::BufferAtomicIMax(const Value& handle, const Value& address, con
: Inst(Opcode::BufferAtomicUMax32, Flags{info}, handle, address, value);
}
Value IREmitter::BufferAtomicInc(const Value& handle, const Value& address, const Value& value,
BufferInstInfo info) {
return Inst(Opcode::BufferAtomicInc32, Flags{info}, handle, address, value);
Value IREmitter::BufferAtomicInc(const Value& handle, const Value& address, BufferInstInfo info) {
return Inst(Opcode::BufferAtomicInc32, Flags{info}, handle, address);
}
Value IREmitter::BufferAtomicDec(const Value& handle, const Value& address, const Value& value,
BufferInstInfo info) {
return Inst(Opcode::BufferAtomicDec32, Flags{info}, handle, address, value);
Value IREmitter::BufferAtomicDec(const Value& handle, const Value& address, BufferInstInfo info) {
return Inst(Opcode::BufferAtomicDec32, Flags{info}, handle, address);
}
Value IREmitter::BufferAtomicAnd(const Value& handle, const Value& address, const Value& value,
@ -1804,8 +1823,15 @@ F32F64 IREmitter::ConvertIToF(size_t dest_bitsize, size_t src_bitsize, bool is_s
: ConvertUToF(dest_bitsize, src_bitsize, value);
}
U16U32U64 IREmitter::UConvert(size_t result_bitsize, const U16U32U64& value) {
U8U16U32U64 IREmitter::UConvert(size_t result_bitsize, const U8U16U32U64& value) {
switch (result_bitsize) {
case 8:
switch (value.Type()) {
case Type::U32:
return Inst<U8>(Opcode::ConvertU8U32, value);
default:
break;
}
case 16:
switch (value.Type()) {
case Type::U32:
@ -1815,6 +1841,8 @@ U16U32U64 IREmitter::UConvert(size_t result_bitsize, const U16U32U64& value) {
}
case 32:
switch (value.Type()) {
case Type::U8:
return Inst<U32>(Opcode::ConvertU32U8, value);
case Type::U16:
return Inst<U32>(Opcode::ConvertU32U16, value);
default:

View file

@ -100,33 +100,35 @@ public:
void WriteShared(int bit_size, const Value& value, const U32& offset);
[[nodiscard]] U32U64 SharedAtomicIAdd(const U32& address, const U32U64& data);
[[nodiscard]] U32 SharedAtomicISub(const U32& address, const U32& data);
[[nodiscard]] U32 SharedAtomicIMin(const U32& address, const U32& data, bool is_signed);
[[nodiscard]] U32 SharedAtomicIMax(const U32& address, const U32& data, bool is_signed);
[[nodiscard]] U32 SharedAtomicInc(const U32& address);
[[nodiscard]] U32 SharedAtomicDec(const U32& address);
[[nodiscard]] U32 SharedAtomicAnd(const U32& address, const U32& data);
[[nodiscard]] U32 SharedAtomicOr(const U32& address, const U32& data);
[[nodiscard]] U32 SharedAtomicXor(const U32& address, const U32& data);
[[nodiscard]] U32 SharedAtomicIIncrement(const U32& address);
[[nodiscard]] U32 SharedAtomicIDecrement(const U32& address);
[[nodiscard]] U32 SharedAtomicISub(const U32& address, const U32& data);
[[nodiscard]] U32 ReadConst(const Value& base, const U32& offset);
[[nodiscard]] U32 ReadConstBuffer(const Value& handle, const U32& index);
[[nodiscard]] U32 LoadBufferU8(const Value& handle, const Value& address, BufferInstInfo info);
[[nodiscard]] U32 LoadBufferU16(const Value& handle, const Value& address, BufferInstInfo info);
[[nodiscard]] U8 LoadBufferU8(const Value& handle, const Value& address, BufferInstInfo info);
[[nodiscard]] U16 LoadBufferU16(const Value& handle, const Value& address, BufferInstInfo info);
[[nodiscard]] Value LoadBufferU32(int num_dwords, const Value& handle, const Value& address,
BufferInstInfo info);
[[nodiscard]] U64 LoadBufferU64(const Value& handle, const Value& address, BufferInstInfo info);
[[nodiscard]] Value LoadBufferF32(int num_dwords, const Value& handle, const Value& address,
BufferInstInfo info);
[[nodiscard]] Value LoadBufferFormat(const Value& handle, const Value& address,
BufferInstInfo info);
void StoreBufferU8(const Value& handle, const Value& address, const U32& data,
void StoreBufferU8(const Value& handle, const Value& address, const U8& data,
BufferInstInfo info);
void StoreBufferU16(const Value& handle, const Value& address, const U32& data,
void StoreBufferU16(const Value& handle, const Value& address, const U16& data,
BufferInstInfo info);
void StoreBufferU32(int num_dwords, const Value& handle, const Value& address,
const Value& data, BufferInstInfo info);
void StoreBufferU64(const Value& handle, const Value& address, const U64& data,
BufferInstInfo info);
void StoreBufferF32(int num_dwords, const Value& handle, const Value& address,
const Value& data, BufferInstInfo info);
void StoreBufferFormat(const Value& handle, const Value& address, const Value& data,
@ -134,14 +136,16 @@ public:
[[nodiscard]] Value BufferAtomicIAdd(const Value& handle, const Value& address,
const Value& value, BufferInstInfo info);
[[nodiscard]] Value BufferAtomicISub(const Value& handle, const Value& address,
const Value& value, BufferInstInfo info);
[[nodiscard]] Value BufferAtomicIMin(const Value& handle, const Value& address,
const Value& value, bool is_signed, BufferInstInfo info);
[[nodiscard]] Value BufferAtomicIMax(const Value& handle, const Value& address,
const Value& value, bool is_signed, BufferInstInfo info);
[[nodiscard]] Value BufferAtomicInc(const Value& handle, const Value& address,
const Value& value, BufferInstInfo info);
BufferInstInfo info);
[[nodiscard]] Value BufferAtomicDec(const Value& handle, const Value& address,
const Value& value, BufferInstInfo info);
BufferInstInfo info);
[[nodiscard]] Value BufferAtomicAnd(const Value& handle, const Value& address,
const Value& value, BufferInstInfo info);
[[nodiscard]] Value BufferAtomicOr(const Value& handle, const Value& address,
@ -309,7 +313,7 @@ public:
[[nodiscard]] F32F64 ConvertIToF(size_t dest_bitsize, size_t src_bitsize, bool is_signed,
const Value& value);
[[nodiscard]] U16U32U64 UConvert(size_t result_bitsize, const U16U32U64& value);
[[nodiscard]] U8U16U32U64 UConvert(size_t result_bitsize, const U8U16U32U64& value);
[[nodiscard]] F16F32F64 FPConvert(size_t result_bitsize, const F16F32F64& value);
[[nodiscard]] Value ImageAtomicIAdd(const Value& handle, const Value& coords,

View file

@ -60,12 +60,15 @@ bool Inst::MayHaveSideEffects() const noexcept {
case Opcode::StoreBufferU32x2:
case Opcode::StoreBufferU32x3:
case Opcode::StoreBufferU32x4:
case Opcode::StoreBufferU64:
case Opcode::StoreBufferF32:
case Opcode::StoreBufferF32x2:
case Opcode::StoreBufferF32x3:
case Opcode::StoreBufferF32x4:
case Opcode::StoreBufferFormatF32:
case Opcode::BufferAtomicIAdd32:
case Opcode::BufferAtomicIAdd64:
case Opcode::BufferAtomicISub32:
case Opcode::BufferAtomicSMin32:
case Opcode::BufferAtomicUMin32:
case Opcode::BufferAtomicSMax32:
@ -76,15 +79,21 @@ bool Inst::MayHaveSideEffects() const noexcept {
case Opcode::BufferAtomicOr32:
case Opcode::BufferAtomicXor32:
case Opcode::BufferAtomicSwap32:
case Opcode::BufferAtomicCmpSwap32:
case Opcode::DataAppend:
case Opcode::DataConsume:
case Opcode::WriteSharedU64:
case Opcode::WriteSharedU16:
case Opcode::WriteSharedU32:
case Opcode::WriteSharedU64:
case Opcode::SharedAtomicIAdd32:
case Opcode::SharedAtomicIAdd64:
case Opcode::SharedAtomicISub32:
case Opcode::SharedAtomicSMin32:
case Opcode::SharedAtomicUMin32:
case Opcode::SharedAtomicSMax32:
case Opcode::SharedAtomicUMax32:
case Opcode::SharedAtomicInc32:
case Opcode::SharedAtomicDec32:
case Opcode::SharedAtomicAnd32:
case Opcode::SharedAtomicOr32:
case Opcode::SharedAtomicXor32:

View file

@ -35,21 +35,21 @@ OPCODE(LoadSharedU32, U32, U32,
OPCODE(LoadSharedU64, U64, U32, )
OPCODE(WriteSharedU16, Void, U32, U16, )
OPCODE(WriteSharedU32, Void, U32, U32, )
OPCODE(WriteSharedU64, Void, U32, U64, )
OPCODE(WriteSharedU64, Void, U32, U64, )
// Shared atomic operations
OPCODE(SharedAtomicIAdd32, U32, U32, U32, )
OPCODE(SharedAtomicIAdd64, U64, U32, U64, )
OPCODE(SharedAtomicISub32, U32, U32, U32, )
OPCODE(SharedAtomicSMin32, U32, U32, U32, )
OPCODE(SharedAtomicUMin32, U32, U32, U32, )
OPCODE(SharedAtomicSMax32, U32, U32, U32, )
OPCODE(SharedAtomicUMax32, U32, U32, U32, )
OPCODE(SharedAtomicInc32, U32, U32, )
OPCODE(SharedAtomicDec32, U32, U32, )
OPCODE(SharedAtomicAnd32, U32, U32, U32, )
OPCODE(SharedAtomicOr32, U32, U32, U32, )
OPCODE(SharedAtomicXor32, U32, U32, U32, )
OPCODE(SharedAtomicISub32, U32, U32, U32, )
OPCODE(SharedAtomicIIncrement32, U32, U32, )
OPCODE(SharedAtomicIDecrement32, U32, U32, )
// Context getters/setters
OPCODE(GetUserData, U32, ScalarReg, )
@ -94,23 +94,25 @@ OPCODE(UndefU32, U32,
OPCODE(UndefU64, U64, )
// Buffer operations
OPCODE(LoadBufferU8, U32, Opaque, Opaque, )
OPCODE(LoadBufferU16, U32, Opaque, Opaque, )
OPCODE(LoadBufferU8, U8, Opaque, Opaque, )
OPCODE(LoadBufferU16, U16, Opaque, Opaque, )
OPCODE(LoadBufferU32, U32, Opaque, Opaque, )
OPCODE(LoadBufferU32x2, U32x2, Opaque, Opaque, )
OPCODE(LoadBufferU32x3, U32x3, Opaque, Opaque, )
OPCODE(LoadBufferU32x4, U32x4, Opaque, Opaque, )
OPCODE(LoadBufferU64, U64, Opaque, Opaque, )
OPCODE(LoadBufferF32, F32, Opaque, Opaque, )
OPCODE(LoadBufferF32x2, F32x2, Opaque, Opaque, )
OPCODE(LoadBufferF32x3, F32x3, Opaque, Opaque, )
OPCODE(LoadBufferF32x4, F32x4, Opaque, Opaque, )
OPCODE(LoadBufferFormatF32, F32x4, Opaque, Opaque, )
OPCODE(StoreBufferU8, Void, Opaque, Opaque, U32, )
OPCODE(StoreBufferU16, Void, Opaque, Opaque, U32, )
OPCODE(StoreBufferU8, Void, Opaque, Opaque, U8, )
OPCODE(StoreBufferU16, Void, Opaque, Opaque, U16, )
OPCODE(StoreBufferU32, Void, Opaque, Opaque, U32, )
OPCODE(StoreBufferU32x2, Void, Opaque, Opaque, U32x2, )
OPCODE(StoreBufferU32x3, Void, Opaque, Opaque, U32x3, )
OPCODE(StoreBufferU32x4, Void, Opaque, Opaque, U32x4, )
OPCODE(StoreBufferU64, Void, Opaque, Opaque, U64, )
OPCODE(StoreBufferF32, Void, Opaque, Opaque, F32, )
OPCODE(StoreBufferF32x2, Void, Opaque, Opaque, F32x2, )
OPCODE(StoreBufferF32x3, Void, Opaque, Opaque, F32x3, )
@ -120,12 +122,13 @@ OPCODE(StoreBufferFormatF32, Void, Opaq
// Buffer atomic operations
OPCODE(BufferAtomicIAdd32, U32, Opaque, Opaque, U32 )
OPCODE(BufferAtomicIAdd64, U64, Opaque, Opaque, U64 )
OPCODE(BufferAtomicISub32, U32, Opaque, Opaque, U32 )
OPCODE(BufferAtomicSMin32, U32, Opaque, Opaque, U32 )
OPCODE(BufferAtomicUMin32, U32, Opaque, Opaque, U32 )
OPCODE(BufferAtomicSMax32, U32, Opaque, Opaque, U32 )
OPCODE(BufferAtomicUMax32, U32, Opaque, Opaque, U32 )
OPCODE(BufferAtomicInc32, U32, Opaque, Opaque, U32, )
OPCODE(BufferAtomicDec32, U32, Opaque, Opaque, U32, )
OPCODE(BufferAtomicInc32, U32, Opaque, Opaque, )
OPCODE(BufferAtomicDec32, U32, Opaque, Opaque, )
OPCODE(BufferAtomicAnd32, U32, Opaque, Opaque, U32, )
OPCODE(BufferAtomicOr32, U32, Opaque, Opaque, U32, )
OPCODE(BufferAtomicXor32, U32, Opaque, Opaque, U32, )
@ -405,6 +408,8 @@ OPCODE(ConvertF64U32, F64, U32,
OPCODE(ConvertF32U16, F32, U16, )
OPCODE(ConvertU16U32, U16, U32, )
OPCODE(ConvertU32U16, U32, U16, )
OPCODE(ConvertU8U32, U8, U32, )
OPCODE(ConvertU32U8, U32, U8, )
// Image operations
OPCODE(ImageSampleRaw, F32x4, Opaque, F32x4, F32x4, F32x4, F32, )

View file

@ -438,7 +438,9 @@ void HullShaderTransform(IR::Program& program, RuntimeInfo& runtime_info) {
IR::IREmitter ir{*block, IR::Block::InstructionList::s_iterator_to(inst)};
const u32 num_dwords = opcode == IR::Opcode::WriteSharedU32 ? 1 : 2;
const IR::U32 addr{inst.Arg(0)};
const IR::U32 data{inst.Arg(1).Resolve()};
const IR::Value data = num_dwords == 2
? ir.UnpackUint2x32(IR::U64{inst.Arg(1).Resolve()})
: inst.Arg(1).Resolve();
const auto SetOutput = [&](IR::U32 addr, IR::U32 value, AttributeRegion output_kind,
u32 off_dw) {
@ -466,10 +468,10 @@ void HullShaderTransform(IR::Program& program, RuntimeInfo& runtime_info) {
AttributeRegion region = GetAttributeRegionKind(&inst, info, runtime_info);
if (num_dwords == 1) {
SetOutput(addr, data, region, 0);
SetOutput(addr, IR::U32{data}, region, 0);
} else {
for (auto i = 0; i < num_dwords; i++) {
SetOutput(addr, IR::U32{data.Inst()->Arg(i)}, region, i);
SetOutput(addr, IR::U32{ir.CompositeExtract(data, i)}, region, i);
}
}
inst.Invalidate();
@ -499,7 +501,7 @@ void HullShaderTransform(IR::Program& program, RuntimeInfo& runtime_info) {
ReadTessControlPointAttribute(addr, stride, ir, i, is_tcs_output_read);
read_components.push_back(ir.BitCast<IR::U32>(component));
}
attr_read = ir.CompositeConstruct(read_components);
attr_read = ir.PackUint2x32(ir.CompositeConstruct(read_components));
}
inst.ReplaceUsesWithAndRemove(attr_read);
break;
@ -578,7 +580,7 @@ void DomainShaderTransform(IR::Program& program, RuntimeInfo& runtime_info) {
const IR::F32 component = GetInput(addr, i);
read_components.push_back(ir.BitCast<IR::U32>(component));
}
attr_read = ir.CompositeConstruct(read_components);
attr_read = ir.PackUint2x32(ir.CompositeConstruct(read_components));
}
inst.ReplaceUsesWithAndRemove(attr_read);
break;

View file

@ -34,13 +34,13 @@ static IR::Value LoadBufferFormat(IR::IREmitter& ir, const IR::Value handle, con
interpreted = ir.Imm32(0.f);
break;
case AmdGpu::DataFormat::Format8: {
const auto unpacked =
ir.Unpack4x8(format_info.num_format, ir.LoadBufferU8(handle, address, info));
const auto raw = ir.UConvert(32, ir.LoadBufferU8(handle, address, info));
const auto unpacked = ir.Unpack4x8(format_info.num_format, raw);
interpreted = ir.CompositeExtract(unpacked, 0);
break;
}
case AmdGpu::DataFormat::Format8_8: {
const auto raw = ir.LoadBufferU16(handle, address, info);
const auto raw = ir.UConvert(32, ir.LoadBufferU16(handle, address, info));
const auto unpacked = ir.Unpack4x8(format_info.num_format, raw);
interpreted = ir.CompositeConstruct(ir.CompositeExtract(unpacked, 0),
ir.CompositeExtract(unpacked, 1));
@ -51,8 +51,8 @@ static IR::Value LoadBufferFormat(IR::IREmitter& ir, const IR::Value handle, con
IR::U32{ir.LoadBufferU32(1, handle, address, info)});
break;
case AmdGpu::DataFormat::Format16: {
const auto unpacked =
ir.Unpack2x16(format_info.num_format, ir.LoadBufferU16(handle, address, info));
const auto raw = ir.UConvert(32, ir.LoadBufferU16(handle, address, info));
const auto unpacked = ir.Unpack2x16(format_info.num_format, raw);
interpreted = ir.CompositeExtract(unpacked, 0);
break;
}
@ -126,7 +126,7 @@ static void StoreBufferFormat(IR::IREmitter& ir, const IR::Value handle, const I
const auto packed =
ir.Pack4x8(format_info.num_format, ir.CompositeConstruct(real_value, ir.Imm32(0.f),
ir.Imm32(0.f), ir.Imm32(0.f)));
ir.StoreBufferU8(handle, address, packed, info);
ir.StoreBufferU8(handle, address, ir.UConvert(8, packed), info);
break;
}
case AmdGpu::DataFormat::Format8_8: {
@ -134,7 +134,7 @@ static void StoreBufferFormat(IR::IREmitter& ir, const IR::Value handle, const I
ir.CompositeConstruct(ir.CompositeExtract(real_value, 0),
ir.CompositeExtract(real_value, 1),
ir.Imm32(0.f), ir.Imm32(0.f)));
ir.StoreBufferU16(handle, address, packed, info);
ir.StoreBufferU16(handle, address, ir.UConvert(16, packed), info);
break;
}
case AmdGpu::DataFormat::Format8_8_8_8: {
@ -145,7 +145,7 @@ static void StoreBufferFormat(IR::IREmitter& ir, const IR::Value handle, const I
case AmdGpu::DataFormat::Format16: {
const auto packed =
ir.Pack2x16(format_info.num_format, ir.CompositeConstruct(real_value, ir.Imm32(0.f)));
ir.StoreBufferU16(handle, address, packed, info);
ir.StoreBufferU16(handle, address, ir.UConvert(16, packed), info);
break;
}
case AmdGpu::DataFormat::Format16_16: {

View file

@ -17,6 +17,8 @@ using SharpLocation = u32;
bool IsBufferAtomic(const IR::Inst& inst) {
switch (inst.GetOpcode()) {
case IR::Opcode::BufferAtomicIAdd32:
case IR::Opcode::BufferAtomicIAdd64:
case IR::Opcode::BufferAtomicISub32:
case IR::Opcode::BufferAtomicSMin32:
case IR::Opcode::BufferAtomicUMin32:
case IR::Opcode::BufferAtomicSMax32:
@ -27,6 +29,7 @@ bool IsBufferAtomic(const IR::Inst& inst) {
case IR::Opcode::BufferAtomicOr32:
case IR::Opcode::BufferAtomicXor32:
case IR::Opcode::BufferAtomicSwap32:
case IR::Opcode::BufferAtomicCmpSwap32:
return true;
default:
return false;
@ -41,6 +44,7 @@ bool IsBufferStore(const IR::Inst& inst) {
case IR::Opcode::StoreBufferU32x2:
case IR::Opcode::StoreBufferU32x3:
case IR::Opcode::StoreBufferU32x4:
case IR::Opcode::StoreBufferU64:
case IR::Opcode::StoreBufferF32:
case IR::Opcode::StoreBufferF32x2:
case IR::Opcode::StoreBufferF32x3:
@ -60,6 +64,7 @@ bool IsBufferInstruction(const IR::Inst& inst) {
case IR::Opcode::LoadBufferU32x2:
case IR::Opcode::LoadBufferU32x3:
case IR::Opcode::LoadBufferU32x4:
case IR::Opcode::LoadBufferU64:
case IR::Opcode::LoadBufferF32:
case IR::Opcode::LoadBufferF32x2:
case IR::Opcode::LoadBufferF32x3:
@ -85,6 +90,10 @@ IR::Type BufferDataType(const IR::Inst& inst, AmdGpu::NumberFormat num_format) {
case IR::Opcode::LoadBufferU16:
case IR::Opcode::StoreBufferU16:
return IR::Type::U16;
case IR::Opcode::LoadBufferU64:
case IR::Opcode::StoreBufferU64:
case IR::Opcode::BufferAtomicIAdd64:
return IR::Type::U64;
case IR::Opcode::LoadBufferFormatF32:
case IR::Opcode::StoreBufferFormatF32:
// Formatted buffer loads can use a variety of types.

View file

@ -9,12 +9,14 @@
namespace Shader::Optimization {
static bool IsLoadShared(const IR::Inst& inst) {
return inst.GetOpcode() == IR::Opcode::LoadSharedU32 ||
return inst.GetOpcode() == IR::Opcode::LoadSharedU16 ||
inst.GetOpcode() == IR::Opcode::LoadSharedU32 ||
inst.GetOpcode() == IR::Opcode::LoadSharedU64;
}
static bool IsWriteShared(const IR::Inst& inst) {
return inst.GetOpcode() == IR::Opcode::WriteSharedU32 ||
return inst.GetOpcode() == IR::Opcode::WriteSharedU16 ||
inst.GetOpcode() == IR::Opcode::WriteSharedU32 ||
inst.GetOpcode() == IR::Opcode::WriteSharedU64;
}

View file

@ -10,18 +10,23 @@ namespace Shader::Optimization {
static bool IsSharedAccess(const IR::Inst& inst) {
const auto opcode = inst.GetOpcode();
switch (opcode) {
case IR::Opcode::LoadSharedU16:
case IR::Opcode::LoadSharedU32:
case IR::Opcode::LoadSharedU64:
case IR::Opcode::WriteSharedU16:
case IR::Opcode::WriteSharedU32:
case IR::Opcode::WriteSharedU64:
case IR::Opcode::SharedAtomicAnd32:
case IR::Opcode::SharedAtomicIAdd32:
case IR::Opcode::SharedAtomicIAdd64:
case IR::Opcode::SharedAtomicOr32:
case IR::Opcode::SharedAtomicSMax32:
case IR::Opcode::SharedAtomicUMax32:
case IR::Opcode::SharedAtomicISub32:
case IR::Opcode::SharedAtomicSMin32:
case IR::Opcode::SharedAtomicUMin32:
case IR::Opcode::SharedAtomicSMax32:
case IR::Opcode::SharedAtomicUMax32:
case IR::Opcode::SharedAtomicInc32:
case IR::Opcode::SharedAtomicDec32:
case IR::Opcode::SharedAtomicAnd32:
case IR::Opcode::SharedAtomicOr32:
case IR::Opcode::SharedAtomicXor32:
return true;
default:
@ -41,14 +46,8 @@ void SharedMemoryToStoragePass(IR::Program& program, const RuntimeInfo& runtime_
profile.supports_workgroup_explicit_memory_layout)) {
return;
}
// Add buffer binding for shared memory storage buffer.
const u32 binding = static_cast<u32>(program.info.buffers.size());
program.info.buffers.push_back({
.used_types = IR::Type::U32,
.inline_cbuf = AmdGpu::Buffer::Null(),
.buffer_type = BufferType::SharedMemory,
.is_written = true,
});
IR::Type used_types{};
for (IR::Block* const block : program.blocks) {
for (IR::Inst& inst : block->Instructions()) {
if (!IsSharedAccess(inst)) {
@ -56,73 +55,106 @@ void SharedMemoryToStoragePass(IR::Program& program, const RuntimeInfo& runtime_
}
IR::IREmitter ir{*block, IR::Block::InstructionList::s_iterator_to(inst)};
const IR::U32 handle = ir.Imm32(binding);
const IR::U32 offset = ir.IMul(ir.GetAttributeU32(IR::Attribute::WorkgroupIndex),
ir.Imm32(shared_memory_size));
const IR::U32 address = ir.IAdd(IR::U32{inst.Arg(0)}, offset);
// Replace shared atomics first
switch (inst.GetOpcode()) {
case IR::Opcode::SharedAtomicAnd32:
inst.ReplaceUsesWithAndRemove(
ir.BufferAtomicAnd(handle, inst.Arg(0), inst.Arg(1), {}));
continue;
case IR::Opcode::SharedAtomicIAdd32:
inst.ReplaceUsesWithAndRemove(
ir.BufferAtomicIAdd(handle, address, inst.Arg(1), {}));
used_types |= IR::Type::U32;
continue;
case IR::Opcode::SharedAtomicIAdd64:
inst.ReplaceUsesWithAndRemove(
ir.BufferAtomicIAdd(handle, inst.Arg(0), inst.Arg(1), {}));
ir.BufferAtomicIAdd(handle, address, inst.Arg(1), {}));
used_types |= IR::Type::U64;
continue;
case IR::Opcode::SharedAtomicOr32:
case IR::Opcode::SharedAtomicISub32:
inst.ReplaceUsesWithAndRemove(
ir.BufferAtomicOr(handle, inst.Arg(0), inst.Arg(1), {}));
ir.BufferAtomicISub(handle, address, inst.Arg(1), {}));
used_types |= IR::Type::U32;
continue;
case IR::Opcode::SharedAtomicSMax32:
case IR::Opcode::SharedAtomicUMax32: {
const bool is_signed = inst.GetOpcode() == IR::Opcode::SharedAtomicSMax32;
inst.ReplaceUsesWithAndRemove(
ir.BufferAtomicIMax(handle, inst.Arg(0), inst.Arg(1), is_signed, {}));
continue;
}
case IR::Opcode::SharedAtomicSMin32:
case IR::Opcode::SharedAtomicUMin32: {
const bool is_signed = inst.GetOpcode() == IR::Opcode::SharedAtomicSMin32;
inst.ReplaceUsesWithAndRemove(
ir.BufferAtomicIMin(handle, inst.Arg(0), inst.Arg(1), is_signed, {}));
ir.BufferAtomicIMin(handle, address, inst.Arg(1), is_signed, {}));
used_types |= IR::Type::U32;
continue;
}
case IR::Opcode::SharedAtomicXor32:
case IR::Opcode::SharedAtomicSMax32:
case IR::Opcode::SharedAtomicUMax32: {
const bool is_signed = inst.GetOpcode() == IR::Opcode::SharedAtomicSMax32;
inst.ReplaceUsesWithAndRemove(
ir.BufferAtomicXor(handle, inst.Arg(0), inst.Arg(1), {}));
ir.BufferAtomicIMax(handle, address, inst.Arg(1), is_signed, {}));
used_types |= IR::Type::U32;
continue;
}
case IR::Opcode::SharedAtomicInc32:
inst.ReplaceUsesWithAndRemove(ir.BufferAtomicInc(handle, address, {}));
used_types |= IR::Type::U32;
continue;
case IR::Opcode::SharedAtomicDec32:
inst.ReplaceUsesWithAndRemove(ir.BufferAtomicDec(handle, address, {}));
used_types |= IR::Type::U32;
continue;
case IR::Opcode::SharedAtomicAnd32:
inst.ReplaceUsesWithAndRemove(ir.BufferAtomicAnd(handle, address, inst.Arg(1), {}));
used_types |= IR::Type::U32;
continue;
case IR::Opcode::SharedAtomicOr32:
inst.ReplaceUsesWithAndRemove(ir.BufferAtomicOr(handle, address, inst.Arg(1), {}));
used_types |= IR::Type::U32;
continue;
case IR::Opcode::SharedAtomicXor32:
inst.ReplaceUsesWithAndRemove(ir.BufferAtomicXor(handle, address, inst.Arg(1), {}));
used_types |= IR::Type::U32;
continue;
default:
break;
}
// Replace shared operations.
const IR::U32 offset = ir.IMul(ir.GetAttributeU32(IR::Attribute::WorkgroupIndex),
ir.Imm32(shared_memory_size));
const IR::U32 address = ir.IAdd(IR::U32{inst.Arg(0)}, offset);
switch (inst.GetOpcode()) {
case IR::Opcode::LoadSharedU16:
inst.ReplaceUsesWithAndRemove(ir.LoadBufferU16(handle, address, {}));
used_types |= IR::Type::U16;
break;
case IR::Opcode::LoadSharedU32:
inst.ReplaceUsesWithAndRemove(ir.LoadBufferU32(1, handle, address, {}));
used_types |= IR::Type::U32;
break;
case IR::Opcode::LoadSharedU64:
inst.ReplaceUsesWithAndRemove(ir.LoadBufferU32(2, handle, address, {}));
inst.ReplaceUsesWithAndRemove(ir.LoadBufferU64(handle, address, {}));
used_types |= IR::Type::U64;
break;
case IR::Opcode::WriteSharedU16:
ir.StoreBufferU16(handle, address, IR::U32{inst.Arg(1)}, {});
ir.StoreBufferU16(handle, address, IR::U16{inst.Arg(1)}, {});
inst.Invalidate();
used_types |= IR::Type::U16;
break;
case IR::Opcode::WriteSharedU32:
ir.StoreBufferU32(1, handle, address, inst.Arg(1), {});
inst.Invalidate();
used_types |= IR::Type::U32;
break;
case IR::Opcode::WriteSharedU64:
ir.StoreBufferU32(2, handle, address, inst.Arg(1), {});
ir.StoreBufferU64(handle, address, IR::U64{inst.Arg(1)}, {});
inst.Invalidate();
used_types |= IR::Type::U64;
break;
default:
break;
}
}
}
// Add buffer binding for shared memory storage buffer.
program.info.buffers.push_back({
.used_types = used_types,
.inline_cbuf = AmdGpu::Buffer::Null(),
.buffer_type = BufferType::SharedMemory,
.is_written = true,
});
}
} // namespace Shader::Optimization

View file

@ -265,6 +265,7 @@ using U32F32 = TypedValue<Type::U32 | Type::F32>;
using U64F64 = TypedValue<Type::U64 | Type::F64>;
using U32U64 = TypedValue<Type::U32 | Type::U64>;
using U16U32U64 = TypedValue<Type::U16 | Type::U32 | Type::U64>;
using U8U16U32U64 = TypedValue<Type::U8 | Type::U16 | Type::U32 | Type::U64>;
using F32F64 = TypedValue<Type::F32 | Type::F64>;
using F16F32F64 = TypedValue<Type::F16 | Type::F32 | Type::F64>;
using UAny = TypedValue<Type::U8 | Type::U16 | Type::U32 | Type::U64>;