diff --git a/CMakeLists.txt b/CMakeLists.txt index 8837a6584..c1ec7b7b9 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -753,6 +753,7 @@ set(SHADER_RECOMPILER src/shader_recompiler/exception.h src/shader_recompiler/ir/passes/hull_shader_transform.cpp src/shader_recompiler/ir/passes/identity_removal_pass.cpp src/shader_recompiler/ir/passes/ir_passes.h + src/shader_recompiler/ir/passes/lower_buffer_format_to_raw.cpp src/shader_recompiler/ir/passes/lower_shared_mem_to_registers.cpp src/shader_recompiler/ir/passes/resource_tracking_pass.cpp src/shader_recompiler/ir/passes/ring_access_elimination.cpp diff --git a/externals/sirit b/externals/sirit index d6f3c0d99..8b9b12c20 160000 --- a/externals/sirit +++ b/externals/sirit @@ -1 +1 @@ -Subproject commit d6f3c0d99862ab2ff8f95e9ac221560f1f97e29a +Subproject commit 8b9b12c2089505ac8b10fa56bf56b3ed49d9d7b0 diff --git a/src/shader_recompiler/backend/spirv/emit_spirv.cpp b/src/shader_recompiler/backend/spirv/emit_spirv.cpp index f0cf15af0..3712380f5 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv.cpp @@ -250,7 +250,7 @@ void SetupCapabilities(const Info& info, const Profile& profile, EmitContext& ct ctx.AddCapability(spv::Capability::Float64); } ctx.AddCapability(spv::Capability::Int64); - if (info.has_storage_images || info.has_image_buffers) { + if (info.has_storage_images) { ctx.AddCapability(spv::Capability::StorageImageExtendedFormats); ctx.AddCapability(spv::Capability::StorageImageReadWithoutFormat); ctx.AddCapability(spv::Capability::StorageImageWriteWithoutFormat); @@ -259,12 +259,6 @@ void SetupCapabilities(const Info& info, const Profile& profile, EmitContext& ct ctx.AddCapability(spv::Capability::ImageReadWriteLodAMD); } } - if (info.has_texel_buffers) { - ctx.AddCapability(spv::Capability::SampledBuffer); - } - if (info.has_image_buffers) { - ctx.AddCapability(spv::Capability::ImageBuffer); - } if (info.has_image_gather) { ctx.AddCapability(spv::Capability::ImageGatherExtended); } diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_bitwise_conversion.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_bitwise_conversion.cpp index 539c6cb81..56a6abc05 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_bitwise_conversion.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_bitwise_conversion.cpp @@ -6,6 +6,56 @@ namespace Shader::Backend::SPIRV { +struct R { + R(u32 a, u32 b) : offset(a), size(b) {} + u32 offset; + u32 size; +}; +template +static std::array ExtractBitFields(EmitContext& ctx, const Id value, + const Args... args) { + const auto op_func = + is_signed ? &EmitContext::OpBitFieldSExtract : &EmitContext::OpBitFieldUExtract; + std::array result{}; + u32 i = 0; + ( + [&] { + result[i++] = (ctx.*op_func)(ctx.U32[1], value, ctx.ConstU32(args.offset), + ctx.ConstU32(args.size)); + }(), + ...); + return result; +} + +template +static Id InsertBitFields(EmitContext& ctx, const std::initializer_list values, + const Args... args) { + Id result{}; + auto it = values.begin(); + ( + [&] { + if (it == values.begin()) { + result = *it; + } else { + result = ctx.OpBitFieldInsert(ctx.U32[1], result, *it, ctx.ConstU32(args.offset), + ctx.ConstU32(args.size)); + } + ++it; + }(), + ...); + return result; +} + +template +static std::array ExtractComposite(EmitContext& ctx, const VectorIds type, + const Id value) { + std::array result{}; + for (u32 i = 0; i < num_components; i++) { + result[i] = ctx.OpCompositeExtract(type[1], value, i); + } + return result; +} + Id EmitBitCastU16F16(EmitContext& ctx, Id value) { return ctx.OpBitcast(ctx.U16, value); } @@ -42,22 +92,6 @@ Id EmitPackFloat2x32(EmitContext& ctx, Id value) { return ctx.OpBitcast(ctx.F64[1], value); } -Id EmitPackFloat2x16(EmitContext& ctx, Id value) { - return ctx.OpBitcast(ctx.U32[1], value); -} - -Id EmitUnpackFloat2x16(EmitContext& ctx, Id value) { - return ctx.OpBitcast(ctx.F16[2], value); -} - -Id EmitPackHalf2x16(EmitContext& ctx, Id value) { - return ctx.OpPackHalf2x16(ctx.U32[1], value); -} - -Id EmitUnpackHalf2x16(EmitContext& ctx, Id value) { - return ctx.OpUnpackHalf2x16(ctx.F32[2], value); -} - Id EmitPackUnorm2x16(EmitContext& ctx, Id value) { return ctx.OpPackUnorm2x16(ctx.U32[1], value); } @@ -75,31 +109,157 @@ Id EmitUnpackSnorm2x16(EmitContext& ctx, Id value) { } Id EmitPackUint2x16(EmitContext& ctx, Id value) { - // No SPIR-V instruction for this, do it manually. - const auto x{ctx.OpCompositeExtract(ctx.U32[1], value, 0)}; - const auto y{ctx.OpCompositeExtract(ctx.U32[1], value, 1)}; - return ctx.OpBitFieldInsert(ctx.U32[1], x, y, ctx.ConstU32(16U), ctx.ConstU32(16U)); + const auto unpacked{ctx.OpBitcast(ctx.U32[2], value)}; + const auto [x, y] = ExtractComposite<2>(ctx, ctx.U32, unpacked); + return InsertBitFields(ctx, {x, y}, R(0, 16), R(16, 16)); } Id EmitUnpackUint2x16(EmitContext& ctx, Id value) { - // No SPIR-V instruction for this, do it manually. - const auto x{ctx.OpBitFieldUExtract(ctx.U32[1], value, ctx.ConstU32(0U), ctx.ConstU32(16U))}; - const auto y{ctx.OpBitFieldUExtract(ctx.U32[1], value, ctx.ConstU32(16U), ctx.ConstU32(16U))}; - return ctx.OpCompositeConstruct(ctx.U32[2], x, y); + const auto [x, y] = ExtractBitFields(ctx, value, R(0, 16), R(16, 16)); + const auto unpacked{ctx.OpCompositeConstruct(ctx.U32[2], x, y)}; + return ctx.OpBitcast(ctx.F32[2], unpacked); } Id EmitPackSint2x16(EmitContext& ctx, Id value) { - // No SPIR-V instruction for this, do it manually. - const auto x{ctx.OpCompositeExtract(ctx.U32[1], value, 0)}; - const auto y{ctx.OpCompositeExtract(ctx.U32[1], value, 1)}; - return ctx.OpBitFieldInsert(ctx.U32[1], x, y, ctx.ConstU32(16U), ctx.ConstU32(16U)); + return EmitPackUint2x16(ctx, value); } Id EmitUnpackSint2x16(EmitContext& ctx, Id value) { - // No SPIR-V instruction for this, do it manually. - const auto x{ctx.OpBitFieldSExtract(ctx.U32[1], value, ctx.ConstU32(0U), ctx.ConstU32(16U))}; - const auto y{ctx.OpBitFieldSExtract(ctx.U32[1], value, ctx.ConstU32(16U), ctx.ConstU32(16U))}; - return ctx.OpCompositeConstruct(ctx.U32[2], x, y); + const auto [x, y] = ExtractBitFields(ctx, value, R(0, 16), R(16, 16)); + const auto unpacked{ctx.OpCompositeConstruct(ctx.U32[2], x, y)}; + return ctx.OpBitcast(ctx.F32[2], unpacked); +} + +Id EmitPackHalf2x16(EmitContext& ctx, Id value) { + return ctx.OpPackHalf2x16(ctx.U32[1], value); +} + +Id EmitUnpackHalf2x16(EmitContext& ctx, Id value) { + return ctx.OpUnpackHalf2x16(ctx.F32[2], value); +} + +Id EmitPackUnorm4x8(EmitContext& ctx, Id value) { + return ctx.OpPackUnorm4x8(ctx.U32[1], value); +} + +Id EmitUnpackUnorm4x8(EmitContext& ctx, Id value) { + return ctx.OpUnpackUnorm4x8(ctx.F32[4], value); +} + +Id EmitPackSnorm4x8(EmitContext& ctx, Id value) { + return ctx.OpPackSnorm4x8(ctx.U32[1], value); +} + +Id EmitUnpackSnorm4x8(EmitContext& ctx, Id value) { + return ctx.OpUnpackSnorm4x8(ctx.F32[4], value); +} + +Id EmitPackUint4x8(EmitContext& ctx, Id value) { + const auto unpacked{ctx.OpBitcast(ctx.U32[4], value)}; + const auto [x, y, z, w] = ExtractComposite<4>(ctx, ctx.U32, unpacked); + return InsertBitFields(ctx, {x, y, z, w}, R(0, 8), R(8, 8), R(16, 8), R(24, 8)); +} + +Id EmitUnpackUint4x8(EmitContext& ctx, Id value) { + const auto [x, y, z, w] = + ExtractBitFields(ctx, value, R(0, 8), R(8, 8), R(16, 8), R(24, 8)); + const auto unpacked{ctx.OpCompositeConstruct(ctx.U32[4], x, y, z, w)}; + return ctx.OpBitcast(ctx.F32[4], unpacked); +} + +Id EmitPackSint4x8(EmitContext& ctx, Id value) { + return EmitPackUint4x8(ctx, value); +} + +Id EmitUnpackSint4x8(EmitContext& ctx, Id value) { + const auto [x, y, z, w] = + ExtractBitFields(ctx, value, R(0, 8), R(8, 8), R(16, 8), R(24, 8)); + const auto unpacked{ctx.OpCompositeConstruct(ctx.U32[4], x, y, z, w)}; + return ctx.OpBitcast(ctx.F32[4], unpacked); +} + +Id EmitPackUfloat10_11_11(EmitContext& ctx, Id value) { + const auto [x, y, z] = ExtractComposite<3>(ctx, ctx.F32, value); + const auto cvt_x{ctx.OpFunctionCall(ctx.U32[1], ctx.f32_to_uf11, x)}; + const auto cvt_y{ctx.OpFunctionCall(ctx.U32[1], ctx.f32_to_uf11, y)}; + const auto cvt_z{ctx.OpFunctionCall(ctx.U32[1], ctx.f32_to_uf10, z)}; + return InsertBitFields(ctx, {cvt_x, cvt_y, cvt_z}, R(0, 11), R(11, 11), R(22, 10)); +} + +Id EmitUnpackUfloat10_11_11(EmitContext& ctx, Id value) { + const auto [x, y, z] = ExtractBitFields(ctx, value, R(0, 11), R(11, 11), R(22, 10)); + const auto cvt_x{ctx.OpFunctionCall(ctx.F32[1], ctx.uf11_to_f32, x)}; + const auto cvt_y{ctx.OpFunctionCall(ctx.F32[1], ctx.uf11_to_f32, y)}; + const auto cvt_z{ctx.OpFunctionCall(ctx.F32[1], ctx.uf10_to_f32, z)}; + return ctx.OpCompositeConstruct(ctx.F32[3], cvt_x, cvt_y, cvt_z); +} + +Id EmitPackUnorm2_10_10_10(EmitContext& ctx, Id value) { + const auto unorm_min{ctx.ConstantComposite(ctx.F32[4], ctx.ConstF32(0.f), ctx.ConstF32(0.f), + ctx.ConstF32(0.f), ctx.ConstF32(0.f))}; + const auto unorm_max{ctx.ConstantComposite(ctx.F32[4], ctx.ConstF32(1.f), ctx.ConstF32(1.f), + ctx.ConstF32(1.f), ctx.ConstF32(1.f))}; + const auto clamped{ctx.OpFClamp(ctx.F32[4], value, unorm_min, unorm_max)}; + const auto unorm_mul{ctx.ConstantComposite(ctx.F32[4], ctx.ConstF32(1023.f), + ctx.ConstF32(1023.f), ctx.ConstF32(1023.f), + ctx.ConstF32(3.f))}; + const auto as_float{ctx.OpFMul(ctx.F32[4], clamped, unorm_mul)}; + const auto as_uint{ctx.OpConvertFToU(ctx.U32[4], ctx.OpRoundEven(ctx.F32[4], as_float))}; + return EmitPackUint2_10_10_10(ctx, ctx.OpBitcast(ctx.F32[4], as_uint)); +} + +Id EmitUnpackUnorm2_10_10_10(EmitContext& ctx, Id value) { + const auto unpacked{ctx.OpBitcast(ctx.U32[4], EmitUnpackUint2_10_10_10(ctx, value))}; + const auto as_float{ctx.OpConvertUToF(ctx.F32[4], unpacked)}; + const auto unorm_div{ctx.ConstantComposite(ctx.F32[4], ctx.ConstF32(1023.f), + ctx.ConstF32(1023.f), ctx.ConstF32(1023.f), + ctx.ConstF32(3.f))}; + return ctx.OpFDiv(ctx.F32[4], as_float, unorm_div); +} + +Id EmitPackSnorm2_10_10_10(EmitContext& ctx, Id value) { + const auto snorm_min{ctx.ConstantComposite(ctx.F32[4], ctx.ConstF32(-1.f), ctx.ConstF32(-1.f), + ctx.ConstF32(-1.f), ctx.ConstF32(-1.f))}; + const auto snorm_max{ctx.ConstantComposite(ctx.F32[4], ctx.ConstF32(1.f), ctx.ConstF32(1.f), + ctx.ConstF32(1.f), ctx.ConstF32(1.f))}; + const auto clamped{ctx.OpFClamp(ctx.F32[4], value, snorm_min, snorm_max)}; + const auto snorm_mul{ctx.ConstantComposite(ctx.F32[4], ctx.ConstF32(511.f), ctx.ConstF32(511.f), + ctx.ConstF32(511.f), ctx.ConstF32(1.f))}; + const auto as_float{ctx.OpFMul(ctx.F32[4], clamped, snorm_mul)}; + const auto as_sint{ctx.OpConvertFToS(ctx.U32[4], ctx.OpRoundEven(ctx.F32[4], as_float))}; + return EmitPackSint2_10_10_10(ctx, ctx.OpBitcast(ctx.F32[4], as_sint)); +} + +Id EmitUnpackSnorm2_10_10_10(EmitContext& ctx, Id value) { + const auto unpacked{ctx.OpBitcast(ctx.U32[4], EmitUnpackSint2_10_10_10(ctx, value))}; + const auto as_float{ctx.OpConvertSToF(ctx.F32[4], unpacked)}; + const auto snorm_div{ctx.ConstantComposite(ctx.F32[4], ctx.ConstF32(511.f), ctx.ConstF32(511.f), + ctx.ConstF32(511.f), ctx.ConstF32(1.f))}; + return ctx.OpFDiv(ctx.F32[4], as_float, snorm_div); +} + +Id EmitPackUint2_10_10_10(EmitContext& ctx, Id value) { + const auto unpacked{ctx.OpBitcast(ctx.U32[4], value)}; + const auto [x, y, z, w] = ExtractComposite<4>(ctx, ctx.U32, unpacked); + return InsertBitFields(ctx, {x, y, z, w}, R(0, 10), R(10, 10), R(20, 10), R(30, 2)); +} + +Id EmitUnpackUint2_10_10_10(EmitContext& ctx, Id value) { + const auto [x, y, z, w] = + ExtractBitFields(ctx, value, R(0, 10), R(10, 10), R(20, 10), R(30, 2)); + const auto unpacked{ctx.OpCompositeConstruct(ctx.U32[4], x, y, z, w)}; + return ctx.OpBitcast(ctx.F32[4], unpacked); +} + +Id EmitPackSint2_10_10_10(EmitContext& ctx, Id value) { + return EmitPackUint2_10_10_10(ctx, value); +} + +Id EmitUnpackSint2_10_10_10(EmitContext& ctx, Id value) { + const auto [x, y, z, w] = + ExtractBitFields(ctx, value, R(0, 10), R(10, 10), R(20, 10), R(30, 2)); + const auto unpacked{ctx.OpCompositeConstruct(ctx.U32[4], x, y, z, w)}; + return ctx.OpBitcast(ctx.F32[4], unpacked); } } // namespace Shader::Backend::SPIRV diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_composite.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_composite.cpp index d064b5d05..4f9e6040e 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_composite.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_composite.cpp @@ -24,6 +24,10 @@ Id EmitCompositeConstructU32x4(EmitContext& ctx, IR::Inst* inst, Id e1, Id e2, I return EmitCompositeConstruct(ctx, inst, ctx.U32[4], e1, e2, e3, e4); } +Id EmitCompositeConstructU32x2x2(EmitContext& ctx, IR::Inst* inst, Id e1, Id e2) { + return EmitCompositeConstruct(ctx, inst, ctx.U32[4], e1, e2); +} + Id EmitCompositeExtractU32x2(EmitContext& ctx, Id composite, u32 index) { return ctx.OpCompositeExtract(ctx.U32[1], composite, index); } @@ -124,6 +128,10 @@ Id EmitCompositeConstructF32x4(EmitContext& ctx, IR::Inst* inst, Id e1, Id e2, I return EmitCompositeConstruct(ctx, inst, ctx.F32[4], e1, e2, e3, e4); } +Id EmitCompositeConstructF32x2x2(EmitContext& ctx, IR::Inst* inst, Id e1, Id e2) { + return EmitCompositeConstruct(ctx, inst, ctx.F32[4], e1, e2); +} + Id EmitCompositeExtractF32x2(EmitContext& ctx, Id composite, u32 index) { return ctx.OpCompositeExtract(ctx.F32[1], composite, index); } diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp index 4550440bb..ae77ed413 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp @@ -416,6 +416,20 @@ static Id EmitLoadBufferU32xN(EmitContext& ctx, u32 handle, Id address) { } } +Id EmitLoadBufferU8(EmitContext& ctx, IR::Inst*, u32 handle, Id address) { + const Id byte_index{ctx.OpBitwiseAnd(ctx.U32[1], address, ctx.ConstU32(3u))}; + const Id bit_offset{ctx.OpShiftLeftLogical(ctx.U32[1], byte_index, ctx.ConstU32(3u))}; + const Id dword{EmitLoadBufferU32xN<1>(ctx, handle, address)}; + return ctx.OpBitFieldUExtract(ctx.U32[1], dword, bit_offset, ctx.ConstU32(8u)); +} + +Id EmitLoadBufferU16(EmitContext& ctx, IR::Inst*, u32 handle, Id address) { + const Id byte_index{ctx.OpBitwiseAnd(ctx.U32[1], address, ctx.ConstU32(2u))}; + const Id bit_offset{ctx.OpShiftLeftLogical(ctx.U32[1], byte_index, ctx.ConstU32(3u))}; + const Id dword{EmitLoadBufferU32xN<1>(ctx, handle, address)}; + return ctx.OpBitFieldUExtract(ctx.U32[1], dword, bit_offset, ctx.ConstU32(16u)); +} + Id EmitLoadBufferU32(EmitContext& ctx, IR::Inst*, u32 handle, Id address) { return EmitLoadBufferU32xN<1>(ctx, handle, address); } @@ -432,18 +446,24 @@ Id EmitLoadBufferU32x4(EmitContext& ctx, IR::Inst*, u32 handle, Id address) { return EmitLoadBufferU32xN<4>(ctx, handle, address); } +Id EmitLoadBufferF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { + return ctx.OpBitcast(ctx.F32[1], EmitLoadBufferU32(ctx, inst, handle, address)); +} + +Id EmitLoadBufferF32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { + return ctx.OpBitcast(ctx.F32[2], EmitLoadBufferU32x2(ctx, inst, handle, address)); +} + +Id EmitLoadBufferF32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { + return ctx.OpBitcast(ctx.F32[3], EmitLoadBufferU32x3(ctx, inst, handle, address)); +} + +Id EmitLoadBufferF32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { + return ctx.OpBitcast(ctx.F32[4], EmitLoadBufferU32x4(ctx, inst, handle, address)); +} + Id EmitLoadBufferFormatF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { - const auto& buffer = ctx.texture_buffers[handle]; - const Id tex_buffer = ctx.OpLoad(buffer.image_type, buffer.id); - const Id coord = - ctx.OpIAdd(ctx.U32[1], ctx.OpShiftLeftLogical(ctx.U32[1], address, buffer.coord_shift), - buffer.coord_offset); - Id texel = buffer.is_storage ? ctx.OpImageRead(buffer.result_type, tex_buffer, coord) - : ctx.OpImageFetch(buffer.result_type, tex_buffer, coord); - if (buffer.is_integer) { - texel = ctx.OpBitcast(ctx.F32[4], texel); - } - return texel; + UNREACHABLE_MSG("SPIR-V instruction"); } template @@ -464,32 +484,56 @@ static void EmitStoreBufferU32xN(EmitContext& ctx, u32 handle, Id address, Id va } } -void EmitStoreBufferU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) { +void EmitStoreBufferU8(EmitContext& ctx, IR::Inst*, u32 handle, Id address, Id value) { + const Id byte_index{ctx.OpBitwiseAnd(ctx.U32[1], address, ctx.ConstU32(3u))}; + const Id bit_offset{ctx.OpShiftLeftLogical(ctx.U32[1], byte_index, ctx.ConstU32(3u))}; + const Id dword{EmitLoadBufferU32xN<1>(ctx, handle, address)}; + const Id new_val{ctx.OpBitFieldInsert(ctx.U32[1], dword, value, bit_offset, ctx.ConstU32(8u))}; + EmitStoreBufferU32xN<1>(ctx, handle, address, new_val); +} + +void EmitStoreBufferU16(EmitContext& ctx, IR::Inst*, u32 handle, Id address, Id value) { + const Id byte_index{ctx.OpBitwiseAnd(ctx.U32[1], address, ctx.ConstU32(2u))}; + const Id bit_offset{ctx.OpShiftLeftLogical(ctx.U32[1], byte_index, ctx.ConstU32(3u))}; + const Id dword{EmitLoadBufferU32xN<1>(ctx, handle, address)}; + const Id new_val{ctx.OpBitFieldInsert(ctx.U32[1], dword, value, bit_offset, ctx.ConstU32(16u))}; + EmitStoreBufferU32xN<1>(ctx, handle, address, new_val); +} + +void EmitStoreBufferU32(EmitContext& ctx, IR::Inst*, u32 handle, Id address, Id value) { EmitStoreBufferU32xN<1>(ctx, handle, address, value); } -void EmitStoreBufferU32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) { +void EmitStoreBufferU32x2(EmitContext& ctx, IR::Inst*, u32 handle, Id address, Id value) { EmitStoreBufferU32xN<2>(ctx, handle, address, value); } -void EmitStoreBufferU32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) { +void EmitStoreBufferU32x3(EmitContext& ctx, IR::Inst*, u32 handle, Id address, Id value) { EmitStoreBufferU32xN<3>(ctx, handle, address, value); } -void EmitStoreBufferU32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) { +void EmitStoreBufferU32x4(EmitContext& ctx, IR::Inst*, u32 handle, Id address, Id value) { EmitStoreBufferU32xN<4>(ctx, handle, address, value); } +void EmitStoreBufferF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) { + EmitStoreBufferU32(ctx, inst, handle, address, ctx.OpBitcast(ctx.U32[1], value)); +} + +void EmitStoreBufferF32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) { + EmitStoreBufferU32x2(ctx, inst, handle, address, ctx.OpBitcast(ctx.U32[2], value)); +} + +void EmitStoreBufferF32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) { + EmitStoreBufferU32x3(ctx, inst, handle, address, ctx.OpBitcast(ctx.U32[3], value)); +} + +void EmitStoreBufferF32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) { + EmitStoreBufferU32x4(ctx, inst, handle, address, ctx.OpBitcast(ctx.U32[4], value)); +} + void EmitStoreBufferFormatF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) { - const auto& buffer = ctx.texture_buffers[handle]; - const Id tex_buffer = ctx.OpLoad(buffer.image_type, buffer.id); - const Id coord = - ctx.OpIAdd(ctx.U32[1], ctx.OpShiftLeftLogical(ctx.U32[1], address, buffer.coord_shift), - buffer.coord_offset); - if (buffer.is_integer) { - value = ctx.OpBitcast(buffer.result_type, value); - } - ctx.OpImageWrite(tex_buffer, coord, value); + UNREACHABLE_MSG("SPIR-V instruction"); } } // namespace Shader::Backend::SPIRV diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h index 842b13207..3e2cea9e5 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h +++ b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h @@ -63,15 +63,27 @@ void EmitGetGotoVariable(EmitContext& ctx); void EmitSetScc(EmitContext& ctx); Id EmitReadConst(EmitContext& ctx, IR::Inst* inst); Id EmitReadConstBuffer(EmitContext& ctx, u32 handle, Id index); +Id EmitLoadBufferU8(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address); +Id EmitLoadBufferU16(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address); Id EmitLoadBufferU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address); Id EmitLoadBufferU32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address); Id EmitLoadBufferU32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address); Id EmitLoadBufferU32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address); +Id EmitLoadBufferF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address); +Id EmitLoadBufferF32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address); +Id EmitLoadBufferF32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address); +Id EmitLoadBufferF32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address); Id EmitLoadBufferFormatF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address); +void EmitStoreBufferU8(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); +void EmitStoreBufferU16(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); void EmitStoreBufferU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); void EmitStoreBufferU32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); void EmitStoreBufferU32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); void EmitStoreBufferU32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); +void EmitStoreBufferF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); +void EmitStoreBufferF32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); +void EmitStoreBufferF32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); +void EmitStoreBufferF32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); void EmitStoreBufferFormatF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); Id EmitBufferAtomicIAdd32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); Id EmitBufferAtomicSMin32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); @@ -123,6 +135,7 @@ Id EmitSharedAtomicXor32(EmitContext& ctx, Id offset, Id value); Id EmitCompositeConstructU32x2(EmitContext& ctx, IR::Inst* inst, Id e1, Id e2); Id EmitCompositeConstructU32x3(EmitContext& ctx, IR::Inst* inst, Id e1, Id e2, Id e3); Id EmitCompositeConstructU32x4(EmitContext& ctx, IR::Inst* inst, Id e1, Id e2, Id e3, Id e4); +Id EmitCompositeConstructU32x2x2(EmitContext& ctx, IR::Inst* inst, Id e1, Id e2); Id EmitCompositeExtractU32x2(EmitContext& ctx, Id composite, u32 index); Id EmitCompositeExtractU32x3(EmitContext& ctx, Id composite, u32 index); Id EmitCompositeExtractU32x4(EmitContext& ctx, Id composite, u32 index); @@ -151,6 +164,7 @@ Id EmitCompositeShuffleF16x4(EmitContext& ctx, Id composite1, Id composite2, u32 Id EmitCompositeConstructF32x2(EmitContext& ctx, IR::Inst* inst, Id e1, Id e2); Id EmitCompositeConstructF32x3(EmitContext& ctx, IR::Inst* inst, Id e1, Id e2, Id e3); Id EmitCompositeConstructF32x4(EmitContext& ctx, IR::Inst* inst, Id e1, Id e2, Id e3, Id e4); +Id EmitCompositeConstructF32x2x2(EmitContext& ctx, IR::Inst* inst, Id e1, Id e2); Id EmitCompositeExtractF32x2(EmitContext& ctx, Id composite, u32 index); Id EmitCompositeExtractF32x3(EmitContext& ctx, Id composite, u32 index); Id EmitCompositeExtractF32x4(EmitContext& ctx, Id composite, u32 index); @@ -193,10 +207,6 @@ void EmitBitCastF64U64(EmitContext& ctx); Id EmitPackUint2x32(EmitContext& ctx, Id value); Id EmitUnpackUint2x32(EmitContext& ctx, Id value); Id EmitPackFloat2x32(EmitContext& ctx, Id value); -Id EmitPackFloat2x16(EmitContext& ctx, Id value); -Id EmitUnpackFloat2x16(EmitContext& ctx, Id value); -Id EmitPackHalf2x16(EmitContext& ctx, Id value); -Id EmitUnpackHalf2x16(EmitContext& ctx, Id value); Id EmitPackUnorm2x16(EmitContext& ctx, Id value); Id EmitUnpackUnorm2x16(EmitContext& ctx, Id value); Id EmitPackSnorm2x16(EmitContext& ctx, Id value); @@ -205,6 +215,26 @@ Id EmitPackUint2x16(EmitContext& ctx, Id value); Id EmitUnpackUint2x16(EmitContext& ctx, Id value); Id EmitPackSint2x16(EmitContext& ctx, Id value); Id EmitUnpackSint2x16(EmitContext& ctx, Id value); +Id EmitPackHalf2x16(EmitContext& ctx, Id value); +Id EmitUnpackHalf2x16(EmitContext& ctx, Id value); +Id EmitPackUnorm4x8(EmitContext& ctx, Id value); +Id EmitUnpackUnorm4x8(EmitContext& ctx, Id value); +Id EmitPackSnorm4x8(EmitContext& ctx, Id value); +Id EmitUnpackSnorm4x8(EmitContext& ctx, Id value); +Id EmitPackUint4x8(EmitContext& ctx, Id value); +Id EmitUnpackUint4x8(EmitContext& ctx, Id value); +Id EmitPackSint4x8(EmitContext& ctx, Id value); +Id EmitUnpackSint4x8(EmitContext& ctx, Id value); +Id EmitPackUfloat10_11_11(EmitContext& ctx, Id value); +Id EmitUnpackUfloat10_11_11(EmitContext& ctx, Id value); +Id EmitPackUnorm2_10_10_10(EmitContext& ctx, Id value); +Id EmitUnpackUnorm2_10_10_10(EmitContext& ctx, Id value); +Id EmitPackSnorm2_10_10_10(EmitContext& ctx, Id value); +Id EmitUnpackSnorm2_10_10_10(EmitContext& ctx, Id value); +Id EmitPackUint2_10_10_10(EmitContext& ctx, Id value); +Id EmitUnpackUint2_10_10_10(EmitContext& ctx, Id value); +Id EmitPackSint2_10_10_10(EmitContext& ctx, Id value); +Id EmitUnpackSint2_10_10_10(EmitContext& ctx, Id value); Id EmitFPAbs16(EmitContext& ctx, Id value); Id EmitFPAbs32(EmitContext& ctx, Id value); Id EmitFPAbs64(EmitContext& ctx, Id value); diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp index 2a0c28563..13d727c72 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp @@ -74,8 +74,8 @@ EmitContext::EmitContext(const Profile& profile_, const RuntimeInfo& runtime_inf DefineInterfaces(); DefineSharedMemory(); DefineBuffers(); - DefineTextureBuffers(); DefineImagesAndSamplers(); + DefineFunctions(); } EmitContext::~EmitContext() = default; @@ -205,19 +205,6 @@ void EmitContext::DefineBufferOffsets() { buffer.offset_dwords = OpShiftRightLogical(U32[1], buffer.offset, ConstU32(2U)); Name(buffer.offset_dwords, fmt::format("buf{}_dword_off", binding)); } - for (TextureBufferDefinition& tex_buffer : texture_buffers) { - const u32 binding = tex_buffer.binding; - const u32 half = PushData::BufOffsetIndex + (binding >> 4); - const u32 comp = (binding & 0xf) >> 2; - const u32 offset = (binding & 0x3) << 3; - const Id ptr{OpAccessChain(TypePointer(spv::StorageClass::PushConstant, U32[1]), - push_data_block, ConstU32(half), ConstU32(comp))}; - const Id value{OpLoad(U32[1], ptr)}; - tex_buffer.coord_offset = OpBitFieldUExtract(U32[1], value, ConstU32(offset), ConstU32(6U)); - tex_buffer.coord_shift = - OpBitFieldUExtract(U32[1], value, ConstU32(offset + 6U), ConstU32(2U)); - Name(tex_buffer.coord_offset, fmt::format("texbuf{}_off", binding)); - } } void EmitContext::DefineInterpolatedAttribs() { @@ -676,32 +663,6 @@ void EmitContext::DefineBuffers() { } } -void EmitContext::DefineTextureBuffers() { - for (const auto& desc : info.texture_buffers) { - const auto sharp = desc.GetSharp(info); - const auto nfmt = sharp.GetNumberFmt(); - const bool is_integer = AmdGpu::IsInteger(nfmt); - const VectorIds& sampled_type{GetAttributeType(*this, nfmt)}; - const u32 sampled = desc.is_written ? 2 : 1; - const Id image_type{TypeImage(sampled_type[1], spv::Dim::Buffer, false, false, false, - sampled, spv::ImageFormat::Unknown)}; - const Id pointer_type{TypePointer(spv::StorageClass::UniformConstant, image_type)}; - const Id id{AddGlobalVariable(pointer_type, spv::StorageClass::UniformConstant)}; - Decorate(id, spv::Decoration::Binding, binding.unified++); - Decorate(id, spv::Decoration::DescriptorSet, 0U); - Name(id, fmt::format("{}_{}", desc.is_written ? "imgbuf" : "texbuf", desc.sharp_idx)); - texture_buffers.push_back({ - .id = id, - .binding = binding.buffer++, - .image_type = image_type, - .result_type = sampled_type[4], - .is_integer = is_integer, - .is_storage = desc.is_written, - }); - interfaces.push_back(id); - } -} - spv::ImageFormat GetFormat(const AmdGpu::Image& image) { if (image.GetDataFmt() == AmdGpu::DataFormat::Format32 && image.GetNumberFmt() == AmdGpu::NumberFormat::Uint) { @@ -893,4 +854,117 @@ void EmitContext::DefineSharedMemory() { } } +Id EmitContext::DefineFloat32ToUfloatM5(u32 mantissa_bits, const std::string_view name) { + // https://gitlab.freedesktop.org/mesa/mesa/-/blob/main/src/util/format_r11g11b10f.h + const auto func_type{TypeFunction(U32[1], F32[1])}; + const auto func{OpFunction(U32[1], spv::FunctionControlMask::MaskNone, func_type)}; + const auto value{OpFunctionParameter(F32[1])}; + Name(func, name); + AddLabel(); + + const auto raw_value{OpBitcast(U32[1], value)}; + const auto exponent{ + OpBitcast(S32[1], OpBitFieldSExtract(U32[1], raw_value, ConstU32(23U), ConstU32(8U)))}; + const auto sign{OpBitFieldUExtract(U32[1], raw_value, ConstU32(31U), ConstU32(1U))}; + + const auto is_zero{OpLogicalOr(U1[1], OpIEqual(U1[1], raw_value, ConstU32(0U)), + OpIEqual(U1[1], sign, ConstU32(1U)))}; + const auto is_nan{OpIsNan(U1[1], value)}; + const auto is_inf{OpIsInf(U1[1], value)}; + const auto is_denorm{OpSLessThanEqual(U1[1], exponent, ConstS32(-15))}; + + const auto denorm_mantissa{OpConvertFToU( + U32[1], + OpRoundEven(F32[1], OpFMul(F32[1], value, + ConstF32(static_cast(1 << (mantissa_bits + 14))))))}; + const auto denorm_overflow{ + OpINotEqual(U1[1], OpShiftRightLogical(U32[1], denorm_mantissa, ConstU32(mantissa_bits)), + ConstU32(0U))}; + const auto denorm{ + OpSelect(U32[1], denorm_overflow, ConstU32(1U << mantissa_bits), denorm_mantissa)}; + + const auto norm_mantissa{OpConvertFToU( + U32[1], + OpRoundEven(F32[1], + OpLdexp(F32[1], value, + OpISub(S32[1], ConstS32(static_cast(mantissa_bits)), exponent))))}; + const auto norm_overflow{ + OpUGreaterThanEqual(U1[1], norm_mantissa, ConstU32(2U << mantissa_bits))}; + const auto norm_final_mantissa{OpBitwiseAnd( + U32[1], + OpSelect(U32[1], norm_overflow, OpShiftRightLogical(U32[1], norm_mantissa, ConstU32(1U)), + norm_mantissa), + ConstU32((1U << mantissa_bits) - 1))}; + const auto norm_final_exponent{OpBitcast( + U32[1], + OpIAdd(S32[1], + OpSelect(S32[1], norm_overflow, OpIAdd(S32[1], exponent, ConstS32(1)), exponent), + ConstS32(15)))}; + const auto norm{OpBitFieldInsert(U32[1], norm_final_mantissa, norm_final_exponent, + ConstU32(mantissa_bits), ConstU32(5U))}; + + const auto result{OpSelect(U32[1], is_zero, ConstU32(0U), + OpSelect(U32[1], is_nan, ConstU32(31u << mantissa_bits | 1U), + OpSelect(U32[1], is_inf, ConstU32(31U << mantissa_bits), + OpSelect(U32[1], is_denorm, denorm, norm))))}; + + OpReturnValue(result); + OpFunctionEnd(); + return func; +} + +Id EmitContext::DefineUfloatM5ToFloat32(u32 mantissa_bits, const std::string_view name) { + // https://gitlab.freedesktop.org/mesa/mesa/-/blob/main/src/util/format_r11g11b10f.h + const auto func_type{TypeFunction(F32[1], U32[1])}; + const auto func{OpFunction(F32[1], spv::FunctionControlMask::MaskNone, func_type)}; + const auto value{OpFunctionParameter(U32[1])}; + Name(func, name); + AddLabel(); + + const auto raw_mantissa{ + OpBitFieldUExtract(U32[1], value, ConstU32(0U), ConstU32(mantissa_bits))}; + const auto mantissa{OpConvertUToF(F32[1], raw_mantissa)}; + const auto exponent{OpBitcast( + S32[1], OpBitFieldSExtract(U32[1], value, ConstU32(mantissa_bits), ConstU32(5U)))}; + + const auto is_exp_neg_one{OpIEqual(U1[1], exponent, ConstS32(-1))}; + const auto is_exp_zero{OpIEqual(U1[1], exponent, ConstS32(0))}; + + const auto is_zero{OpIEqual(U1[1], value, ConstU32(0u))}; + const auto is_nan{ + OpLogicalAnd(U1[1], is_exp_neg_one, OpINotEqual(U1[1], raw_mantissa, ConstU32(0u)))}; + const auto is_inf{ + OpLogicalAnd(U1[1], is_exp_neg_one, OpIEqual(U1[1], raw_mantissa, ConstU32(0u)))}; + const auto is_denorm{ + OpLogicalAnd(U1[1], is_exp_zero, OpINotEqual(U1[1], raw_mantissa, ConstU32(0u)))}; + + const auto denorm{OpFMul(F32[1], mantissa, ConstF32(1.f / (1 << 20)))}; + const auto norm{OpLdexp( + F32[1], + OpFAdd(F32[1], + OpFMul(F32[1], mantissa, ConstF32(1.f / static_cast(1 << mantissa_bits))), + ConstF32(1.f)), + exponent)}; + + const auto result{OpSelect(F32[1], is_zero, ConstF32(0.f), + OpSelect(F32[1], is_nan, ConstF32(NAN), + OpSelect(F32[1], is_inf, ConstF32(INFINITY), + OpSelect(F32[1], is_denorm, denorm, norm))))}; + + OpReturnValue(result); + OpFunctionEnd(); + return func; +} + +void EmitContext::DefineFunctions() { + if (info.uses_pack_10_11_11) { + f32_to_uf11 = DefineFloat32ToUfloatM5(6, "f32_to_uf11"); + f32_to_uf10 = DefineFloat32ToUfloatM5(5, "f32_to_uf10"); + } + if (info.uses_unpack_10_11_11) { + uf11_to_f32 = DefineUfloatM5ToFloat32(6, "uf11_to_f32"); + uf10_to_f32 = DefineUfloatM5ToFloat32(5, "uf10_to_f32"); + } +} + } // namespace Shader::Backend::SPIRV diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.h b/src/shader_recompiler/backend/spirv/spirv_emit_context.h index ab42ecc5b..23fca4212 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.h +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.h @@ -235,20 +235,9 @@ public: const VectorIds* data_types; Id pointer_type; }; - struct TextureBufferDefinition { - Id id; - Id coord_offset; - Id coord_shift; - u32 binding; - Id image_type; - Id result_type; - bool is_integer = false; - bool is_storage = false; - }; Bindings& binding; boost::container::small_vector buffers; - boost::container::small_vector texture_buffers; BufferDefinition srt_flatbuf; boost::container::small_vector images; boost::container::small_vector samplers; @@ -271,6 +260,11 @@ public: std::array output_params{}; std::array frag_outputs{}; + Id uf11_to_f32{}; + Id f32_to_uf11{}; + Id uf10_to_f32{}; + Id f32_to_uf10{}; + private: void DefineArithmeticTypes(); void DefineInterfaces(); @@ -278,12 +272,15 @@ private: void DefineOutputs(); void DefinePushDataBlock(); void DefineBuffers(); - void DefineTextureBuffers(); void DefineImagesAndSamplers(); void DefineSharedMemory(); + void DefineFunctions(); SpirvAttribute GetAttributeInfo(AmdGpu::NumberFormat fmt, Id id, u32 num_components, bool output); + + Id DefineFloat32ToUfloatM5(u32 mantissa_bits, std::string_view name); + Id DefineUfloatM5ToFloat32(u32 mantissa_bits, std::string_view name); }; } // namespace Shader::Backend::SPIRV diff --git a/src/shader_recompiler/frontend/translate/export.cpp b/src/shader_recompiler/frontend/translate/export.cpp index 28c4685db..ece35093a 100644 --- a/src/shader_recompiler/frontend/translate/export.cpp +++ b/src/shader_recompiler/frontend/translate/export.cpp @@ -30,28 +30,25 @@ void Translator::ExportMrtCompressed(IR::Attribute attribute, u32 idx, const IR: static_cast(attribute) - static_cast(IR::Attribute::RenderTarget0); const auto color_buffer = runtime_info.fs_info.color_buffers[color_buffer_idx]; - IR::Value unpacked_value; - bool is_integer = false; + AmdGpu::NumberFormat num_format; switch (color_buffer.export_format) { case AmdGpu::Liverpool::ShaderExportFormat::Zero: // No export return; case AmdGpu::Liverpool::ShaderExportFormat::ABGR_FP16: - unpacked_value = ir.UnpackHalf2x16(value); + num_format = AmdGpu::NumberFormat::Float; break; case AmdGpu::Liverpool::ShaderExportFormat::ABGR_UNORM16: - unpacked_value = ir.UnpackUnorm2x16(value); + num_format = AmdGpu::NumberFormat::Unorm; break; case AmdGpu::Liverpool::ShaderExportFormat::ABGR_SNORM16: - unpacked_value = ir.UnpackSnorm2x16(value); + num_format = AmdGpu::NumberFormat::Snorm; break; case AmdGpu::Liverpool::ShaderExportFormat::ABGR_UINT16: - unpacked_value = ir.UnpackUint2x16(value); - is_integer = true; + num_format = AmdGpu::NumberFormat::Uint; break; case AmdGpu::Liverpool::ShaderExportFormat::ABGR_SINT16: - unpacked_value = ir.UnpackSint2x16(value); - is_integer = true; + num_format = AmdGpu::NumberFormat::Sint; break; default: UNREACHABLE_MSG("Unimplemented compressed MRT export format {}", @@ -59,16 +56,15 @@ void Translator::ExportMrtCompressed(IR::Attribute attribute, u32 idx, const IR: break; } - const auto r = ir.CompositeExtract(unpacked_value, 0); - const auto g = ir.CompositeExtract(unpacked_value, 1); - const IR::F32 float_r = is_integer ? ir.BitCast(IR::U32{r}) : IR::F32{r}; - const IR::F32 float_g = is_integer ? ir.BitCast(IR::U32{g}) : IR::F32{g}; + const auto unpacked_value = ir.Unpack2x16(num_format, value); + const IR::F32 r = IR::F32{ir.CompositeExtract(unpacked_value, 0)}; + const IR::F32 g = IR::F32{ir.CompositeExtract(unpacked_value, 1)}; const auto swizzled_r = SwizzleMrtComponent(color_buffer, idx * 2); const auto swizzled_g = SwizzleMrtComponent(color_buffer, idx * 2 + 1); - ExportMrtValue(attribute, swizzled_r, float_r, color_buffer); - ExportMrtValue(attribute, swizzled_g, float_g, color_buffer); + ExportMrtValue(attribute, swizzled_r, r, color_buffer); + ExportMrtValue(attribute, swizzled_g, g, color_buffer); } void Translator::ExportMrtUncompressed(IR::Attribute attribute, u32 comp, const IR::F32& value) { @@ -115,7 +111,7 @@ void Translator::ExportCompressed(IR::Attribute attribute, u32 idx, const IR::U3 ExportMrtCompressed(attribute, idx, value); return; } - const IR::Value unpacked_value = ir.UnpackHalf2x16(value); + const IR::Value unpacked_value = ir.Unpack2x16(AmdGpu::NumberFormat::Float, value); const IR::F32 r = IR::F32{ir.CompositeExtract(unpacked_value, 0)}; const IR::F32 g = IR::F32{ir.CompositeExtract(unpacked_value, 1)}; ir.SetAttribute(attribute, r, idx * 2); diff --git a/src/shader_recompiler/frontend/translate/vector_alu.cpp b/src/shader_recompiler/frontend/translate/vector_alu.cpp index f73618dbe..56e903052 100644 --- a/src/shader_recompiler/frontend/translate/vector_alu.cpp +++ b/src/shader_recompiler/frontend/translate/vector_alu.cpp @@ -651,19 +651,19 @@ void Translator::V_LDEXP_F32(const GcnInst& inst) { void Translator::V_CVT_PKNORM_U16_F32(const GcnInst& inst) { const IR::Value vec_f32 = ir.CompositeConstruct(GetSrc(inst.src[0]), GetSrc(inst.src[1])); - SetDst(inst.dst[0], ir.PackUnorm2x16(vec_f32)); + SetDst(inst.dst[0], ir.Pack2x16(AmdGpu::NumberFormat::Unorm, vec_f32)); } void Translator::V_CVT_PKNORM_I16_F32(const GcnInst& inst) { const IR::Value vec_f32 = ir.CompositeConstruct(GetSrc(inst.src[0]), GetSrc(inst.src[1])); - SetDst(inst.dst[0], ir.PackSnorm2x16(vec_f32)); + SetDst(inst.dst[0], ir.Pack2x16(AmdGpu::NumberFormat::Snorm, vec_f32)); } void Translator::V_CVT_PKRTZ_F16_F32(const GcnInst& inst) { const IR::Value vec_f32 = ir.CompositeConstruct(GetSrc(inst.src[0]), GetSrc(inst.src[1])); - SetDst(inst.dst[0], ir.PackHalf2x16(vec_f32)); + SetDst(inst.dst[0], ir.Pack2x16(AmdGpu::NumberFormat::Float, vec_f32)); } // VOP1 @@ -1245,14 +1245,16 @@ void Translator::V_SAD_U32(const GcnInst& inst) { void Translator::V_CVT_PK_U16_U32(const GcnInst& inst) { const IR::Value vec_u32 = - ir.CompositeConstruct(GetSrc(inst.src[0]), GetSrc(inst.src[1])); - SetDst(inst.dst[0], ir.PackUint2x16(vec_u32)); + ir.CompositeConstruct(ir.BitCast(GetSrc(inst.src[0])), + ir.BitCast(GetSrc(inst.src[1]))); + SetDst(inst.dst[0], ir.Pack2x16(AmdGpu::NumberFormat::Uint, vec_u32)); } void Translator::V_CVT_PK_I16_I32(const GcnInst& inst) { const IR::Value vec_u32 = - ir.CompositeConstruct(GetSrc(inst.src[0]), GetSrc(inst.src[1])); - SetDst(inst.dst[0], ir.PackSint2x16(vec_u32)); + ir.CompositeConstruct(ir.BitCast(GetSrc(inst.src[0])), + ir.BitCast(GetSrc(inst.src[1]))); + SetDst(inst.dst[0], ir.Pack2x16(AmdGpu::NumberFormat::Sint, vec_u32)); } void Translator::V_CVT_PK_U8_F32(const GcnInst& inst) { diff --git a/src/shader_recompiler/frontend/translate/vector_memory.cpp b/src/shader_recompiler/frontend/translate/vector_memory.cpp index 685785af1..0b911eb57 100644 --- a/src/shader_recompiler/frontend/translate/vector_memory.cpp +++ b/src/shader_recompiler/frontend/translate/vector_memory.cpp @@ -208,7 +208,7 @@ void Translator::BUFFER_LOAD(u32 num_dwords, bool is_typed, const GcnInst& inst) const IR::Value handle = ir.CompositeConstruct(ir.GetScalarReg(sharp), ir.GetScalarReg(sharp + 1), ir.GetScalarReg(sharp + 2), ir.GetScalarReg(sharp + 3)); - const IR::Value value = ir.LoadBuffer(num_dwords, handle, address, buffer_info); + const IR::Value value = ir.LoadBufferU32(num_dwords, handle, address, buffer_info); const IR::VectorReg dst_reg{inst.src[1].code}; if (num_dwords == 1) { ir.SetVectorReg(dst_reg, IR::U32{value}); @@ -314,16 +314,18 @@ void Translator::BUFFER_STORE(u32 num_dwords, bool is_typed, const GcnInst& inst const IR::Value handle = ir.CompositeConstruct(ir.GetScalarReg(sharp), ir.GetScalarReg(sharp + 1), ir.GetScalarReg(sharp + 2), ir.GetScalarReg(sharp + 3)); - ir.StoreBuffer(num_dwords, handle, address, value, buffer_info); + ir.StoreBufferU32(num_dwords, handle, address, value, buffer_info); } void Translator::BUFFER_STORE_FORMAT(u32 num_dwords, const GcnInst& inst) { const auto& mubuf = inst.control.mubuf; const IR::VectorReg vaddr{inst.src[0].code}; const IR::ScalarReg sharp{inst.src[2].code * 4}; - ASSERT_MSG(!mubuf.offen && mubuf.offset == 0, "Offsets for image buffers are not supported"); const IR::Value address = [&] -> IR::Value { - if (mubuf.idxen) { + if (mubuf.idxen && mubuf.offen) { + return ir.CompositeConstruct(ir.GetVectorReg(vaddr), ir.GetVectorReg(vaddr + 1)); + } + if (mubuf.idxen || mubuf.offen) { return ir.GetVectorReg(vaddr); } return {}; diff --git a/src/shader_recompiler/info.h b/src/shader_recompiler/info.h index 9469eaad7..498752607 100644 --- a/src/shader_recompiler/info.h +++ b/src/shader_recompiler/info.h @@ -48,6 +48,7 @@ struct BufferResource { bool is_instance_data{}; u8 instance_attrib{}; bool is_written{}; + bool is_formatted{}; [[nodiscard]] bool IsStorage(const AmdGpu::Buffer& buffer) const noexcept { return buffer.GetSize() > MaxUboSize || is_written || is_gds_buffer; @@ -57,14 +58,6 @@ struct BufferResource { }; using BufferResourceList = boost::container::small_vector; -struct TextureBufferResource { - u32 sharp_idx; - bool is_written{}; - - [[nodiscard]] constexpr AmdGpu::Buffer GetSharp(const Info& info) const noexcept; -}; -using TextureBufferResourceList = boost::container::small_vector; - struct ImageResource { u32 sharp_idx; bool is_depth{}; @@ -114,11 +107,6 @@ struct PushData { ASSERT(offset < 256 && binding < buf_offsets.size()); buf_offsets[binding] = offset; } - - void AddTexelOffset(u32 binding, u32 multiplier, u32 texel_offset) { - ASSERT(texel_offset < 64 && multiplier < 16); - buf_offsets[binding] = texel_offset | ((std::bit_width(multiplier) - 1) << 6); - } }; static_assert(sizeof(PushData) <= 128, "PushData size is greater than minimum size guaranteed by Vulkan spec"); @@ -175,7 +163,6 @@ struct Info { u32 uses_patches{}; BufferResourceList buffers; - TextureBufferResourceList texture_buffers; ImageResourceList images; SamplerResourceList samplers; FMaskResourceList fmasks; @@ -193,8 +180,6 @@ struct Info { u64 pgm_hash{}; VAddr pgm_base; bool has_storage_images{}; - bool has_image_buffers{}; - bool has_texel_buffers{}; bool has_discard{}; bool has_image_gather{}; bool has_image_query{}; @@ -204,6 +189,8 @@ struct Info { bool uses_shared{}; bool uses_fp16{}; bool uses_fp64{}; + bool uses_pack_10_11_11{}; + bool uses_unpack_10_11_11{}; bool stores_tess_level_outer{}; bool stores_tess_level_inner{}; bool translation_failed{}; // indicates that shader has unsupported instructions @@ -246,8 +233,7 @@ struct Info { } void AddBindings(Backend::Bindings& bnd) const { - const auto total_buffers = - buffers.size() + texture_buffers.size() + (has_readconst ? 1 : 0); + const auto total_buffers = buffers.size() + (has_readconst ? 1 : 0); bnd.buffer += total_buffers; bnd.unified += total_buffers + images.size() + samplers.size(); bnd.user_data += ud_mask.NumRegs(); @@ -278,10 +264,6 @@ constexpr AmdGpu::Buffer BufferResource::GetSharp(const Info& info) const noexce return inline_cbuf ? inline_cbuf : info.ReadUdSharp(sharp_idx); } -constexpr AmdGpu::Buffer TextureBufferResource::GetSharp(const Info& info) const noexcept { - return info.ReadUdSharp(sharp_idx); -} - constexpr AmdGpu::Image ImageResource::GetSharp(const Info& info) const noexcept { const auto image = info.ReadUdSharp(sharp_idx); if (!image.Valid()) { diff --git a/src/shader_recompiler/ir/ir_emitter.cpp b/src/shader_recompiler/ir/ir_emitter.cpp index ecbe1f838..7e3d0f937 100644 --- a/src/shader_recompiler/ir/ir_emitter.cpp +++ b/src/shader_recompiler/ir/ir_emitter.cpp @@ -370,8 +370,16 @@ U32 IREmitter::ReadConstBuffer(const Value& handle, const U32& index) { return Inst(Opcode::ReadConstBuffer, handle, index); } -Value IREmitter::LoadBuffer(int num_dwords, const Value& handle, const Value& address, - BufferInstInfo info) { +U32 IREmitter::LoadBufferU8(const Value& handle, const Value& address, BufferInstInfo info) { + return Inst(Opcode::LoadBufferU8, Flags{info}, handle, address); +} + +U32 IREmitter::LoadBufferU16(const Value& handle, const Value& address, BufferInstInfo info) { + return Inst(Opcode::LoadBufferU16, Flags{info}, handle, address); +} + +Value IREmitter::LoadBufferU32(int num_dwords, const Value& handle, const Value& address, + BufferInstInfo info) { switch (num_dwords) { case 1: return Inst(Opcode::LoadBufferU32, Flags{info}, handle, address); @@ -386,12 +394,38 @@ Value IREmitter::LoadBuffer(int num_dwords, const Value& handle, const Value& ad } } +Value IREmitter::LoadBufferF32(int num_dwords, const Value& handle, const Value& address, + BufferInstInfo info) { + switch (num_dwords) { + case 1: + return Inst(Opcode::LoadBufferF32, Flags{info}, handle, address); + case 2: + return Inst(Opcode::LoadBufferF32x2, Flags{info}, handle, address); + case 3: + return Inst(Opcode::LoadBufferF32x3, Flags{info}, handle, address); + case 4: + return Inst(Opcode::LoadBufferF32x4, Flags{info}, handle, address); + default: + UNREACHABLE_MSG("Invalid number of dwords {}", num_dwords); + } +} + Value IREmitter::LoadBufferFormat(const Value& handle, const Value& address, BufferInstInfo info) { return Inst(Opcode::LoadBufferFormatF32, Flags{info}, handle, address); } -void IREmitter::StoreBuffer(int num_dwords, const Value& handle, const Value& address, - const Value& data, BufferInstInfo info) { +void IREmitter::StoreBufferU8(const Value& handle, const Value& address, const U32& data, + BufferInstInfo info) { + Inst(Opcode::StoreBufferU8, Flags{info}, handle, address, data); +} + +void IREmitter::StoreBufferU16(const Value& handle, const Value& address, const U32& data, + BufferInstInfo info) { + Inst(Opcode::StoreBufferU16, Flags{info}, handle, address, data); +} + +void IREmitter::StoreBufferU32(int num_dwords, const Value& handle, const Value& address, + const Value& data, BufferInstInfo info) { switch (num_dwords) { case 1: Inst(Opcode::StoreBufferU32, Flags{info}, handle, address, data); @@ -410,6 +444,31 @@ void IREmitter::StoreBuffer(int num_dwords, const Value& handle, const Value& ad } } +void IREmitter::StoreBufferF32(int num_dwords, const Value& handle, const Value& address, + const Value& data, BufferInstInfo info) { + switch (num_dwords) { + case 1: + Inst(Opcode::StoreBufferF32, Flags{info}, handle, address, data); + break; + case 2: + Inst(Opcode::StoreBufferF32x2, Flags{info}, handle, address, data); + break; + case 3: + Inst(Opcode::StoreBufferF32x3, Flags{info}, handle, address, data); + break; + case 4: + Inst(Opcode::StoreBufferF32x4, Flags{info}, handle, address, data); + break; + default: + UNREACHABLE_MSG("Invalid number of dwords {}", num_dwords); + } +} + +void IREmitter::StoreBufferFormat(const Value& handle, const Value& address, const Value& data, + BufferInstInfo info) { + Inst(Opcode::StoreBufferFormatF32, Flags{info}, handle, address, data); +} + Value IREmitter::BufferAtomicIAdd(const Value& handle, const Value& address, const Value& value, BufferInstInfo info) { return Inst(Opcode::BufferAtomicIAdd32, Flags{info}, handle, address, value); @@ -457,11 +516,6 @@ Value IREmitter::BufferAtomicSwap(const Value& handle, const Value& address, con return Inst(Opcode::BufferAtomicSwap32, Flags{info}, handle, address, value); } -void IREmitter::StoreBufferFormat(const Value& handle, const Value& address, const Value& data, - BufferInstInfo info) { - Inst(Opcode::StoreBufferFormatF32, Flags{info}, handle, address, data); -} - U32 IREmitter::DataAppend(const U32& counter) { return Inst(Opcode::DataAppend, counter, Imm32(0)); } @@ -527,10 +581,14 @@ Value IREmitter::CompositeConstruct(const Value& e1, const Value& e2) { switch (e1.Type()) { case Type::U32: return Inst(Opcode::CompositeConstructU32x2, e1, e2); + case Type::U32x2: + return Inst(Opcode::CompositeConstructU32x2x2, e1, e2); case Type::F16: return Inst(Opcode::CompositeConstructF16x2, e1, e2); case Type::F32: return Inst(Opcode::CompositeConstructF32x2, e1, e2); + case Type::F32x2: + return Inst(Opcode::CompositeConstructF32x2x2, e1, e2); case Type::F64: return Inst(Opcode::CompositeConstructF64x2, e1, e2); default: @@ -779,52 +837,116 @@ F64 IREmitter::PackFloat2x32(const Value& vector) { return Inst(Opcode::PackFloat2x32, vector); } -U32 IREmitter::PackFloat2x16(const Value& vector) { - return Inst(Opcode::PackFloat2x16, vector); +U32 IREmitter::Pack2x16(const AmdGpu::NumberFormat number_format, const Value& vector) { + switch (number_format) { + case AmdGpu::NumberFormat::Unorm: + return Inst(Opcode::PackUnorm2x16, vector); + case AmdGpu::NumberFormat::Snorm: + return Inst(Opcode::PackSnorm2x16, vector); + case AmdGpu::NumberFormat::Uint: + return Inst(Opcode::PackUint2x16, vector); + case AmdGpu::NumberFormat::Sint: + return Inst(Opcode::PackSint2x16, vector); + case AmdGpu::NumberFormat::Float: + return Inst(Opcode::PackHalf2x16, vector); + default: + UNREACHABLE_MSG("Unsupported 2x16 number format: {}", number_format); + } } -Value IREmitter::UnpackFloat2x16(const U32& value) { - return Inst(Opcode::UnpackFloat2x16, value); +Value IREmitter::Unpack2x16(const AmdGpu::NumberFormat number_format, const U32& value) { + switch (number_format) { + case AmdGpu::NumberFormat::Unorm: + return Inst(Opcode::UnpackUnorm2x16, value); + case AmdGpu::NumberFormat::Snorm: + return Inst(Opcode::UnpackSnorm2x16, value); + case AmdGpu::NumberFormat::Uint: + return Inst(Opcode::UnpackUint2x16, value); + case AmdGpu::NumberFormat::Sint: + return Inst(Opcode::UnpackSint2x16, value); + case AmdGpu::NumberFormat::Float: + return Inst(Opcode::UnpackHalf2x16, value); + default: + UNREACHABLE_MSG("Unsupported 2x16 number format: {}", number_format); + } } -U32 IREmitter::PackHalf2x16(const Value& vector) { - return Inst(Opcode::PackHalf2x16, vector); +U32 IREmitter::Pack4x8(const AmdGpu::NumberFormat number_format, const Value& vector) { + switch (number_format) { + case AmdGpu::NumberFormat::Unorm: + return Inst(Opcode::PackUnorm4x8, vector); + case AmdGpu::NumberFormat::Snorm: + return Inst(Opcode::PackSnorm4x8, vector); + case AmdGpu::NumberFormat::Uint: + return Inst(Opcode::PackUint4x8, vector); + case AmdGpu::NumberFormat::Sint: + return Inst(Opcode::PackSint4x8, vector); + default: + UNREACHABLE_MSG("Unsupported 4x8 number format: {}", number_format); + } } -Value IREmitter::UnpackHalf2x16(const U32& value) { - return Inst(Opcode::UnpackHalf2x16, value); +Value IREmitter::Unpack4x8(const AmdGpu::NumberFormat number_format, const U32& value) { + switch (number_format) { + case AmdGpu::NumberFormat::Unorm: + return Inst(Opcode::UnpackUnorm4x8, value); + case AmdGpu::NumberFormat::Snorm: + return Inst(Opcode::UnpackSnorm4x8, value); + case AmdGpu::NumberFormat::Uint: + return Inst(Opcode::UnpackUint4x8, value); + case AmdGpu::NumberFormat::Sint: + return Inst(Opcode::UnpackSint4x8, value); + default: + UNREACHABLE_MSG("Unsupported 4x8 number format: {}", number_format); + } } -U32 IREmitter::PackUnorm2x16(const Value& vector) { - return Inst(Opcode::PackUnorm2x16, vector); +U32 IREmitter::Pack10_11_11(const AmdGpu::NumberFormat number_format, const Value& vector) { + switch (number_format) { + case AmdGpu::NumberFormat::Float: + return Inst(Opcode::PackUfloat10_11_11, vector); + default: + UNREACHABLE_MSG("Unsupported 10_11_11 number format: {}", number_format); + } } -Value IREmitter::UnpackUnorm2x16(const U32& value) { - return Inst(Opcode::UnpackUnorm2x16, value); +U32 IREmitter::Pack2_10_10_10(const AmdGpu::NumberFormat number_format, const Value& vector) { + switch (number_format) { + case AmdGpu::NumberFormat::Unorm: + return Inst(Opcode::PackUnorm2_10_10_10, vector); + case AmdGpu::NumberFormat::Snorm: + return Inst(Opcode::PackSnorm2_10_10_10, vector); + case AmdGpu::NumberFormat::Uint: + return Inst(Opcode::PackUint2_10_10_10, vector); + case AmdGpu::NumberFormat::Sint: + return Inst(Opcode::PackSint2_10_10_10, vector); + default: + UNREACHABLE_MSG("Unsupported 2_10_10_10 number format: {}", number_format); + } } -U32 IREmitter::PackSnorm2x16(const Value& vector) { - return Inst(Opcode::PackSnorm2x16, vector); +Value IREmitter::Unpack2_10_10_10(const AmdGpu::NumberFormat number_format, const U32& value) { + switch (number_format) { + case AmdGpu::NumberFormat::Unorm: + return Inst(Opcode::UnpackUnorm2_10_10_10, value); + case AmdGpu::NumberFormat::Snorm: + return Inst(Opcode::UnpackSnorm2_10_10_10, value); + case AmdGpu::NumberFormat::Uint: + return Inst(Opcode::UnpackUint2_10_10_10, value); + case AmdGpu::NumberFormat::Sint: + return Inst(Opcode::UnpackSint2_10_10_10, value); + default: + UNREACHABLE_MSG("Unsupported 2_10_10_10 number format: {}", number_format); + } } -Value IREmitter::UnpackSnorm2x16(const U32& value) { - return Inst(Opcode::UnpackSnorm2x16, value); -} - -U32 IREmitter::PackUint2x16(const Value& value) { - return Inst(Opcode::PackUint2x16, value); -} - -Value IREmitter::UnpackUint2x16(const U32& value) { - return Inst(Opcode::UnpackUint2x16, value); -} - -U32 IREmitter::PackSint2x16(const Value& value) { - return Inst(Opcode::PackSint2x16, value); -} - -Value IREmitter::UnpackSint2x16(const U32& value) { - return Inst(Opcode::UnpackSint2x16, value); +Value IREmitter::Unpack10_11_11(const AmdGpu::NumberFormat number_format, const U32& value) { + switch (number_format) { + case AmdGpu::NumberFormat::Float: + return Inst(Opcode::UnpackUfloat10_11_11, value); + default: + UNREACHABLE_MSG("Unsupported 10_11_11 number format: {}", number_format); + } } F32F64 IREmitter::FPMul(const F32F64& a, const F32F64& b) { diff --git a/src/shader_recompiler/ir/ir_emitter.h b/src/shader_recompiler/ir/ir_emitter.h index 97b94187a..7ac75bf70 100644 --- a/src/shader_recompiler/ir/ir_emitter.h +++ b/src/shader_recompiler/ir/ir_emitter.h @@ -109,12 +109,22 @@ public: [[nodiscard]] U32 ReadConst(const Value& base, const U32& offset); [[nodiscard]] U32 ReadConstBuffer(const Value& handle, const U32& index); - [[nodiscard]] Value LoadBuffer(int num_dwords, const Value& handle, const Value& address, - BufferInstInfo info); + [[nodiscard]] U32 LoadBufferU8(const Value& handle, const Value& address, BufferInstInfo info); + [[nodiscard]] U32 LoadBufferU16(const Value& handle, const Value& address, BufferInstInfo info); + [[nodiscard]] Value LoadBufferU32(int num_dwords, const Value& handle, const Value& address, + BufferInstInfo info); + [[nodiscard]] Value LoadBufferF32(int num_dwords, const Value& handle, const Value& address, + BufferInstInfo info); [[nodiscard]] Value LoadBufferFormat(const Value& handle, const Value& address, BufferInstInfo info); - void StoreBuffer(int num_dwords, const Value& handle, const Value& address, const Value& data, - BufferInstInfo info); + void StoreBufferU8(const Value& handle, const Value& address, const U32& data, + BufferInstInfo info); + void StoreBufferU16(const Value& handle, const Value& address, const U32& data, + BufferInstInfo info); + void StoreBufferU32(int num_dwords, const Value& handle, const Value& address, + const Value& data, BufferInstInfo info); + void StoreBufferF32(int num_dwords, const Value& handle, const Value& address, + const Value& data, BufferInstInfo info); void StoreBufferFormat(const Value& handle, const Value& address, const Value& data, BufferInstInfo info); @@ -167,22 +177,19 @@ public: [[nodiscard]] U64 PackUint2x32(const Value& vector); [[nodiscard]] Value UnpackUint2x32(const U64& value); - [[nodiscard]] F64 PackFloat2x32(const Value& vector); - [[nodiscard]] U32 PackFloat2x16(const Value& vector); - [[nodiscard]] Value UnpackFloat2x16(const U32& value); + [[nodiscard]] U32 Pack2x16(AmdGpu::NumberFormat number_format, const Value& vector); + [[nodiscard]] Value Unpack2x16(AmdGpu::NumberFormat number_format, const U32& value); - [[nodiscard]] U32 PackHalf2x16(const Value& vector); - [[nodiscard]] Value UnpackHalf2x16(const U32& value); - [[nodiscard]] U32 PackUnorm2x16(const Value& vector); - [[nodiscard]] Value UnpackUnorm2x16(const U32& value); - [[nodiscard]] U32 PackSnorm2x16(const Value& vector); - [[nodiscard]] Value UnpackSnorm2x16(const U32& value); - [[nodiscard]] U32 PackUint2x16(const Value& value); - [[nodiscard]] Value UnpackUint2x16(const U32& value); - [[nodiscard]] U32 PackSint2x16(const Value& value); - [[nodiscard]] Value UnpackSint2x16(const U32& value); + [[nodiscard]] U32 Pack4x8(AmdGpu::NumberFormat number_format, const Value& vector); + [[nodiscard]] Value Unpack4x8(AmdGpu::NumberFormat number_format, const U32& value); + + [[nodiscard]] U32 Pack10_11_11(AmdGpu::NumberFormat number_format, const Value& vector); + [[nodiscard]] Value Unpack10_11_11(AmdGpu::NumberFormat number_format, const U32& value); + + [[nodiscard]] U32 Pack2_10_10_10(AmdGpu::NumberFormat number_format, const Value& vector); + [[nodiscard]] Value Unpack2_10_10_10(AmdGpu::NumberFormat number_format, const U32& value); [[nodiscard]] F32F64 FPAdd(const F32F64& a, const F32F64& b); [[nodiscard]] F32F64 FPSub(const F32F64& a, const F32F64& b); diff --git a/src/shader_recompiler/ir/microinstruction.cpp b/src/shader_recompiler/ir/microinstruction.cpp index 6e7bbe661..fdbc019e3 100644 --- a/src/shader_recompiler/ir/microinstruction.cpp +++ b/src/shader_recompiler/ir/microinstruction.cpp @@ -54,10 +54,16 @@ bool Inst::MayHaveSideEffects() const noexcept { case Opcode::SetAttribute: case Opcode::SetTcsGenericAttribute: case Opcode::SetPatch: + case Opcode::StoreBufferU8: + case Opcode::StoreBufferU16: case Opcode::StoreBufferU32: case Opcode::StoreBufferU32x2: case Opcode::StoreBufferU32x3: case Opcode::StoreBufferU32x4: + case Opcode::StoreBufferF32: + case Opcode::StoreBufferF32x2: + case Opcode::StoreBufferF32x3: + case Opcode::StoreBufferF32x4: case Opcode::StoreBufferFormatF32: case Opcode::BufferAtomicIAdd32: case Opcode::BufferAtomicSMin32: diff --git a/src/shader_recompiler/ir/opcodes.inc b/src/shader_recompiler/ir/opcodes.inc index 6750be5a6..0d87430d2 100644 --- a/src/shader_recompiler/ir/opcodes.inc +++ b/src/shader_recompiler/ir/opcodes.inc @@ -90,15 +90,27 @@ OPCODE(UndefU32, U32, OPCODE(UndefU64, U64, ) // Buffer operations +OPCODE(LoadBufferU8, U32, Opaque, Opaque, ) +OPCODE(LoadBufferU16, U32, Opaque, Opaque, ) OPCODE(LoadBufferU32, U32, Opaque, Opaque, ) OPCODE(LoadBufferU32x2, U32x2, Opaque, Opaque, ) OPCODE(LoadBufferU32x3, U32x3, Opaque, Opaque, ) OPCODE(LoadBufferU32x4, U32x4, Opaque, Opaque, ) +OPCODE(LoadBufferF32, F32, Opaque, Opaque, ) +OPCODE(LoadBufferF32x2, F32x2, Opaque, Opaque, ) +OPCODE(LoadBufferF32x3, F32x3, Opaque, Opaque, ) +OPCODE(LoadBufferF32x4, F32x4, Opaque, Opaque, ) OPCODE(LoadBufferFormatF32, F32x4, Opaque, Opaque, ) +OPCODE(StoreBufferU8, Void, Opaque, Opaque, U32, ) +OPCODE(StoreBufferU16, Void, Opaque, Opaque, U32, ) OPCODE(StoreBufferU32, Void, Opaque, Opaque, U32, ) OPCODE(StoreBufferU32x2, Void, Opaque, Opaque, U32x2, ) OPCODE(StoreBufferU32x3, Void, Opaque, Opaque, U32x3, ) OPCODE(StoreBufferU32x4, Void, Opaque, Opaque, U32x4, ) +OPCODE(StoreBufferF32, Void, Opaque, Opaque, F32, ) +OPCODE(StoreBufferF32x2, Void, Opaque, Opaque, F32x2, ) +OPCODE(StoreBufferF32x3, Void, Opaque, Opaque, F32x3, ) +OPCODE(StoreBufferF32x4, Void, Opaque, Opaque, F32x4, ) OPCODE(StoreBufferFormatF32, Void, Opaque, Opaque, F32x4, ) // Buffer atomic operations @@ -118,6 +130,7 @@ OPCODE(BufferAtomicSwap32, U32, Opaq OPCODE(CompositeConstructU32x2, U32x2, U32, U32, ) OPCODE(CompositeConstructU32x3, U32x3, U32, U32, U32, ) OPCODE(CompositeConstructU32x4, U32x4, U32, U32, U32, U32, ) +OPCODE(CompositeConstructU32x2x2, U32x4, U32x2, U32x2, ) OPCODE(CompositeExtractU32x2, U32, U32x2, U32, ) OPCODE(CompositeExtractU32x3, U32, U32x3, U32, ) OPCODE(CompositeExtractU32x4, U32, U32x4, U32, ) @@ -142,6 +155,7 @@ OPCODE(CompositeShuffleF16x4, F16x4, F16x OPCODE(CompositeConstructF32x2, F32x2, F32, F32, ) OPCODE(CompositeConstructF32x3, F32x3, F32, F32, F32, ) OPCODE(CompositeConstructF32x4, F32x4, F32, F32, F32, F32, ) +OPCODE(CompositeConstructF32x2x2, F32x4, F32x2, F32x2, ) OPCODE(CompositeExtractF32x2, F32, F32x2, U32, ) OPCODE(CompositeExtractF32x3, F32, F32x3, U32, ) OPCODE(CompositeExtractF32x4, F32, F32x4, U32, ) @@ -180,21 +194,42 @@ OPCODE(BitCastU64F64, U64, F64, OPCODE(BitCastF16U16, F16, U16, ) OPCODE(BitCastF32U32, F32, U32, ) OPCODE(BitCastF64U64, F64, U64, ) + OPCODE(PackUint2x32, U64, U32x2, ) OPCODE(UnpackUint2x32, U32x2, U64, ) OPCODE(PackFloat2x32, F64, F32x2, ) -OPCODE(PackFloat2x16, U32, F16x2, ) -OPCODE(UnpackFloat2x16, F16x2, U32, ) -OPCODE(PackHalf2x16, U32, F32x2, ) -OPCODE(UnpackHalf2x16, F32x2, U32, ) + OPCODE(PackUnorm2x16, U32, F32x2, ) OPCODE(UnpackUnorm2x16, F32x2, U32, ) OPCODE(PackSnorm2x16, U32, F32x2, ) OPCODE(UnpackSnorm2x16, F32x2, U32, ) -OPCODE(PackUint2x16, U32, U32x2, ) -OPCODE(UnpackUint2x16, U32x2, U32, ) -OPCODE(PackSint2x16, U32, U32x2, ) -OPCODE(UnpackSint2x16, U32x2, U32, ) +OPCODE(PackUint2x16, U32, F32x2, ) +OPCODE(UnpackUint2x16, F32x2, U32, ) +OPCODE(PackSint2x16, U32, F32x2, ) +OPCODE(UnpackSint2x16, F32x2, U32, ) +OPCODE(PackHalf2x16, U32, F32x2, ) +OPCODE(UnpackHalf2x16, F32x2, U32, ) + +OPCODE(PackUnorm4x8, U32, F32x4, ) +OPCODE(UnpackUnorm4x8, F32x4, U32, ) +OPCODE(PackSnorm4x8, U32, F32x4, ) +OPCODE(UnpackSnorm4x8, F32x4, U32, ) +OPCODE(PackUint4x8, U32, F32x4, ) +OPCODE(UnpackUint4x8, F32x4, U32, ) +OPCODE(PackSint4x8, U32, F32x4, ) +OPCODE(UnpackSint4x8, F32x4, U32, ) + +OPCODE(PackUfloat10_11_11, U32, F32x3, ) +OPCODE(UnpackUfloat10_11_11, F32x3, U32, ) + +OPCODE(PackUnorm2_10_10_10, U32, F32x4, ) +OPCODE(UnpackUnorm2_10_10_10, F32x4, U32, ) +OPCODE(PackSnorm2_10_10_10, U32, F32x4, ) +OPCODE(UnpackSnorm2_10_10_10, F32x4, U32, ) +OPCODE(PackUint2_10_10_10, U32, F32x4, ) +OPCODE(UnpackUint2_10_10_10, F32x4, U32, ) +OPCODE(PackSint2_10_10_10, U32, F32x4, ) +OPCODE(UnpackSint2_10_10_10, F32x4, U32, ) // Floating-point operations OPCODE(FPAbs32, F32, F32, ) diff --git a/src/shader_recompiler/ir/passes/constant_propagation_pass.cpp b/src/shader_recompiler/ir/passes/constant_propagation_pass.cpp index c72b9e835..c8a4b13cb 100644 --- a/src/shader_recompiler/ir/passes/constant_propagation_pass.cpp +++ b/src/shader_recompiler/ir/passes/constant_propagation_pass.cpp @@ -340,14 +340,7 @@ void ConstantPropagation(IR::Block& block, IR::Inst& inst) { return FoldBitCast(inst, IR::Opcode::BitCastU32F32); case IR::Opcode::BitCastU32F32: return FoldBitCast(inst, IR::Opcode::BitCastF32U32); - case IR::Opcode::PackHalf2x16: - return FoldInverseFunc(inst, IR::Opcode::UnpackHalf2x16); - case IR::Opcode::UnpackHalf2x16: - return FoldInverseFunc(inst, IR::Opcode::PackHalf2x16); - case IR::Opcode::PackFloat2x16: - return FoldInverseFunc(inst, IR::Opcode::UnpackFloat2x16); - case IR::Opcode::UnpackFloat2x16: - return FoldInverseFunc(inst, IR::Opcode::PackFloat2x16); + // 2x16 case IR::Opcode::PackUnorm2x16: return FoldInverseFunc(inst, IR::Opcode::UnpackUnorm2x16); case IR::Opcode::UnpackUnorm2x16: @@ -364,6 +357,49 @@ void ConstantPropagation(IR::Block& block, IR::Inst& inst) { return FoldInverseFunc(inst, IR::Opcode::UnpackSint2x16); case IR::Opcode::UnpackSint2x16: return FoldInverseFunc(inst, IR::Opcode::PackSint2x16); + case IR::Opcode::PackHalf2x16: + return FoldInverseFunc(inst, IR::Opcode::UnpackHalf2x16); + case IR::Opcode::UnpackHalf2x16: + return FoldInverseFunc(inst, IR::Opcode::PackHalf2x16); + // 4x8 + case IR::Opcode::PackUnorm4x8: + return FoldInverseFunc(inst, IR::Opcode::UnpackUnorm4x8); + case IR::Opcode::UnpackUnorm4x8: + return FoldInverseFunc(inst, IR::Opcode::PackUnorm4x8); + case IR::Opcode::PackSnorm4x8: + return FoldInverseFunc(inst, IR::Opcode::UnpackSnorm4x8); + case IR::Opcode::UnpackSnorm4x8: + return FoldInverseFunc(inst, IR::Opcode::PackSnorm4x8); + case IR::Opcode::PackUint4x8: + return FoldInverseFunc(inst, IR::Opcode::UnpackUint4x8); + case IR::Opcode::UnpackUint4x8: + return FoldInverseFunc(inst, IR::Opcode::PackUint4x8); + case IR::Opcode::PackSint4x8: + return FoldInverseFunc(inst, IR::Opcode::UnpackSint4x8); + case IR::Opcode::UnpackSint4x8: + return FoldInverseFunc(inst, IR::Opcode::PackSint4x8); + // 10_11_11 + case IR::Opcode::PackUfloat10_11_11: + return FoldInverseFunc(inst, IR::Opcode::UnpackUfloat10_11_11); + case IR::Opcode::UnpackUfloat10_11_11: + return FoldInverseFunc(inst, IR::Opcode::PackUfloat10_11_11); + // 2_10_10_10 + case IR::Opcode::PackUnorm2_10_10_10: + return FoldInverseFunc(inst, IR::Opcode::UnpackUnorm2_10_10_10); + case IR::Opcode::UnpackUnorm2_10_10_10: + return FoldInverseFunc(inst, IR::Opcode::PackUnorm2_10_10_10); + case IR::Opcode::PackSnorm2_10_10_10: + return FoldInverseFunc(inst, IR::Opcode::UnpackSnorm2_10_10_10); + case IR::Opcode::UnpackSnorm2_10_10_10: + return FoldInverseFunc(inst, IR::Opcode::PackSnorm2_10_10_10); + case IR::Opcode::PackUint2_10_10_10: + return FoldInverseFunc(inst, IR::Opcode::UnpackUint2_10_10_10); + case IR::Opcode::UnpackUint2_10_10_10: + return FoldInverseFunc(inst, IR::Opcode::PackUint2_10_10_10); + case IR::Opcode::PackSint2_10_10_10: + return FoldInverseFunc(inst, IR::Opcode::UnpackSint2_10_10_10); + case IR::Opcode::UnpackSint2_10_10_10: + return FoldInverseFunc(inst, IR::Opcode::PackSint2_10_10_10); case IR::Opcode::SelectU1: case IR::Opcode::SelectU8: case IR::Opcode::SelectU16: diff --git a/src/shader_recompiler/ir/passes/ir_passes.h b/src/shader_recompiler/ir/passes/ir_passes.h index 8a71d9e1f..0d6816ae0 100644 --- a/src/shader_recompiler/ir/passes/ir_passes.h +++ b/src/shader_recompiler/ir/passes/ir_passes.h @@ -19,6 +19,7 @@ void ConstantPropagationPass(IR::BlockList& program); void FlattenExtendedUserdataPass(IR::Program& program); void ResourceTrackingPass(IR::Program& program); void CollectShaderInfoPass(IR::Program& program); +void LowerBufferFormatToRaw(IR::Program& program); void LowerSharedMemToRegisters(IR::Program& program); void RingAccessElimination(const IR::Program& program, const RuntimeInfo& runtime_info, Stage stage); diff --git a/src/shader_recompiler/ir/passes/lower_buffer_format_to_raw.cpp b/src/shader_recompiler/ir/passes/lower_buffer_format_to_raw.cpp new file mode 100644 index 000000000..b30b022f8 --- /dev/null +++ b/src/shader_recompiler/ir/passes/lower_buffer_format_to_raw.cpp @@ -0,0 +1,211 @@ +// SPDX-FileCopyrightText: Copyright 2025 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include "shader_recompiler/info.h" +#include "shader_recompiler/ir/basic_block.h" +#include "shader_recompiler/ir/ir_emitter.h" +#include "shader_recompiler/ir/program.h" +#include "shader_recompiler/ir/reinterpret.h" +#include "video_core/amdgpu/resource.h" + +namespace Shader::Optimization { + +static bool IsBufferFormatLoad(const IR::Inst& inst) { + return inst.GetOpcode() == IR::Opcode::LoadBufferFormatF32; +} + +static bool IsBufferFormatStore(const IR::Inst& inst) { + return inst.GetOpcode() == IR::Opcode::StoreBufferFormatF32; +} + +static IR::Value LoadBufferFormat(IR::IREmitter& ir, const AmdGpu::Buffer& buffer, + const IR::Value handle, const IR::U32 address, + const IR::BufferInstInfo info) { + const auto data_fmt = buffer.GetDataFmt(); + const auto num_fmt = buffer.GetNumberFmt(); + const auto num_conv = buffer.GetNumberConversion(); + const auto num_components = AmdGpu::NumComponents(buffer.GetDataFmt()); + + IR::Value interpreted; + switch (data_fmt) { + case AmdGpu::DataFormat::FormatInvalid: + interpreted = ir.Imm32(0.f); + break; + case AmdGpu::DataFormat::Format8: { + const auto unpacked = ir.Unpack4x8(num_fmt, ir.LoadBufferU8(handle, address, info)); + interpreted = ir.CompositeExtract(unpacked, 0); + break; + } + case AmdGpu::DataFormat::Format8_8: { + const auto raw = ir.LoadBufferU16(handle, address, info); + const auto unpacked = ir.Unpack4x8(num_fmt, raw); + interpreted = ir.CompositeConstruct(ir.CompositeExtract(unpacked, 0), + ir.CompositeExtract(unpacked, 1)); + break; + } + case AmdGpu::DataFormat::Format8_8_8_8: + interpreted = ir.Unpack4x8(num_fmt, IR::U32{ir.LoadBufferU32(1, handle, address, info)}); + break; + case AmdGpu::DataFormat::Format16: { + const auto unpacked = ir.Unpack2x16(num_fmt, ir.LoadBufferU16(handle, address, info)); + interpreted = ir.CompositeExtract(unpacked, 0); + break; + } + case AmdGpu::DataFormat::Format16_16: + interpreted = ir.Unpack2x16(num_fmt, IR::U32{ir.LoadBufferU32(1, handle, address, info)}); + break; + case AmdGpu::DataFormat::Format10_11_11: + interpreted = + ir.Unpack10_11_11(num_fmt, IR::U32{ir.LoadBufferU32(1, handle, address, info)}); + break; + case AmdGpu::DataFormat::Format2_10_10_10: + interpreted = + ir.Unpack2_10_10_10(num_fmt, IR::U32{ir.LoadBufferU32(1, handle, address, info)}); + break; + case AmdGpu::DataFormat::Format16_16_16_16: { + const auto raw = ir.LoadBufferU32(2, handle, address, info); + interpreted = + ir.CompositeConstruct(ir.Unpack2x16(num_fmt, IR::U32{ir.CompositeExtract(raw, 0)}), + ir.Unpack2x16(num_fmt, IR::U32{ir.CompositeExtract(raw, 1)})); + break; + } + case AmdGpu::DataFormat::Format32: + case AmdGpu::DataFormat::Format32_32: + case AmdGpu::DataFormat::Format32_32_32: + case AmdGpu::DataFormat::Format32_32_32_32: { + ASSERT(num_fmt == AmdGpu::NumberFormat::Uint || num_fmt == AmdGpu::NumberFormat::Sint || + num_fmt == AmdGpu::NumberFormat::Float); + interpreted = ir.LoadBufferF32(num_components, handle, address, info); + break; + } + default: + UNREACHABLE_MSG("Unsupported buffer data format: {}", data_fmt); + } + + // Pad to 4 components and apply additional modifications. + boost::container::static_vector components; + for (u32 i = 0; i < 4; i++) { + if (i < num_components) { + const auto component = + IR::F32{num_components == 1 ? interpreted : ir.CompositeExtract(interpreted, i)}; + components.push_back(ApplyReadNumberConversion(ir, component, num_conv)); + } else { + components.push_back(ir.Imm32(0.f)); + } + } + const auto swizzled = ApplySwizzle(ir, ir.CompositeConstruct(components), buffer.DstSelect()); + return swizzled; +} + +static void StoreBufferFormat(IR::IREmitter& ir, const AmdGpu::Buffer& buffer, + const IR::Value handle, const IR::U32 address, const IR::Value& value, + const IR::BufferInstInfo info) { + const auto data_fmt = buffer.GetDataFmt(); + const auto num_fmt = buffer.GetNumberFmt(); + const auto num_conv = buffer.GetNumberConversion(); + const auto num_components = AmdGpu::NumComponents(buffer.GetDataFmt()); + + // Extract actual number of components and apply additional modifications. + const auto swizzled = ApplySwizzle(ir, value, buffer.DstSelect().Inverse()); + boost::container::static_vector components; + for (u32 i = 0; i < num_components; i++) { + const auto component = IR::F32{ir.CompositeExtract(swizzled, i)}; + components.push_back(ApplyWriteNumberConversion(ir, component, num_conv)); + } + const auto real_value = + components.size() == 1 ? components[0] : ir.CompositeConstruct(components); + + switch (data_fmt) { + case AmdGpu::DataFormat::FormatInvalid: + break; + case AmdGpu::DataFormat::Format8: { + const auto packed = + ir.Pack4x8(num_fmt, ir.CompositeConstruct(real_value, ir.Imm32(0.f), ir.Imm32(0.f), + ir.Imm32(0.f))); + ir.StoreBufferU8(handle, address, packed, info); + break; + } + case AmdGpu::DataFormat::Format8_8: { + const auto packed = + ir.Pack4x8(num_fmt, ir.CompositeConstruct(ir.CompositeExtract(real_value, 0), + ir.CompositeExtract(real_value, 1), + ir.Imm32(0.f), ir.Imm32(0.f))); + ir.StoreBufferU16(handle, address, packed, info); + break; + } + case AmdGpu::DataFormat::Format8_8_8_8: { + auto packed = ir.Pack4x8(num_fmt, real_value); + ir.StoreBufferU32(1, handle, address, packed, info); + break; + } + case AmdGpu::DataFormat::Format16: { + const auto packed = ir.Pack2x16(num_fmt, ir.CompositeConstruct(real_value, ir.Imm32(0.f))); + ir.StoreBufferU16(handle, address, packed, info); + break; + } + case AmdGpu::DataFormat::Format16_16: { + const auto packed = ir.Pack2x16(num_fmt, real_value); + ir.StoreBufferU32(1, handle, address, packed, info); + break; + } + case AmdGpu::DataFormat::Format10_11_11: { + const auto packed = ir.Pack10_11_11(num_fmt, real_value); + ir.StoreBufferU32(1, handle, address, packed, info); + break; + } + case AmdGpu::DataFormat::Format2_10_10_10: { + const auto packed = ir.Pack2_10_10_10(num_fmt, real_value); + ir.StoreBufferU32(1, handle, address, packed, info); + break; + } + case AmdGpu::DataFormat::Format16_16_16_16: { + const auto packed = ir.CompositeConstruct( + ir.Pack2x16(num_fmt, ir.CompositeConstruct(ir.CompositeExtract(real_value, 0), + ir.CompositeExtract(real_value, 1))), + ir.Pack2x16(num_fmt, ir.CompositeConstruct(ir.CompositeExtract(real_value, 2), + ir.CompositeExtract(real_value, 3)))); + ir.StoreBufferU32(2, handle, address, packed, info); + break; + } + case AmdGpu::DataFormat::Format32: + case AmdGpu::DataFormat::Format32_32: + case AmdGpu::DataFormat::Format32_32_32: + case AmdGpu::DataFormat::Format32_32_32_32: { + ASSERT(num_fmt == AmdGpu::NumberFormat::Uint || num_fmt == AmdGpu::NumberFormat::Sint || + num_fmt == AmdGpu::NumberFormat::Float); + ir.StoreBufferF32(num_components, handle, address, real_value, info); + break; + } + default: + UNREACHABLE_MSG("Unsupported buffer data format: {}", data_fmt); + } +} + +static void LowerBufferFormatInst(IR::Block& block, IR::Inst& inst, Info& info) { + IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)}; + const auto desc{info.buffers[inst.Arg(0).U32()]}; + const auto buffer{desc.GetSharp(info)}; + + if (IsBufferFormatLoad(inst)) { + const auto interpreted = LoadBufferFormat(ir, buffer, inst.Arg(0), IR::U32{inst.Arg(1)}, + inst.Flags()); + inst.ReplaceUsesWithAndRemove(interpreted); + } else if (IsBufferFormatStore(inst)) { + StoreBufferFormat(ir, buffer, inst.Arg(0), IR::U32{inst.Arg(1)}, inst.Arg(2), + inst.Flags()); + inst.Invalidate(); + } +} + +void LowerBufferFormatToRaw(IR::Program& program) { + auto& info = program.info; + for (IR::Block* const block : program.blocks) { + for (IR::Inst& inst : block->Instructions()) { + if (IsBufferFormatLoad(inst) || IsBufferFormatStore(inst)) { + LowerBufferFormatInst(*block, inst, info); + } + } + } +} + +} // namespace Shader::Optimization diff --git a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp index c5f98e5b9..029558d9e 100644 --- a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp +++ b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp @@ -1,8 +1,6 @@ // SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later -#include -#include #include "shader_recompiler/info.h" #include "shader_recompiler/ir/basic_block.h" #include "shader_recompiler/ir/breadth_first_search.h" @@ -37,10 +35,17 @@ bool IsBufferAtomic(const IR::Inst& inst) { bool IsBufferStore(const IR::Inst& inst) { switch (inst.GetOpcode()) { + case IR::Opcode::StoreBufferU8: + case IR::Opcode::StoreBufferU16: case IR::Opcode::StoreBufferU32: case IR::Opcode::StoreBufferU32x2: case IR::Opcode::StoreBufferU32x3: case IR::Opcode::StoreBufferU32x4: + case IR::Opcode::StoreBufferF32: + case IR::Opcode::StoreBufferF32x2: + case IR::Opcode::StoreBufferF32x3: + case IR::Opcode::StoreBufferF32x4: + case IR::Opcode::StoreBufferFormatF32: return true; default: return IsBufferAtomic(inst); @@ -49,10 +54,17 @@ bool IsBufferStore(const IR::Inst& inst) { bool IsBufferInstruction(const IR::Inst& inst) { switch (inst.GetOpcode()) { + case IR::Opcode::LoadBufferU8: + case IR::Opcode::LoadBufferU16: case IR::Opcode::LoadBufferU32: case IR::Opcode::LoadBufferU32x2: case IR::Opcode::LoadBufferU32x3: case IR::Opcode::LoadBufferU32x4: + case IR::Opcode::LoadBufferF32: + case IR::Opcode::LoadBufferF32x2: + case IR::Opcode::LoadBufferF32x3: + case IR::Opcode::LoadBufferF32x4: + case IR::Opcode::LoadBufferFormatF32: case IR::Opcode::ReadConstBuffer: return true; default: @@ -65,34 +77,6 @@ bool IsDataRingInstruction(const IR::Inst& inst) { inst.GetOpcode() == IR::Opcode::DataConsume; } -bool IsTextureBufferInstruction(const IR::Inst& inst) { - return inst.GetOpcode() == IR::Opcode::LoadBufferFormatF32 || - inst.GetOpcode() == IR::Opcode::StoreBufferFormatF32; -} - -bool UseFP16(AmdGpu::DataFormat data_format, AmdGpu::NumberFormat num_format) { - switch (num_format) { - case AmdGpu::NumberFormat::Float: - switch (data_format) { - case AmdGpu::DataFormat::Format16: - case AmdGpu::DataFormat::Format16_16: - case AmdGpu::DataFormat::Format16_16_16_16: - return true; - default: - return false; - } - case AmdGpu::NumberFormat::Unorm: - case AmdGpu::NumberFormat::Snorm: - case AmdGpu::NumberFormat::Uscaled: - case AmdGpu::NumberFormat::Sscaled: - case AmdGpu::NumberFormat::Uint: - case AmdGpu::NumberFormat::Sint: - case AmdGpu::NumberFormat::SnormNz: - default: - return false; - } -} - IR::Type BufferDataType(const IR::Inst& inst, AmdGpu::NumberFormat num_format) { return IR::Type::U32; } @@ -132,8 +116,7 @@ bool IsImageInstruction(const IR::Inst& inst) { class Descriptors { public: explicit Descriptors(Info& info_) - : info{info_}, buffer_resources{info_.buffers}, - texture_buffer_resources{info_.texture_buffers}, image_resources{info_.images}, + : info{info_}, buffer_resources{info_.buffers}, image_resources{info_.images}, sampler_resources{info_.samplers}, fmask_resources(info_.fmasks) {} u32 Add(const BufferResource& desc) { @@ -147,15 +130,7 @@ public: auto& buffer = buffer_resources[index]; buffer.used_types |= desc.used_types; buffer.is_written |= desc.is_written; - return index; - } - - u32 Add(const TextureBufferResource& desc) { - const u32 index{Add(texture_buffer_resources, desc, [&desc](const auto& existing) { - return desc.sharp_idx == existing.sharp_idx; - })}; - auto& buffer = texture_buffer_resources[index]; - buffer.is_written |= desc.is_written; + buffer.is_formatted |= desc.is_formatted; return index; } @@ -196,7 +171,6 @@ private: const Info& info; BufferResourceList& buffer_resources; - TextureBufferResourceList& texture_buffer_resources; ImageResourceList& image_resources; SamplerResourceList& sampler_resources; FMaskResourceList& fmask_resources; @@ -313,6 +287,8 @@ void PatchBufferSharp(IR::Block& block, IR::Inst& inst, Info& info, Descriptors& .sharp_idx = sharp, .used_types = BufferDataType(inst, buffer.GetNumberFmt()), .is_written = IsBufferStore(inst), + .is_formatted = inst.GetOpcode() == IR::Opcode::LoadBufferFormatF32 || + inst.GetOpcode() == IR::Opcode::StoreBufferFormatF32, }); } @@ -321,21 +297,6 @@ void PatchBufferSharp(IR::Block& block, IR::Inst& inst, Info& info, Descriptors& inst.SetArg(0, ir.Imm32(binding)); } -void PatchTextureBufferSharp(IR::Block& block, IR::Inst& inst, Info& info, - Descriptors& descriptors) { - const IR::Inst* handle = inst.Arg(0).InstRecursive(); - const IR::Inst* producer = handle->Arg(0).InstRecursive(); - const auto sharp = TrackSharp(producer, info); - const s32 binding = descriptors.Add(TextureBufferResource{ - .sharp_idx = sharp, - .is_written = inst.GetOpcode() == IR::Opcode::StoreBufferFormatF32, - }); - - // Replace handle with binding index in texture buffer resource list. - IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)}; - inst.SetArg(0, ir.Imm32(binding)); -} - void PatchImageSharp(IR::Block& block, IR::Inst& inst, Info& info, Descriptors& descriptors) { const auto pred = [](const IR::Inst* inst) -> std::optional { const auto opcode = inst->GetOpcode(); @@ -553,36 +514,6 @@ void PatchBufferArgs(IR::Block& block, IR::Inst& inst, Info& info) { inst.SetArg(1, CalculateBufferAddress(ir, inst, info, buffer, buffer.stride)); } -void PatchTextureBufferArgs(IR::Block& block, IR::Inst& inst, Info& info) { - const auto handle = inst.Arg(0); - const auto buffer_res = info.texture_buffers[handle.U32()]; - const auto buffer = buffer_res.GetSharp(info); - - // Only linear addressing with index is supported currently, since we cannot yet - // address with sub-texel granularity. - const auto inst_info = inst.Flags(); - ASSERT_MSG(!buffer.swizzle_enable && !inst_info.offset_enable && inst_info.inst_offset == 0, - "Unsupported texture buffer address mode."); - - IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)}; - // Stride of 1 to get an index into formatted data. See above addressing limitations. - inst.SetArg(1, CalculateBufferAddress(ir, inst, info, buffer, 1U)); - - if (inst.GetOpcode() == IR::Opcode::StoreBufferFormatF32) { - const auto swizzled = ApplySwizzle(ir, inst.Arg(2), buffer.DstSelect().Inverse()); - const auto converted = - ApplyWriteNumberConversionVec4(ir, swizzled, buffer.GetNumberConversion()); - inst.SetArg(2, converted); - } else if (inst.GetOpcode() == IR::Opcode::LoadBufferFormatF32) { - const auto inst_info = inst.Flags(); - const auto texel = ir.LoadBufferFormat(inst.Arg(0), inst.Arg(1), inst_info); - const auto swizzled = ApplySwizzle(ir, texel, buffer.DstSelect()); - const auto converted = - ApplyReadNumberConversionVec4(ir, swizzled, buffer.GetNumberConversion()); - inst.ReplaceUsesWith(converted); - } -} - IR::Value FixCubeCoords(IR::IREmitter& ir, const AmdGpu::Image& image, const IR::Value& x, const IR::Value& y, const IR::Value& face) { if (!image.IsCube()) { @@ -861,8 +792,6 @@ void ResourceTrackingPass(IR::Program& program) { for (IR::Inst& inst : block->Instructions()) { if (IsBufferInstruction(inst)) { PatchBufferSharp(*block, inst, info, descriptors); - } else if (IsTextureBufferInstruction(inst)) { - PatchTextureBufferSharp(*block, inst, info, descriptors); } else if (IsImageInstruction(inst)) { PatchImageSharp(*block, inst, info, descriptors); } else if (IsDataRingInstruction(inst)) { @@ -876,8 +805,6 @@ void ResourceTrackingPass(IR::Program& program) { for (IR::Inst& inst : block->Instructions()) { if (IsBufferInstruction(inst)) { PatchBufferArgs(*block, inst, info); - } else if (IsTextureBufferInstruction(inst)) { - PatchTextureBufferArgs(*block, inst, info); } else if (IsImageInstruction(inst)) { PatchImageArgs(*block, inst, info); } diff --git a/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp b/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp index 7fd5b75ff..f3a1fc9a8 100644 --- a/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp +++ b/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp @@ -50,12 +50,6 @@ void Visit(Info& info, const IR::Inst& inst) { case IR::Opcode::ImageWrite: info.has_storage_images = true; break; - case IR::Opcode::LoadBufferFormatF32: - info.has_texel_buffers = true; - break; - case IR::Opcode::StoreBufferFormatF32: - info.has_image_buffers = true; - break; case IR::Opcode::QuadShuffle: info.uses_group_quad = true; break; @@ -82,6 +76,12 @@ void Visit(Info& info, const IR::Inst& inst) { case IR::Opcode::ReadConst: info.has_readconst = true; break; + case IR::Opcode::PackUfloat10_11_11: + info.uses_pack_10_11_11 = true; + break; + case IR::Opcode::UnpackUfloat10_11_11: + info.uses_unpack_10_11_11 = true; + break; default: break; } diff --git a/src/shader_recompiler/recompiler.cpp b/src/shader_recompiler/recompiler.cpp index 01518ab8f..a9f7aeb40 100644 --- a/src/shader_recompiler/recompiler.cpp +++ b/src/shader_recompiler/recompiler.cpp @@ -88,6 +88,7 @@ IR::Program TranslateProgram(std::span code, Pools& pools, Info& info Shader::Optimization::ConstantPropagationPass(program.post_order_blocks); Shader::Optimization::FlattenExtendedUserdataPass(program); Shader::Optimization::ResourceTrackingPass(program); + Shader::Optimization::LowerBufferFormatToRaw(program); Shader::Optimization::IdentityRemovalPass(program.blocks); Shader::Optimization::DeadCodeEliminationPass(program); Shader::Optimization::ConstantPropagationPass(program.post_order_blocks); diff --git a/src/shader_recompiler/specialization.h b/src/shader_recompiler/specialization.h index 2083d11a9..4328193b5 100644 --- a/src/shader_recompiler/specialization.h +++ b/src/shader_recompiler/specialization.h @@ -19,30 +19,30 @@ struct VsAttribSpecialization { }; struct BufferSpecialization { - u16 stride : 14; - u16 is_storage : 1; - u16 swizzle_enable : 1; - u8 index_stride : 2 = 0; - u8 element_size : 2 = 0; + u32 stride : 14; + u32 is_storage : 1; + u32 is_formatted : 1; + u32 swizzle_enable : 1; + u32 data_format : 6; + u32 num_format : 4; + u32 index_stride : 2; + u32 element_size : 2; u32 size = 0; + AmdGpu::CompMapping dst_select{}; + AmdGpu::NumberConversion num_conversion{}; bool operator==(const BufferSpecialization& other) const { return stride == other.stride && is_storage == other.is_storage && - swizzle_enable == other.swizzle_enable && + is_formatted == other.is_formatted && swizzle_enable == other.swizzle_enable && + (!is_formatted || + (data_format == other.data_format && num_format == other.num_format && + dst_select == other.dst_select && num_conversion == other.num_conversion)) && (!swizzle_enable || (index_stride == other.index_stride && element_size == other.element_size)) && (size >= other.is_storage || is_storage); } }; -struct TextureBufferSpecialization { - bool is_integer = false; - AmdGpu::CompMapping dst_select{}; - AmdGpu::NumberConversion num_conversion{}; - - auto operator<=>(const TextureBufferSpecialization&) const = default; -}; - struct ImageSpecialization { AmdGpu::ImageType type = AmdGpu::ImageType::Color2D; bool is_integer = false; @@ -82,7 +82,6 @@ struct StageSpecialization { boost::container::small_vector vs_attribs; std::bitset bitset{}; boost::container::small_vector buffers; - boost::container::small_vector tex_buffers; boost::container::small_vector images; boost::container::small_vector fmasks; boost::container::small_vector samplers; @@ -111,7 +110,14 @@ struct StageSpecialization { [](auto& spec, const auto& desc, AmdGpu::Buffer sharp) { spec.stride = sharp.GetStride(); spec.is_storage = desc.IsStorage(sharp); + spec.is_formatted = desc.is_formatted; spec.swizzle_enable = sharp.swizzle_enable; + if (spec.is_formatted) { + spec.data_format = static_cast(sharp.GetDataFmt()); + spec.num_format = static_cast(sharp.GetNumberFmt()); + spec.dst_select = sharp.DstSelect(); + spec.num_conversion = sharp.GetNumberConversion(); + } if (spec.swizzle_enable) { spec.index_stride = sharp.index_stride; spec.element_size = sharp.element_size; @@ -120,12 +126,6 @@ struct StageSpecialization { spec.size = sharp.GetSize(); } }); - ForEachSharp(binding, tex_buffers, info->texture_buffers, - [](auto& spec, const auto& desc, AmdGpu::Buffer sharp) { - spec.is_integer = AmdGpu::IsInteger(sharp.GetNumberFmt()); - spec.dst_select = sharp.DstSelect(); - spec.num_conversion = sharp.GetNumberConversion(); - }); ForEachSharp(binding, images, info->images, [](auto& spec, const auto& desc, AmdGpu::Image sharp) { spec.type = sharp.GetViewType(desc.is_array); @@ -217,11 +217,6 @@ struct StageSpecialization { return false; } } - for (u32 i = 0; i < tex_buffers.size(); i++) { - if (other.bitset[binding++] && tex_buffers[i] != other.tex_buffers[i]) { - return false; - } - } for (u32 i = 0; i < images.size(); i++) { if (other.bitset[binding++] && images[i] != other.images[i]) { return false; diff --git a/src/video_core/buffer_cache/buffer.cpp b/src/video_core/buffer_cache/buffer.cpp index a8d1271c6..15ef746cd 100644 --- a/src/video_core/buffer_cache/buffer.cpp +++ b/src/video_core/buffer_cache/buffer.cpp @@ -95,8 +95,7 @@ Buffer::Buffer(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_, // Create buffer object. const vk::BufferCreateInfo buffer_ci = { .size = size_bytes, - // When maintenance5 is not supported, use all flags since we can't add flags to views. - .usage = instance->IsMaintenance5Supported() ? flags : AllFlags, + .usage = flags, }; VmaAllocationInfo alloc_info{}; buffer.Create(buffer_ci, usage, &alloc_info); @@ -113,29 +112,6 @@ Buffer::Buffer(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_, is_coherent = property_flags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT; } -vk::BufferView Buffer::View(u32 offset, u32 size, bool is_written, AmdGpu::DataFormat dfmt, - AmdGpu::NumberFormat nfmt) { - const vk::BufferUsageFlags2CreateInfoKHR usage_flags = { - .usage = is_written ? vk::BufferUsageFlagBits2KHR::eStorageTexelBuffer - : vk::BufferUsageFlagBits2KHR::eUniformTexelBuffer, - }; - const vk::BufferViewCreateInfo view_ci = { - .pNext = instance->IsMaintenance5Supported() ? &usage_flags : nullptr, - .buffer = buffer.buffer, - .format = Vulkan::LiverpoolToVK::SurfaceFormat(dfmt, nfmt), - .offset = offset, - .range = size, - }; - const auto [view_result, view] = instance->GetDevice().createBufferView(view_ci); - ASSERT_MSG(view_result == vk::Result::eSuccess, "Failed to create buffer view: {}", - vk::to_string(view_result)); - scheduler->DeferOperation( - [view, device = instance->GetDevice()] { device.destroyBufferView(view); }); - Vulkan::SetObjectName(instance->GetDevice(), view, "BufferView {:#x}:{:#x}", cpu_addr + offset, - size); - return view; -} - constexpr u64 WATCHES_INITIAL_RESERVE = 0x4000; constexpr u64 WATCHES_RESERVE_CHUNK = 0x1000; diff --git a/src/video_core/buffer_cache/buffer.h b/src/video_core/buffer_cache/buffer.h index 63391a180..ec92a0ebf 100644 --- a/src/video_core/buffer_cache/buffer.h +++ b/src/video_core/buffer_cache/buffer.h @@ -32,13 +32,12 @@ enum class MemoryUsage { }; constexpr vk::BufferUsageFlags ReadFlags = - vk::BufferUsageFlagBits::eTransferSrc | vk::BufferUsageFlagBits::eUniformTexelBuffer | - vk::BufferUsageFlagBits::eUniformBuffer | vk::BufferUsageFlagBits::eIndexBuffer | - vk::BufferUsageFlagBits::eVertexBuffer | vk::BufferUsageFlagBits::eIndirectBuffer; + vk::BufferUsageFlagBits::eTransferSrc | vk::BufferUsageFlagBits::eUniformBuffer | + vk::BufferUsageFlagBits::eIndexBuffer | vk::BufferUsageFlagBits::eVertexBuffer | + vk::BufferUsageFlagBits::eIndirectBuffer; -constexpr vk::BufferUsageFlags AllFlags = ReadFlags | vk::BufferUsageFlagBits::eTransferDst | - vk::BufferUsageFlagBits::eStorageTexelBuffer | - vk::BufferUsageFlagBits::eStorageBuffer; +constexpr vk::BufferUsageFlags AllFlags = + ReadFlags | vk::BufferUsageFlagBits::eTransferDst | vk::BufferUsageFlagBits::eStorageBuffer; struct UniqueBuffer { explicit UniqueBuffer(vk::Device device, VmaAllocator allocator); @@ -83,9 +82,6 @@ public: Buffer& operator=(Buffer&&) = default; Buffer(Buffer&&) = default; - vk::BufferView View(u32 offset, u32 size, bool is_written, AmdGpu::DataFormat dfmt, - AmdGpu::NumberFormat nfmt); - /// Increases the likeliness of this being a stream buffer void IncreaseStreamScore(int score) noexcept { stream_score += score; diff --git a/src/video_core/renderer_vulkan/liverpool_to_vk.cpp b/src/video_core/renderer_vulkan/liverpool_to_vk.cpp index 59a0802bb..5c02ef39f 100644 --- a/src/video_core/renderer_vulkan/liverpool_to_vk.cpp +++ b/src/video_core/renderer_vulkan/liverpool_to_vk.cpp @@ -352,12 +352,9 @@ vk::ComponentMapping ComponentMapping(AmdGpu::CompMapping comp_mapping) { }; } -static constexpr vk::FormatFeatureFlags2 BufferRead = - vk::FormatFeatureFlagBits2::eUniformTexelBuffer | vk::FormatFeatureFlagBits2::eVertexBuffer; -static constexpr vk::FormatFeatureFlags2 BufferWrite = - vk::FormatFeatureFlagBits2::eStorageTexelBuffer | - vk::FormatFeatureFlagBits2::eStorageReadWithoutFormat | - vk::FormatFeatureFlagBits2::eStorageWriteWithoutFormat; +// Texel buffer feature flags are not needed as format is interpreted in-shader. +static constexpr vk::FormatFeatureFlags2 BufferRead = vk::FormatFeatureFlagBits2::eVertexBuffer; +static constexpr vk::FormatFeatureFlags2 BufferWrite = static_cast(0); static constexpr vk::FormatFeatureFlags2 ImageRead = vk::FormatFeatureFlagBits2::eTransferSrc | vk::FormatFeatureFlagBits2::eTransferDst | vk::FormatFeatureFlagBits2::eSampledImage; diff --git a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp index afa598fca..0832f65a2 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp @@ -55,15 +55,6 @@ ComputePipeline::ComputePipeline(const Instance& instance_, Scheduler& scheduler .stageFlags = vk::ShaderStageFlagBits::eCompute, }); } - for (const auto& tex_buffer : info->texture_buffers) { - bindings.push_back({ - .binding = binding++, - .descriptorType = tex_buffer.is_written ? vk::DescriptorType::eStorageTexelBuffer - : vk::DescriptorType::eUniformTexelBuffer, - .descriptorCount = 1, - .stageFlags = vk::ShaderStageFlagBits::eCompute, - }); - } for (const auto& image : info->images) { bindings.push_back({ .binding = binding++, diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp index 330a8ab7f..588754c00 100644 --- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp @@ -375,15 +375,6 @@ void GraphicsPipeline::BuildDescSetLayout() { .stageFlags = gp_stage_flags, }); } - for (const auto& tex_buffer : stage->texture_buffers) { - bindings.push_back({ - .binding = binding++, - .descriptorType = tex_buffer.is_written ? vk::DescriptorType::eStorageTexelBuffer - : vk::DescriptorType::eUniformTexelBuffer, - .descriptorCount = 1, - .stageFlags = gp_stage_flags, - }); - } for (const auto& image : stage->images) { bindings.push_back({ .binding = binding++, diff --git a/src/video_core/renderer_vulkan/vk_instance.cpp b/src/video_core/renderer_vulkan/vk_instance.cpp index 319f10278..e64cae87d 100644 --- a/src/video_core/renderer_vulkan/vk_instance.cpp +++ b/src/video_core/renderer_vulkan/vk_instance.cpp @@ -268,7 +268,6 @@ bool Instance::CreateDevice() { null_descriptor = feature_chain.get().nullDescriptor; } - maintenance5 = add_extension(VK_KHR_MAINTENANCE_5_EXTENSION_NAME); custom_border_color = add_extension(VK_EXT_CUSTOM_BORDER_COLOR_EXTENSION_NAME); depth_clip_control = add_extension(VK_EXT_DEPTH_CLIP_CONTROL_EXTENSION_NAME); vertex_input_dynamic_state = add_extension(VK_EXT_VERTEX_INPUT_DYNAMIC_STATE_EXTENSION_NAME); @@ -376,9 +375,6 @@ bool Instance::CreateDevice() { .maintenance4 = true, }, // Other extensions - vk::PhysicalDeviceMaintenance5FeaturesKHR{ - .maintenance5 = true, - }, vk::PhysicalDeviceCustomBorderColorFeaturesEXT{ .customBorderColors = true, .customBorderColorWithoutFormat = true, @@ -414,9 +410,6 @@ bool Instance::CreateDevice() { if (!maintenance4) { device_chain.unlink(); } - if (!maintenance5) { - device_chain.unlink(); - } if (!custom_border_color) { device_chain.unlink(); } diff --git a/src/video_core/renderer_vulkan/vk_instance.h b/src/video_core/renderer_vulkan/vk_instance.h index 532696f0f..1748fcd59 100644 --- a/src/video_core/renderer_vulkan/vk_instance.h +++ b/src/video_core/renderer_vulkan/vk_instance.h @@ -114,11 +114,6 @@ public: return null_descriptor; } - /// Returns true when VK_KHR_maintenance5 is supported. - bool IsMaintenance5Supported() const { - return maintenance5; - } - /// Returns true when VK_KHR_fragment_shader_barycentric is supported. bool IsFragmentShaderBarycentricSupported() const { return fragment_shader_barycentric; @@ -209,11 +204,6 @@ public: return properties.limits.minStorageBufferOffsetAlignment; } - /// Returns the minimum required alignment for texel buffers - vk::DeviceSize TexelBufferMinAlignment() const { - return properties.limits.minTexelBufferOffsetAlignment; - } - /// Returns the minimum alignemt required for accessing host-mapped device memory vk::DeviceSize NonCoherentAtomSize() const { return properties.limits.nonCoherentAtomSize; @@ -229,11 +219,6 @@ public: return properties.limits.maxComputeSharedMemorySize; } - /// Returns the maximum supported elements in a texel buffer - u32 MaxTexelBufferElements() const { - return properties.limits.maxTexelBufferElements; - } - /// Returns the maximum sampler LOD bias. float MaxSamplerLodBias() const { return properties.limits.maxSamplerLodBias; @@ -317,7 +302,6 @@ private: bool dynamic_color_write_mask{}; bool vertex_input_dynamic_state{}; bool null_descriptor{}; - bool maintenance5{}; bool list_restart{}; bool legacy_vertex_attributes{}; bool shader_stencil_export{}; diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index d8f6a08d0..16d2187db 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -29,8 +29,6 @@ using Shader::VsOutput; constexpr static std::array DescriptorHeapSizes = { vk::DescriptorPoolSize{vk::DescriptorType::eUniformBuffer, 8192}, vk::DescriptorPoolSize{vk::DescriptorType::eStorageBuffer, 1024}, - vk::DescriptorPoolSize{vk::DescriptorType::eUniformTexelBuffer, 128}, - vk::DescriptorPoolSize{vk::DescriptorType::eStorageTexelBuffer, 128}, vk::DescriptorPoolSize{vk::DescriptorType::eSampledImage, 8192}, vk::DescriptorPoolSize{vk::DescriptorType::eSampler, 1024}, }; diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 8da27de00..6f979a734 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -435,28 +435,6 @@ bool Rasterizer::BindResources(const Pipeline* pipeline) { if (pipeline->IsCompute()) { const auto& info = pipeline->GetStage(Shader::LogicalStage::Compute); - // Most of the time when a metadata is updated with a shader it gets cleared. It means - // we can skip the whole dispatch and update the tracked state instead. Also, it is not - // intended to be consumed and in such rare cases (e.g. HTile introspection, CRAA) we - // will need its full emulation anyways. For cases of metadata read a warning will be - // logged. - const auto IsMetaUpdate = [&](const auto& desc) { - const auto sharp = desc.GetSharp(info); - const VAddr address = sharp.base_address; - if (desc.is_written) { - // Assume all slices were updates - if (texture_cache.ClearMeta(address)) { - LOG_TRACE(Render_Vulkan, "Metadata update skipped"); - return true; - } - } else { - if (texture_cache.IsMeta(address)) { - LOG_WARNING(Render_Vulkan, "Unexpected metadata read by a CS shader (buffer)"); - } - } - return false; - }; - // Assume if a shader reads and writes metas at the same time, it is a copy shader. bool meta_read = false; for (const auto& desc : info.buffers) { @@ -469,23 +447,26 @@ bool Rasterizer::BindResources(const Pipeline* pipeline) { } } - for (const auto& desc : info.texture_buffers) { - if (!desc.is_written) { - const VAddr address = desc.GetSharp(info).base_address; - meta_read = texture_cache.IsMeta(address); - } - } - + // Most of the time when a metadata is updated with a shader it gets cleared. It means + // we can skip the whole dispatch and update the tracked state instead. Also, it is not + // intended to be consumed and in such rare cases (e.g. HTile introspection, CRAA) we + // will need its full emulation anyways. For cases of metadata read a warning will be + // logged. if (!meta_read) { for (const auto& desc : info.buffers) { - if (IsMetaUpdate(desc)) { - return false; - } - } - - for (const auto& desc : info.texture_buffers) { - if (IsMetaUpdate(desc)) { - return false; + const auto sharp = desc.GetSharp(info); + const VAddr address = sharp.base_address; + if (desc.is_written) { + // Assume all slices were updates + if (texture_cache.ClearMeta(address)) { + LOG_TRACE(Render_Vulkan, "Metadata update skipped"); + return false; + } + } else { + if (texture_cache.IsMeta(address)) { + LOG_WARNING(Render_Vulkan, + "Unexpected metadata read by a CS shader (buffer)"); + } } } } @@ -541,19 +522,6 @@ void Rasterizer::BindBuffers(const Shader::Info& stage, Shader::Backend::Binding } } - texbuffer_bindings.clear(); - - for (const auto& desc : stage.texture_buffers) { - const auto vsharp = desc.GetSharp(stage); - if (vsharp.base_address != 0 && vsharp.GetSize() > 0 && - vsharp.GetDataFmt() != AmdGpu::DataFormat::FormatInvalid) { - const auto buffer_id = buffer_cache.FindBuffer(vsharp.base_address, vsharp.GetSize()); - texbuffer_bindings.emplace_back(buffer_id, vsharp); - } else { - texbuffer_bindings.emplace_back(VideoCore::BufferId{}, vsharp); - } - } - // Bind a SSBO to act as shared memory in case of not being able to use a workgroup buffer // (e.g. when the compute shared memory is bigger than the GPU's shared memory) if (stage.has_emulated_shared_memory) { @@ -601,8 +569,9 @@ void Rasterizer::BindBuffers(const Shader::Info& stage, Shader::Backend::Binding buffer_infos.emplace_back(null_buffer.Handle(), 0, VK_WHOLE_SIZE); } } else { - const auto [vk_buffer, offset] = buffer_cache.ObtainBuffer( - vsharp.base_address, vsharp.GetSize(), desc.is_written, false, buffer_id); + const auto [vk_buffer, offset] = + buffer_cache.ObtainBuffer(vsharp.base_address, vsharp.GetSize(), desc.is_written, + desc.is_formatted, buffer_id); const u32 alignment = is_storage ? instance.StorageMinAlignment() : instance.UniformMinAlignment(); const u32 offset_aligned = Common::AlignDown(offset, alignment); @@ -617,6 +586,9 @@ void Rasterizer::BindBuffers(const Shader::Info& stage, Shader::Backend::Binding vk::PipelineStageFlagBits2::eAllCommands)) { buffer_barriers.emplace_back(*barrier); } + if (desc.is_written && desc.is_formatted) { + texture_cache.InvalidateMemoryFromGPU(vsharp.base_address, vsharp.GetSize()); + } } set_writes.push_back({ @@ -630,56 +602,6 @@ void Rasterizer::BindBuffers(const Shader::Info& stage, Shader::Backend::Binding }); ++binding.buffer; } - - for (u32 i = 0; i < texbuffer_bindings.size(); i++) { - const auto& [buffer_id, vsharp] = texbuffer_bindings[i]; - const auto& desc = stage.texture_buffers[i]; - // Fallback format for null buffer view; never used in valid buffer case. - const auto data_fmt = vsharp.GetDataFmt() != AmdGpu::DataFormat::FormatInvalid - ? vsharp.GetDataFmt() - : AmdGpu::DataFormat::Format8; - const u32 fmt_stride = AmdGpu::NumBits(data_fmt) >> 3; - vk::BufferView buffer_view; - if (buffer_id) { - const u32 alignment = instance.TexelBufferMinAlignment(); - const auto [vk_buffer, offset] = buffer_cache.ObtainBuffer( - vsharp.base_address, vsharp.GetSize(), desc.is_written, true, buffer_id); - const u32 buf_stride = vsharp.GetStride(); - ASSERT_MSG(buf_stride % fmt_stride == 0, - "Texel buffer stride must match format stride"); - const u32 offset_aligned = Common::AlignDown(offset, alignment); - const u32 adjust = offset - offset_aligned; - ASSERT(adjust % fmt_stride == 0); - push_data.AddTexelOffset(binding.buffer, buf_stride / fmt_stride, adjust / fmt_stride); - buffer_view = vk_buffer->View(offset_aligned, vsharp.GetSize() + adjust, - desc.is_written, data_fmt, vsharp.GetNumberFmt()); - if (auto barrier = - vk_buffer->GetBarrier(desc.is_written ? vk::AccessFlagBits2::eShaderWrite - : vk::AccessFlagBits2::eShaderRead, - vk::PipelineStageFlagBits2::eAllCommands)) { - buffer_barriers.emplace_back(*barrier); - } - if (desc.is_written) { - texture_cache.InvalidateMemoryFromGPU(vsharp.base_address, vsharp.GetSize()); - } - } else if (instance.IsNullDescriptorSupported()) { - buffer_view = VK_NULL_HANDLE; - } else { - buffer_view = - null_buffer.View(0, fmt_stride, desc.is_written, data_fmt, vsharp.GetNumberFmt()); - } - - set_writes.push_back({ - .dstSet = VK_NULL_HANDLE, - .dstBinding = binding.unified++, - .dstArrayElement = 0, - .descriptorCount = 1, - .descriptorType = desc.is_written ? vk::DescriptorType::eStorageTexelBuffer - : vk::DescriptorType::eUniformTexelBuffer, - .pTexelBufferView = &buffer_views.emplace_back(buffer_view), - }); - ++binding.buffer; - } } void Rasterizer::BindTextures(const Shader::Info& stage, Shader::Backend::Bindings& binding, diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index 6e1a1d82e..db458662c 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h @@ -120,8 +120,6 @@ private: using BufferBindingInfo = std::pair; boost::container::static_vector buffer_bindings; - using TexBufferBindingInfo = std::pair; - boost::container::static_vector texbuffer_bindings; using ImageBindingInfo = std::pair; boost::container::static_vector image_bindings; }; diff --git a/src/video_core/renderer_vulkan/vk_shader_hle.cpp b/src/video_core/renderer_vulkan/vk_shader_hle.cpp index ff78f5d24..d73fdbeb1 100644 --- a/src/video_core/renderer_vulkan/vk_shader_hle.cpp +++ b/src/video_core/renderer_vulkan/vk_shader_hle.cpp @@ -19,9 +19,9 @@ static bool ExecuteCopyShaderHLE(const Shader::Info& info, auto& buffer_cache = rasterizer.GetBufferCache(); // Copy shader defines three formatted buffers as inputs: control, source, and destination. - const auto ctl_buf_sharp = info.texture_buffers[0].GetSharp(info); - const auto src_buf_sharp = info.texture_buffers[1].GetSharp(info); - const auto dst_buf_sharp = info.texture_buffers[2].GetSharp(info); + const auto ctl_buf_sharp = info.buffers[0].GetSharp(info); + const auto src_buf_sharp = info.buffers[1].GetSharp(info); + const auto dst_buf_sharp = info.buffers[2].GetSharp(info); const auto buf_stride = src_buf_sharp.GetStride(); ASSERT(buf_stride == dst_buf_sharp.GetStride()); @@ -95,12 +95,10 @@ static bool ExecuteCopyShaderHLE(const Shader::Info& info, } // Obtain buffers for the total source and destination ranges. - const auto [src_buf, src_buf_offset] = - buffer_cache.ObtainBuffer(src_buf_sharp.base_address + src_offset_min, - src_offset_max - src_offset_min, false, false); - const auto [dst_buf, dst_buf_offset] = - buffer_cache.ObtainBuffer(dst_buf_sharp.base_address + dst_offset_min, - dst_offset_max - dst_offset_min, true, false); + const auto [src_buf, src_buf_offset] = buffer_cache.ObtainBuffer( + src_buf_sharp.base_address + src_offset_min, src_offset_max - src_offset_min, false); + const auto [dst_buf, dst_buf_offset] = buffer_cache.ObtainBuffer( + dst_buf_sharp.base_address + dst_offset_min, dst_offset_max - dst_offset_min, true); // Apply found buffer base. const auto vk_copies = std::span{copies}.subspan(batch_start, batch_end - batch_start);