video_core: Account of runtime state changes when compiling shaders (#575)

* video_core: Compile shader permutations

* spirv: Only specific storage image format for atomics

* ir: Avoid cube coord patching for storage image

* spirv: Fix default attributes

* data_share: Add more instructions

* video_core: Query storage flag with runtime state

* kernel: Use std::list for semaphore

* video_core: Use texture buffers for untyped format load/store

* buffer_cache: Limit view usage

* vk_pipeline_cache: Fix invalid iterator

* image_view: Reduce log spam when alpha=1 in storage swizzle

* video_core: More features and proper spirv feature detection

* video_core: Attempt no2 for specialization

* spirv: Remove conflict

* vk_shader_cache: Small cleanup
This commit is contained in:
TheTurtle 2024-08-29 19:29:54 +03:00 committed by GitHub
parent 790d19e59b
commit 66e96dd944
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
43 changed files with 1058 additions and 976 deletions

View file

@ -325,20 +325,8 @@ Value IREmitter::LoadBuffer(int num_dwords, const Value& handle, const Value& ad
}
}
Value IREmitter::LoadBufferFormat(int num_dwords, const Value& handle, const Value& address,
BufferInstInfo info) {
switch (num_dwords) {
case 1:
return Inst(Opcode::LoadBufferFormatF32, Flags{info}, handle, address);
case 2:
return Inst(Opcode::LoadBufferFormatF32x2, Flags{info}, handle, address);
case 3:
return Inst(Opcode::LoadBufferFormatF32x3, Flags{info}, handle, address);
case 4:
return Inst(Opcode::LoadBufferFormatF32x4, Flags{info}, handle, address);
default:
UNREACHABLE_MSG("Invalid number of dwords {}", num_dwords);
}
Value IREmitter::LoadBufferFormat(const Value& handle, const Value& address, BufferInstInfo info) {
return Inst(Opcode::LoadBufferFormatF32, Flags{info}, handle, address);
}
void IREmitter::StoreBuffer(int num_dwords, const Value& handle, const Value& address,
@ -409,24 +397,9 @@ Value IREmitter::BufferAtomicSwap(const Value& handle, const Value& address, con
return Inst(Opcode::BufferAtomicSwap32, Flags{info}, handle, address, value);
}
void IREmitter::StoreBufferFormat(int num_dwords, const Value& handle, const Value& address,
const Value& data, BufferInstInfo info) {
switch (num_dwords) {
case 1:
Inst(Opcode::StoreBufferFormatF32, Flags{info}, handle, address, data);
break;
case 2:
Inst(Opcode::StoreBufferFormatF32x2, Flags{info}, handle, address, data);
break;
case 3:
Inst(Opcode::StoreBufferFormatF32x3, Flags{info}, handle, address, data);
break;
case 4:
Inst(Opcode::StoreBufferFormatF32x4, Flags{info}, handle, address, data);
break;
default:
UNREACHABLE_MSG("Invalid number of dwords {}", num_dwords);
}
void IREmitter::StoreBufferFormat(const Value& handle, const Value& address, const Value& data,
BufferInstInfo info) {
Inst(Opcode::StoreBufferFormatF32, Flags{info}, handle, address, data);
}
U32 IREmitter::LaneId() {

View file

@ -92,12 +92,12 @@ public:
[[nodiscard]] Value LoadBuffer(int num_dwords, const Value& handle, const Value& address,
BufferInstInfo info);
[[nodiscard]] Value LoadBufferFormat(int num_dwords, const Value& handle, const Value& address,
[[nodiscard]] Value LoadBufferFormat(const Value& handle, const Value& address,
BufferInstInfo info);
void StoreBuffer(int num_dwords, const Value& handle, const Value& address, const Value& data,
BufferInstInfo info);
void StoreBufferFormat(int num_dwords, const Value& handle, const Value& address,
const Value& data, BufferInstInfo info);
void StoreBufferFormat(const Value& handle, const Value& address, const Value& data,
BufferInstInfo info);
[[nodiscard]] Value BufferAtomicIAdd(const Value& handle, const Value& address,
const Value& value, BufferInstInfo info);

View file

@ -56,9 +56,6 @@ bool Inst::MayHaveSideEffects() const noexcept {
case Opcode::StoreBufferF32x3:
case Opcode::StoreBufferF32x4:
case Opcode::StoreBufferFormatF32:
case Opcode::StoreBufferFormatF32x2:
case Opcode::StoreBufferFormatF32x3:
case Opcode::StoreBufferFormatF32x4:
case Opcode::StoreBufferU32:
case Opcode::BufferAtomicIAdd32:
case Opcode::BufferAtomicSMin32:

View file

@ -79,19 +79,13 @@ OPCODE(LoadBufferF32, F32, Opaq
OPCODE(LoadBufferF32x2, F32x2, Opaque, Opaque, )
OPCODE(LoadBufferF32x3, F32x3, Opaque, Opaque, )
OPCODE(LoadBufferF32x4, F32x4, Opaque, Opaque, )
OPCODE(LoadBufferFormatF32, F32, Opaque, Opaque, )
OPCODE(LoadBufferFormatF32x2, F32x2, Opaque, Opaque, )
OPCODE(LoadBufferFormatF32x3, F32x3, Opaque, Opaque, )
OPCODE(LoadBufferFormatF32x4, F32x4, Opaque, Opaque, )
OPCODE(LoadBufferFormatF32, F32x4, Opaque, Opaque, )
OPCODE(LoadBufferU32, U32, Opaque, Opaque, )
OPCODE(StoreBufferF32, Void, Opaque, Opaque, F32, )
OPCODE(StoreBufferF32x2, Void, Opaque, Opaque, F32x2, )
OPCODE(StoreBufferF32x3, Void, Opaque, Opaque, F32x3, )
OPCODE(StoreBufferF32x4, Void, Opaque, Opaque, F32x4, )
OPCODE(StoreBufferFormatF32, Void, Opaque, Opaque, F32, )
OPCODE(StoreBufferFormatF32x2, Void, Opaque, Opaque, F32x2, )
OPCODE(StoreBufferFormatF32x3, Void, Opaque, Opaque, F32x3, )
OPCODE(StoreBufferFormatF32x4, Void, Opaque, Opaque, F32x4, )
OPCODE(StoreBufferFormatF32, Void, Opaque, Opaque, F32x4, )
OPCODE(StoreBufferU32, Void, Opaque, Opaque, U32, )
// Buffer atomic operations

View file

@ -3,6 +3,7 @@
#include <algorithm>
#include <boost/container/small_vector.hpp>
#include "common/alignment.h"
#include "shader_recompiler/ir/basic_block.h"
#include "shader_recompiler/ir/breadth_first_search.h"
#include "shader_recompiler/ir/ir_emitter.h"
@ -45,10 +46,6 @@ bool IsBufferStore(const IR::Inst& inst) {
case IR::Opcode::StoreBufferF32x2:
case IR::Opcode::StoreBufferF32x3:
case IR::Opcode::StoreBufferF32x4:
case IR::Opcode::StoreBufferFormatF32:
case IR::Opcode::StoreBufferFormatF32x2:
case IR::Opcode::StoreBufferFormatF32x3:
case IR::Opcode::StoreBufferFormatF32x4:
case IR::Opcode::StoreBufferU32:
return true;
default:
@ -62,10 +59,6 @@ bool IsBufferInstruction(const IR::Inst& inst) {
case IR::Opcode::LoadBufferF32x2:
case IR::Opcode::LoadBufferF32x3:
case IR::Opcode::LoadBufferF32x4:
case IR::Opcode::LoadBufferFormatF32:
case IR::Opcode::LoadBufferFormatF32x2:
case IR::Opcode::LoadBufferFormatF32x3:
case IR::Opcode::LoadBufferFormatF32x4:
case IR::Opcode::LoadBufferU32:
case IR::Opcode::ReadConstBuffer:
case IR::Opcode::ReadConstBufferU32:
@ -75,6 +68,11 @@ bool IsBufferInstruction(const IR::Inst& inst) {
}
}
bool IsTextureBufferInstruction(const IR::Inst& inst) {
return inst.GetOpcode() == IR::Opcode::LoadBufferFormatF32 ||
inst.GetOpcode() == IR::Opcode::StoreBufferFormatF32;
}
static bool UseFP16(AmdGpu::DataFormat data_format, AmdGpu::NumberFormat num_format) {
switch (num_format) {
case AmdGpu::NumberFormat::Float:
@ -100,28 +98,6 @@ static bool UseFP16(AmdGpu::DataFormat data_format, AmdGpu::NumberFormat num_for
IR::Type BufferDataType(const IR::Inst& inst, AmdGpu::NumberFormat num_format) {
switch (inst.GetOpcode()) {
case IR::Opcode::LoadBufferFormatF32:
case IR::Opcode::LoadBufferFormatF32x2:
case IR::Opcode::LoadBufferFormatF32x3:
case IR::Opcode::LoadBufferFormatF32x4:
case IR::Opcode::StoreBufferFormatF32:
case IR::Opcode::StoreBufferFormatF32x2:
case IR::Opcode::StoreBufferFormatF32x3:
case IR::Opcode::StoreBufferFormatF32x4:
switch (num_format) {
case AmdGpu::NumberFormat::Unorm:
case AmdGpu::NumberFormat::Snorm:
case AmdGpu::NumberFormat::Uscaled:
case AmdGpu::NumberFormat::Sscaled:
case AmdGpu::NumberFormat::Uint:
case AmdGpu::NumberFormat::Sint:
case AmdGpu::NumberFormat::SnormNz:
return IR::Type::U32;
case AmdGpu::NumberFormat::Float:
return IR::Type::F32;
default:
UNREACHABLE();
}
case IR::Opcode::LoadBufferF32:
case IR::Opcode::LoadBufferF32x2:
case IR::Opcode::LoadBufferF32x3:
@ -143,20 +119,8 @@ IR::Type BufferDataType(const IR::Inst& inst, AmdGpu::NumberFormat num_format) {
}
}
bool IsImageInstruction(const IR::Inst& inst) {
bool IsImageAtomicInstruction(const IR::Inst& inst) {
switch (inst.GetOpcode()) {
case IR::Opcode::ImageSampleExplicitLod:
case IR::Opcode::ImageSampleImplicitLod:
case IR::Opcode::ImageSampleDrefExplicitLod:
case IR::Opcode::ImageSampleDrefImplicitLod:
case IR::Opcode::ImageFetch:
case IR::Opcode::ImageGather:
case IR::Opcode::ImageGatherDref:
case IR::Opcode::ImageQueryDimensions:
case IR::Opcode::ImageQueryLod:
case IR::Opcode::ImageGradient:
case IR::Opcode::ImageRead:
case IR::Opcode::ImageWrite:
case IR::Opcode::ImageAtomicIAdd32:
case IR::Opcode::ImageAtomicSMin32:
case IR::Opcode::ImageAtomicUMin32:
@ -178,20 +142,27 @@ bool IsImageStorageInstruction(const IR::Inst& inst) {
switch (inst.GetOpcode()) {
case IR::Opcode::ImageWrite:
case IR::Opcode::ImageRead:
case IR::Opcode::ImageAtomicIAdd32:
case IR::Opcode::ImageAtomicSMin32:
case IR::Opcode::ImageAtomicUMin32:
case IR::Opcode::ImageAtomicSMax32:
case IR::Opcode::ImageAtomicUMax32:
case IR::Opcode::ImageAtomicInc32:
case IR::Opcode::ImageAtomicDec32:
case IR::Opcode::ImageAtomicAnd32:
case IR::Opcode::ImageAtomicOr32:
case IR::Opcode::ImageAtomicXor32:
case IR::Opcode::ImageAtomicExchange32:
return true;
default:
return false;
return IsImageAtomicInstruction(inst);
}
}
bool IsImageInstruction(const IR::Inst& inst) {
switch (inst.GetOpcode()) {
case IR::Opcode::ImageSampleExplicitLod:
case IR::Opcode::ImageSampleImplicitLod:
case IR::Opcode::ImageSampleDrefExplicitLod:
case IR::Opcode::ImageSampleDrefImplicitLod:
case IR::Opcode::ImageFetch:
case IR::Opcode::ImageGather:
case IR::Opcode::ImageGatherDref:
case IR::Opcode::ImageQueryDimensions:
case IR::Opcode::ImageQueryLod:
case IR::Opcode::ImageGradient:
return true;
default:
return IsImageStorageInstruction(inst);
}
}
@ -214,7 +185,8 @@ u32 ImageOffsetArgumentPosition(const IR::Inst& inst) {
class Descriptors {
public:
explicit Descriptors(Info& info_)
: info{info_}, buffer_resources{info_.buffers}, image_resources{info_.images},
: info{info_}, buffer_resources{info_.buffers},
texture_buffer_resources{info_.texture_buffers}, image_resources{info_.images},
sampler_resources{info_.samplers} {}
u32 Add(const BufferResource& desc) {
@ -224,13 +196,21 @@ public:
desc.inline_cbuf == existing.inline_cbuf;
})};
auto& buffer = buffer_resources[index];
ASSERT(buffer.length == desc.length);
buffer.is_storage |= desc.is_storage;
buffer.used_types |= desc.used_types;
buffer.is_written |= desc.is_written;
return index;
}
u32 Add(const TextureBufferResource& desc) {
const u32 index{Add(texture_buffer_resources, desc, [&desc](const auto& existing) {
return desc.sgpr_base == existing.sgpr_base &&
desc.dword_offset == existing.dword_offset;
})};
auto& buffer = texture_buffer_resources[index];
buffer.is_written |= desc.is_written;
return index;
}
u32 Add(const ImageResource& desc) {
const u32 index{Add(image_resources, desc, [&desc](const auto& existing) {
return desc.sgpr_base == existing.sgpr_base &&
@ -247,7 +227,7 @@ public:
return true;
}
// Samplers with different bindings might still be the same.
return existing.GetSsharp(info) == desc.GetSsharp(info);
return existing.GetSharp(info) == desc.GetSharp(info);
})};
return index;
}
@ -265,6 +245,7 @@ private:
const Info& info;
BufferResourceList& buffer_resources;
TextureBufferResourceList& texture_buffer_resources;
ImageResourceList& image_resources;
SamplerResourceList& sampler_resources;
};
@ -361,33 +342,6 @@ SharpLocation TrackSharp(const IR::Inst* inst) {
};
}
static constexpr size_t MaxUboSize = 65536;
static bool IsLoadBufferFormat(const IR::Inst& inst) {
switch (inst.GetOpcode()) {
case IR::Opcode::LoadBufferFormatF32:
case IR::Opcode::LoadBufferFormatF32x2:
case IR::Opcode::LoadBufferFormatF32x3:
case IR::Opcode::LoadBufferFormatF32x4:
return true;
default:
return false;
}
}
static u32 BufferLength(const AmdGpu::Buffer& buffer) {
const auto stride = buffer.GetStride();
if (stride < sizeof(f32)) {
ASSERT(sizeof(f32) % stride == 0);
return (((buffer.num_records - 1) / sizeof(f32)) + 1) * stride;
} else if (stride == sizeof(f32)) {
return buffer.num_records;
} else {
ASSERT(stride % sizeof(f32) == 0);
return buffer.num_records * (stride / sizeof(f32));
}
}
s32 TryHandleInlineCbuf(IR::Inst& inst, Info& info, Descriptors& descriptors,
AmdGpu::Buffer& cbuf) {
@ -414,10 +368,8 @@ s32 TryHandleInlineCbuf(IR::Inst& inst, Info& info, Descriptors& descriptors,
return descriptors.Add(BufferResource{
.sgpr_base = std::numeric_limits<u32>::max(),
.dword_offset = 0,
.length = BufferLength(cbuf),
.used_types = BufferDataType(inst, cbuf.GetNumberFmt()),
.inline_cbuf = cbuf,
.is_storage = IsBufferStore(inst) || cbuf.GetSize() > MaxUboSize,
});
}
@ -429,28 +381,17 @@ void PatchBufferInstruction(IR::Block& block, IR::Inst& inst, Info& info,
IR::Inst* handle = inst.Arg(0).InstRecursive();
IR::Inst* producer = handle->Arg(0).InstRecursive();
const auto sharp = TrackSharp(producer);
const bool is_store = IsBufferStore(inst);
buffer = info.ReadUd<AmdGpu::Buffer>(sharp.sgpr_base, sharp.dword_offset);
binding = descriptors.Add(BufferResource{
.sgpr_base = sharp.sgpr_base,
.dword_offset = sharp.dword_offset,
.length = BufferLength(buffer),
.used_types = BufferDataType(inst, buffer.GetNumberFmt()),
.is_storage = is_store || buffer.GetSize() > MaxUboSize,
.is_written = is_store,
.is_written = IsBufferStore(inst),
});
}
// Update buffer descriptor format.
const auto inst_info = inst.Flags<IR::BufferInstInfo>();
auto& buffer_desc = info.buffers[binding];
if (inst_info.is_typed) {
buffer_desc.dfmt = inst_info.dmft;
buffer_desc.nfmt = inst_info.nfmt;
} else {
buffer_desc.dfmt = buffer.GetDataFmt();
buffer_desc.nfmt = buffer.GetNumberFmt();
}
// Replace handle with binding index in buffer resource list.
IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)};
@ -463,20 +404,7 @@ void PatchBufferInstruction(IR::Block& block, IR::Inst& inst, Info& info,
return;
}
if (IsLoadBufferFormat(inst)) {
if (UseFP16(buffer.GetDataFmt(), buffer.GetNumberFmt())) {
info.uses_fp16 = true;
}
} else {
const u32 stride = buffer.GetStride();
if (stride < 4) {
LOG_WARNING(Render_Vulkan,
"non-formatting load_buffer_* is not implemented for stride {}", stride);
}
}
// Compute address of the buffer using the stride.
// Todo: What if buffer is rebound with different stride?
IR::U32 address = ir.Imm32(inst_info.inst_offset.Value());
if (inst_info.index_enable) {
const IR::U32 index = inst_info.offset_enable ? IR::U32{ir.CompositeExtract(inst.Arg(1), 0)}
@ -491,8 +419,31 @@ void PatchBufferInstruction(IR::Block& block, IR::Inst& inst, Info& info,
inst.SetArg(1, address);
}
void PatchTextureBufferInstruction(IR::Block& block, IR::Inst& inst, Info& info,
Descriptors& descriptors) {
const IR::Inst* handle = inst.Arg(0).InstRecursive();
const IR::Inst* producer = handle->Arg(0).InstRecursive();
const auto sharp = TrackSharp(producer);
const auto buffer = info.ReadUd<AmdGpu::Buffer>(sharp.sgpr_base, sharp.dword_offset);
const s32 binding = descriptors.Add(TextureBufferResource{
.sgpr_base = sharp.sgpr_base,
.dword_offset = sharp.dword_offset,
.nfmt = buffer.GetNumberFmt(),
.is_written = inst.GetOpcode() == IR::Opcode::StoreBufferFormatF32,
});
// Replace handle with binding index in texture buffer resource list.
IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)};
inst.SetArg(0, ir.Imm32(binding));
ASSERT(!buffer.swizzle_enable && !buffer.add_tid_enable);
}
IR::Value PatchCubeCoord(IR::IREmitter& ir, const IR::Value& s, const IR::Value& t,
const IR::Value& z) {
const IR::Value& z, bool is_storage) {
// When cubemap is written with imageStore it is treated like 2DArray.
if (is_storage) {
return ir.CompositeConstruct(s, t, z);
}
// We need to fix x and y coordinate,
// because the s and t coordinate will be scaled and plus 1.5 by v_madak_f32.
// We already force the scale value to be 1.0 when handling v_cubema_f32,
@ -530,13 +481,15 @@ void PatchImageInstruction(IR::Block& block, IR::Inst& inst, Info& info, Descrip
return;
}
ASSERT(image.GetType() != AmdGpu::ImageType::Invalid);
const bool is_storage = IsImageStorageInstruction(inst);
u32 image_binding = descriptors.Add(ImageResource{
.sgpr_base = tsharp.sgpr_base,
.dword_offset = tsharp.dword_offset,
.type = image.GetType(),
.nfmt = static_cast<AmdGpu::NumberFormat>(image.GetNumberFmt()),
.is_storage = IsImageStorageInstruction(inst),
.is_storage = is_storage,
.is_depth = bool(inst_info.is_depth),
.is_atomic = IsImageAtomicInstruction(inst),
});
// Read sampler sharp. This doesn't exist for IMAGE_LOAD/IMAGE_STORE instructions
@ -593,7 +546,8 @@ void PatchImageInstruction(IR::Block& block, IR::Inst& inst, Info& info, Descrip
case AmdGpu::ImageType::Color3D: // x, y, z
return {ir.CompositeConstruct(body->Arg(0), body->Arg(1), body->Arg(2)), body->Arg(3)};
case AmdGpu::ImageType::Cube: // x, y, face
return {PatchCubeCoord(ir, body->Arg(0), body->Arg(1), body->Arg(2)), body->Arg(3)};
return {PatchCubeCoord(ir, body->Arg(0), body->Arg(1), body->Arg(2), is_storage),
body->Arg(3)};
default:
UNREACHABLE_MSG("Unknown image type {}", image.GetType());
}
@ -668,6 +622,10 @@ void ResourceTrackingPass(IR::Program& program) {
PatchBufferInstruction(*block, inst, info, descriptors);
continue;
}
if (IsTextureBufferInstruction(inst)) {
PatchTextureBufferInstruction(*block, inst, info, descriptors);
continue;
}
if (IsImageInstruction(inst)) {
PatchImageInstruction(*block, inst, info, descriptors);
}

View file

@ -29,6 +29,12 @@ void Visit(Info& info, IR::Inst& inst) {
case IR::Opcode::ImageWrite:
info.has_storage_images = true;
break;
case IR::Opcode::LoadBufferFormatF32:
info.has_texel_buffers = true;
break;
case IR::Opcode::StoreBufferFormatF32:
info.has_image_buffers = true;
break;
case IR::Opcode::QuadShuffle:
info.uses_group_quad = true;
break;
@ -44,6 +50,9 @@ void Visit(Info& info, IR::Inst& inst) {
case IR::Opcode::ImageQueryLod:
info.has_image_query = true;
break;
case IR::Opcode::LaneId:
info.uses_lane_id = true;
break;
default:
break;
}

View file

@ -12,11 +12,13 @@
namespace Shader::IR {
struct Program {
explicit Program(Info& info_) : info{info_} {}
AbstractSyntaxList syntax_list;
BlockList blocks;
BlockList post_order_blocks;
std::vector<Gcn::GcnInst> ins_list;
Info info;
Info& info;
};
[[nodiscard]] std::string DumpProgram(const Program& program);

View file

@ -66,9 +66,6 @@ union BufferInstInfo {
BitField<0, 1, u32> index_enable;
BitField<1, 1, u32> offset_enable;
BitField<2, 12, u32> inst_offset;
BitField<14, 4, AmdGpu::DataFormat> dmft;
BitField<18, 3, AmdGpu::NumberFormat> nfmt;
BitField<21, 1, u32> is_typed;
};
enum class ScalarReg : u32 {