video_core: Account of runtime state changes when compiling shaders (#575)

* video_core: Compile shader permutations

* spirv: Only specific storage image format for atomics

* ir: Avoid cube coord patching for storage image

* spirv: Fix default attributes

* data_share: Add more instructions

* video_core: Query storage flag with runtime state

* kernel: Use std::list for semaphore

* video_core: Use texture buffers for untyped format load/store

* buffer_cache: Limit view usage

* vk_pipeline_cache: Fix invalid iterator

* image_view: Reduce log spam when alpha=1 in storage swizzle

* video_core: More features and proper spirv feature detection

* video_core: Attempt no2 for specialization

* spirv: Remove conflict

* vk_shader_cache: Small cleanup
This commit is contained in:
TheTurtle 2024-08-29 19:29:54 +03:00 committed by GitHub
parent 790d19e59b
commit 66e96dd944
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
43 changed files with 1058 additions and 976 deletions

View file

@ -18,25 +18,31 @@ void Translator::EmitDataShare(const GcnInst& inst) {
case Opcode::DS_READ2_B64:
return DS_READ(64, false, true, inst);
case Opcode::DS_WRITE_B32:
return DS_WRITE(32, false, false, inst);
return DS_WRITE(32, false, false, false, inst);
case Opcode::DS_WRITE2ST64_B32:
return DS_WRITE(32, false, true, true, inst);
case Opcode::DS_WRITE_B64:
return DS_WRITE(64, false, false, inst);
return DS_WRITE(64, false, false, false, inst);
case Opcode::DS_WRITE2_B32:
return DS_WRITE(32, false, true, inst);
return DS_WRITE(32, false, true, false, inst);
case Opcode::DS_WRITE2_B64:
return DS_WRITE(64, false, true, inst);
return DS_WRITE(64, false, true, false, inst);
case Opcode::DS_ADD_U32:
return DS_ADD_U32(inst, false);
case Opcode::DS_MIN_U32:
return DS_MIN_U32(inst, false);
return DS_MIN_U32(inst, false, false);
case Opcode::DS_MIN_I32:
return DS_MIN_U32(inst, true, false);
case Opcode::DS_MAX_U32:
return DS_MAX_U32(inst, false);
return DS_MAX_U32(inst, false, false);
case Opcode::DS_MAX_I32:
return DS_MAX_U32(inst, true, false);
case Opcode::DS_ADD_RTN_U32:
return DS_ADD_U32(inst, true);
case Opcode::DS_MIN_RTN_U32:
return DS_MIN_U32(inst, true);
return DS_MIN_U32(inst, false, true);
case Opcode::DS_MAX_RTN_U32:
return DS_MAX_U32(inst, true);
return DS_MAX_U32(inst, false, true);
default:
LogMissingOpcode(inst);
}
@ -89,12 +95,13 @@ void Translator::DS_READ(int bit_size, bool is_signed, bool is_pair, const GcnIn
}
}
void Translator::DS_WRITE(int bit_size, bool is_signed, bool is_pair, const GcnInst& inst) {
void Translator::DS_WRITE(int bit_size, bool is_signed, bool is_pair, bool stride64,
const GcnInst& inst) {
const IR::U32 addr{ir.GetVectorReg(IR::VectorReg(inst.src[0].code))};
const IR::VectorReg data0{inst.src[1].code};
const IR::VectorReg data1{inst.src[2].code};
if (is_pair) {
const u32 adj = bit_size == 32 ? 4 : 8;
const u32 adj = (bit_size == 32 ? 4 : 8) * (stride64 ? 64 : 1);
const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset0 * adj)));
if (bit_size == 32) {
ir.WriteShared(32, ir.GetVectorReg(data0), addr0);
@ -133,23 +140,23 @@ void Translator::DS_ADD_U32(const GcnInst& inst, bool rtn) {
}
}
void Translator::DS_MIN_U32(const GcnInst& inst, bool rtn) {
void Translator::DS_MIN_U32(const GcnInst& inst, bool is_signed, bool rtn) {
const IR::U32 addr{GetSrc(inst.src[0])};
const IR::U32 data{GetSrc(inst.src[1])};
const IR::U32 offset = ir.Imm32(u32(inst.control.ds.offset0));
const IR::U32 addr_offset = ir.IAdd(addr, offset);
const IR::Value original_val = ir.SharedAtomicIMin(addr_offset, data, false);
const IR::Value original_val = ir.SharedAtomicIMin(addr_offset, data, is_signed);
if (rtn) {
SetDst(inst.dst[0], IR::U32{original_val});
}
}
void Translator::DS_MAX_U32(const GcnInst& inst, bool rtn) {
void Translator::DS_MAX_U32(const GcnInst& inst, bool is_signed, bool rtn) {
const IR::U32 addr{GetSrc(inst.src[0])};
const IR::U32 data{GetSrc(inst.src[1])};
const IR::U32 offset = ir.Imm32(u32(inst.control.ds.offset0));
const IR::U32 addr_offset = ir.IAdd(addr, offset);
const IR::Value original_val = ir.SharedAtomicIMax(addr_offset, data, false);
const IR::Value original_val = ir.SharedAtomicIMax(addr_offset, data, is_signed);
if (rtn) {
SetDst(inst.dst[0], IR::U32{original_val});
}

View file

@ -1,14 +1,12 @@
// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#include "common/logging/log.h"
#include "shader_recompiler/frontend/translate/translate.h"
namespace Shader::Gcn {
void Translator::EmitExport(const GcnInst& inst) {
if (ir.block->has_multiple_predecessors && info.stage == Stage::Fragment) {
LOG_WARNING(Render_Recompiler, "An ambiguous export appeared in translation");
ir.Discard(ir.LogicalNot(ir.GetExec()));
}

View file

@ -354,7 +354,7 @@ void Translator::EmitFetch(const GcnInst& inst) {
if (!std::filesystem::exists(dump_dir)) {
std::filesystem::create_directories(dump_dir);
}
const auto filename = fmt::format("vs_fetch_{:#018x}.bin", info.pgm_hash);
const auto filename = fmt::format("vs_{:#018x}_fetch.bin", info.pgm_hash);
const auto file = IOFile{dump_dir / filename, FileAccessMode::Write};
file.WriteRaw<u8>(code, fetch_size);
}
@ -399,9 +399,7 @@ void Translator::EmitFetch(const GcnInst& inst) {
info.buffers.push_back({
.sgpr_base = attrib.sgpr_base,
.dword_offset = attrib.dword_offset,
.length = buffer.num_records,
.used_types = IR::Type::F32,
.is_storage = true, // we may not fit into UBO with large meshes
.is_instance_data = true,
});
instance_buf_handle = s32(info.buffers.size() - 1);

View file

@ -191,8 +191,10 @@ public:
void V_MBCNT_U32_B32(bool is_low, const GcnInst& inst);
// Vector Memory
void BUFFER_LOAD_FORMAT(u32 num_dwords, bool is_typed, bool is_format, const GcnInst& inst);
void BUFFER_STORE_FORMAT(u32 num_dwords, bool is_typed, bool is_format, const GcnInst& inst);
void BUFFER_LOAD(u32 num_dwords, bool is_typed, const GcnInst& inst);
void BUFFER_LOAD_FORMAT(u32 num_dwords, const GcnInst& inst);
void BUFFER_STORE(u32 num_dwords, bool is_typed, const GcnInst& inst);
void BUFFER_STORE_FORMAT(u32 num_dwords, const GcnInst& inst);
void BUFFER_ATOMIC(AtomicOp op, const GcnInst& inst);
// Vector interpolation
@ -202,10 +204,10 @@ public:
// Data share
void DS_SWIZZLE_B32(const GcnInst& inst);
void DS_READ(int bit_size, bool is_signed, bool is_pair, const GcnInst& inst);
void DS_WRITE(int bit_size, bool is_signed, bool is_pair, const GcnInst& inst);
void DS_WRITE(int bit_size, bool is_signed, bool is_pair, bool stride64, const GcnInst& inst);
void DS_ADD_U32(const GcnInst& inst, bool rtn);
void DS_MIN_U32(const GcnInst& inst, bool rtn);
void DS_MAX_U32(const GcnInst& inst, bool rtn);
void DS_MIN_U32(const GcnInst& inst, bool is_signed, bool rtn);
void DS_MAX_U32(const GcnInst& inst, bool is_signed, bool rtn);
void V_READFIRSTLANE_B32(const GcnInst& inst);
void V_READLANE_B32(const GcnInst& inst);
void V_WRITELANE_B32(const GcnInst& inst);

View file

@ -415,14 +415,20 @@ void Translator::V_ADDC_U32(const GcnInst& inst) {
const auto src0 = GetSrc<IR::U32>(inst.src[0]);
const auto src1 = GetSrc<IR::U32>(inst.src[1]);
IR::U32 scarry;
IR::U1 carry;
if (inst.src_count == 3) { // VOP3
IR::U1 thread_bit{ir.GetThreadBitScalarReg(IR::ScalarReg(inst.src[2].code))};
scarry = IR::U32{ir.Select(thread_bit, ir.Imm32(1), ir.Imm32(0))};
if (inst.src[2].field == OperandField::VccLo) {
carry = ir.GetVcc();
} else if (inst.src[2].field == OperandField::ScalarGPR) {
carry = ir.GetThreadBitScalarReg(IR::ScalarReg(inst.src[2].code));
} else {
UNREACHABLE();
}
} else { // VOP2
scarry = ir.GetVccLo();
carry = ir.GetVcc();
}
const IR::U32 scarry = IR::U32{ir.Select(carry, ir.Imm32(1), ir.Imm32(0))};
const IR::U32 result = ir.IAdd(ir.IAdd(src0, src1), scarry);
const IR::VectorReg dst_reg{inst.dst[0].code};

View file

@ -56,57 +56,57 @@ void Translator::EmitVectorMemory(const GcnInst& inst) {
// Buffer load operations
case Opcode::TBUFFER_LOAD_FORMAT_X:
return BUFFER_LOAD_FORMAT(1, true, true, inst);
return BUFFER_LOAD(1, true, inst);
case Opcode::TBUFFER_LOAD_FORMAT_XY:
return BUFFER_LOAD_FORMAT(2, true, true, inst);
return BUFFER_LOAD(2, true, inst);
case Opcode::TBUFFER_LOAD_FORMAT_XYZ:
return BUFFER_LOAD_FORMAT(3, true, true, inst);
return BUFFER_LOAD(3, true, inst);
case Opcode::TBUFFER_LOAD_FORMAT_XYZW:
return BUFFER_LOAD_FORMAT(4, true, true, inst);
return BUFFER_LOAD(4, true, inst);
case Opcode::BUFFER_LOAD_FORMAT_X:
return BUFFER_LOAD_FORMAT(1, false, true, inst);
return BUFFER_LOAD_FORMAT(1, inst);
case Opcode::BUFFER_LOAD_FORMAT_XY:
return BUFFER_LOAD_FORMAT(2, false, true, inst);
return BUFFER_LOAD_FORMAT(2, inst);
case Opcode::BUFFER_LOAD_FORMAT_XYZ:
return BUFFER_LOAD_FORMAT(3, false, true, inst);
return BUFFER_LOAD_FORMAT(3, inst);
case Opcode::BUFFER_LOAD_FORMAT_XYZW:
return BUFFER_LOAD_FORMAT(4, false, true, inst);
return BUFFER_LOAD_FORMAT(4, inst);
case Opcode::BUFFER_LOAD_DWORD:
return BUFFER_LOAD_FORMAT(1, false, false, inst);
return BUFFER_LOAD(1, false, inst);
case Opcode::BUFFER_LOAD_DWORDX2:
return BUFFER_LOAD_FORMAT(2, false, false, inst);
return BUFFER_LOAD(2, false, inst);
case Opcode::BUFFER_LOAD_DWORDX3:
return BUFFER_LOAD_FORMAT(3, false, false, inst);
return BUFFER_LOAD(3, false, inst);
case Opcode::BUFFER_LOAD_DWORDX4:
return BUFFER_LOAD_FORMAT(4, false, false, inst);
return BUFFER_LOAD(4, false, inst);
// Buffer store operations
case Opcode::BUFFER_STORE_FORMAT_X:
return BUFFER_STORE_FORMAT(1, false, true, inst);
return BUFFER_STORE_FORMAT(1, inst);
case Opcode::BUFFER_STORE_FORMAT_XY:
return BUFFER_STORE_FORMAT(2, false, true, inst);
return BUFFER_STORE_FORMAT(2, inst);
case Opcode::BUFFER_STORE_FORMAT_XYZ:
return BUFFER_STORE_FORMAT(3, false, true, inst);
return BUFFER_STORE_FORMAT(3, inst);
case Opcode::BUFFER_STORE_FORMAT_XYZW:
return BUFFER_STORE_FORMAT(4, false, true, inst);
return BUFFER_STORE_FORMAT(4, inst);
case Opcode::TBUFFER_STORE_FORMAT_X:
return BUFFER_STORE_FORMAT(1, true, true, inst);
return BUFFER_STORE(1, true, inst);
case Opcode::TBUFFER_STORE_FORMAT_XY:
return BUFFER_STORE_FORMAT(2, true, true, inst);
return BUFFER_STORE(2, true, inst);
case Opcode::TBUFFER_STORE_FORMAT_XYZ:
return BUFFER_STORE_FORMAT(3, true, true, inst);
return BUFFER_STORE(3, true, inst);
case Opcode::BUFFER_STORE_DWORD:
return BUFFER_STORE_FORMAT(1, false, false, inst);
return BUFFER_STORE(1, false, inst);
case Opcode::BUFFER_STORE_DWORDX2:
return BUFFER_STORE_FORMAT(2, false, false, inst);
return BUFFER_STORE(2, false, inst);
case Opcode::BUFFER_STORE_DWORDX3:
return BUFFER_STORE_FORMAT(3, false, false, inst);
return BUFFER_STORE(3, false, inst);
case Opcode::BUFFER_STORE_DWORDX4:
return BUFFER_STORE_FORMAT(4, false, false, inst);
return BUFFER_STORE(4, false, inst);
// Buffer atomic operations
case Opcode::BUFFER_ATOMIC_ADD:
@ -349,8 +349,7 @@ void Translator::IMAGE_STORE(const GcnInst& inst) {
ir.ImageWrite(handle, body, value, {});
}
void Translator::BUFFER_LOAD_FORMAT(u32 num_dwords, bool is_typed, bool is_format,
const GcnInst& inst) {
void Translator::BUFFER_LOAD(u32 num_dwords, bool is_typed, const GcnInst& inst) {
const auto& mtbuf = inst.control.mtbuf;
const IR::VectorReg vaddr{inst.src[0].code};
const IR::ScalarReg sharp{inst.src[2].code * 4};
@ -370,22 +369,19 @@ void Translator::BUFFER_LOAD_FORMAT(u32 num_dwords, bool is_typed, bool is_forma
info.index_enable.Assign(mtbuf.idxen);
info.offset_enable.Assign(mtbuf.offen);
info.inst_offset.Assign(mtbuf.offset);
info.is_typed.Assign(is_typed);
if (is_typed) {
info.dmft.Assign(static_cast<AmdGpu::DataFormat>(mtbuf.dfmt));
info.nfmt.Assign(static_cast<AmdGpu::NumberFormat>(mtbuf.nfmt));
ASSERT(info.nfmt == AmdGpu::NumberFormat::Float &&
(info.dmft == AmdGpu::DataFormat::Format32_32_32_32 ||
info.dmft == AmdGpu::DataFormat::Format32_32_32 ||
info.dmft == AmdGpu::DataFormat::Format32_32 ||
info.dmft == AmdGpu::DataFormat::Format32));
const auto dmft = static_cast<AmdGpu::DataFormat>(mtbuf.dfmt);
const auto nfmt = static_cast<AmdGpu::NumberFormat>(mtbuf.nfmt);
ASSERT(nfmt == AmdGpu::NumberFormat::Float &&
(dmft == AmdGpu::DataFormat::Format32_32_32_32 ||
dmft == AmdGpu::DataFormat::Format32_32_32 ||
dmft == AmdGpu::DataFormat::Format32_32 || dmft == AmdGpu::DataFormat::Format32));
}
const IR::Value handle =
ir.CompositeConstruct(ir.GetScalarReg(sharp), ir.GetScalarReg(sharp + 1),
ir.GetScalarReg(sharp + 2), ir.GetScalarReg(sharp + 3));
const IR::Value value = is_format ? ir.LoadBufferFormat(num_dwords, handle, address, info)
: ir.LoadBuffer(num_dwords, handle, address, info);
const IR::Value value = ir.LoadBuffer(num_dwords, handle, address, info);
const IR::VectorReg dst_reg{inst.src[1].code};
if (num_dwords == 1) {
ir.SetVectorReg(dst_reg, IR::F32{value});
@ -396,8 +392,34 @@ void Translator::BUFFER_LOAD_FORMAT(u32 num_dwords, bool is_typed, bool is_forma
}
}
void Translator::BUFFER_STORE_FORMAT(u32 num_dwords, bool is_typed, bool is_format,
const GcnInst& inst) {
void Translator::BUFFER_LOAD_FORMAT(u32 num_dwords, const GcnInst& inst) {
const auto& mubuf = inst.control.mubuf;
const IR::VectorReg vaddr{inst.src[0].code};
const IR::ScalarReg sharp{inst.src[2].code * 4};
ASSERT_MSG(!mubuf.offen && mubuf.offset == 0, "Offsets for image buffers are not supported");
const IR::Value address = [&] -> IR::Value {
if (mubuf.idxen) {
return ir.GetVectorReg(vaddr);
}
return {};
}();
const IR::Value soffset{GetSrc(inst.src[3])};
ASSERT_MSG(soffset.IsImmediate() && soffset.U32() == 0, "Non immediate offset not supported");
IR::BufferInstInfo info{};
info.index_enable.Assign(mubuf.idxen);
const IR::Value handle =
ir.CompositeConstruct(ir.GetScalarReg(sharp), ir.GetScalarReg(sharp + 1),
ir.GetScalarReg(sharp + 2), ir.GetScalarReg(sharp + 3));
const IR::Value value = ir.LoadBufferFormat(handle, address, info);
const IR::VectorReg dst_reg{inst.src[1].code};
for (u32 i = 0; i < num_dwords; i++) {
ir.SetVectorReg(dst_reg + i, IR::F32{ir.CompositeExtract(value, i)});
}
}
void Translator::BUFFER_STORE(u32 num_dwords, bool is_typed, const GcnInst& inst) {
const auto& mtbuf = inst.control.mtbuf;
const IR::VectorReg vaddr{inst.src[0].code};
const IR::ScalarReg sharp{inst.src[2].code * 4};
@ -417,45 +439,76 @@ void Translator::BUFFER_STORE_FORMAT(u32 num_dwords, bool is_typed, bool is_form
info.index_enable.Assign(mtbuf.idxen);
info.offset_enable.Assign(mtbuf.offen);
info.inst_offset.Assign(mtbuf.offset);
info.is_typed.Assign(is_typed);
if (is_typed) {
info.dmft.Assign(static_cast<AmdGpu::DataFormat>(mtbuf.dfmt));
info.nfmt.Assign(static_cast<AmdGpu::NumberFormat>(mtbuf.nfmt));
const auto dmft = static_cast<AmdGpu::DataFormat>(mtbuf.dfmt);
const auto nfmt = static_cast<AmdGpu::NumberFormat>(mtbuf.nfmt);
ASSERT(nfmt == AmdGpu::NumberFormat::Float &&
(dmft == AmdGpu::DataFormat::Format32_32_32_32 ||
dmft == AmdGpu::DataFormat::Format32_32_32 ||
dmft == AmdGpu::DataFormat::Format32_32 || dmft == AmdGpu::DataFormat::Format32));
}
IR::Value value{};
const IR::VectorReg src_reg{inst.src[1].code};
switch (num_dwords) {
case 1:
value = ir.GetVectorReg<Shader::IR::F32>(src_reg);
value = ir.GetVectorReg<IR::F32>(src_reg);
break;
case 2:
value = ir.CompositeConstruct(ir.GetVectorReg<Shader::IR::F32>(src_reg),
ir.GetVectorReg<Shader::IR::F32>(src_reg + 1));
value = ir.CompositeConstruct(ir.GetVectorReg<IR::F32>(src_reg),
ir.GetVectorReg<IR::F32>(src_reg + 1));
break;
case 3:
value = ir.CompositeConstruct(ir.GetVectorReg<Shader::IR::F32>(src_reg),
ir.GetVectorReg<Shader::IR::F32>(src_reg + 1),
ir.GetVectorReg<Shader::IR::F32>(src_reg + 2));
value = ir.CompositeConstruct(ir.GetVectorReg<IR::F32>(src_reg),
ir.GetVectorReg<IR::F32>(src_reg + 1),
ir.GetVectorReg<IR::F32>(src_reg + 2));
break;
case 4:
value = ir.CompositeConstruct(ir.GetVectorReg<Shader::IR::F32>(src_reg),
ir.GetVectorReg<Shader::IR::F32>(src_reg + 1),
ir.GetVectorReg<Shader::IR::F32>(src_reg + 2),
ir.GetVectorReg<Shader::IR::F32>(src_reg + 3));
value = ir.CompositeConstruct(
ir.GetVectorReg<IR::F32>(src_reg), ir.GetVectorReg<IR::F32>(src_reg + 1),
ir.GetVectorReg<IR::F32>(src_reg + 2), ir.GetVectorReg<IR::F32>(src_reg + 3));
break;
}
const IR::Value handle =
ir.CompositeConstruct(ir.GetScalarReg(sharp), ir.GetScalarReg(sharp + 1),
ir.GetScalarReg(sharp + 2), ir.GetScalarReg(sharp + 3));
if (is_format) {
ir.StoreBufferFormat(num_dwords, handle, address, value, info);
} else {
ir.StoreBuffer(num_dwords, handle, address, value, info);
}
ir.StoreBuffer(num_dwords, handle, address, value, info);
}
void Translator::BUFFER_STORE_FORMAT(u32 num_dwords, const GcnInst& inst) {
const auto& mubuf = inst.control.mubuf;
const IR::VectorReg vaddr{inst.src[0].code};
const IR::ScalarReg sharp{inst.src[2].code * 4};
ASSERT_MSG(!mubuf.offen && mubuf.offset == 0, "Offsets for image buffers are not supported");
const IR::Value address = [&] -> IR::Value {
if (mubuf.idxen) {
return ir.GetVectorReg(vaddr);
}
return {};
}();
const IR::Value soffset{GetSrc(inst.src[3])};
ASSERT_MSG(soffset.IsImmediate() && soffset.U32() == 0, "Non immediate offset not supported");
IR::BufferInstInfo info{};
info.index_enable.Assign(mubuf.idxen);
const IR::VectorReg src_reg{inst.src[1].code};
std::array<IR::Value, 4> comps{};
for (u32 i = 0; i < num_dwords; i++) {
comps[i] = ir.GetVectorReg<IR::F32>(src_reg + i);
}
for (u32 i = num_dwords; i < 4; i++) {
comps[i] = ir.Imm32(0.f);
}
const IR::Value value = ir.CompositeConstruct(comps[0], comps[1], comps[2], comps[3]);
const IR::Value handle =
ir.CompositeConstruct(ir.GetScalarReg(sharp), ir.GetScalarReg(sharp + 1),
ir.GetScalarReg(sharp + 2), ir.GetScalarReg(sharp + 3));
ir.StoreBufferFormat(handle, address, value, info);
}
// TODO: U64
void Translator::BUFFER_ATOMIC(AtomicOp op, const GcnInst& inst) {
const auto& mubuf = inst.control.mubuf;
const IR::VectorReg vaddr{inst.src[0].code};