shader_recompiler: Implement most integer image atomics, workgroup barriers and shared memory load/store (#231)

* shader_recompiler: Add LDEXP

* shader_recompiler: Add most image integer atomic ops

* shader_recompiler: Implement shared memory load/store

* shader_recompiler: More image atomics

* externals: Update sirit

* clang format

* cmake: Add missing files

* shader_recompiler: Fix some atomic bugs

* shader_recompiler: Vs outputs

* shader_recompiler: Shared mem has side-effects, fix format component order

* shader_recompiler: Inline constant buffer impl

* video_core: Fix regressions

* Work

* Fixup a few things
This commit is contained in:
TheTurtle 2024-07-05 00:15:44 +03:00 committed by GitHub
parent af3bbc33e9
commit 6ceab6dfac
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
69 changed files with 1597 additions and 310 deletions

View file

@ -149,9 +149,15 @@ void CFG::LinkBlocks() {
block.end_class = EndClass::Branch;
} else if (end_inst.opcode == Opcode::S_ENDPGM) {
const auto& prev_inst = inst_list[block.end_index - 1];
if (prev_inst.opcode == Opcode::EXP && prev_inst.control.exp.en == 0 &&
prev_inst.control.exp.target != 9) {
block.end_class = EndClass::Kill;
if (prev_inst.opcode == Opcode::EXP && prev_inst.control.exp.en == 0) {
if (prev_inst.control.exp.target != 9) {
block.end_class = EndClass::Kill;
} else if (const auto& exec_mask = inst_list[block.end_index - 2];
exec_mask.src[0].field == OperandField::ConstZero) {
block.end_class = EndClass::Kill;
} else {
block.end_class = EndClass::Exit;
}
} else {
block.end_class = EndClass::Exit;
}

View file

@ -32,7 +32,7 @@ namespace Shader::Gcn {
* We take the reverse way, extract the original input semantics from these instructions.
**/
std::vector<VertexAttribute> ParseFetchShader(const u32* code) {
std::vector<VertexAttribute> ParseFetchShader(const u32* code, u32* out_size) {
std::vector<VertexAttribute> attributes;
GcnCodeSlice code_slice(code, code + std::numeric_limits<u32>::max());
GcnDecodeContext decoder;
@ -47,6 +47,8 @@ std::vector<VertexAttribute> ParseFetchShader(const u32* code) {
u32 semantic_index = 0;
while (!code_slice.atEnd()) {
const auto inst = decoder.decodeInstruction(code_slice);
*out_size += inst.length;
if (inst.opcode == Opcode::S_SETPC_B64) {
break;
}

View file

@ -17,6 +17,6 @@ struct VertexAttribute {
u8 instance_data; ///< Indicates that the buffer will be accessed in instance rate
};
std::vector<VertexAttribute> ParseFetchShader(const u32* code);
std::vector<VertexAttribute> ParseFetchShader(const u32* code, u32* out_size);
} // namespace Shader::Gcn

View file

@ -3429,48 +3429,48 @@ constexpr std::array<InstFormat, 112> InstructionFormatMIMG = {{
{InstClass::VectorMemImgNoSmp, InstCategory::VectorMemory, 4, 1, ScalarType::Undefined,
ScalarType::Undefined},
// 17 = IMAGE_ATOMIC_ADD
{InstClass::VectorMemImgNoSmp, InstCategory::VectorMemory, 4, 1, ScalarType::Undefined,
ScalarType::Undefined},
{InstClass::VectorMemImgNoSmp, InstCategory::VectorMemory, 4, 1, ScalarType::Uint32,
ScalarType::Uint32},
// 18 = IMAGE_ATOMIC_SUB
{InstClass::VectorMemImgNoSmp, InstCategory::VectorMemory, 4, 1, ScalarType::Undefined,
ScalarType::Undefined},
{InstClass::VectorMemImgNoSmp, InstCategory::VectorMemory, 4, 1, ScalarType::Uint32,
ScalarType::Uint32},
{},
// 20 = IMAGE_ATOMIC_SMIN
{InstClass::VectorMemImgNoSmp, InstCategory::VectorMemory, 4, 1, ScalarType::Undefined,
ScalarType::Undefined},
{InstClass::VectorMemImgNoSmp, InstCategory::VectorMemory, 4, 1, ScalarType::Sint32,
ScalarType::Sint32},
// 21 = IMAGE_ATOMIC_UMIN
{InstClass::VectorMemImgNoSmp, InstCategory::VectorMemory, 4, 1, ScalarType::Undefined,
ScalarType::Undefined},
{InstClass::VectorMemImgNoSmp, InstCategory::VectorMemory, 4, 1, ScalarType::Uint32,
ScalarType::Uint32},
// 22 = IMAGE_ATOMIC_SMAX
{InstClass::VectorMemImgNoSmp, InstCategory::VectorMemory, 4, 1, ScalarType::Undefined,
ScalarType::Undefined},
{InstClass::VectorMemImgNoSmp, InstCategory::VectorMemory, 4, 1, ScalarType::Sint32,
ScalarType::Sint32},
// 23 = IMAGE_ATOMIC_UMAX
{InstClass::VectorMemImgNoSmp, InstCategory::VectorMemory, 4, 1, ScalarType::Undefined,
ScalarType::Undefined},
{InstClass::VectorMemImgNoSmp, InstCategory::VectorMemory, 4, 1, ScalarType::Uint32,
ScalarType::Uint32},
// 24 = IMAGE_ATOMIC_AND
{InstClass::VectorMemImgNoSmp, InstCategory::VectorMemory, 4, 1, ScalarType::Undefined,
ScalarType::Undefined},
{InstClass::VectorMemImgNoSmp, InstCategory::VectorMemory, 4, 1, ScalarType::Uint32,
ScalarType::Uint32},
// 25 = IMAGE_ATOMIC_OR
{InstClass::VectorMemImgNoSmp, InstCategory::VectorMemory, 4, 1, ScalarType::Undefined,
ScalarType::Undefined},
{InstClass::VectorMemImgNoSmp, InstCategory::VectorMemory, 4, 1, ScalarType::Uint32,
ScalarType::Uint32},
// 26 = IMAGE_ATOMIC_XOR
{InstClass::VectorMemImgNoSmp, InstCategory::VectorMemory, 4, 1, ScalarType::Undefined,
ScalarType::Undefined},
{InstClass::VectorMemImgNoSmp, InstCategory::VectorMemory, 4, 1, ScalarType::Uint32,
ScalarType::Uint32},
// 27 = IMAGE_ATOMIC_INC
{InstClass::VectorMemImgNoSmp, InstCategory::VectorMemory, 4, 1, ScalarType::Undefined,
ScalarType::Undefined},
{InstClass::VectorMemImgNoSmp, InstCategory::VectorMemory, 4, 1, ScalarType::Uint32,
ScalarType::Uint32},
// 28 = IMAGE_ATOMIC_DEC
{InstClass::VectorMemImgNoSmp, InstCategory::VectorMemory, 4, 1, ScalarType::Undefined,
ScalarType::Undefined},
{InstClass::VectorMemImgNoSmp, InstCategory::VectorMemory, 4, 1, ScalarType::Uint32,
ScalarType::Uint32},
// 29 = IMAGE_ATOMIC_FCMPSWAP
{InstClass::VectorMemImgNoSmp, InstCategory::VectorMemory, 4, 1, ScalarType::Undefined,
ScalarType::Undefined},
{InstClass::VectorMemImgNoSmp, InstCategory::VectorMemory, 4, 1, ScalarType::Float32,
ScalarType::Float32},
// 30 = IMAGE_ATOMIC_FMIN
{InstClass::VectorMemImgNoSmp, InstCategory::VectorMemory, 4, 1, ScalarType::Undefined,
ScalarType::Undefined},
{InstClass::VectorMemImgNoSmp, InstCategory::VectorMemory, 4, 1, ScalarType::Float32,
ScalarType::Float32},
// 31 = IMAGE_ATOMIC_FMAX
{InstClass::VectorMemImgNoSmp, InstCategory::VectorMemory, 4, 1, ScalarType::Undefined,
ScalarType::Undefined},
{InstClass::VectorMemImgNoSmp, InstCategory::VectorMemory, 4, 1, ScalarType::Float32,
ScalarType::Float32},
// 32 = IMAGE_SAMPLE
{InstClass::VectorMemImgSmp, InstCategory::VectorMemory, 4, 1, ScalarType::Float32,
ScalarType::Float32},

View file

@ -187,7 +187,7 @@ std::string DumpExpr(const Statement* stmt) {
case StatementType::Not:
case StatementType::Or:
case StatementType::Variable:
throw LogicError("Statement can't be printed");
UNREACHABLE_MSG("Statement can't be printed");
}
}
return ret;
@ -335,7 +335,7 @@ private:
}
// Expensive operation:
if (!AreSiblings(goto_stmt, label_stmt)) {
throw LogicError("Goto is not a sibling with the label");
UNREACHABLE_MSG("Goto is not a sibling with the label");
}
// goto_stmt and label_stmt are guaranteed to be siblings, eliminate
if (std::next(goto_stmt) == label_stmt) {
@ -451,7 +451,7 @@ private:
case StatementType::Loop:
return MoveOutwardLoop(goto_stmt);
default:
throw LogicError("Invalid outward movement");
UNREACHABLE_MSG("Invalid outward movement");
}
}
@ -486,7 +486,7 @@ private:
case StatementType::Loop:
break;
default:
throw LogicError("Invalid inward movement");
UNREACHABLE_MSG("Invalid inward movement");
}
Tree& nested_tree{label_nested_stmt->children};
Statement* const new_goto{pool.Create(Goto{}, variable, label, &*label_nested_stmt)};
@ -633,7 +633,8 @@ private:
if (!stmt.block->is_dummy) {
const u32 start = stmt.block->begin_index;
const u32 size = stmt.block->end_index - start + 1;
Translate(current_block, inst_list.subspan(start, size), info);
Translate(current_block, stmt.block->begin, inst_list.subspan(start, size),
info);
}
break;
}

View file

@ -22,16 +22,18 @@ void Translator::DS_READ(int bit_size, bool is_signed, bool is_pair, const GcnIn
const IR::U32 addr{ir.GetVectorReg(IR::VectorReg(inst.src[0].code))};
const IR::VectorReg dst_reg{inst.dst[0].code};
if (is_pair) {
// Pair loads are either 32 or 64-bit. We assume 32-bit for now.
ASSERT(bit_size == 32);
const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset0)));
ir.SetVectorReg(dst_reg, ir.ReadShared(32, is_signed, addr0));
ir.SetVectorReg(dst_reg, IR::U32{ir.LoadShared(32, is_signed, addr0)});
const IR::U32 addr1 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset1)));
ir.SetVectorReg(dst_reg + 1, ir.ReadShared(32, is_signed, addr1));
ir.SetVectorReg(dst_reg + 1, IR::U32{ir.LoadShared(32, is_signed, addr1)});
} else if (bit_size == 64) {
const IR::Value data = ir.UnpackUint2x32(ir.ReadShared(bit_size, is_signed, addr));
const IR::Value data = ir.LoadShared(bit_size, is_signed, addr);
ir.SetVectorReg(dst_reg, IR::U32{ir.CompositeExtract(data, 0)});
ir.SetVectorReg(dst_reg + 1, IR::U32{ir.CompositeExtract(data, 1)});
} else {
const IR::U32 data = ir.ReadShared(bit_size, is_signed, addr);
const IR::U32 data = IR::U32{ir.LoadShared(bit_size, is_signed, addr)};
ir.SetVectorReg(dst_reg, data);
}
}
@ -41,17 +43,26 @@ void Translator::DS_WRITE(int bit_size, bool is_signed, bool is_pair, const GcnI
const IR::VectorReg data0{inst.src[1].code};
const IR::VectorReg data1{inst.src[2].code};
if (is_pair) {
ASSERT(bit_size == 32);
const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset0)));
ir.WriteShared(32, ir.GetVectorReg(data0), addr0);
const IR::U32 addr1 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset1)));
ir.WriteShared(32, ir.GetVectorReg(data1), addr1);
} else if (bit_size == 64) {
const IR::U64 data = ir.PackUint2x32(
ir.CompositeConstruct(ir.GetVectorReg(data0), ir.GetVectorReg(data0 + 1)));
const IR::Value data =
ir.CompositeConstruct(ir.GetVectorReg(data0), ir.GetVectorReg(data0 + 1));
ir.WriteShared(bit_size, data, addr);
} else {
ir.WriteShared(bit_size, ir.GetVectorReg(data0), addr);
}
}
void Translator::S_BARRIER() {
ir.Barrier();
}
void Translator::V_READFIRSTLANE_B32(const GcnInst& inst) {
UNREACHABLE();
}
} // namespace Shader::Gcn

View file

@ -318,4 +318,16 @@ void Translator::S_SUB_U32(const GcnInst& inst) {
ir.SetScc(ir.Imm1(false));
}
void Translator::S_GETPC_B64(u32 pc, const GcnInst& inst) {
// This only really exists to let resource tracking pass know
// there is an inline cbuf.
SetDst(inst.dst[0], ir.Imm32(pc));
}
void Translator::S_ADDC_U32(const GcnInst& inst) {
const IR::U32 src0{GetSrc(inst.src[0])};
const IR::U32 src1{GetSrc(inst.src[1])};
SetDst(inst.dst[0], ir.IAdd(ir.IAdd(src0, src1), ir.GetSccLo()));
}
} // namespace Shader::Gcn

View file

@ -5,20 +5,29 @@
namespace Shader::Gcn {
static constexpr u32 SQ_SRC_LITERAL = 0xFF;
void Translator::S_LOAD_DWORD(int num_dwords, const GcnInst& inst) {
const auto& smrd = inst.control.smrd;
ASSERT_MSG(smrd.imm, "Bindless texture loads unsupported");
const u32 dword_offset = [&] -> u32 {
if (smrd.imm) {
return smrd.offset;
}
if (smrd.offset == SQ_SRC_LITERAL) {
return inst.src[1].code;
}
UNREACHABLE();
}();
const IR::ScalarReg sbase{inst.src[0].code * 2};
const IR::Value base =
ir.CompositeConstruct(ir.GetScalarReg(sbase), ir.GetScalarReg(sbase + 1));
IR::ScalarReg dst_reg{inst.dst[0].code};
for (u32 i = 0; i < num_dwords; i++) {
ir.SetScalarReg(dst_reg++, ir.ReadConst(base, ir.Imm32(smrd.offset + i)));
ir.SetScalarReg(dst_reg++, ir.ReadConst(base, ir.Imm32(dword_offset + i)));
}
}
void Translator::S_BUFFER_LOAD_DWORD(int num_dwords, const GcnInst& inst) {
static constexpr u32 SQ_SRC_LITERAL = 0xFF;
const auto& smrd = inst.control.smrd;
const IR::ScalarReg sbase{inst.src[0].code * 2};
const IR::U32 dword_offset = [&] -> IR::U32 {
@ -30,7 +39,9 @@ void Translator::S_BUFFER_LOAD_DWORD(int num_dwords, const GcnInst& inst) {
}
return ir.ShiftRightLogical(ir.GetScalarReg(IR::ScalarReg(smrd.offset)), ir.Imm32(2));
}();
const IR::Value vsharp = ir.GetScalarReg(sbase);
const IR::Value vsharp =
ir.CompositeConstruct(ir.GetScalarReg(sbase), ir.GetScalarReg(sbase + 1),
ir.GetScalarReg(sbase + 2), ir.GetScalarReg(sbase + 3));
IR::ScalarReg dst_reg{inst.dst[0].code};
for (u32 i = 0; i < num_dwords; i++) {
const IR::U32 index = ir.IAdd(dword_offset, ir.Imm32(i));

View file

@ -1,6 +1,9 @@
// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#include "common/config.h"
#include "common/io_file.h"
#include "common/path_util.h"
#include "shader_recompiler/exception.h"
#include "shader_recompiler/frontend/fetch_shader.h"
#include "shader_recompiler/frontend/translate/translate.h"
@ -190,7 +193,20 @@ void Translator::EmitFetch(const GcnInst& inst) {
std::memcpy(&code, &info.user_data[sgpr_base], sizeof(code));
// Parse the assembly to generate a list of attributes.
const auto attribs = ParseFetchShader(code);
u32 fetch_size{};
const auto attribs = ParseFetchShader(code, &fetch_size);
if (Config::dumpShaders()) {
using namespace Common::FS;
const auto dump_dir = GetUserPath(PathType::ShaderDir) / "dumps";
if (!std::filesystem::exists(dump_dir)) {
std::filesystem::create_directories(dump_dir);
}
const auto filename = fmt::format("vs_fetch_{:#018x}.bin", info.pgm_hash);
const auto file = IOFile{dump_dir / filename, FileAccessMode::Write};
file.WriteRaw<u8>(code, fetch_size);
}
for (const auto& attrib : attribs) {
const IR::Attribute attr{IR::Attribute::Param0 + attrib.semantic};
IR::VectorReg dst_reg{attrib.dest_vgpr};
@ -224,9 +240,9 @@ void Translator::EmitFetch(const GcnInst& inst) {
attrib.instance_data);
}
const u32 num_components = AmdGpu::NumComponents(buffer.data_format);
const u32 num_components = AmdGpu::NumComponents(buffer.GetDataFmt());
info.vs_inputs.push_back({
.fmt = buffer.num_format,
.fmt = buffer.GetNumberFmt(),
.binding = attrib.semantic,
.num_components = std::min<u16>(attrib.num_elements, num_components),
.sgpr_base = attrib.sgpr_base,
@ -236,12 +252,13 @@ void Translator::EmitFetch(const GcnInst& inst) {
}
}
void Translate(IR::Block* block, std::span<const GcnInst> inst_list, Info& info) {
void Translate(IR::Block* block, u32 block_base, std::span<const GcnInst> inst_list, Info& info) {
if (inst_list.empty()) {
return;
}
Translator translator{block, info};
for (const auto& inst : inst_list) {
block_base += inst.length;
switch (inst.opcode) {
case Opcode::S_MOVK_I32:
translator.S_MOVK(inst);
@ -345,6 +362,9 @@ void Translate(IR::Block* block, std::span<const GcnInst> inst_list, Info& info)
case Opcode::V_BFREV_B32:
translator.V_BFREV_B32(inst);
break;
case Opcode::V_LDEXP_F32:
translator.V_LDEXP_F32(inst);
break;
case Opcode::V_FRACT_F32:
translator.V_FRACT_F32(inst);
break;
@ -374,8 +394,40 @@ void Translate(IR::Block* block, std::span<const GcnInst> inst_list, Info& info)
case Opcode::IMAGE_SAMPLE_LZ:
case Opcode::IMAGE_SAMPLE:
case Opcode::IMAGE_SAMPLE_L:
case Opcode::IMAGE_SAMPLE_C_O:
case Opcode::IMAGE_SAMPLE_B:
translator.IMAGE_SAMPLE(inst);
break;
case Opcode::IMAGE_ATOMIC_ADD:
translator.IMAGE_ATOMIC(AtomicOp::Add, inst);
break;
case Opcode::IMAGE_ATOMIC_AND:
translator.IMAGE_ATOMIC(AtomicOp::And, inst);
break;
case Opcode::IMAGE_ATOMIC_OR:
translator.IMAGE_ATOMIC(AtomicOp::Or, inst);
break;
case Opcode::IMAGE_ATOMIC_XOR:
translator.IMAGE_ATOMIC(AtomicOp::Xor, inst);
break;
case Opcode::IMAGE_ATOMIC_UMAX:
translator.IMAGE_ATOMIC(AtomicOp::Umax, inst);
break;
case Opcode::IMAGE_ATOMIC_SMAX:
translator.IMAGE_ATOMIC(AtomicOp::Smax, inst);
break;
case Opcode::IMAGE_ATOMIC_UMIN:
translator.IMAGE_ATOMIC(AtomicOp::Umin, inst);
break;
case Opcode::IMAGE_ATOMIC_SMIN:
translator.IMAGE_ATOMIC(AtomicOp::Smin, inst);
break;
case Opcode::IMAGE_ATOMIC_INC:
translator.IMAGE_ATOMIC(AtomicOp::Inc, inst);
break;
case Opcode::IMAGE_ATOMIC_DEC:
translator.IMAGE_ATOMIC(AtomicOp::Dec, inst);
break;
case Opcode::IMAGE_GET_LOD:
translator.IMAGE_GET_LOD(inst);
break;
@ -457,9 +509,15 @@ void Translate(IR::Block* block, std::span<const GcnInst> inst_list, Info& info)
case Opcode::V_CMP_NGT_F32:
translator.V_CMP_F32(ConditionOp::LE, false, inst);
break;
case Opcode::V_CMP_NGE_F32:
translator.V_CMP_F32(ConditionOp::LT, false, inst);
break;
case Opcode::S_CMP_LT_U32:
translator.S_CMP(ConditionOp::LT, false, inst);
break;
case Opcode::S_CMP_LE_U32:
translator.S_CMP(ConditionOp::LE, false, inst);
break;
case Opcode::S_CMP_LG_U32:
translator.S_CMP(ConditionOp::LG, false, inst);
break;
@ -487,6 +545,12 @@ void Translate(IR::Block* block, std::span<const GcnInst> inst_list, Info& info)
case Opcode::V_CNDMASK_B32:
translator.V_CNDMASK_B32(inst);
break;
case Opcode::TBUFFER_LOAD_FORMAT_X:
translator.BUFFER_LOAD_FORMAT(1, true, inst);
break;
case Opcode::TBUFFER_LOAD_FORMAT_XY:
translator.BUFFER_LOAD_FORMAT(2, true, inst);
break;
case Opcode::TBUFFER_LOAD_FORMAT_XYZ:
translator.BUFFER_LOAD_FORMAT(3, true, inst);
break;
@ -581,6 +645,9 @@ void Translate(IR::Block* block, std::span<const GcnInst> inst_list, Info& info)
case Opcode::V_CVT_I32_F32:
translator.V_CVT_I32_F32(inst);
break;
case Opcode::V_CVT_FLR_I32_F32:
translator.V_CVT_FLR_I32_F32(inst);
break;
case Opcode::V_SUBREV_F32:
translator.V_SUBREV_F32(inst);
break;
@ -715,6 +782,7 @@ void Translate(IR::Block* block, std::span<const GcnInst> inst_list, Info& info)
translator.V_MAD_I32_I24(inst);
break;
case Opcode::V_MUL_I32_I24:
case Opcode::V_MUL_U32_U24:
translator.V_MUL_I32_I24(inst);
break;
case Opcode::V_SUB_I32:
@ -771,6 +839,9 @@ void Translate(IR::Block* block, std::span<const GcnInst> inst_list, Info& info)
case Opcode::V_CMP_NE_U64:
translator.V_CMP_NE_U64(inst);
break;
case Opcode::V_CMP_CLASS_F32:
translator.V_CMP_CLASS_F32(inst);
break;
case Opcode::V_TRUNC_F32:
translator.V_TRUNC_F32(inst);
break;
@ -786,7 +857,11 @@ void Translate(IR::Block* block, std::span<const GcnInst> inst_list, Info& info)
case Opcode::S_ADD_U32:
translator.S_ADD_U32(inst);
break;
case Opcode::S_ADDC_U32:
translator.S_ADDC_U32(inst);
break;
case Opcode::S_SUB_U32:
case Opcode::S_SUB_I32:
translator.S_SUB_U32(inst);
break;
// TODO: Separate implementation for legacy variants.
@ -809,9 +884,30 @@ void Translate(IR::Block* block, std::span<const GcnInst> inst_list, Info& info)
case Opcode::IMAGE_GET_RESINFO:
translator.IMAGE_GET_RESINFO(inst);
break;
case Opcode::S_BARRIER:
translator.S_BARRIER();
break;
case Opcode::S_TTRACEDATA:
LOG_WARNING(Render_Vulkan, "S_TTRACEDATA instruction!");
break;
case Opcode::DS_READ_B32:
translator.DS_READ(32, false, false, inst);
break;
case Opcode::DS_READ2_B32:
translator.DS_READ(32, false, true, inst);
break;
case Opcode::DS_WRITE_B32:
translator.DS_WRITE(32, false, false, inst);
break;
case Opcode::DS_WRITE2_B32:
translator.DS_WRITE(32, false, true, inst);
break;
case Opcode::V_READFIRSTLANE_B32:
translator.V_READFIRSTLANE_B32(inst);
break;
case Opcode::S_GETPC_B64:
translator.S_GETPC_B64(block_base, inst);
break;
case Opcode::S_NOP:
case Opcode::S_CBRANCH_EXECZ:
case Opcode::S_CBRANCH_SCC0:

View file

@ -26,6 +26,25 @@ enum class ConditionOp : u32 {
TRU,
};
enum class AtomicOp : u32 {
Swap,
CmpSwap,
Add,
Sub,
Smin,
Umin,
Smax,
Umax,
And,
Or,
Xor,
Inc,
Dec,
FCmpSwap,
Fmin,
Fmax,
};
enum class NegateMode : u32 {
None,
Src1,
@ -61,6 +80,8 @@ public:
void S_BREV_B32(const GcnInst& inst);
void S_ADD_U32(const GcnInst& inst);
void S_SUB_U32(const GcnInst& inst);
void S_GETPC_B64(u32 pc, const GcnInst& inst);
void S_ADDC_U32(const GcnInst& inst);
// Scalar Memory
void S_LOAD_DWORD(int num_dwords, const GcnInst& inst);
@ -133,6 +154,9 @@ public:
void V_NOT_B32(const GcnInst& inst);
void V_CVT_F32_UBYTE(u32 index, const GcnInst& inst);
void V_BFREV_B32(const GcnInst& inst);
void V_LDEXP_F32(const GcnInst& inst);
void V_CVT_FLR_I32_F32(const GcnInst& inst);
void V_CMP_CLASS_F32(const GcnInst& inst);
// Vector Memory
void BUFFER_LOAD_FORMAT(u32 num_dwords, bool is_typed, const GcnInst& inst);
@ -145,6 +169,8 @@ public:
void DS_SWIZZLE_B32(const GcnInst& inst);
void DS_READ(int bit_size, bool is_signed, bool is_pair, const GcnInst& inst);
void DS_WRITE(int bit_size, bool is_signed, bool is_pair, const GcnInst& inst);
void V_READFIRSTLANE_B32(const GcnInst& inst);
void S_BARRIER();
// MIMG
void IMAGE_GET_RESINFO(const GcnInst& inst);
@ -153,6 +179,7 @@ public:
void IMAGE_STORE(const GcnInst& inst);
void IMAGE_LOAD(bool has_mip, const GcnInst& inst);
void IMAGE_GET_LOD(const GcnInst& inst);
void IMAGE_ATOMIC(AtomicOp op, const GcnInst& inst);
// Export
void EXP(const GcnInst& inst);
@ -167,6 +194,6 @@ private:
static std::array<bool, IR::NumScalarRegs> exec_contexts;
};
void Translate(IR::Block* block, std::span<const GcnInst> inst_list, Info& info);
void Translate(IR::Block* block, u32 block_base, std::span<const GcnInst> inst_list, Info& info);
} // namespace Shader::Gcn

View file

@ -28,7 +28,8 @@ void Translator::V_CVT_PKRTZ_F16_F32(const GcnInst& inst) {
void Translator::V_CVT_F32_F16(const GcnInst& inst) {
const IR::U32 src0 = GetSrc(inst.src[0]);
SetDst(inst.dst[0], ir.ConvertUToF(32, 16, src0));
const IR::U16 src0l = ir.UConvert(16, src0);
SetDst(inst.dst[0], ir.FPConvert(32, ir.BitCast<IR::F16>(src0l)));
}
void Translator::V_MUL_F32(const GcnInst& inst) {
@ -50,11 +51,14 @@ void Translator::V_CNDMASK_B32(const GcnInst& inst) {
};
const bool has_flt_source =
is_float_const(inst.src[0].field) || is_float_const(inst.src[1].field);
const IR::U32F32 src0 = GetSrc(inst.src[0], has_flt_source);
IR::U32F32 src0 = GetSrc(inst.src[0], has_flt_source);
IR::U32F32 src1 = GetSrc(inst.src[1], has_flt_source);
if (src0.Type() == IR::Type::F32 && src1.Type() == IR::Type::U32) {
src1 = ir.BitCast<IR::F32, IR::U32>(src1);
}
if (src1.Type() == IR::Type::F32 && src0.Type() == IR::Type::U32) {
src0 = ir.BitCast<IR::F32, IR::U32>(src0);
}
const IR::Value result = ir.Select(flag, src1, src0);
ir.SetVectorReg(dst_reg, IR::U32F32{result});
}
@ -502,4 +506,19 @@ void Translator::V_BFREV_B32(const GcnInst& inst) {
SetDst(inst.dst[0], ir.BitReverse(src0));
}
void Translator::V_LDEXP_F32(const GcnInst& inst) {
const IR::F32 src0{GetSrc(inst.src[0], true)};
const IR::U32 src1{GetSrc(inst.src[1])};
SetDst(inst.dst[0], ir.FPLdexp(src0, src1));
}
void Translator::V_CVT_FLR_I32_F32(const GcnInst& inst) {
const IR::F32 src0{GetSrc(inst.src[0], true)};
SetDst(inst.dst[0], ir.ConvertFToI(32, true, ir.FPFloor(src0)));
}
void Translator::V_CMP_CLASS_F32(const GcnInst& inst) {
UNREACHABLE();
}
} // namespace Shader::Gcn

View file

@ -212,10 +212,15 @@ void Translator::IMAGE_STORE(const GcnInst& inst) {
ir.CompositeConstruct(ir.GetVectorReg(addr_reg), ir.GetVectorReg(addr_reg + 1),
ir.GetVectorReg(addr_reg + 2), ir.GetVectorReg(addr_reg + 3));
ASSERT(mimg.dmask == 0xF);
const IR::Value value = ir.CompositeConstruct(
ir.GetVectorReg<IR::F32>(data_reg), ir.GetVectorReg<IR::F32>(data_reg + 1),
ir.GetVectorReg<IR::F32>(data_reg + 2), ir.GetVectorReg<IR::F32>(data_reg + 3));
boost::container::static_vector<IR::F32, 4> comps;
for (u32 i = 0; i < 4; i++) {
if (((mimg.dmask >> i) & 1) == 0) {
comps.push_back(ir.Imm32(0.f));
continue;
}
comps.push_back(ir.GetVectorReg<IR::F32>(data_reg++));
}
const IR::Value value = ir.CompositeConstruct(comps[0], comps[1], comps[2], comps[3]);
ir.ImageWrite(handle, body, value, {});
}
@ -245,7 +250,10 @@ void Translator::BUFFER_LOAD_FORMAT(u32 num_dwords, bool is_typed, const GcnInst
info.nfmt.Assign(static_cast<AmdGpu::NumberFormat>(mtbuf.nfmt));
}
const IR::Value value = ir.LoadBuffer(num_dwords, ir.GetScalarReg(sharp), address, info);
const IR::Value handle =
ir.CompositeConstruct(ir.GetScalarReg(sharp), ir.GetScalarReg(sharp + 1),
ir.GetScalarReg(sharp + 2), ir.GetScalarReg(sharp + 3));
const IR::Value value = ir.LoadBuffer(num_dwords, handle, address, info);
const IR::VectorReg dst_reg{inst.src[1].code};
if (num_dwords == 1) {
ir.SetVectorReg(dst_reg, IR::F32{value});
@ -304,7 +312,10 @@ void Translator::BUFFER_STORE_FORMAT(u32 num_dwords, bool is_typed, const GcnIns
ir.GetVectorReg<Shader::IR::F32>(src_reg + 3));
break;
}
ir.StoreBuffer(num_dwords, ir.GetScalarReg(sharp), address, value, info);
const IR::Value handle =
ir.CompositeConstruct(ir.GetScalarReg(sharp), ir.GetScalarReg(sharp + 1),
ir.GetScalarReg(sharp + 2), ir.GetScalarReg(sharp + 3));
ir.StoreBuffer(num_dwords, handle, address, value, info);
}
void Translator::IMAGE_GET_LOD(const GcnInst& inst) {
@ -322,4 +333,48 @@ void Translator::IMAGE_GET_LOD(const GcnInst& inst) {
ir.SetVectorReg(dst_reg++, IR::F32{ir.CompositeExtract(lod, 1)});
}
void Translator::IMAGE_ATOMIC(AtomicOp op, const GcnInst& inst) {
const auto& mimg = inst.control.mimg;
IR::VectorReg val_reg{inst.dst[0].code};
IR::VectorReg addr_reg{inst.src[0].code};
const IR::ScalarReg tsharp_reg{inst.src[2].code * 4};
const IR::Value value = ir.GetVectorReg(val_reg);
const IR::Value handle = ir.GetScalarReg(tsharp_reg);
const IR::Value body =
ir.CompositeConstruct(ir.GetVectorReg(addr_reg), ir.GetVectorReg(addr_reg + 1),
ir.GetVectorReg(addr_reg + 2), ir.GetVectorReg(addr_reg + 3));
const IR::Value prev = [&] {
switch (op) {
case AtomicOp::Swap:
return ir.ImageAtomicExchange(handle, body, value, {});
case AtomicOp::Add:
return ir.ImageAtomicIAdd(handle, body, value, {});
case AtomicOp::Smin:
return ir.ImageAtomicIMin(handle, body, value, true, {});
case AtomicOp::Umin:
return ir.ImageAtomicUMin(handle, body, value, {});
case AtomicOp::Smax:
return ir.ImageAtomicIMax(handle, body, value, true, {});
case AtomicOp::Umax:
return ir.ImageAtomicUMax(handle, body, value, {});
case AtomicOp::And:
return ir.ImageAtomicAnd(handle, body, value, {});
case AtomicOp::Or:
return ir.ImageAtomicOr(handle, body, value, {});
case AtomicOp::Xor:
return ir.ImageAtomicXor(handle, body, value, {});
case AtomicOp::Inc:
return ir.ImageAtomicInc(handle, body, value, {});
case AtomicOp::Dec:
return ir.ImageAtomicDec(handle, body, value, {});
default:
UNREACHABLE();
}
}();
if (mimg.glc) {
ir.SetVectorReg(val_reg, IR::U32{prev});
}
}
} // namespace Shader::Gcn