shadPS4/src/video_core/amdgpu/liverpool.cpp

936 lines
41 KiB
C++

// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#include <boost/preprocessor/stringize.hpp>
#include "common/assert.h"
#include "common/config.h"
#include "common/debug.h"
#include "common/polyfill_thread.h"
#include "common/thread.h"
#include "core/debug_state.h"
#include "core/libraries/videoout/driver.h"
#include "core/memory.h"
#include "video_core/amdgpu/liverpool.h"
#include "video_core/amdgpu/pm4_cmds.h"
#include "video_core/renderdoc.h"
#include "video_core/renderer_vulkan/vk_rasterizer.h"
namespace AmdGpu {
static const char* dcb_task_name{"DCB_TASK"};
static const char* ccb_task_name{"CCB_TASK"};
#define MAX_NAMES 56
static_assert(Liverpool::NumComputeRings <= MAX_NAMES);
#define NAME_NUM(z, n, name) BOOST_PP_STRINGIZE(name) BOOST_PP_STRINGIZE(n),
#define NAME_ARRAY(name, num) {BOOST_PP_REPEAT(num, NAME_NUM, name)}
static const char* acb_task_name[] = NAME_ARRAY(ACB_TASK, MAX_NAMES);
#define YIELD(name) \
FIBER_EXIT; \
co_yield {}; \
FIBER_ENTER(name);
#define YIELD_CE() YIELD(ccb_task_name)
#define YIELD_GFX() YIELD(dcb_task_name)
#define YIELD_ASC(id) YIELD(acb_task_name[id])
#define RESUME(task, name) \
FIBER_EXIT; \
task.handle.resume(); \
FIBER_ENTER(name);
#define RESUME_CE(task) RESUME(task, ccb_task_name)
#define RESUME_GFX(task) RESUME(task, dcb_task_name)
#define RESUME_ASC(task, id) RESUME(task, acb_task_name[id])
std::array<u8, 48_KB> Liverpool::ConstantEngine::constants_heap;
static std::span<const u32> NextPacket(std::span<const u32> span, size_t offset) {
if (offset > span.size()) {
LOG_ERROR(
Lib_GnmDriver,
": packet length exceeds remaining submission size. Packet dword count={}, remaining "
"submission dwords={}",
offset, span.size());
// Return empty subspan so check for next packet bails out
return {};
}
return span.subspan(offset);
}
Liverpool::Liverpool() {
process_thread = std::jthread{std::bind_front(&Liverpool::Process, this)};
}
Liverpool::~Liverpool() {
process_thread.request_stop();
process_thread.join();
}
void Liverpool::Process(std::stop_token stoken) {
Common::SetCurrentThreadName("shadPS4:GpuCommandProcessor");
while (!stoken.stop_requested()) {
{
std::unique_lock lk{submit_mutex};
Common::CondvarWait(submit_cv, lk, stoken,
[this] { return num_commands || num_submits || submit_done; });
}
if (stoken.stop_requested()) {
break;
}
VideoCore::StartCapture();
curr_qid = -1;
while (num_submits || num_commands) {
// Process incoming commands with high priority
while (num_commands) {
Common::UniqueFunction<void> callback{};
{
std::unique_lock lk{submit_mutex};
callback = std::move(command_queue.back());
command_queue.pop();
}
callback();
--num_commands;
}
curr_qid = (curr_qid + 1) % num_mapped_queues;
auto& queue = mapped_queues[curr_qid];
Task::Handle task{};
{
std::scoped_lock lock{queue.m_access};
if (queue.submits.empty()) {
continue;
}
task = queue.submits.front();
}
task.resume();
if (task.done()) {
task.destroy();
std::scoped_lock lock{queue.m_access};
queue.submits.pop();
--num_submits;
std::scoped_lock lock2{submit_mutex};
submit_cv.notify_all();
}
}
if (submit_done) {
VideoCore::EndCapture();
if (rasterizer) {
rasterizer->Flush();
}
submit_done = false;
}
Platform::IrqC::Instance()->Signal(Platform::InterruptId::GpuIdle);
}
}
Liverpool::Task Liverpool::ProcessCeUpdate(std::span<const u32> ccb) {
FIBER_ENTER(ccb_task_name);
while (!ccb.empty()) {
const auto* header = reinterpret_cast<const PM4Header*>(ccb.data());
const u32 type = header->type;
if (type != 3) {
// No other types of packets were spotted so far
UNREACHABLE_MSG("Invalid PM4 type {}", type);
}
const PM4ItOpcode opcode = header->type3.opcode;
const auto* it_body = reinterpret_cast<const u32*>(header) + 1;
switch (opcode) {
case PM4ItOpcode::Nop: {
const auto* nop = reinterpret_cast<const PM4CmdNop*>(header);
break;
}
case PM4ItOpcode::WriteConstRam: {
const auto* write_const = reinterpret_cast<const PM4WriteConstRam*>(header);
memcpy(cblock.constants_heap.data() + write_const->Offset(), &write_const->data,
write_const->Size());
break;
}
case PM4ItOpcode::DumpConstRam: {
const auto* dump_const = reinterpret_cast<const PM4DumpConstRam*>(header);
memcpy(dump_const->Address<void*>(),
cblock.constants_heap.data() + dump_const->Offset(), dump_const->Size());
break;
}
case PM4ItOpcode::IncrementCeCounter: {
++cblock.ce_count;
break;
}
case PM4ItOpcode::WaitOnDeCounterDiff: {
const auto diff = it_body[0];
while ((cblock.de_count - cblock.ce_count) >= diff) {
YIELD_CE();
}
break;
}
case PM4ItOpcode::IndirectBufferConst: {
const auto* indirect_buffer = reinterpret_cast<const PM4CmdIndirectBuffer*>(header);
auto task =
ProcessCeUpdate({indirect_buffer->Address<const u32>(), indirect_buffer->ib_size});
RESUME_CE(task);
while (!task.handle.done()) {
YIELD_CE();
RESUME_CE(task);
}
break;
}
default:
const u32 count = header->type3.NumWords();
UNREACHABLE_MSG("Unknown PM4 type 3 opcode {:#x} with count {}",
static_cast<u32>(opcode), count);
}
ccb = NextPacket(ccb, header->type3.NumWords() + 1);
}
FIBER_EXIT;
}
Liverpool::Task Liverpool::ProcessGraphics(std::span<const u32> dcb, std::span<const u32> ccb) {
FIBER_ENTER(dcb_task_name);
cblock.Reset();
// TODO: potentially, ASCs also can depend on CE and in this case the
// CE task should be moved into more global scope
Task ce_task{};
if (!ccb.empty()) {
// In case of CCB provided kick off CE asap to have the constant heap ready to use
ce_task = ProcessCeUpdate(ccb);
RESUME_GFX(ce_task);
}
const auto base_addr = reinterpret_cast<uintptr_t>(dcb.data());
while (!dcb.empty()) {
const auto* header = reinterpret_cast<const PM4Header*>(dcb.data());
const u32 type = header->type;
switch (type) {
case 0:
case 1:
UNREACHABLE_MSG("Unsupported PM4 type {}", type);
break;
case 2:
// Type-2 packet are used for padding purposes
dcb = NextPacket(dcb, 1);
continue;
case 3:
const u32 count = header->type3.NumWords();
const PM4ItOpcode opcode = header->type3.opcode;
switch (opcode) {
case PM4ItOpcode::Nop: {
const auto* nop = reinterpret_cast<const PM4CmdNop*>(header);
if (nop->header.count.Value() == 0) {
break;
}
switch (nop->data_block[0]) {
case PM4CmdNop::PayloadType::PatchedFlip: {
// There is no evidence that GPU CP drives flip events by parsing
// special NOP packets. For convenience lets assume that it does.
Platform::IrqC::Instance()->Signal(Platform::InterruptId::GfxFlip);
break;
}
case PM4CmdNop::PayloadType::DebugMarkerPush: {
const auto marker_sz = nop->header.count.Value() * 2;
const std::string_view label{reinterpret_cast<const char*>(&nop->data_block[1]),
marker_sz};
if (rasterizer) {
rasterizer->ScopeMarkerBegin(label);
}
break;
}
case PM4CmdNop::PayloadType::DebugColorMarkerPush: {
const auto marker_sz = nop->header.count.Value() * 2;
const std::string_view label{reinterpret_cast<const char*>(&nop->data_block[1]),
marker_sz};
const u32 color = *reinterpret_cast<const u32*>(
reinterpret_cast<const u8*>(&nop->data_block[1]) + marker_sz);
if (rasterizer) {
rasterizer->ScopedMarkerInsertColor(label, color);
}
break;
}
case PM4CmdNop::PayloadType::DebugMarkerPop: {
if (rasterizer) {
rasterizer->ScopeMarkerEnd();
}
break;
}
default:
break;
}
break;
}
case PM4ItOpcode::ContextControl: {
break;
}
case PM4ItOpcode::ClearState: {
regs.SetDefaults();
break;
}
case PM4ItOpcode::SetConfigReg: {
const auto* set_data = reinterpret_cast<const PM4CmdSetData*>(header);
const auto reg_addr = ConfigRegWordOffset + set_data->reg_offset;
const auto* payload = reinterpret_cast<const u32*>(header + 2);
std::memcpy(&regs.reg_array[reg_addr], payload, (count - 1) * sizeof(u32));
break;
}
case PM4ItOpcode::SetContextReg: {
const auto* set_data = reinterpret_cast<const PM4CmdSetData*>(header);
const auto reg_addr = ContextRegWordOffset + set_data->reg_offset;
const auto* payload = reinterpret_cast<const u32*>(header + 2);
std::memcpy(&regs.reg_array[reg_addr], payload, (count - 1) * sizeof(u32));
// In the case of HW, render target memory has alignment as color block operates on
// tiles. There is no information of actual resource extents stored in CB context
// regs, so any deduction of it from slices/pitch will lead to a larger surface
// created. The same applies to the depth targets. Fortunately, the guest always
// sends a trailing NOP packet right after the context regs setup, so we can use the
// heuristic below and extract the hint to determine actual resource dims.
switch (reg_addr) {
case ContextRegs::CbColor0Base:
case ContextRegs::CbColor1Base:
case ContextRegs::CbColor2Base:
case ContextRegs::CbColor3Base:
case ContextRegs::CbColor4Base:
case ContextRegs::CbColor5Base:
case ContextRegs::CbColor6Base:
case ContextRegs::CbColor7Base: {
const auto col_buf_id = (reg_addr - ContextRegs::CbColor0Base) /
(ContextRegs::CbColor1Base - ContextRegs::CbColor0Base);
ASSERT(col_buf_id < NumColorBuffers);
const auto nop_offset = header->type3.count;
if (nop_offset == 0x0e || nop_offset == 0x0d || nop_offset == 0x0b) {
ASSERT_MSG(payload[nop_offset] == 0xc0001000,
"NOP hint is missing in CB setup sequence");
last_cb_extent[col_buf_id].raw = payload[nop_offset + 1];
} else {
last_cb_extent[col_buf_id].raw = 0;
}
break;
}
case ContextRegs::CbColor0Cmask:
case ContextRegs::CbColor1Cmask:
case ContextRegs::CbColor2Cmask:
case ContextRegs::CbColor3Cmask:
case ContextRegs::CbColor4Cmask:
case ContextRegs::CbColor5Cmask:
case ContextRegs::CbColor6Cmask:
case ContextRegs::CbColor7Cmask: {
const auto col_buf_id =
(reg_addr - ContextRegs::CbColor0Cmask) /
(ContextRegs::CbColor1Cmask - ContextRegs::CbColor0Cmask);
ASSERT(col_buf_id < NumColorBuffers);
const auto nop_offset = header->type3.count;
if (nop_offset == 0x04) {
ASSERT_MSG(payload[nop_offset] == 0xc0001000,
"NOP hint is missing in CB setup sequence");
last_cb_extent[col_buf_id].raw = payload[nop_offset + 1];
}
break;
}
case ContextRegs::DbZInfo: {
if (header->type3.count == 8) {
ASSERT_MSG(payload[20] == 0xc0001000,
"NOP hint is missing in DB setup sequence");
last_db_extent.raw = payload[21];
} else {
last_db_extent.raw = 0;
}
break;
}
default:
break;
}
break;
}
case PM4ItOpcode::SetShReg: {
const auto* set_data = reinterpret_cast<const PM4CmdSetData*>(header);
const auto set_size = (count - 1) * sizeof(u32);
if (set_data->reg_offset >= 0x200 &&
set_data->reg_offset <= (0x200 + sizeof(ComputeProgram) / 4)) {
ASSERT(set_size <= sizeof(ComputeProgram));
auto* addr = reinterpret_cast<u32*>(&mapped_queues[GfxQueueId].cs_state) +
(set_data->reg_offset - 0x200);
std::memcpy(addr, header + 2, set_size);
} else {
std::memcpy(&regs.reg_array[ShRegWordOffset + set_data->reg_offset], header + 2,
set_size);
}
break;
}
case PM4ItOpcode::SetUconfigReg: {
const auto* set_data = reinterpret_cast<const PM4CmdSetData*>(header);
std::memcpy(&regs.reg_array[UconfigRegWordOffset + set_data->reg_offset],
header + 2, (count - 1) * sizeof(u32));
break;
}
case PM4ItOpcode::IndexType: {
const auto* index_type = reinterpret_cast<const PM4CmdDrawIndexType*>(header);
regs.index_buffer_type.raw = index_type->raw;
break;
}
case PM4ItOpcode::DrawIndex2: {
const auto* draw_index = reinterpret_cast<const PM4CmdDrawIndex2*>(header);
regs.max_index_size = draw_index->max_size;
regs.index_base_address.base_addr_lo = draw_index->index_base_lo;
regs.index_base_address.base_addr_hi.Assign(draw_index->index_base_hi);
regs.num_indices = draw_index->index_count;
regs.draw_initiator = draw_index->draw_initiator;
if (DebugState.DumpingCurrentReg()) {
DebugState.PushRegsDump(base_addr, reinterpret_cast<uintptr_t>(header), regs);
}
if (rasterizer) {
const auto cmd_address = reinterpret_cast<const void*>(header);
rasterizer->ScopeMarkerBegin(fmt::format("dcb:{}:DrawIndex2", cmd_address));
rasterizer->Draw(true);
rasterizer->ScopeMarkerEnd();
}
break;
}
case PM4ItOpcode::DrawIndexOffset2: {
const auto* draw_index_off =
reinterpret_cast<const PM4CmdDrawIndexOffset2*>(header);
regs.max_index_size = draw_index_off->max_size;
regs.num_indices = draw_index_off->index_count;
regs.draw_initiator = draw_index_off->draw_initiator;
if (DebugState.DumpingCurrentReg()) {
DebugState.PushRegsDump(base_addr, reinterpret_cast<uintptr_t>(header), regs);
}
if (rasterizer) {
const auto cmd_address = reinterpret_cast<const void*>(header);
rasterizer->ScopeMarkerBegin(
fmt::format("dcb:{}:DrawIndexOffset2", cmd_address));
rasterizer->Draw(true, draw_index_off->index_offset);
rasterizer->ScopeMarkerEnd();
}
break;
}
case PM4ItOpcode::DrawIndexAuto: {
const auto* draw_index = reinterpret_cast<const PM4CmdDrawIndexAuto*>(header);
regs.num_indices = draw_index->index_count;
regs.draw_initiator = draw_index->draw_initiator;
if (DebugState.DumpingCurrentReg()) {
DebugState.PushRegsDump(base_addr, reinterpret_cast<uintptr_t>(header), regs);
}
if (rasterizer) {
const auto cmd_address = reinterpret_cast<const void*>(header);
rasterizer->ScopeMarkerBegin(fmt::format("dcb:{}:DrawIndexAuto", cmd_address));
rasterizer->Draw(false);
rasterizer->ScopeMarkerEnd();
}
break;
}
case PM4ItOpcode::DrawIndirect: {
const auto* draw_indirect = reinterpret_cast<const PM4CmdDrawIndirect*>(header);
const auto offset = draw_indirect->data_offset;
const auto ib_address = mapped_queues[GfxQueueId].indirect_args_addr;
const auto size = sizeof(DrawIndirectArgs);
if (DebugState.DumpingCurrentReg()) {
DebugState.PushRegsDump(base_addr, reinterpret_cast<uintptr_t>(header), regs);
}
if (rasterizer) {
const auto cmd_address = reinterpret_cast<const void*>(header);
rasterizer->ScopeMarkerBegin(fmt::format("dcb:{}:DrawIndirect", cmd_address));
rasterizer->DrawIndirect(false, ib_address, offset, size, 1, 0);
rasterizer->ScopeMarkerEnd();
}
break;
}
case PM4ItOpcode::DrawIndexIndirect: {
const auto* draw_index_indirect =
reinterpret_cast<const PM4CmdDrawIndexIndirect*>(header);
const auto offset = draw_index_indirect->data_offset;
const auto ib_address = mapped_queues[GfxQueueId].indirect_args_addr;
const auto size = sizeof(DrawIndexedIndirectArgs);
if (DebugState.DumpingCurrentReg()) {
DebugState.PushRegsDump(base_addr, reinterpret_cast<uintptr_t>(header), regs);
}
if (rasterizer) {
const auto cmd_address = reinterpret_cast<const void*>(header);
rasterizer->ScopeMarkerBegin(
fmt::format("dcb:{}:DrawIndexIndirect", cmd_address));
rasterizer->DrawIndirect(true, ib_address, offset, size, 1, 0);
rasterizer->ScopeMarkerEnd();
}
break;
}
case PM4ItOpcode::DrawIndexIndirectCountMulti: {
const auto* draw_index_indirect =
reinterpret_cast<const PM4CmdDrawIndexIndirectMulti*>(header);
const auto offset = draw_index_indirect->data_offset;
const auto ib_address = mapped_queues[GfxQueueId].indirect_args_addr;
if (DebugState.DumpingCurrentReg()) {
DebugState.PushRegsDump(base_addr, reinterpret_cast<uintptr_t>(header), regs);
}
if (rasterizer) {
const auto cmd_address = reinterpret_cast<const void*>(header);
rasterizer->ScopeMarkerBegin(
fmt::format("dcb:{}:DrawIndexIndirectCountMulti", cmd_address));
rasterizer->DrawIndirect(true, ib_address, offset, draw_index_indirect->stride,
draw_index_indirect->count,
draw_index_indirect->countAddr);
rasterizer->ScopeMarkerEnd();
}
break;
}
case PM4ItOpcode::DispatchDirect: {
const auto* dispatch_direct = reinterpret_cast<const PM4CmdDispatchDirect*>(header);
auto& cs_program = GetCsRegs();
cs_program.dim_x = dispatch_direct->dim_x;
cs_program.dim_y = dispatch_direct->dim_y;
cs_program.dim_z = dispatch_direct->dim_z;
cs_program.dispatch_initiator = dispatch_direct->dispatch_initiator;
if (DebugState.DumpingCurrentReg()) {
DebugState.PushRegsDumpCompute(base_addr, reinterpret_cast<uintptr_t>(header),
cs_program);
}
if (rasterizer && (cs_program.dispatch_initiator & 1)) {
const auto cmd_address = reinterpret_cast<const void*>(header);
rasterizer->ScopeMarkerBegin(fmt::format("dcb:{}:Dispatch", cmd_address));
rasterizer->DispatchDirect();
rasterizer->ScopeMarkerEnd();
}
break;
}
case PM4ItOpcode::DispatchIndirect: {
const auto* dispatch_indirect =
reinterpret_cast<const PM4CmdDispatchIndirect*>(header);
auto& cs_program = GetCsRegs();
const auto offset = dispatch_indirect->data_offset;
const auto ib_address = mapped_queues[GfxQueueId].indirect_args_addr;
const auto size = sizeof(PM4CmdDispatchIndirect::GroupDimensions);
if (DebugState.DumpingCurrentReg()) {
DebugState.PushRegsDumpCompute(base_addr, reinterpret_cast<uintptr_t>(header),
cs_program);
}
if (rasterizer && (cs_program.dispatch_initiator & 1)) {
const auto cmd_address = reinterpret_cast<const void*>(header);
rasterizer->ScopeMarkerBegin(
fmt::format("dcb:{}:DispatchIndirect", cmd_address));
rasterizer->DispatchIndirect(ib_address, offset, size);
rasterizer->ScopeMarkerEnd();
}
break;
}
case PM4ItOpcode::NumInstances: {
const auto* num_instances = reinterpret_cast<const PM4CmdDrawNumInstances*>(header);
regs.num_instances.num_instances = num_instances->num_instances;
break;
}
case PM4ItOpcode::IndexBase: {
const auto* index_base = reinterpret_cast<const PM4CmdDrawIndexBase*>(header);
regs.index_base_address.base_addr_lo = index_base->addr_lo;
regs.index_base_address.base_addr_hi.Assign(index_base->addr_hi);
break;
}
case PM4ItOpcode::IndexBufferSize: {
const auto* index_size = reinterpret_cast<const PM4CmdDrawIndexBufferSize*>(header);
regs.num_indices = index_size->num_indices;
break;
}
case PM4ItOpcode::SetBase: {
const auto* set_base = reinterpret_cast<const PM4CmdSetBase*>(header);
ASSERT(set_base->base_index == PM4CmdSetBase::BaseIndex::DrawIndexIndirPatchTable);
mapped_queues[GfxQueueId].indirect_args_addr = set_base->Address<u64>();
break;
}
case PM4ItOpcode::EventWrite: {
// const auto* event = reinterpret_cast<const PM4CmdEventWrite*>(header);
break;
}
case PM4ItOpcode::EventWriteEos: {
const auto* event_eos = reinterpret_cast<const PM4CmdEventWriteEos*>(header);
event_eos->SignalFence([](void* address, u64 data, u32 num_bytes) {
auto* memory = Core::Memory::Instance();
if (!memory->TryWriteBacking(address, &data, num_bytes)) {
memcpy(address, &data, num_bytes);
}
});
if (event_eos->command == PM4CmdEventWriteEos::Command::GdsStore) {
ASSERT(event_eos->size == 1);
if (rasterizer) {
rasterizer->Finish();
const u32 value = rasterizer->ReadDataFromGds(event_eos->gds_index);
*event_eos->Address() = value;
}
}
break;
}
case PM4ItOpcode::EventWriteEop: {
const auto* event_eop = reinterpret_cast<const PM4CmdEventWriteEop*>(header);
event_eop->SignalFence([](void* address, u64 data, u32 num_bytes) {
auto* memory = Core::Memory::Instance();
if (!memory->TryWriteBacking(address, &data, num_bytes)) {
memcpy(address, &data, num_bytes);
}
});
break;
}
case PM4ItOpcode::DmaData: {
const auto* dma_data = reinterpret_cast<const PM4DmaData*>(header);
if (dma_data->dst_addr_lo == 0x3022C || !rasterizer) {
break;
}
if (dma_data->src_sel == DmaDataSrc::Data && dma_data->dst_sel == DmaDataDst::Gds) {
rasterizer->InlineData(dma_data->dst_addr_lo, &dma_data->data, sizeof(u32),
true);
} else if (dma_data->src_sel == DmaDataSrc::Memory &&
dma_data->dst_sel == DmaDataDst::Gds) {
rasterizer->InlineData(dma_data->dst_addr_lo,
dma_data->SrcAddress<const void*>(),
dma_data->NumBytes(), true);
} else if (dma_data->src_sel == DmaDataSrc::Data &&
dma_data->dst_sel == DmaDataDst::Memory) {
rasterizer->InlineData(dma_data->DstAddress<VAddr>(), &dma_data->data,
sizeof(u32), false);
} else if (dma_data->src_sel == DmaDataSrc::Gds &&
dma_data->dst_sel == DmaDataDst::Memory) {
// LOG_WARNING(Render_Vulkan, "GDS memory read");
} else if (dma_data->src_sel == DmaDataSrc::Memory &&
dma_data->dst_sel == DmaDataDst::Memory) {
rasterizer->InlineData(dma_data->DstAddress<VAddr>(),
dma_data->SrcAddress<const void*>(),
dma_data->NumBytes(), false);
} else {
UNREACHABLE_MSG("WriteData src_sel = {}, dst_sel = {}",
u32(dma_data->src_sel.Value()), u32(dma_data->dst_sel.Value()));
}
break;
}
case PM4ItOpcode::WriteData: {
const auto* write_data = reinterpret_cast<const PM4CmdWriteData*>(header);
ASSERT(write_data->dst_sel.Value() == 2 || write_data->dst_sel.Value() == 5);
const u32 data_size = (header->type3.count.Value() - 2) * 4;
u64* address = write_data->Address<u64*>();
if (!write_data->wr_one_addr.Value()) {
std::memcpy(address, write_data->data, data_size);
} else {
UNREACHABLE();
}
break;
}
case PM4ItOpcode::AcquireMem: {
// const auto* acquire_mem = reinterpret_cast<PM4CmdAcquireMem*>(header);
break;
}
case PM4ItOpcode::Rewind: {
const PM4CmdRewind* rewind = reinterpret_cast<const PM4CmdRewind*>(header);
while (!rewind->Valid()) {
YIELD_GFX();
}
break;
}
case PM4ItOpcode::WaitRegMem: {
const auto* wait_reg_mem = reinterpret_cast<const PM4CmdWaitRegMem*>(header);
// ASSERT(wait_reg_mem->engine.Value() == PM4CmdWaitRegMem::Engine::Me);
// Optimization: VO label waits are special because the emulator
// will write to the label when presentation is finished. So if
// there are no other submits to yield to we can sleep the thread
// instead and allow other tasks to run.
const u64* wait_addr = wait_reg_mem->Address<u64*>();
if (vo_port->IsVoLabel(wait_addr) &&
num_submits == mapped_queues[GfxQueueId].submits.size()) {
vo_port->WaitVoLabel([&] { return wait_reg_mem->Test(); });
}
while (!wait_reg_mem->Test()) {
YIELD_GFX();
}
break;
}
case PM4ItOpcode::IndirectBuffer: {
const auto* indirect_buffer = reinterpret_cast<const PM4CmdIndirectBuffer*>(header);
auto task = ProcessGraphics(
{indirect_buffer->Address<const u32>(), indirect_buffer->ib_size}, {});
RESUME_GFX(task);
while (!task.handle.done()) {
YIELD_GFX();
RESUME_GFX(task);
}
break;
}
case PM4ItOpcode::IncrementDeCounter: {
++cblock.de_count;
break;
}
case PM4ItOpcode::WaitOnCeCounter: {
while (cblock.ce_count <= cblock.de_count) {
RESUME_GFX(ce_task);
}
break;
}
case PM4ItOpcode::PfpSyncMe: {
if (rasterizer) {
rasterizer->CpSync();
}
break;
}
default:
UNREACHABLE_MSG("Unknown PM4 type 3 opcode {:#x} with count {}",
static_cast<u32>(opcode), count);
}
dcb = NextPacket(dcb, header->type3.NumWords() + 1);
break;
}
}
if (ce_task.handle) {
ASSERT_MSG(ce_task.handle.done(), "Partially processed CCB");
ce_task.handle.destroy();
}
FIBER_EXIT;
}
template <bool is_indirect>
Liverpool::Task Liverpool::ProcessCompute(std::span<const u32> acb, u32 vqid) {
FIBER_ENTER(acb_task_name[vqid]);
const auto& queue = asc_queues[{vqid}];
auto base_addr = reinterpret_cast<uintptr_t>(acb.data());
while (!acb.empty()) {
const auto* header = reinterpret_cast<const PM4Header*>(acb.data());
const u32 type = header->type;
if (type != 3) {
// No other types of packets were spotted so far
UNREACHABLE_MSG("Invalid PM4 type {}", type);
}
const u32 count = header->type3.NumWords();
const PM4ItOpcode opcode = header->type3.opcode;
const auto* it_body = reinterpret_cast<const u32*>(header) + 1;
switch (opcode) {
case PM4ItOpcode::Nop: {
const auto* nop = reinterpret_cast<const PM4CmdNop*>(header);
break;
}
case PM4ItOpcode::IndirectBuffer: {
const auto* indirect_buffer = reinterpret_cast<const PM4CmdIndirectBuffer*>(header);
auto task = ProcessCompute<true>(
{indirect_buffer->Address<const u32>(), indirect_buffer->ib_size}, vqid);
RESUME_ASC(task, vqid);
while (!task.handle.done()) {
YIELD_ASC(vqid);
RESUME_ASC(task, vqid);
}
break;
}
case PM4ItOpcode::DmaData: {
const auto* dma_data = reinterpret_cast<const PM4DmaData*>(header);
if (dma_data->dst_addr_lo == 0x3022C || !rasterizer) {
break;
}
if (dma_data->src_sel == DmaDataSrc::Data && dma_data->dst_sel == DmaDataDst::Gds) {
rasterizer->InlineData(dma_data->dst_addr_lo, &dma_data->data, sizeof(u32), true);
} else if (dma_data->src_sel == DmaDataSrc::Memory &&
dma_data->dst_sel == DmaDataDst::Gds) {
rasterizer->InlineData(dma_data->dst_addr_lo, dma_data->SrcAddress<const void*>(),
dma_data->NumBytes(), true);
} else if (dma_data->src_sel == DmaDataSrc::Data &&
dma_data->dst_sel == DmaDataDst::Memory) {
rasterizer->InlineData(dma_data->DstAddress<VAddr>(), &dma_data->data, sizeof(u32),
false);
} else if (dma_data->src_sel == DmaDataSrc::Gds &&
dma_data->dst_sel == DmaDataDst::Memory) {
// LOG_WARNING(Render_Vulkan, "GDS memory read");
} else if (dma_data->src_sel == DmaDataSrc::Memory &&
dma_data->dst_sel == DmaDataDst::Memory) {
rasterizer->InlineData(dma_data->DstAddress<VAddr>(),
dma_data->SrcAddress<const void*>(), dma_data->NumBytes(),
false);
} else {
UNREACHABLE_MSG("WriteData src_sel = {}, dst_sel = {}",
u32(dma_data->src_sel.Value()), u32(dma_data->dst_sel.Value()));
}
break;
}
case PM4ItOpcode::AcquireMem: {
break;
}
case PM4ItOpcode::Rewind: {
const PM4CmdRewind* rewind = reinterpret_cast<const PM4CmdRewind*>(header);
while (!rewind->Valid()) {
YIELD_ASC(vqid);
}
break;
}
case PM4ItOpcode::SetShReg: {
const auto* set_data = reinterpret_cast<const PM4CmdSetData*>(header);
const auto set_size = (count - 1) * sizeof(u32);
if (set_data->reg_offset >= 0x200 &&
set_data->reg_offset <= (0x200 + sizeof(ComputeProgram) / 4)) {
ASSERT(set_size <= sizeof(ComputeProgram));
auto* addr = reinterpret_cast<u32*>(&mapped_queues[vqid + 1].cs_state) +
(set_data->reg_offset - 0x200);
std::memcpy(addr, header + 2, set_size);
} else {
std::memcpy(&regs.reg_array[ShRegWordOffset + set_data->reg_offset], header + 2,
set_size);
}
break;
}
case PM4ItOpcode::DispatchDirect: {
const auto* dispatch_direct = reinterpret_cast<const PM4CmdDispatchDirect*>(header);
auto& cs_program = GetCsRegs();
cs_program.dim_x = dispatch_direct->dim_x;
cs_program.dim_y = dispatch_direct->dim_y;
cs_program.dim_z = dispatch_direct->dim_z;
cs_program.dispatch_initiator = dispatch_direct->dispatch_initiator;
if (DebugState.DumpingCurrentReg()) {
DebugState.PushRegsDumpCompute(base_addr, reinterpret_cast<uintptr_t>(header),
cs_program);
}
if (rasterizer && (cs_program.dispatch_initiator & 1)) {
const auto cmd_address = reinterpret_cast<const void*>(header);
rasterizer->ScopeMarkerBegin(fmt::format("acb[{}]:{}:Dispatch", vqid, cmd_address));
rasterizer->DispatchDirect();
rasterizer->ScopeMarkerEnd();
}
break;
}
case PM4ItOpcode::WriteData: {
const auto* write_data = reinterpret_cast<const PM4CmdWriteData*>(header);
ASSERT(write_data->dst_sel.Value() == 2 || write_data->dst_sel.Value() == 5);
const u32 data_size = (header->type3.count.Value() - 2) * 4;
if (!write_data->wr_one_addr.Value()) {
std::memcpy(write_data->Address<void*>(), write_data->data, data_size);
} else {
UNREACHABLE();
}
break;
}
case PM4ItOpcode::WaitRegMem: {
const auto* wait_reg_mem = reinterpret_cast<const PM4CmdWaitRegMem*>(header);
ASSERT(wait_reg_mem->engine.Value() == PM4CmdWaitRegMem::Engine::Me);
while (!wait_reg_mem->Test()) {
YIELD_ASC(vqid);
}
break;
}
case PM4ItOpcode::ReleaseMem: {
const auto* release_mem = reinterpret_cast<const PM4CmdReleaseMem*>(header);
release_mem->SignalFence(static_cast<Platform::InterruptId>(queue.pipe_id));
break;
}
default:
UNREACHABLE_MSG("Unknown PM4 type 3 opcode {:#x} with count {}",
static_cast<u32>(opcode), count);
}
const auto packet_size_dw = header->type3.NumWords() + 1;
acb = NextPacket(acb, packet_size_dw);
if constexpr (!is_indirect) {
*queue.read_addr += packet_size_dw;
*queue.read_addr %= queue.ring_size_dw;
}
}
FIBER_EXIT;
}
std::pair<std::span<const u32>, std::span<const u32>> Liverpool::CopyCmdBuffers(
std::span<const u32> dcb, std::span<const u32> ccb) {
auto& queue = mapped_queues[GfxQueueId];
// std::vector resize can invalidate spans for commands in flight
ASSERT_MSG(queue.dcb_buffer.capacity() >= queue.dcb_buffer_offset + dcb.size(),
"dcb copy buffer out of reserved space");
ASSERT_MSG(queue.ccb_buffer.capacity() >= queue.ccb_buffer_offset + ccb.size(),
"ccb copy buffer out of reserved space");
queue.dcb_buffer.resize(
std::max(queue.dcb_buffer.size(), queue.dcb_buffer_offset + dcb.size()));
queue.ccb_buffer.resize(
std::max(queue.ccb_buffer.size(), queue.ccb_buffer_offset + ccb.size()));
u32 prev_dcb_buffer_offset = queue.dcb_buffer_offset;
u32 prev_ccb_buffer_offset = queue.ccb_buffer_offset;
if (!dcb.empty()) {
std::memcpy(queue.dcb_buffer.data() + queue.dcb_buffer_offset, dcb.data(),
dcb.size_bytes());
queue.dcb_buffer_offset += dcb.size();
dcb = std::span<const u32>{queue.dcb_buffer.begin() + prev_dcb_buffer_offset,
queue.dcb_buffer.begin() + queue.dcb_buffer_offset};
}
if (!ccb.empty()) {
std::memcpy(queue.ccb_buffer.data() + queue.ccb_buffer_offset, ccb.data(),
ccb.size_bytes());
queue.ccb_buffer_offset += ccb.size();
ccb = std::span<const u32>{queue.ccb_buffer.begin() + prev_ccb_buffer_offset,
queue.ccb_buffer.begin() + queue.ccb_buffer_offset};
}
return std::make_pair(dcb, ccb);
}
void Liverpool::SubmitGfx(std::span<const u32> dcb, std::span<const u32> ccb) {
auto& queue = mapped_queues[GfxQueueId];
if (Config::copyGPUCmdBuffers()) {
std::tie(dcb, ccb) = CopyCmdBuffers(dcb, ccb);
}
auto task = ProcessGraphics(dcb, ccb);
{
std::scoped_lock lock{queue.m_access};
queue.submits.emplace(task.handle);
}
std::scoped_lock lk{submit_mutex};
++num_submits;
submit_cv.notify_one();
}
void Liverpool::SubmitAsc(u32 gnm_vqid, std::span<const u32> acb) {
ASSERT_MSG(gnm_vqid > 0 && gnm_vqid < NumTotalQueues, "Invalid virtual ASC queue index");
auto& queue = mapped_queues[gnm_vqid];
const auto vqid = gnm_vqid - 1;
const auto& task = ProcessCompute(acb, vqid);
{
std::scoped_lock lock{queue.m_access};
queue.submits.emplace(task.handle);
}
std::scoped_lock lk{submit_mutex};
num_mapped_queues = std::max(num_mapped_queues, gnm_vqid + 1);
++num_submits;
submit_cv.notify_one();
}
} // namespace AmdGpu