mirror of
https://github.com/shadps4-emu/shadPS4.git
synced 2025-05-25 12:55:00 +00:00
GPU processor refactoring (#1787)
* coroutine code prettification * asc queues submission refactoring * better asc ring context handling * final touches and review notes * even more simplification for context saving
This commit is contained in:
parent
af26c945b1
commit
0fd1ab674b
12 changed files with 234 additions and 146 deletions
|
@ -1,6 +1,8 @@
|
|||
// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
|
||||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
|
||||
#include <boost/preprocessor/stringize.hpp>
|
||||
|
||||
#include "common/assert.h"
|
||||
#include "common/config.h"
|
||||
#include "common/debug.h"
|
||||
|
@ -18,7 +20,32 @@ namespace AmdGpu {
|
|||
|
||||
static const char* dcb_task_name{"DCB_TASK"};
|
||||
static const char* ccb_task_name{"CCB_TASK"};
|
||||
static const char* acb_task_name{"ACB_TASK"};
|
||||
|
||||
#define MAX_NAMES 56
|
||||
static_assert(Liverpool::NumComputeRings <= MAX_NAMES);
|
||||
|
||||
#define NAME_NUM(z, n, name) BOOST_PP_STRINGIZE(name) BOOST_PP_STRINGIZE(n),
|
||||
#define NAME_ARRAY(name, num) {BOOST_PP_REPEAT(num, NAME_NUM, name)}
|
||||
|
||||
static const char* acb_task_name[] = NAME_ARRAY(ACB_TASK, MAX_NAMES);
|
||||
|
||||
#define YIELD(name) \
|
||||
FIBER_EXIT; \
|
||||
co_yield {}; \
|
||||
FIBER_ENTER(name);
|
||||
|
||||
#define YIELD_CE() YIELD(ccb_task_name)
|
||||
#define YIELD_GFX() YIELD(dcb_task_name)
|
||||
#define YIELD_ASC(id) YIELD(acb_task_name[id])
|
||||
|
||||
#define RESUME(task, name) \
|
||||
FIBER_EXIT; \
|
||||
task.handle.resume(); \
|
||||
FIBER_ENTER(name);
|
||||
|
||||
#define RESUME_CE(task) RESUME(task, ccb_task_name)
|
||||
#define RESUME_GFX(task) RESUME(task, dcb_task_name)
|
||||
#define RESUME_ASC(task, id) RESUME(task, acb_task_name[id])
|
||||
|
||||
std::array<u8, 48_KB> Liverpool::ConstantEngine::constants_heap;
|
||||
|
||||
|
@ -60,7 +87,7 @@ void Liverpool::Process(std::stop_token stoken) {
|
|||
|
||||
VideoCore::StartCapture();
|
||||
|
||||
int qid = -1;
|
||||
curr_qid = -1;
|
||||
|
||||
while (num_submits || num_commands) {
|
||||
|
||||
|
@ -79,9 +106,9 @@ void Liverpool::Process(std::stop_token stoken) {
|
|||
--num_commands;
|
||||
}
|
||||
|
||||
qid = (qid + 1) % NumTotalQueues;
|
||||
curr_qid = (curr_qid + 1) % num_mapped_queues;
|
||||
|
||||
auto& queue = mapped_queues[qid];
|
||||
auto& queue = mapped_queues[curr_qid];
|
||||
|
||||
Task::Handle task{};
|
||||
{
|
||||
|
@ -119,7 +146,7 @@ void Liverpool::Process(std::stop_token stoken) {
|
|||
}
|
||||
|
||||
Liverpool::Task Liverpool::ProcessCeUpdate(std::span<const u32> ccb) {
|
||||
TracyFiberEnter(ccb_task_name);
|
||||
FIBER_ENTER(ccb_task_name);
|
||||
|
||||
while (!ccb.empty()) {
|
||||
const auto* header = reinterpret_cast<const PM4Header*>(ccb.data());
|
||||
|
@ -155,9 +182,7 @@ Liverpool::Task Liverpool::ProcessCeUpdate(std::span<const u32> ccb) {
|
|||
case PM4ItOpcode::WaitOnDeCounterDiff: {
|
||||
const auto diff = it_body[0];
|
||||
while ((cblock.de_count - cblock.ce_count) >= diff) {
|
||||
TracyFiberLeave;
|
||||
co_yield {};
|
||||
TracyFiberEnter(ccb_task_name);
|
||||
YIELD_CE();
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
@ -165,13 +190,12 @@ Liverpool::Task Liverpool::ProcessCeUpdate(std::span<const u32> ccb) {
|
|||
const auto* indirect_buffer = reinterpret_cast<const PM4CmdIndirectBuffer*>(header);
|
||||
auto task =
|
||||
ProcessCeUpdate({indirect_buffer->Address<const u32>(), indirect_buffer->ib_size});
|
||||
while (!task.handle.done()) {
|
||||
task.handle.resume();
|
||||
RESUME_CE(task);
|
||||
|
||||
TracyFiberLeave;
|
||||
co_yield {};
|
||||
TracyFiberEnter(ccb_task_name);
|
||||
};
|
||||
while (!task.handle.done()) {
|
||||
YIELD_CE();
|
||||
RESUME_CE(task);
|
||||
}
|
||||
break;
|
||||
}
|
||||
default:
|
||||
|
@ -182,11 +206,11 @@ Liverpool::Task Liverpool::ProcessCeUpdate(std::span<const u32> ccb) {
|
|||
ccb = NextPacket(ccb, header->type3.NumWords() + 1);
|
||||
}
|
||||
|
||||
TracyFiberLeave;
|
||||
FIBER_EXIT;
|
||||
}
|
||||
|
||||
Liverpool::Task Liverpool::ProcessGraphics(std::span<const u32> dcb, std::span<const u32> ccb) {
|
||||
TracyFiberEnter(dcb_task_name);
|
||||
FIBER_ENTER(dcb_task_name);
|
||||
|
||||
cblock.Reset();
|
||||
|
||||
|
@ -197,9 +221,7 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span<const u32> dcb, std::span<c
|
|||
if (!ccb.empty()) {
|
||||
// In case of CCB provided kick off CE asap to have the constant heap ready to use
|
||||
ce_task = ProcessCeUpdate(ccb);
|
||||
TracyFiberLeave;
|
||||
ce_task.handle.resume();
|
||||
TracyFiberEnter(dcb_task_name);
|
||||
RESUME_GFX(ce_task);
|
||||
}
|
||||
|
||||
const auto base_addr = reinterpret_cast<uintptr_t>(dcb.data());
|
||||
|
@ -353,8 +375,18 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span<const u32> dcb, std::span<c
|
|||
}
|
||||
case PM4ItOpcode::SetShReg: {
|
||||
const auto* set_data = reinterpret_cast<const PM4CmdSetData*>(header);
|
||||
std::memcpy(®s.reg_array[ShRegWordOffset + set_data->reg_offset], header + 2,
|
||||
(count - 1) * sizeof(u32));
|
||||
const auto set_size = (count - 1) * sizeof(u32);
|
||||
|
||||
if (set_data->reg_offset >= 0x200 &&
|
||||
set_data->reg_offset <= (0x200 + sizeof(ComputeProgram) / 4)) {
|
||||
ASSERT(set_size <= sizeof(ComputeProgram));
|
||||
auto* addr = reinterpret_cast<u32*>(&mapped_queues[GfxQueueId].cs_state) +
|
||||
(set_data->reg_offset - 0x200);
|
||||
std::memcpy(addr, header + 2, set_size);
|
||||
} else {
|
||||
std::memcpy(®s.reg_array[ShRegWordOffset + set_data->reg_offset], header + 2,
|
||||
set_size);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case PM4ItOpcode::SetUconfigReg: {
|
||||
|
@ -474,15 +506,16 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span<const u32> dcb, std::span<c
|
|||
}
|
||||
case PM4ItOpcode::DispatchDirect: {
|
||||
const auto* dispatch_direct = reinterpret_cast<const PM4CmdDispatchDirect*>(header);
|
||||
regs.cs_program.dim_x = dispatch_direct->dim_x;
|
||||
regs.cs_program.dim_y = dispatch_direct->dim_y;
|
||||
regs.cs_program.dim_z = dispatch_direct->dim_z;
|
||||
regs.cs_program.dispatch_initiator = dispatch_direct->dispatch_initiator;
|
||||
auto& cs_program = GetCsRegs();
|
||||
cs_program.dim_x = dispatch_direct->dim_x;
|
||||
cs_program.dim_y = dispatch_direct->dim_y;
|
||||
cs_program.dim_z = dispatch_direct->dim_z;
|
||||
cs_program.dispatch_initiator = dispatch_direct->dispatch_initiator;
|
||||
if (DebugState.DumpingCurrentReg()) {
|
||||
DebugState.PushRegsDump(base_addr, reinterpret_cast<uintptr_t>(header), regs,
|
||||
true);
|
||||
DebugState.PushRegsDumpCompute(base_addr, reinterpret_cast<uintptr_t>(header),
|
||||
cs_program);
|
||||
}
|
||||
if (rasterizer && (regs.cs_program.dispatch_initiator & 1)) {
|
||||
if (rasterizer && (cs_program.dispatch_initiator & 1)) {
|
||||
const auto cmd_address = reinterpret_cast<const void*>(header);
|
||||
rasterizer->ScopeMarkerBegin(fmt::format("dcb:{}:Dispatch", cmd_address));
|
||||
rasterizer->DispatchDirect();
|
||||
|
@ -493,14 +526,15 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span<const u32> dcb, std::span<c
|
|||
case PM4ItOpcode::DispatchIndirect: {
|
||||
const auto* dispatch_indirect =
|
||||
reinterpret_cast<const PM4CmdDispatchIndirect*>(header);
|
||||
auto& cs_program = GetCsRegs();
|
||||
const auto offset = dispatch_indirect->data_offset;
|
||||
const auto ib_address = mapped_queues[GfxQueueId].indirect_args_addr;
|
||||
const auto size = sizeof(PM4CmdDispatchIndirect::GroupDimensions);
|
||||
if (DebugState.DumpingCurrentReg()) {
|
||||
DebugState.PushRegsDump(base_addr, reinterpret_cast<uintptr_t>(header), regs,
|
||||
true);
|
||||
DebugState.PushRegsDumpCompute(base_addr, reinterpret_cast<uintptr_t>(header),
|
||||
cs_program);
|
||||
}
|
||||
if (rasterizer && (regs.cs_program.dispatch_initiator & 1)) {
|
||||
if (rasterizer && (cs_program.dispatch_initiator & 1)) {
|
||||
const auto cmd_address = reinterpret_cast<const void*>(header);
|
||||
rasterizer->ScopeMarkerBegin(
|
||||
fmt::format("dcb:{}:DispatchIndirect", cmd_address));
|
||||
|
@ -613,11 +647,7 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span<const u32> dcb, std::span<c
|
|||
case PM4ItOpcode::Rewind: {
|
||||
const PM4CmdRewind* rewind = reinterpret_cast<const PM4CmdRewind*>(header);
|
||||
while (!rewind->Valid()) {
|
||||
mapped_queues[GfxQueueId].cs_state = regs.cs_program;
|
||||
TracyFiberLeave;
|
||||
co_yield {};
|
||||
TracyFiberEnter(dcb_task_name);
|
||||
regs.cs_program = mapped_queues[GfxQueueId].cs_state;
|
||||
YIELD_GFX();
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
@ -633,11 +663,7 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span<const u32> dcb, std::span<c
|
|||
vo_port->WaitVoLabel([&] { return wait_reg_mem->Test(); });
|
||||
}
|
||||
while (!wait_reg_mem->Test()) {
|
||||
mapped_queues[GfxQueueId].cs_state = regs.cs_program;
|
||||
TracyFiberLeave;
|
||||
co_yield {};
|
||||
TracyFiberEnter(dcb_task_name);
|
||||
regs.cs_program = mapped_queues[GfxQueueId].cs_state;
|
||||
YIELD_GFX();
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
@ -645,13 +671,12 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span<const u32> dcb, std::span<c
|
|||
const auto* indirect_buffer = reinterpret_cast<const PM4CmdIndirectBuffer*>(header);
|
||||
auto task = ProcessGraphics(
|
||||
{indirect_buffer->Address<const u32>(), indirect_buffer->ib_size}, {});
|
||||
while (!task.handle.done()) {
|
||||
task.handle.resume();
|
||||
RESUME_GFX(task);
|
||||
|
||||
TracyFiberLeave;
|
||||
co_yield {};
|
||||
TracyFiberEnter(dcb_task_name);
|
||||
};
|
||||
while (!task.handle.done()) {
|
||||
YIELD_GFX();
|
||||
RESUME_GFX(task);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case PM4ItOpcode::IncrementDeCounter: {
|
||||
|
@ -660,9 +685,7 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span<const u32> dcb, std::span<c
|
|||
}
|
||||
case PM4ItOpcode::WaitOnCeCounter: {
|
||||
while (cblock.ce_count <= cblock.de_count) {
|
||||
TracyFiberLeave;
|
||||
ce_task.handle.resume();
|
||||
TracyFiberEnter(dcb_task_name);
|
||||
RESUME_GFX(ce_task);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
@ -686,11 +709,13 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span<const u32> dcb, std::span<c
|
|||
ce_task.handle.destroy();
|
||||
}
|
||||
|
||||
TracyFiberLeave;
|
||||
FIBER_EXIT;
|
||||
}
|
||||
|
||||
Liverpool::Task Liverpool::ProcessCompute(std::span<const u32> acb, int vqid) {
|
||||
TracyFiberEnter(acb_task_name);
|
||||
template <bool is_indirect>
|
||||
Liverpool::Task Liverpool::ProcessCompute(std::span<const u32> acb, u32 vqid) {
|
||||
FIBER_ENTER(acb_task_name[vqid]);
|
||||
const auto& queue = asc_queues[{vqid}];
|
||||
|
||||
auto base_addr = reinterpret_cast<uintptr_t>(acb.data());
|
||||
while (!acb.empty()) {
|
||||
|
@ -711,15 +736,14 @@ Liverpool::Task Liverpool::ProcessCompute(std::span<const u32> acb, int vqid) {
|
|||
}
|
||||
case PM4ItOpcode::IndirectBuffer: {
|
||||
const auto* indirect_buffer = reinterpret_cast<const PM4CmdIndirectBuffer*>(header);
|
||||
auto task = ProcessCompute(
|
||||
auto task = ProcessCompute<true>(
|
||||
{indirect_buffer->Address<const u32>(), indirect_buffer->ib_size}, vqid);
|
||||
while (!task.handle.done()) {
|
||||
task.handle.resume();
|
||||
RESUME_ASC(task, vqid);
|
||||
|
||||
TracyFiberLeave;
|
||||
co_yield {};
|
||||
TracyFiberEnter(acb_task_name);
|
||||
};
|
||||
while (!task.handle.done()) {
|
||||
YIELD_ASC(vqid);
|
||||
RESUME_ASC(task, vqid);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case PM4ItOpcode::DmaData: {
|
||||
|
@ -757,30 +781,38 @@ Liverpool::Task Liverpool::ProcessCompute(std::span<const u32> acb, int vqid) {
|
|||
case PM4ItOpcode::Rewind: {
|
||||
const PM4CmdRewind* rewind = reinterpret_cast<const PM4CmdRewind*>(header);
|
||||
while (!rewind->Valid()) {
|
||||
mapped_queues[vqid].cs_state = regs.cs_program;
|
||||
TracyFiberLeave;
|
||||
co_yield {};
|
||||
TracyFiberEnter(acb_task_name);
|
||||
regs.cs_program = mapped_queues[vqid].cs_state;
|
||||
YIELD_ASC(vqid);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case PM4ItOpcode::SetShReg: {
|
||||
const auto* set_data = reinterpret_cast<const PM4CmdSetData*>(header);
|
||||
std::memcpy(®s.reg_array[ShRegWordOffset + set_data->reg_offset], header + 2,
|
||||
(count - 1) * sizeof(u32));
|
||||
const auto set_size = (count - 1) * sizeof(u32);
|
||||
|
||||
if (set_data->reg_offset >= 0x200 &&
|
||||
set_data->reg_offset <= (0x200 + sizeof(ComputeProgram) / 4)) {
|
||||
ASSERT(set_size <= sizeof(ComputeProgram));
|
||||
auto* addr = reinterpret_cast<u32*>(&mapped_queues[vqid + 1].cs_state) +
|
||||
(set_data->reg_offset - 0x200);
|
||||
std::memcpy(addr, header + 2, set_size);
|
||||
} else {
|
||||
std::memcpy(®s.reg_array[ShRegWordOffset + set_data->reg_offset], header + 2,
|
||||
set_size);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case PM4ItOpcode::DispatchDirect: {
|
||||
const auto* dispatch_direct = reinterpret_cast<const PM4CmdDispatchDirect*>(header);
|
||||
regs.cs_program.dim_x = dispatch_direct->dim_x;
|
||||
regs.cs_program.dim_y = dispatch_direct->dim_y;
|
||||
regs.cs_program.dim_z = dispatch_direct->dim_z;
|
||||
regs.cs_program.dispatch_initiator = dispatch_direct->dispatch_initiator;
|
||||
auto& cs_program = GetCsRegs();
|
||||
cs_program.dim_x = dispatch_direct->dim_x;
|
||||
cs_program.dim_y = dispatch_direct->dim_y;
|
||||
cs_program.dim_z = dispatch_direct->dim_z;
|
||||
cs_program.dispatch_initiator = dispatch_direct->dispatch_initiator;
|
||||
if (DebugState.DumpingCurrentReg()) {
|
||||
DebugState.PushRegsDump(base_addr, reinterpret_cast<uintptr_t>(header), regs, true);
|
||||
DebugState.PushRegsDumpCompute(base_addr, reinterpret_cast<uintptr_t>(header),
|
||||
cs_program);
|
||||
}
|
||||
if (rasterizer && (regs.cs_program.dispatch_initiator & 1)) {
|
||||
if (rasterizer && (cs_program.dispatch_initiator & 1)) {
|
||||
const auto cmd_address = reinterpret_cast<const void*>(header);
|
||||
rasterizer->ScopeMarkerBegin(fmt::format("acb[{}]:{}:Dispatch", vqid, cmd_address));
|
||||
rasterizer->DispatchDirect();
|
||||
|
@ -803,17 +835,13 @@ Liverpool::Task Liverpool::ProcessCompute(std::span<const u32> acb, int vqid) {
|
|||
const auto* wait_reg_mem = reinterpret_cast<const PM4CmdWaitRegMem*>(header);
|
||||
ASSERT(wait_reg_mem->engine.Value() == PM4CmdWaitRegMem::Engine::Me);
|
||||
while (!wait_reg_mem->Test()) {
|
||||
mapped_queues[vqid].cs_state = regs.cs_program;
|
||||
TracyFiberLeave;
|
||||
co_yield {};
|
||||
TracyFiberEnter(acb_task_name);
|
||||
regs.cs_program = mapped_queues[vqid].cs_state;
|
||||
YIELD_ASC(vqid);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case PM4ItOpcode::ReleaseMem: {
|
||||
const auto* release_mem = reinterpret_cast<const PM4CmdReleaseMem*>(header);
|
||||
release_mem->SignalFence(Platform::InterruptId::Compute0RelMem); // <---
|
||||
release_mem->SignalFence(static_cast<Platform::InterruptId>(queue.pipe_id));
|
||||
break;
|
||||
}
|
||||
default:
|
||||
|
@ -821,10 +849,16 @@ Liverpool::Task Liverpool::ProcessCompute(std::span<const u32> acb, int vqid) {
|
|||
static_cast<u32>(opcode), count);
|
||||
}
|
||||
|
||||
acb = NextPacket(acb, header->type3.NumWords() + 1);
|
||||
const auto packet_size_dw = header->type3.NumWords() + 1;
|
||||
acb = NextPacket(acb, packet_size_dw);
|
||||
|
||||
if constexpr (!is_indirect) {
|
||||
*queue.read_addr += packet_size_dw;
|
||||
*queue.read_addr %= queue.ring_size_dw;
|
||||
}
|
||||
}
|
||||
|
||||
TracyFiberLeave;
|
||||
FIBER_EXIT;
|
||||
}
|
||||
|
||||
std::pair<std::span<const u32>, std::span<const u32>> Liverpool::CopyCmdBuffers(
|
||||
|
@ -881,10 +915,11 @@ void Liverpool::SubmitGfx(std::span<const u32> dcb, std::span<const u32> ccb) {
|
|||
submit_cv.notify_one();
|
||||
}
|
||||
|
||||
void Liverpool::SubmitAsc(u32 vqid, std::span<const u32> acb) {
|
||||
ASSERT_MSG(vqid >= 0 && vqid < NumTotalQueues, "Invalid virtual ASC queue index");
|
||||
auto& queue = mapped_queues[vqid];
|
||||
void Liverpool::SubmitAsc(u32 gnm_vqid, std::span<const u32> acb) {
|
||||
ASSERT_MSG(gnm_vqid > 0 && gnm_vqid < NumTotalQueues, "Invalid virtual ASC queue index");
|
||||
auto& queue = mapped_queues[gnm_vqid];
|
||||
|
||||
const auto vqid = gnm_vqid - 1;
|
||||
const auto& task = ProcessCompute(acb, vqid);
|
||||
{
|
||||
std::scoped_lock lock{queue.m_access};
|
||||
|
@ -892,6 +927,7 @@ void Liverpool::SubmitAsc(u32 vqid, std::span<const u32> acb) {
|
|||
}
|
||||
|
||||
std::scoped_lock lk{submit_mutex};
|
||||
num_mapped_queues = std::max(num_mapped_queues, gnm_vqid + 1);
|
||||
++num_submits;
|
||||
submit_cv.notify_one();
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue