diff --git a/src/video_core/amdgpu/liverpool.cpp b/src/video_core/amdgpu/liverpool.cpp index 246c8c947..bfe99c754 100644 --- a/src/video_core/amdgpu/liverpool.cpp +++ b/src/video_core/amdgpu/liverpool.cpp @@ -726,20 +726,39 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::span -Liverpool::Task Liverpool::ProcessCompute(std::span acb, u32 vqid) { +Liverpool::Task Liverpool::ProcessCompute(const u32* acb, u32 acb_dwords, u32 vqid) { FIBER_ENTER(acb_task_name[vqid]); - const auto& queue = asc_queues[{vqid}]; + auto& queue = asc_queues[{vqid}]; - auto base_addr = reinterpret_cast(acb.data()); - while (!acb.empty()) { - const auto* header = reinterpret_cast(acb.data()); - const u32 type = header->type; - if (type != 3) { - // No other types of packets were spotted so far - UNREACHABLE_MSG("Invalid PM4 type {}", type); + auto base_addr = reinterpret_cast(acb); + while (acb_dwords > 0) { + auto* header = reinterpret_cast(acb); + u32 next_dw_off = header->type3.NumWords() + 1; + + // If we have a buffered packet, use it. + if (queue.tmp_dwords > 0) [[unlikely]] { + header = reinterpret_cast(queue.tmp_packet.data()); + next_dw_off = header->type3.NumWords() + 1 - queue.tmp_dwords; + std::memcpy(queue.tmp_packet.data() + queue.tmp_dwords, acb, next_dw_off * sizeof(u32)); + queue.tmp_dwords = 0; + } + + // If the packet is split across ring boundary, buffer until next submission + if (next_dw_off > acb_dwords) [[unlikely]] { + std::memcpy(queue.tmp_packet.data(), acb, acb_dwords * sizeof(u32)); + queue.tmp_dwords = acb_dwords; + if constexpr (!is_indirect) { + *queue.read_addr += acb_dwords; + *queue.read_addr %= queue.ring_size_dw; + } + break; + } + + if (header->type != 3) { + // No other types of packets were spotted so far + UNREACHABLE_MSG("Invalid PM4 type {}", header->type.Value()); } - const u32 count = header->type3.NumWords(); const PM4ItOpcode opcode = header->type3.opcode; const auto* it_body = reinterpret_cast(header) + 1; switch (opcode) { @@ -749,8 +768,8 @@ Liverpool::Task Liverpool::ProcessCompute(std::span acb, u32 vqid) { } case PM4ItOpcode::IndirectBuffer: { const auto* indirect_buffer = reinterpret_cast(header); - auto task = ProcessCompute( - {indirect_buffer->Address(), indirect_buffer->ib_size}, vqid); + auto task = ProcessCompute(indirect_buffer->Address(), + indirect_buffer->ib_size, vqid); RESUME_ASC(task, vqid); while (!task.handle.done()) { @@ -800,7 +819,7 @@ Liverpool::Task Liverpool::ProcessCompute(std::span acb, u32 vqid) { } case PM4ItOpcode::SetShReg: { const auto* set_data = reinterpret_cast(header); - const auto set_size = (count - 1) * sizeof(u32); + const auto set_size = (header->type3.NumWords() - 1) * sizeof(u32); if (set_data->reg_offset >= 0x200 && set_data->reg_offset <= (0x200 + sizeof(ComputeProgram) / 4)) { @@ -895,14 +914,14 @@ Liverpool::Task Liverpool::ProcessCompute(std::span acb, u32 vqid) { } default: UNREACHABLE_MSG("Unknown PM4 type 3 opcode {:#x} with count {}", - static_cast(opcode), count); + static_cast(opcode), header->type3.NumWords()); } - const auto packet_size_dw = header->type3.NumWords() + 1; - acb = NextPacket(acb, packet_size_dw); + acb += next_dw_off; + acb_dwords -= next_dw_off; if constexpr (!is_indirect) { - *queue.read_addr += packet_size_dw; + *queue.read_addr += next_dw_off; *queue.read_addr %= queue.ring_size_dw; } } @@ -969,7 +988,7 @@ void Liverpool::SubmitAsc(u32 gnm_vqid, std::span acb) { auto& queue = mapped_queues[gnm_vqid]; const auto vqid = gnm_vqid - 1; - const auto& task = ProcessCompute(acb, vqid); + const auto& task = ProcessCompute(acb.data(), acb.size(), vqid); { std::scoped_lock lock{queue.m_access}; queue.submits.emplace(task.handle); diff --git a/src/video_core/amdgpu/liverpool.h b/src/video_core/amdgpu/liverpool.h index c18bcd57b..474c04ec2 100644 --- a/src/video_core/amdgpu/liverpool.h +++ b/src/video_core/amdgpu/liverpool.h @@ -1496,10 +1496,13 @@ public: } struct AscQueueInfo { + static constexpr size_t Pm4BufferSize = 1024; VAddr map_addr; u32* read_addr; u32 ring_size_dw; u32 pipe_id; + std::array tmp_packet; + u32 tmp_dwords; }; Common::SlotVector asc_queues{}; @@ -1541,7 +1544,7 @@ private: Task ProcessGraphics(std::span dcb, std::span ccb); Task ProcessCeUpdate(std::span ccb); template - Task ProcessCompute(std::span acb, u32 vqid); + Task ProcessCompute(const u32* acb, u32 acb_dwords, u32 vqid); void Process(std::stop_token stoken);