video_core: added support for indirect draws (#678)

* video_core: added support for indirect draws

* barriers simplified
This commit is contained in:
psucien 2024-08-30 22:59:56 +02:00 committed by GitHub
parent 3d375a28eb
commit ca1613258f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 140 additions and 35 deletions

View file

@ -368,6 +368,36 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span<const u32> dcb, std::span<c
}
break;
}
case PM4ItOpcode::DrawIndirect: {
const auto* draw_indirect = reinterpret_cast<const PM4CmdDrawIndirect*>(header);
const auto offset = draw_indirect->data_offset;
const auto ib_address = mapped_queues[GfxQueueId].indirect_args_addr;
const auto size = sizeof(PM4CmdDrawIndirect::DrawInstancedArgs);
if (rasterizer) {
const auto cmd_address = reinterpret_cast<const void*>(header);
rasterizer->ScopeMarkerBegin(fmt::format("dcb:{}:DrawIndirect", cmd_address));
rasterizer->Breadcrumb(u64(cmd_address));
rasterizer->DrawIndirect(false, ib_address, offset, size);
rasterizer->ScopeMarkerEnd();
}
break;
}
case PM4ItOpcode::DrawIndexIndirect: {
const auto* draw_index_indirect =
reinterpret_cast<const PM4CmdDrawIndexIndirect*>(header);
const auto offset = draw_index_indirect->data_offset;
const auto ib_address = mapped_queues[GfxQueueId].indirect_args_addr;
const auto size = sizeof(PM4CmdDrawIndexIndirect::DrawIndexInstancedArgs);
if (rasterizer) {
const auto cmd_address = reinterpret_cast<const void*>(header);
rasterizer->ScopeMarkerBegin(
fmt::format("dcb:{}:DrawIndexIndirect", cmd_address));
rasterizer->Breadcrumb(u64(cmd_address));
rasterizer->DrawIndirect(true, ib_address, offset, size);
rasterizer->ScopeMarkerEnd();
}
break;
}
case PM4ItOpcode::DispatchDirect: {
const auto* dispatch_direct = reinterpret_cast<const PM4CmdDispatchDirect*>(header);
regs.cs_program.dim_x = dispatch_direct->dim_x;
@ -488,6 +518,7 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span<const u32> dcb, std::span<c
break;
}
case PM4ItOpcode::PfpSyncMe: {
rasterizer->CpSync();
break;
}
default:

View file

@ -253,20 +253,6 @@ struct PM4CmdDrawIndexAuto {
u32 draw_initiator;
};
struct PM4CmdDrawIndirect {
PM4Type3Header header; ///< header
u32 data_offset; ///< DWORD aligned offset
union {
u32 dw2;
BitField<0, 16, u32> base_vtx_loc; ///< base vertex location
};
union {
u32 dw3;
BitField<0, 16, u32> start_inst_loc; ///< start instance location
};
u32 draw_initiator; ///< Draw Initiator Register
};
enum class DataSelect : u32 {
None = 0,
Data32Low = 1,
@ -740,4 +726,51 @@ struct PM4CmdDispatchIndirect {
u32 dispatch_initiator; ///< Dispatch Initiator Register
};
struct PM4CmdDrawIndirect {
struct DrawInstancedArgs {
u32 vertex_count_per_instance;
u32 instance_count;
u32 start_vertex_location;
u32 start_instance_location;
};
PM4Type3Header header; ///< header
u32 data_offset; ///< Byte aligned offset where the required data structure starts
union {
u32 dw2;
BitField<0, 16, u32> base_vtx_loc; ///< Offset where the CP will write the
///< BaseVertexLocation it fetched from memory
};
union {
u32 dw3;
BitField<0, 16, u32> start_inst_loc; ///< Offset where the CP will write the
///< StartInstanceLocation it fetched from memory
};
u32 draw_initiator; ///< Draw Initiator Register
};
struct PM4CmdDrawIndexIndirect {
struct DrawIndexInstancedArgs {
u32 index_count_per_instance;
u32 instance_count;
u32 start_index_location;
u32 base_vertex_location;
u32 start_instance_location;
};
PM4Type3Header header; ///< header
u32 data_offset; ///< Byte aligned offset where the required data structure starts
union {
u32 dw2;
BitField<0, 16, u32> base_vtx_loc; ///< Offset where the CP will write the
///< BaseVertexLocation it fetched from memory
};
union { // NOTE: this one is undocumented in AMD spec, but Gnm driver writes this field
u32 dw3;
BitField<0, 16, u32> start_inst_loc; ///< Offset where the CP will write the
///< StartInstanceLocation it fetched from memory
};
u32 draw_initiator; ///< Draw Initiator Register
};
} // namespace AmdGpu

View file

@ -29,6 +29,19 @@ Rasterizer::Rasterizer(const Instance& instance_, Scheduler& scheduler_,
Rasterizer::~Rasterizer() = default;
void Rasterizer::CpSync() {
scheduler.EndRendering();
auto cmdbuf = scheduler.CommandBuffer();
const vk::MemoryBarrier ib_barrier{
.srcAccessMask = vk::AccessFlagBits::eShaderWrite,
.dstAccessMask = vk::AccessFlagBits::eIndirectCommandRead,
};
cmdbuf.pipelineBarrier(vk::PipelineStageFlagBits::eComputeShader,
vk::PipelineStageFlagBits::eDrawIndirect,
vk::DependencyFlagBits::eByRegion, ib_barrier, {}, {});
}
void Rasterizer::Draw(bool is_indexed, u32 index_offset) {
RENDERER_TRACE;
@ -66,6 +79,45 @@ void Rasterizer::Draw(bool is_indexed, u32 index_offset) {
}
}
void Rasterizer::DrawIndirect(bool is_indexed, VAddr address, u32 offset, u32 size) {
RENDERER_TRACE;
const auto cmdbuf = scheduler.CommandBuffer();
const auto& regs = liverpool->regs;
const GraphicsPipeline* pipeline = pipeline_cache.GetGraphicsPipeline();
if (!pipeline) {
return;
}
ASSERT_MSG(regs.primitive_type != AmdGpu::Liverpool::PrimitiveType::RectList,
"Unsupported primitive type for indirect draw");
try {
pipeline->BindResources(regs, buffer_cache, texture_cache);
} catch (...) {
UNREACHABLE();
}
const auto& vs_info = pipeline->GetStage(Shader::Stage::Vertex);
buffer_cache.BindVertexBuffers(vs_info);
const u32 num_indices = buffer_cache.BindIndexBuffer(is_indexed, 0);
BeginRendering();
UpdateDynamicState(*pipeline);
const auto [buffer, base] = buffer_cache.ObtainBuffer(address, size, true);
const auto total_offset = base + offset;
// We can safely ignore both SGPR UD indices and results of fetch shader parsing, as vertex and
// instance offsets will be automatically applied by Vulkan from indirect args buffer.
if (is_indexed) {
cmdbuf.drawIndexedIndirect(buffer->Handle(), total_offset, 1, 0);
} else {
cmdbuf.drawIndirect(buffer->Handle(), total_offset, 1, 0);
}
}
void Rasterizer::DispatchDirect() {
RENDERER_TRACE;
@ -113,19 +165,6 @@ void Rasterizer::DispatchIndirect(VAddr address, u32 offset, u32 size) {
cmdbuf.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline->Handle());
const auto [buffer, base] = buffer_cache.ObtainBuffer(address, size, true);
const auto total_offset = base + offset;
// Emulate PFP-to-ME sync packet
const vk::BufferMemoryBarrier ib_barrier{
.srcAccessMask = vk::AccessFlagBits::eShaderWrite,
.dstAccessMask = vk::AccessFlagBits::eIndirectCommandRead,
.buffer = buffer->Handle(),
.offset = total_offset,
.size = size,
};
cmdbuf.pipelineBarrier(vk::PipelineStageFlagBits::eComputeShader,
vk::PipelineStageFlagBits::eDrawIndirect,
vk::DependencyFlagBits::eByRegion, {}, ib_barrier, {});
cmdbuf.dispatchIndirect(buffer->Handle(), total_offset);
}

View file

@ -32,6 +32,7 @@ public:
}
void Draw(bool is_indexed, u32 index_offset = 0);
void DrawIndirect(bool is_indexed, VAddr address, u32 offset, u32 size);
void DispatchDirect();
void DispatchIndirect(VAddr address, u32 offset, u32 size);
@ -45,6 +46,7 @@ public:
void MapMemory(VAddr addr, u64 size);
void UnmapMemory(VAddr addr, u64 size);
void CpSync();
u64 Flush();
private: