Move presentation to separate thread/improve sync (#303)

* video_out: Move presentation to separate thread

* liverpool: Better sync for CPU flips

* driver: Make flip blocking

* videoout: Proper flip rate and vblank management

* config: Add vblank divider option

* clang format

* videoout: added `sceVideoOutWaitVblank`

* clang format

* vk_scheduler: Silly merge conflict

* externals: Add renderdoc API

* clang format

* reuse

* rdoc: manual capture trigger

* clang fmt

---------

Co-authored-by: psucien <168137814+psucien@users.noreply.github.com>
This commit is contained in:
TheTurtle 2024-07-28 16:54:09 +03:00 committed by GitHub
parent 361412031c
commit 0d6edaa0a0
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
32 changed files with 1259 additions and 224 deletions

View file

@ -63,44 +63,30 @@ bool CanBlitToSwapchain(const vk::PhysicalDevice physical_device, vk::Format for
};
}
RendererVulkan::RendererVulkan(Frontend::WindowSDL& window_, AmdGpu::Liverpool* liverpool)
: window{window_}, instance{window, Config::getGpuId(), Config::vkValidationEnabled()},
scheduler{instance}, swapchain{instance, window}, texture_cache{instance, scheduler} {
rasterizer = std::make_unique<Rasterizer>(instance, scheduler, texture_cache, liverpool);
RendererVulkan::RendererVulkan(Frontend::WindowSDL& window_, AmdGpu::Liverpool* liverpool_)
: window{window_}, liverpool{liverpool_},
instance{window, Config::getGpuId(), Config::vkValidationEnabled()}, draw_scheduler{instance},
present_scheduler{instance}, flip_scheduler{instance}, swapchain{instance, window},
texture_cache{instance, draw_scheduler} {
rasterizer = std::make_unique<Rasterizer>(instance, draw_scheduler, texture_cache, liverpool);
const u32 num_images = swapchain.GetImageCount();
const vk::Device device = instance.GetDevice();
const vk::CommandPoolCreateInfo pool_info = {
.flags = vk::CommandPoolCreateFlagBits::eResetCommandBuffer |
vk::CommandPoolCreateFlagBits::eTransient,
.queueFamilyIndex = instance.GetGraphicsQueueFamilyIndex(),
};
command_pool = device.createCommandPoolUnique(pool_info);
const vk::CommandBufferAllocateInfo alloc_info = {
.commandPool = *command_pool,
.level = vk::CommandBufferLevel::ePrimary,
.commandBufferCount = num_images,
};
const auto cmdbuffers = device.allocateCommandBuffers(alloc_info);
// Create presentation frames.
present_frames.resize(num_images);
for (u32 i = 0; i < num_images; i++) {
Frame& frame = present_frames[i];
frame.cmdbuf = cmdbuffers[i];
frame.render_ready = device.createSemaphore({});
frame.present_done = device.createFence({.flags = vk::FenceCreateFlagBits::eSignaled});
free_queue.push(&frame);
}
}
RendererVulkan::~RendererVulkan() {
scheduler.Finish();
draw_scheduler.Finish();
const vk::Device device = instance.GetDevice();
for (auto& frame : present_frames) {
vmaDestroyImage(instance.GetAllocator(), frame.image, frame.allocation);
device.destroyImageView(frame.image_view);
device.destroySemaphore(frame.render_ready);
device.destroyFence(frame.present_done);
}
}
@ -184,7 +170,7 @@ bool RendererVulkan::ShowSplash(Frame* frame /*= nullptr*/) {
info.pitch = splash->GetImageInfo().width;
info.guest_address = VAddr(splash->GetImageData().data());
info.guest_size_bytes = splash->GetImageData().size();
splash_img.emplace(instance, scheduler, info);
splash_img.emplace(instance, present_scheduler, info);
texture_cache.RefreshImage(*splash_img);
}
frame = PrepareFrameInternal(*splash_img);
@ -193,12 +179,18 @@ bool RendererVulkan::ShowSplash(Frame* frame /*= nullptr*/) {
return true;
}
Frame* RendererVulkan::PrepareFrameInternal(VideoCore::Image& image) {
Frame* RendererVulkan::PrepareFrameInternal(VideoCore::Image& image, bool is_eop) {
// Request a free presentation frame.
Frame* frame = GetRenderFrame();
// Post-processing (Anti-aliasing, FSR etc) goes here. For now just blit to the frame image.
image.Transit(vk::ImageLayout::eTransferSrcOptimal, vk::AccessFlagBits::eTransferRead);
// EOP flips are triggered from GPU thread so use the drawing scheduler to record
// commands. Otherwise we are dealing with a CPU flip which could have arrived
// from any guest thread. Use a separate scheduler for that.
auto& scheduler = is_eop ? draw_scheduler : flip_scheduler;
scheduler.EndRendering();
const auto cmdbuf = scheduler.CommandBuffer();
image.Transit(vk::ImageLayout::eTransferSrcOptimal, vk::AccessFlagBits::eTransferRead, cmdbuf);
const std::array pre_barrier{
vk::ImageMemoryBarrier{
@ -218,12 +210,11 @@ Frame* RendererVulkan::PrepareFrameInternal(VideoCore::Image& image) {
},
},
};
const auto cmdbuf = scheduler.CommandBuffer();
cmdbuf.pipelineBarrier(vk::PipelineStageFlagBits::eTransfer,
vk::PipelineStageFlagBits::eTransfer, vk::DependencyFlagBits::eByRegion,
{}, {}, pre_barrier);
// Post-processing (Anti-aliasing, FSR etc) goes here. For now just blit to the frame image.
cmdbuf.blitImage(
image.image, image.layout, frame->image, vk::ImageLayout::eTransferDstOptimal,
MakeImageBlit(image.info.size.width, image.info.size.height, frame->width, frame->height),
@ -245,13 +236,15 @@ Frame* RendererVulkan::PrepareFrameInternal(VideoCore::Image& image) {
.layerCount = VK_REMAINING_ARRAY_LAYERS,
},
};
cmdbuf.pipelineBarrier(vk::PipelineStageFlagBits::eAllCommands,
vk::PipelineStageFlagBits::eAllCommands,
vk::DependencyFlagBits::eByRegion, {}, {}, post_barrier);
// Flush pending vulkan operations.
scheduler.Flush(frame->render_ready);
// Flush frame creation commands.
frame->ready_semaphore = scheduler.GetMasterSemaphore()->Handle();
frame->ready_tick = scheduler.CurrentTick();
SubmitInfo info{};
scheduler.Flush(info);
return frame;
}
@ -260,11 +253,8 @@ void RendererVulkan::Present(Frame* frame) {
const vk::Image swapchain_image = swapchain.Image();
const vk::CommandBufferBeginInfo begin_info = {
.flags = vk::CommandBufferUsageFlagBits::eOneTimeSubmit,
};
const vk::CommandBuffer cmdbuf = frame->cmdbuf;
cmdbuf.begin(begin_info);
auto& scheduler = present_scheduler;
const auto cmdbuf = scheduler.CommandBuffer();
{
auto* profiler_ctx = instance.GetProfilerContext();
TracyVkNamedZoneC(profiler_ctx, renderer_gpu_zone, cmdbuf, "Host frame",
@ -339,35 +329,17 @@ void RendererVulkan::Present(Frame* frame) {
TracyVkCollect(profiler_ctx, cmdbuf);
}
}
cmdbuf.end();
static constexpr std::array<vk::PipelineStageFlags, 2> wait_stage_masks = {
vk::PipelineStageFlagBits::eColorAttachmentOutput,
vk::PipelineStageFlagBits::eAllGraphics,
};
const vk::Semaphore present_ready = swapchain.GetPresentReadySemaphore();
const vk::Semaphore image_acquired = swapchain.GetImageAcquiredSemaphore();
const std::array wait_semaphores = {image_acquired, frame->render_ready};
vk::SubmitInfo submit_info = {
.waitSemaphoreCount = static_cast<u32>(wait_semaphores.size()),
.pWaitSemaphores = wait_semaphores.data(),
.pWaitDstStageMask = wait_stage_masks.data(),
.commandBufferCount = 1u,
.pCommandBuffers = &cmdbuf,
.signalSemaphoreCount = 1,
.pSignalSemaphores = &present_ready,
};
std::scoped_lock submit_lock{scheduler.submit_mutex};
try {
instance.GetGraphicsQueue().submit(submit_info, frame->present_done);
} catch (vk::DeviceLostError& err) {
LOG_CRITICAL(Render_Vulkan, "Device lost during present submit: {}", err.what());
UNREACHABLE();
}
// Flush vulkan commands.
SubmitInfo info{};
info.AddWait(swapchain.GetImageAcquiredSemaphore());
info.AddWait(frame->ready_semaphore, frame->ready_tick);
info.AddSignal(swapchain.GetPresentReadySemaphore());
info.AddSignal(frame->present_done);
scheduler.Flush(info);
// Present to swapchain.
std::scoped_lock submit_lock{Scheduler::submit_mutex};
swapchain.Present();
// Free the frame for reuse

View file

@ -26,9 +26,15 @@ struct Frame {
VmaAllocation allocation;
vk::Image image;
vk::ImageView image_view;
vk::Semaphore render_ready;
vk::Fence present_done;
vk::CommandBuffer cmdbuf;
vk::Semaphore ready_semaphore;
u64 ready_tick;
};
enum SchedulerType {
Draw,
Present,
CpuFlip,
};
class Rasterizer;
@ -39,16 +45,16 @@ public:
~RendererVulkan();
Frame* PrepareFrame(const Libraries::VideoOut::BufferAttributeGroup& attribute,
VAddr cpu_address) {
VAddr cpu_address, bool is_eop) {
const auto info = VideoCore::ImageInfo{attribute, cpu_address};
const auto image_id = texture_cache.FindImage(info, cpu_address);
auto& image = texture_cache.GetImage(image_id);
return PrepareFrameInternal(image);
return PrepareFrameInternal(image, is_eop);
}
Frame* PrepareBlankFrame() {
auto& image = texture_cache.GetImage(VideoCore::NULL_IMAGE_ID);
return PrepareFrameInternal(image);
return PrepareFrameInternal(image, true);
}
VideoCore::Image& RegisterVideoOutSurface(
@ -60,9 +66,9 @@ public:
}
bool IsVideoOutSurface(const AmdGpu::Liverpool::ColorBuffer& color_buffer) {
return std::find_if(vo_buffers_addr.cbegin(), vo_buffers_addr.cend(), [&](VAddr vo_buffer) {
return std::ranges::find_if(vo_buffers_addr, [&](VAddr vo_buffer) {
return vo_buffer == color_buffer.Address();
}) != vo_buffers_addr.cend();
}) != vo_buffers_addr.end();
}
bool ShowSplash(Frame* frame = nullptr);
@ -70,13 +76,16 @@ public:
void RecreateFrame(Frame* frame, u32 width, u32 height);
private:
Frame* PrepareFrameInternal(VideoCore::Image& image);
Frame* PrepareFrameInternal(VideoCore::Image& image, bool is_eop = true);
Frame* GetRenderFrame();
private:
Frontend::WindowSDL& window;
AmdGpu::Liverpool* liverpool;
Instance instance;
Scheduler scheduler;
Scheduler draw_scheduler;
Scheduler present_scheduler;
Scheduler flip_scheduler;
Swapchain swapchain;
std::unique_ptr<Rasterizer> rasterizer;
VideoCore::TextureCache texture_cache;

View file

@ -2,8 +2,6 @@
// SPDX-License-Identifier: GPL-2.0-or-later
#include <limits>
#include <mutex>
#include "common/assert.h"
#include "video_core/renderer_vulkan/vk_instance.h"
#include "video_core/renderer_vulkan/vk_master_semaphore.h"
@ -60,46 +58,4 @@ void MasterSemaphore::Wait(u64 tick) {
Refresh();
}
void MasterSemaphore::SubmitWork(vk::CommandBuffer cmdbuf, vk::Semaphore wait, vk::Semaphore signal,
u64 signal_value) {
cmdbuf.end();
const u32 num_signal_semaphores = signal ? 2U : 1U;
const std::array signal_values{signal_value, u64(0)};
const std::array signal_semaphores{Handle(), signal};
const u32 num_wait_semaphores = wait ? 2U : 1U;
const std::array wait_values{signal_value - 1, u64(1)};
const std::array wait_semaphores{Handle(), wait};
static constexpr std::array<vk::PipelineStageFlags, 2> wait_stage_masks = {
vk::PipelineStageFlagBits::eAllCommands,
vk::PipelineStageFlagBits::eColorAttachmentOutput,
};
const vk::TimelineSemaphoreSubmitInfo timeline_si = {
.waitSemaphoreValueCount = num_wait_semaphores,
.pWaitSemaphoreValues = wait_values.data(),
.signalSemaphoreValueCount = num_signal_semaphores,
.pSignalSemaphoreValues = signal_values.data(),
};
const vk::SubmitInfo submit_info = {
.pNext = &timeline_si,
.waitSemaphoreCount = num_wait_semaphores,
.pWaitSemaphores = wait_semaphores.data(),
.pWaitDstStageMask = wait_stage_masks.data(),
.commandBufferCount = 1u,
.pCommandBuffers = &cmdbuf,
.signalSemaphoreCount = num_signal_semaphores,
.pSignalSemaphores = signal_semaphores.data(),
};
try {
instance.GetGraphicsQueue().submit(submit_info);
} catch (vk::DeviceLostError& err) {
UNREACHABLE_MSG("Device lost during submit: {}", err.what());
}
}
} // namespace Vulkan

View file

@ -46,10 +46,6 @@ public:
/// Waits for a tick to be hit on the GPU
void Wait(u64 tick);
/// Submits the provided command buffer for execution
void SubmitWork(vk::CommandBuffer cmdbuf, vk::Semaphore wait, vk::Semaphore signal,
u64 signal_value);
protected:
const Instance& instance;
vk::UniqueSemaphore semaphore; ///< Timeline semaphore.

View file

@ -96,6 +96,13 @@ void Rasterizer::DispatchDirect() {
cmdbuf.dispatch(cs_program.dim_x, cs_program.dim_y, cs_program.dim_z);
}
u64 Rasterizer::Flush() {
const u64 current_tick = scheduler.CurrentTick();
SubmitInfo info{};
scheduler.Flush(info);
return current_tick;
}
void Rasterizer::BeginRendering() {
const auto& regs = liverpool->regs;
RenderState state;

View file

@ -36,6 +36,8 @@ public:
void ScopeMarkerBegin(const std::string& str);
void ScopeMarkerEnd();
u64 Flush();
private:
u32 SetupIndexBuffer(bool& is_indexed, u32 index_offset);
void MapMemory(VAddr addr, size_t size);

View file

@ -2,12 +2,15 @@
// SPDX-License-Identifier: GPL-2.0-or-later
#include <mutex>
#include "common/assert.h"
#include "common/debug.h"
#include "video_core/renderer_vulkan/vk_instance.h"
#include "video_core/renderer_vulkan/vk_scheduler.h"
namespace Vulkan {
std::mutex Scheduler::submit_mutex;
Scheduler::Scheduler(const Instance& instance)
: instance{instance}, master_semaphore{instance}, command_pool{instance, &master_semaphore} {
profiler_scope = reinterpret_cast<tracy::VkCtxScope*>(std::malloc(sizeof(tracy::VkCtxScope)));
@ -50,22 +53,24 @@ void Scheduler::EndRendering() {
current_cmdbuf.endRendering();
}
void Scheduler::Flush(vk::Semaphore signal, vk::Semaphore wait) {
// When flushing, we only send data to the worker thread; no waiting is necessary.
SubmitExecution(signal, wait);
void Scheduler::Flush(SubmitInfo& info) {
// When flushing, we only send data to the driver; no waiting is necessary.
SubmitExecution(info);
}
void Scheduler::Finish(vk::Semaphore signal, vk::Semaphore wait) {
void Scheduler::Finish() {
// When finishing, we need to wait for the submission to have executed on the device.
const u64 presubmit_tick = CurrentTick();
SubmitExecution(signal, wait);
SubmitInfo info{};
SubmitExecution(info);
Wait(presubmit_tick);
}
void Scheduler::Wait(u64 tick) {
if (tick >= master_semaphore.CurrentTick()) {
// Make sure we are not waiting for the current tick without signalling
Flush();
SubmitInfo info{};
Flush(info);
}
master_semaphore.Wait(tick);
}
@ -86,7 +91,7 @@ void Scheduler::AllocateWorkerCommandBuffers() {
}
}
void Scheduler::SubmitExecution(vk::Semaphore signal_semaphore, vk::Semaphore wait_semaphore) {
void Scheduler::SubmitExecution(SubmitInfo& info) {
std::scoped_lock lk{submit_mutex};
const u64 signal_value = master_semaphore.NextTick();
@ -97,7 +102,40 @@ void Scheduler::SubmitExecution(vk::Semaphore signal_semaphore, vk::Semaphore wa
}
EndRendering();
master_semaphore.SubmitWork(current_cmdbuf, wait_semaphore, signal_semaphore, signal_value);
current_cmdbuf.end();
const vk::Semaphore timeline = master_semaphore.Handle();
info.AddSignal(timeline, signal_value);
static constexpr std::array<vk::PipelineStageFlags, 2> wait_stage_masks = {
vk::PipelineStageFlagBits::eAllCommands,
vk::PipelineStageFlagBits::eColorAttachmentOutput,
};
const vk::TimelineSemaphoreSubmitInfo timeline_si = {
.waitSemaphoreValueCount = static_cast<u32>(info.wait_ticks.size()),
.pWaitSemaphoreValues = info.wait_ticks.data(),
.signalSemaphoreValueCount = static_cast<u32>(info.signal_ticks.size()),
.pSignalSemaphoreValues = info.signal_ticks.data(),
};
const vk::SubmitInfo submit_info = {
.pNext = &timeline_si,
.waitSemaphoreCount = static_cast<u32>(info.wait_semas.size()),
.pWaitSemaphores = info.wait_semas.data(),
.pWaitDstStageMask = wait_stage_masks.data(),
.commandBufferCount = 1U,
.pCommandBuffers = &current_cmdbuf,
.signalSemaphoreCount = static_cast<u32>(info.signal_semas.size()),
.pSignalSemaphores = info.signal_semas.data(),
};
try {
instance.GetGraphicsQueue().submit(submit_info, info.fence);
} catch (vk::DeviceLostError& err) {
UNREACHABLE_MSG("Device lost during submit: {}", err.what());
}
master_semaphore.Refresh();
AllocateWorkerCommandBuffers();

View file

@ -26,16 +26,39 @@ struct RenderState {
}
};
struct SubmitInfo {
boost::container::static_vector<vk::Semaphore, 3> wait_semas;
boost::container::static_vector<u64, 3> wait_ticks;
boost::container::static_vector<vk::Semaphore, 3> signal_semas;
boost::container::static_vector<u64, 3> signal_ticks;
vk::Fence fence;
void AddWait(vk::Semaphore semaphore, u64 tick = 1) {
wait_semas.emplace_back(semaphore);
wait_ticks.emplace_back(tick);
}
void AddSignal(vk::Semaphore semaphore, u64 tick = 1) {
signal_semas.emplace_back(semaphore);
signal_ticks.emplace_back(tick);
}
void AddSignal(vk::Fence fence) {
this->fence = fence;
}
};
class Scheduler {
public:
explicit Scheduler(const Instance& instance);
~Scheduler();
/// Sends the current execution context to the GPU.
void Flush(vk::Semaphore signal = nullptr, vk::Semaphore wait = nullptr);
/// Sends the current execution context to the GPU
/// and increments the scheduler timeline semaphore.
void Flush(SubmitInfo& info);
/// Sends the current execution context to the GPU and waits for it to complete.
void Finish(vk::Semaphore signal = nullptr, vk::Semaphore wait = nullptr);
void Finish();
/// Waits for the given tick to trigger on the GPU.
void Wait(u64 tick);
@ -76,12 +99,12 @@ public:
pending_ops.emplace(func, CurrentTick());
}
std::mutex submit_mutex;
static std::mutex submit_mutex;
private:
void AllocateWorkerCommandBuffers();
void SubmitExecution(vk::Semaphore signal_semaphore, vk::Semaphore wait_semaphore);
void SubmitExecution(SubmitInfo& info);
private:
const Instance& instance;

View file

@ -55,7 +55,7 @@ void Swapchain::Create(u32 width_, u32 height_, vk::SurfaceKHR surface_) {
.pQueueFamilyIndices = queue_family_indices.data(),
.preTransform = transform,
.compositeAlpha = composite_alpha,
.presentMode = vk::PresentModeKHR::eFifo,
.presentMode = vk::PresentModeKHR::eMailbox,
.clipped = true,
.oldSwapchain = nullptr,
};