Move presentation to separate thread/improve sync (#303)

* video_out: Move presentation to separate thread * liverpool: Better sync for CPU flips * driver: Make flip blocking * videoout: Proper flip rate and vblank management * config: Add vblank divider option * clang format * videoout: added `sceVideoOutWaitVblank` * clang format * vk_scheduler: Silly merge conflict * externals: Add renderdoc API * clang format * reuse * rdoc: manual capture trigger * clang fmt --------- Co-authored-by: psucien <168137814+psucien@users.noreply.github.com>
2025-06-08 03:33:14 +00:00 · 2024-07-28 16:54:09 +03:00 · 2024-07-28 16:54:09 +03:00 · 0d6edaa0a0
commit 0d6edaa0a0
parent 361412031c
32 changed files with 1259 additions and 224 deletions
--- a/src/video_core/renderer_vulkan/renderer_vulkan.cpp
+++ b/src/video_core/renderer_vulkan/renderer_vulkan.cpp
@ -63,44 +63,30 @@ bool CanBlitToSwapchain(const vk::PhysicalDevice physical_device, vk::Format for
    };
 }

-RendererVulkan::RendererVulkan(Frontend::WindowSDL& window_, AmdGpu::Liverpool* liverpool)
-    : window{window_}, instance{window, Config::getGpuId(), Config::vkValidationEnabled()},
-      scheduler{instance}, swapchain{instance, window}, texture_cache{instance, scheduler} {
-    rasterizer = std::make_unique<Rasterizer>(instance, scheduler, texture_cache, liverpool);
+RendererVulkan::RendererVulkan(Frontend::WindowSDL& window_, AmdGpu::Liverpool* liverpool_)
+    : window{window_}, liverpool{liverpool_},
+      instance{window, Config::getGpuId(), Config::vkValidationEnabled()}, draw_scheduler{instance},
+      present_scheduler{instance}, flip_scheduler{instance}, swapchain{instance, window},
+      texture_cache{instance, draw_scheduler} {
+    rasterizer = std::make_unique<Rasterizer>(instance, draw_scheduler, texture_cache, liverpool);
    const u32 num_images = swapchain.GetImageCount();
    const vk::Device device = instance.GetDevice();

-    const vk::CommandPoolCreateInfo pool_info = {
-        .flags = vk::CommandPoolCreateFlagBits::eResetCommandBuffer |
-                 vk::CommandPoolCreateFlagBits::eTransient,
-        .queueFamilyIndex = instance.GetGraphicsQueueFamilyIndex(),
-    };
-    command_pool = device.createCommandPoolUnique(pool_info);
-
-    const vk::CommandBufferAllocateInfo alloc_info = {
-        .commandPool = *command_pool,
-        .level = vk::CommandBufferLevel::ePrimary,
-        .commandBufferCount = num_images,
-    };
-
-    const auto cmdbuffers = device.allocateCommandBuffers(alloc_info);
+    // Create presentation frames.
    present_frames.resize(num_images);
    for (u32 i = 0; i < num_images; i++) {
        Frame& frame = present_frames[i];
-        frame.cmdbuf = cmdbuffers[i];
-        frame.render_ready = device.createSemaphore({});
        frame.present_done = device.createFence({.flags = vk::FenceCreateFlagBits::eSignaled});
        free_queue.push(&frame);
    }
 }

 RendererVulkan::~RendererVulkan() {
-    scheduler.Finish();
+    draw_scheduler.Finish();
    const vk::Device device = instance.GetDevice();
    for (auto& frame : present_frames) {
        vmaDestroyImage(instance.GetAllocator(), frame.image, frame.allocation);
        device.destroyImageView(frame.image_view);
-        device.destroySemaphore(frame.render_ready);
        device.destroyFence(frame.present_done);
    }
 }
@ -184,7 +170,7 @@ bool RendererVulkan::ShowSplash(Frame* frame /*= nullptr*/) {
            info.pitch = splash->GetImageInfo().width;
            info.guest_address = VAddr(splash->GetImageData().data());
            info.guest_size_bytes = splash->GetImageData().size();
-            splash_img.emplace(instance, scheduler, info);
+            splash_img.emplace(instance, present_scheduler, info);
            texture_cache.RefreshImage(*splash_img);
        }
        frame = PrepareFrameInternal(*splash_img);
@ -193,12 +179,18 @@ bool RendererVulkan::ShowSplash(Frame* frame /*= nullptr*/) {
    return true;
 }

-Frame* RendererVulkan::PrepareFrameInternal(VideoCore::Image& image) {
+Frame* RendererVulkan::PrepareFrameInternal(VideoCore::Image& image, bool is_eop) {
    // Request a free presentation frame.
    Frame* frame = GetRenderFrame();

-    // Post-processing (Anti-aliasing, FSR etc) goes here. For now just blit to the frame image.
-    image.Transit(vk::ImageLayout::eTransferSrcOptimal, vk::AccessFlagBits::eTransferRead);
+    // EOP flips are triggered from GPU thread so use the drawing scheduler to record
+    // commands. Otherwise we are dealing with a CPU flip which could have arrived
+    // from any guest thread. Use a separate scheduler for that.
+    auto& scheduler = is_eop ? draw_scheduler : flip_scheduler;
+    scheduler.EndRendering();
+    const auto cmdbuf = scheduler.CommandBuffer();
+
+    image.Transit(vk::ImageLayout::eTransferSrcOptimal, vk::AccessFlagBits::eTransferRead, cmdbuf);

    const std::array pre_barrier{
        vk::ImageMemoryBarrier{
@ -218,12 +210,11 @@ Frame* RendererVulkan::PrepareFrameInternal(VideoCore::Image& image) {
            },
        },
    };
-
-    const auto cmdbuf = scheduler.CommandBuffer();
    cmdbuf.pipelineBarrier(vk::PipelineStageFlagBits::eTransfer,
                           vk::PipelineStageFlagBits::eTransfer, vk::DependencyFlagBits::eByRegion,
                           {}, {}, pre_barrier);

+    // Post-processing (Anti-aliasing, FSR etc) goes here. For now just blit to the frame image.
    cmdbuf.blitImage(
        image.image, image.layout, frame->image, vk::ImageLayout::eTransferDstOptimal,
        MakeImageBlit(image.info.size.width, image.info.size.height, frame->width, frame->height),
@ -245,13 +236,15 @@ Frame* RendererVulkan::PrepareFrameInternal(VideoCore::Image& image) {
            .layerCount = VK_REMAINING_ARRAY_LAYERS,
        },
    };
-
    cmdbuf.pipelineBarrier(vk::PipelineStageFlagBits::eAllCommands,
                           vk::PipelineStageFlagBits::eAllCommands,
                           vk::DependencyFlagBits::eByRegion, {}, {}, post_barrier);

-    // Flush pending vulkan operations.
-    scheduler.Flush(frame->render_ready);
+    // Flush frame creation commands.
+    frame->ready_semaphore = scheduler.GetMasterSemaphore()->Handle();
+    frame->ready_tick = scheduler.CurrentTick();
+    SubmitInfo info{};
+    scheduler.Flush(info);
    return frame;
 }

@ -260,11 +253,8 @@ void RendererVulkan::Present(Frame* frame) {

    const vk::Image swapchain_image = swapchain.Image();

-    const vk::CommandBufferBeginInfo begin_info = {
-        .flags = vk::CommandBufferUsageFlagBits::eOneTimeSubmit,
-    };
-    const vk::CommandBuffer cmdbuf = frame->cmdbuf;
-    cmdbuf.begin(begin_info);
+    auto& scheduler = present_scheduler;
+    const auto cmdbuf = scheduler.CommandBuffer();
    {
        auto* profiler_ctx = instance.GetProfilerContext();
        TracyVkNamedZoneC(profiler_ctx, renderer_gpu_zone, cmdbuf, "Host frame",
@ -339,35 +329,17 @@ void RendererVulkan::Present(Frame* frame) {
            TracyVkCollect(profiler_ctx, cmdbuf);
        }
    }
-    cmdbuf.end();

-    static constexpr std::array<vk::PipelineStageFlags, 2> wait_stage_masks = {
-        vk::PipelineStageFlagBits::eColorAttachmentOutput,
-        vk::PipelineStageFlagBits::eAllGraphics,
-    };
-
-    const vk::Semaphore present_ready = swapchain.GetPresentReadySemaphore();
-    const vk::Semaphore image_acquired = swapchain.GetImageAcquiredSemaphore();
-    const std::array wait_semaphores = {image_acquired, frame->render_ready};
-
-    vk::SubmitInfo submit_info = {
-        .waitSemaphoreCount = static_cast<u32>(wait_semaphores.size()),
-        .pWaitSemaphores = wait_semaphores.data(),
-        .pWaitDstStageMask = wait_stage_masks.data(),
-        .commandBufferCount = 1u,
-        .pCommandBuffers = &cmdbuf,
-        .signalSemaphoreCount = 1,
-        .pSignalSemaphores = &present_ready,
-    };
-
-    std::scoped_lock submit_lock{scheduler.submit_mutex};
-    try {
-        instance.GetGraphicsQueue().submit(submit_info, frame->present_done);
-    } catch (vk::DeviceLostError& err) {
-        LOG_CRITICAL(Render_Vulkan, "Device lost during present submit: {}", err.what());
-        UNREACHABLE();
-    }
+    // Flush vulkan commands.
+    SubmitInfo info{};
+    info.AddWait(swapchain.GetImageAcquiredSemaphore());
+    info.AddWait(frame->ready_semaphore, frame->ready_tick);
+    info.AddSignal(swapchain.GetPresentReadySemaphore());
+    info.AddSignal(frame->present_done);
+    scheduler.Flush(info);

+    // Present to swapchain.
+    std::scoped_lock submit_lock{Scheduler::submit_mutex};
    swapchain.Present();

    // Free the frame for reuse
--- a/src/video_core/renderer_vulkan/renderer_vulkan.h
+++ b/src/video_core/renderer_vulkan/renderer_vulkan.h
@ -26,9 +26,15 @@ struct Frame {
    VmaAllocation allocation;
    vk::Image image;
    vk::ImageView image_view;
-    vk::Semaphore render_ready;
    vk::Fence present_done;
-    vk::CommandBuffer cmdbuf;
+    vk::Semaphore ready_semaphore;
+    u64 ready_tick;
+};
+
+enum SchedulerType {
+    Draw,
+    Present,
+    CpuFlip,
 };

 class Rasterizer;
@ -39,16 +45,16 @@ public:
    ~RendererVulkan();

    Frame* PrepareFrame(const Libraries::VideoOut::BufferAttributeGroup& attribute,
-                        VAddr cpu_address) {
+                        VAddr cpu_address, bool is_eop) {
        const auto info = VideoCore::ImageInfo{attribute, cpu_address};
        const auto image_id = texture_cache.FindImage(info, cpu_address);
        auto& image = texture_cache.GetImage(image_id);
-        return PrepareFrameInternal(image);
+        return PrepareFrameInternal(image, is_eop);
    }

    Frame* PrepareBlankFrame() {
        auto& image = texture_cache.GetImage(VideoCore::NULL_IMAGE_ID);
-        return PrepareFrameInternal(image);
+        return PrepareFrameInternal(image, true);
    }

    VideoCore::Image& RegisterVideoOutSurface(
@ -60,9 +66,9 @@ public:
    }

    bool IsVideoOutSurface(const AmdGpu::Liverpool::ColorBuffer& color_buffer) {
-        return std::find_if(vo_buffers_addr.cbegin(), vo_buffers_addr.cend(), [&](VAddr vo_buffer) {
+        return std::ranges::find_if(vo_buffers_addr, [&](VAddr vo_buffer) {
                   return vo_buffer == color_buffer.Address();
-               }) != vo_buffers_addr.cend();
+               }) != vo_buffers_addr.end();
    }

    bool ShowSplash(Frame* frame = nullptr);
@ -70,13 +76,16 @@ public:
    void RecreateFrame(Frame* frame, u32 width, u32 height);

 private:
-    Frame* PrepareFrameInternal(VideoCore::Image& image);
+    Frame* PrepareFrameInternal(VideoCore::Image& image, bool is_eop = true);
    Frame* GetRenderFrame();

 private:
    Frontend::WindowSDL& window;
+    AmdGpu::Liverpool* liverpool;
    Instance instance;
-    Scheduler scheduler;
+    Scheduler draw_scheduler;
+    Scheduler present_scheduler;
+    Scheduler flip_scheduler;
    Swapchain swapchain;
    std::unique_ptr<Rasterizer> rasterizer;
    VideoCore::TextureCache texture_cache;
--- a/src/video_core/renderer_vulkan/vk_master_semaphore.cpp
+++ b/src/video_core/renderer_vulkan/vk_master_semaphore.cpp
@ -2,8 +2,6 @@
 // SPDX-License-Identifier: GPL-2.0-or-later

 #include <limits>
-#include <mutex>
-#include "common/assert.h"
 #include "video_core/renderer_vulkan/vk_instance.h"
 #include "video_core/renderer_vulkan/vk_master_semaphore.h"

@ -60,46 +58,4 @@ void MasterSemaphore::Wait(u64 tick) {
    Refresh();
 }

-void MasterSemaphore::SubmitWork(vk::CommandBuffer cmdbuf, vk::Semaphore wait, vk::Semaphore signal,
-                                 u64 signal_value) {
-    cmdbuf.end();
-
-    const u32 num_signal_semaphores = signal ? 2U : 1U;
-    const std::array signal_values{signal_value, u64(0)};
-    const std::array signal_semaphores{Handle(), signal};
-
-    const u32 num_wait_semaphores = wait ? 2U : 1U;
-    const std::array wait_values{signal_value - 1, u64(1)};
-    const std::array wait_semaphores{Handle(), wait};
-
-    static constexpr std::array<vk::PipelineStageFlags, 2> wait_stage_masks = {
-        vk::PipelineStageFlagBits::eAllCommands,
-        vk::PipelineStageFlagBits::eColorAttachmentOutput,
-    };
-
-    const vk::TimelineSemaphoreSubmitInfo timeline_si = {
-        .waitSemaphoreValueCount = num_wait_semaphores,
-        .pWaitSemaphoreValues = wait_values.data(),
-        .signalSemaphoreValueCount = num_signal_semaphores,
-        .pSignalSemaphoreValues = signal_values.data(),
-    };
-
-    const vk::SubmitInfo submit_info = {
-        .pNext = &timeline_si,
-        .waitSemaphoreCount = num_wait_semaphores,
-        .pWaitSemaphores = wait_semaphores.data(),
-        .pWaitDstStageMask = wait_stage_masks.data(),
-        .commandBufferCount = 1u,
-        .pCommandBuffers = &cmdbuf,
-        .signalSemaphoreCount = num_signal_semaphores,
-        .pSignalSemaphores = signal_semaphores.data(),
-    };
-
-    try {
-        instance.GetGraphicsQueue().submit(submit_info);
-    } catch (vk::DeviceLostError& err) {
-        UNREACHABLE_MSG("Device lost during submit: {}", err.what());
-    }
-}
-
 } // namespace Vulkan
--- a/src/video_core/renderer_vulkan/vk_master_semaphore.h
+++ b/src/video_core/renderer_vulkan/vk_master_semaphore.h
@ -46,10 +46,6 @@ public:
    /// Waits for a tick to be hit on the GPU
    void Wait(u64 tick);

-    /// Submits the provided command buffer for execution
-    void SubmitWork(vk::CommandBuffer cmdbuf, vk::Semaphore wait, vk::Semaphore signal,
-                    u64 signal_value);
-
 protected:
    const Instance& instance;
    vk::UniqueSemaphore semaphore;    ///< Timeline semaphore.
--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@ -96,6 +96,13 @@ void Rasterizer::DispatchDirect() {
    cmdbuf.dispatch(cs_program.dim_x, cs_program.dim_y, cs_program.dim_z);
 }

+u64 Rasterizer::Flush() {
+    const u64 current_tick = scheduler.CurrentTick();
+    SubmitInfo info{};
+    scheduler.Flush(info);
+    return current_tick;
+}
+
 void Rasterizer::BeginRendering() {
    const auto& regs = liverpool->regs;
    RenderState state;
--- a/src/video_core/renderer_vulkan/vk_rasterizer.h
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.h
@ -36,6 +36,8 @@ public:
    void ScopeMarkerBegin(const std::string& str);
    void ScopeMarkerEnd();

+    u64 Flush();
+
 private:
    u32 SetupIndexBuffer(bool& is_indexed, u32 index_offset);
    void MapMemory(VAddr addr, size_t size);
--- a/src/video_core/renderer_vulkan/vk_scheduler.cpp
+++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp
@ -2,12 +2,15 @@
 // SPDX-License-Identifier: GPL-2.0-or-later

 #include <mutex>
+#include "common/assert.h"
 #include "common/debug.h"
 #include "video_core/renderer_vulkan/vk_instance.h"
 #include "video_core/renderer_vulkan/vk_scheduler.h"

 namespace Vulkan {

+std::mutex Scheduler::submit_mutex;
+
 Scheduler::Scheduler(const Instance& instance)
    : instance{instance}, master_semaphore{instance}, command_pool{instance, &master_semaphore} {
    profiler_scope = reinterpret_cast<tracy::VkCtxScope*>(std::malloc(sizeof(tracy::VkCtxScope)));
@ -50,22 +53,24 @@ void Scheduler::EndRendering() {
    current_cmdbuf.endRendering();
 }

-void Scheduler::Flush(vk::Semaphore signal, vk::Semaphore wait) {
-    // When flushing, we only send data to the worker thread; no waiting is necessary.
-    SubmitExecution(signal, wait);
+void Scheduler::Flush(SubmitInfo& info) {
+    // When flushing, we only send data to the driver; no waiting is necessary.
+    SubmitExecution(info);
 }

-void Scheduler::Finish(vk::Semaphore signal, vk::Semaphore wait) {
+void Scheduler::Finish() {
    // When finishing, we need to wait for the submission to have executed on the device.
    const u64 presubmit_tick = CurrentTick();
-    SubmitExecution(signal, wait);
+    SubmitInfo info{};
+    SubmitExecution(info);
    Wait(presubmit_tick);
 }

 void Scheduler::Wait(u64 tick) {
    if (tick >= master_semaphore.CurrentTick()) {
        // Make sure we are not waiting for the current tick without signalling
-        Flush();
+        SubmitInfo info{};
+        Flush(info);
    }
    master_semaphore.Wait(tick);
 }
@ -86,7 +91,7 @@ void Scheduler::AllocateWorkerCommandBuffers() {
    }
 }

-void Scheduler::SubmitExecution(vk::Semaphore signal_semaphore, vk::Semaphore wait_semaphore) {
+void Scheduler::SubmitExecution(SubmitInfo& info) {
    std::scoped_lock lk{submit_mutex};
    const u64 signal_value = master_semaphore.NextTick();

@ -97,7 +102,40 @@ void Scheduler::SubmitExecution(vk::Semaphore signal_semaphore, vk::Semaphore wa
    }

    EndRendering();
-    master_semaphore.SubmitWork(current_cmdbuf, wait_semaphore, signal_semaphore, signal_value);
+    current_cmdbuf.end();
+
+    const vk::Semaphore timeline = master_semaphore.Handle();
+    info.AddSignal(timeline, signal_value);
+
+    static constexpr std::array<vk::PipelineStageFlags, 2> wait_stage_masks = {
+        vk::PipelineStageFlagBits::eAllCommands,
+        vk::PipelineStageFlagBits::eColorAttachmentOutput,
+    };
+
+    const vk::TimelineSemaphoreSubmitInfo timeline_si = {
+        .waitSemaphoreValueCount = static_cast<u32>(info.wait_ticks.size()),
+        .pWaitSemaphoreValues = info.wait_ticks.data(),
+        .signalSemaphoreValueCount = static_cast<u32>(info.signal_ticks.size()),
+        .pSignalSemaphoreValues = info.signal_ticks.data(),
+    };
+
+    const vk::SubmitInfo submit_info = {
+        .pNext = &timeline_si,
+        .waitSemaphoreCount = static_cast<u32>(info.wait_semas.size()),
+        .pWaitSemaphores = info.wait_semas.data(),
+        .pWaitDstStageMask = wait_stage_masks.data(),
+        .commandBufferCount = 1U,
+        .pCommandBuffers = &current_cmdbuf,
+        .signalSemaphoreCount = static_cast<u32>(info.signal_semas.size()),
+        .pSignalSemaphores = info.signal_semas.data(),
+    };
+
+    try {
+        instance.GetGraphicsQueue().submit(submit_info, info.fence);
+    } catch (vk::DeviceLostError& err) {
+        UNREACHABLE_MSG("Device lost during submit: {}", err.what());
+    }
+
    master_semaphore.Refresh();
    AllocateWorkerCommandBuffers();

--- a/src/video_core/renderer_vulkan/vk_scheduler.h
+++ b/src/video_core/renderer_vulkan/vk_scheduler.h
@ -26,16 +26,39 @@ struct RenderState {
    }
 };

+struct SubmitInfo {
+    boost::container::static_vector<vk::Semaphore, 3> wait_semas;
+    boost::container::static_vector<u64, 3> wait_ticks;
+    boost::container::static_vector<vk::Semaphore, 3> signal_semas;
+    boost::container::static_vector<u64, 3> signal_ticks;
+    vk::Fence fence;
+
+    void AddWait(vk::Semaphore semaphore, u64 tick = 1) {
+        wait_semas.emplace_back(semaphore);
+        wait_ticks.emplace_back(tick);
+    }
+
+    void AddSignal(vk::Semaphore semaphore, u64 tick = 1) {
+        signal_semas.emplace_back(semaphore);
+        signal_ticks.emplace_back(tick);
+    }
+
+    void AddSignal(vk::Fence fence) {
+        this->fence = fence;
+    }
+};
+
 class Scheduler {
 public:
    explicit Scheduler(const Instance& instance);
    ~Scheduler();

-    /// Sends the current execution context to the GPU.
-    void Flush(vk::Semaphore signal = nullptr, vk::Semaphore wait = nullptr);
+    /// Sends the current execution context to the GPU
+    /// and increments the scheduler timeline semaphore.
+    void Flush(SubmitInfo& info);

    /// Sends the current execution context to the GPU and waits for it to complete.
-    void Finish(vk::Semaphore signal = nullptr, vk::Semaphore wait = nullptr);
+    void Finish();

    /// Waits for the given tick to trigger on the GPU.
    void Wait(u64 tick);
@ -76,12 +99,12 @@ public:
        pending_ops.emplace(func, CurrentTick());
    }

-    std::mutex submit_mutex;
+    static std::mutex submit_mutex;

 private:
    void AllocateWorkerCommandBuffers();

-    void SubmitExecution(vk::Semaphore signal_semaphore, vk::Semaphore wait_semaphore);
+    void SubmitExecution(SubmitInfo& info);

 private:
    const Instance& instance;
--- a/src/video_core/renderer_vulkan/vk_swapchain.cpp
+++ b/src/video_core/renderer_vulkan/vk_swapchain.cpp
@ -55,7 +55,7 @@ void Swapchain::Create(u32 width_, u32 height_, vk::SurfaceKHR surface_) {
        .pQueueFamilyIndices = queue_family_indices.data(),
        .preTransform = transform,
        .compositeAlpha = composite_alpha,
-        .presentMode = vk::PresentModeKHR::eFifo,
+        .presentMode = vk::PresentModeKHR::eMailbox,
        .clipped = true,
        .oldSwapchain = nullptr,
    };