Query Cachge: Fully rework Vulkan's query cache

This commit is contained in:
Fernando Sahmkow 2023-08-04 03:32:30 +02:00
parent bdc01254a9
commit f1a2e36711
35 changed files with 1573 additions and 355 deletions

View file

@ -61,6 +61,9 @@ vk::Buffer CreateBuffer(const Device& device, const MemoryAllocator& memory_allo
if (device.IsExtTransformFeedbackSupported()) {
flags |= VK_BUFFER_USAGE_TRANSFORM_FEEDBACK_BUFFER_BIT_EXT;
}
if (device.IsExtConditionalRendering()) {
flags |= VK_BUFFER_USAGE_CONDITIONAL_RENDERING_BIT_EXT;
}
const VkBufferCreateInfo buffer_ci = {
.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
.pNext = nullptr,

View file

@ -12,6 +12,7 @@
#include "common/common_types.h"
#include "common/div_ceil.h"
#include "video_core/host_shaders/astc_decoder_comp_spv.h"
#include "video_core/host_shaders/resolve_conditional_render_comp_spv.h"
#include "video_core/host_shaders/vulkan_quad_indexed_comp_spv.h"
#include "video_core/host_shaders/vulkan_uint8_comp_spv.h"
#include "video_core/renderer_vulkan/vk_compute_pass.h"
@ -302,6 +303,52 @@ std::pair<VkBuffer, VkDeviceSize> QuadIndexedPass::Assemble(
return {staging.buffer, staging.offset};
}
ConditionalRenderingResolvePass::ConditionalRenderingResolvePass(const Device& device_,
Scheduler& scheduler_,
DescriptorPool& descriptor_pool_, ComputePassDescriptorQueue& compute_pass_descriptor_queue_)
: ComputePass(device_, descriptor_pool_, INPUT_OUTPUT_DESCRIPTOR_SET_BINDINGS,
INPUT_OUTPUT_DESCRIPTOR_UPDATE_TEMPLATE, INPUT_OUTPUT_BANK_INFO, nullptr,
RESOLVE_CONDITIONAL_RENDER_COMP_SPV),
scheduler{scheduler_}, compute_pass_descriptor_queue{compute_pass_descriptor_queue_} {}
void ConditionalRenderingResolvePass::Resolve(VkBuffer dst_buffer, VkBuffer src_buffer,
u32 src_offset, bool compare_to_zero) {
scheduler.RequestOutsideRenderPassOperationContext();
const size_t compare_size = compare_to_zero ? 8 : 24;
compute_pass_descriptor_queue.Acquire();
compute_pass_descriptor_queue.AddBuffer(src_buffer, src_offset, compare_size);
compute_pass_descriptor_queue.AddBuffer(dst_buffer, 0, sizeof(u32));
const void* const descriptor_data{compute_pass_descriptor_queue.UpdateData()};
scheduler.RequestOutsideRenderPassOperationContext();
scheduler.Record([this, descriptor_data](vk::CommandBuffer cmdbuf) {
static constexpr VkMemoryBarrier read_barrier{
.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
.pNext = nullptr,
.srcAccessMask = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
.dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT,
};
static constexpr VkMemoryBarrier write_barrier{
.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
.pNext = nullptr,
.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT,
.dstAccessMask = VK_ACCESS_CONDITIONAL_RENDERING_READ_BIT_EXT,
};
const VkDescriptorSet set = descriptor_allocator.Commit();
device.GetLogical().UpdateDescriptorSet(set, *descriptor_template, descriptor_data);
cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, read_barrier);
cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, *layout, 0, set, {});
cmdbuf.Dispatch(1, 1, 1);
cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
VK_PIPELINE_STAGE_CONDITIONAL_RENDERING_BIT_EXT, 0, write_barrier);
});
}
ASTCDecoderPass::ASTCDecoderPass(const Device& device_, Scheduler& scheduler_,
DescriptorPool& descriptor_pool_,
StagingBufferPool& staging_buffer_pool_,

View file

@ -82,6 +82,19 @@ private:
ComputePassDescriptorQueue& compute_pass_descriptor_queue;
};
class ConditionalRenderingResolvePass final : public ComputePass {
public:
explicit ConditionalRenderingResolvePass(
const Device& device_, Scheduler& scheduler_, DescriptorPool& descriptor_pool_,
ComputePassDescriptorQueue& compute_pass_descriptor_queue_);
void Resolve(VkBuffer dst_buffer, VkBuffer src_buffer, u32 src_offset, bool compare_to_zero);
private:
Scheduler& scheduler;
ComputePassDescriptorQueue& compute_pass_descriptor_queue;
};
class ASTCDecoderPass final : public ComputePass {
public:
explicit ASTCDecoderPass(const Device& device_, Scheduler& scheduler_,

View file

@ -8,6 +8,7 @@
#include "video_core/fence_manager.h"
#include "video_core/renderer_vulkan/vk_buffer_cache.h"
#include "video_core/renderer_vulkan/vk_texture_cache.h"
#include "video_core/renderer_vulkan/vk_query_cache.h"
namespace Core {
class System;
@ -20,7 +21,6 @@ class RasterizerInterface;
namespace Vulkan {
class Device;
class QueryCache;
class Scheduler;
class InnerFence : public VideoCommon::FenceBase {

File diff suppressed because it is too large Load diff

View file

@ -1,101 +1,74 @@
// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
// SPDX-License-Identifier: GPL-3.0-or-later
#pragma once
#include <cstddef>
#include <memory>
#include <utility>
#include <vector>
#include "common/common_types.h"
#include "video_core/query_cache.h"
#include "video_core/renderer_vulkan/vk_resource_pool.h"
#include "video_core/vulkan_common/vulkan_wrapper.h"
#include "video_core/query_cache/query_cache_base.h"
#include "video_core/renderer_vulkan/vk_buffer_cache.h"
namespace VideoCore {
class RasterizerInterface;
}
namespace VideoCommon {
class StreamerInterface;
}
namespace Vulkan {
class CachedQuery;
class Device;
class HostCounter;
class QueryCache;
class Scheduler;
class StagingBufferPool;
using CounterStream = VideoCommon::CounterStreamBase<QueryCache, HostCounter>;
struct QueryCacheRuntimeImpl;
class QueryPool final : public ResourcePool {
class QueryCacheRuntime {
public:
explicit QueryPool(const Device& device, Scheduler& scheduler, VideoCore::QueryType type);
~QueryPool() override;
explicit QueryCacheRuntime(VideoCore::RasterizerInterface* rasterizer,
Core::Memory::Memory& cpu_memory_,
Vulkan::BufferCache& buffer_cache_, const Device& device_,
const MemoryAllocator& memory_allocator_, Scheduler& scheduler_,
StagingBufferPool& staging_pool_,
ComputePassDescriptorQueue& compute_pass_descriptor_queue,
DescriptorPool& descriptor_pool);
~QueryCacheRuntime();
std::pair<VkQueryPool, u32> Commit();
template <typename SyncValuesType>
void SyncValues(std::span<SyncValuesType> values, VkBuffer base_src_buffer = nullptr);
void Reserve(std::pair<VkQueryPool, u32> query);
void Barriers(bool is_prebarrier);
protected:
void Allocate(std::size_t begin, std::size_t end) override;
void EndHostConditionalRendering();
void PauseHostConditionalRendering();
void ResumeHostConditionalRendering();
bool HostConditionalRenderingCompareValue(VideoCommon::LookupData object_1, bool qc_dirty);
bool HostConditionalRenderingCompareValues(VideoCommon::LookupData object_1,
VideoCommon::LookupData object_2, bool qc_dirty, bool equal_check);
VideoCommon::StreamerInterface* GetStreamerInterface(VideoCommon::QueryType query_type);
void Bind3DEngine(Tegra::Engines::Maxwell3D* maxwell3d);
template <typename Func>
void View3DRegs(Func&& func);
private:
static constexpr std::size_t GROW_STEP = 512;
const Device& device;
const VideoCore::QueryType type;
std::vector<vk::QueryPool> pools;
std::vector<bool> usage;
void HostConditionalRenderingCompareValueImpl(VideoCommon::LookupData object, bool is_equal);
void HostConditionalRenderingCompareBCImpl(VAddr address, bool is_equal);
friend struct QueryCacheRuntimeImpl;
std::unique_ptr<QueryCacheRuntimeImpl> impl;
};
class QueryCache final
: public VideoCommon::QueryCacheBase<QueryCache, CachedQuery, CounterStream, HostCounter> {
public:
explicit QueryCache(VideoCore::RasterizerInterface& rasterizer_,
Core::Memory::Memory& cpu_memory_, const Device& device_,
Scheduler& scheduler_);
~QueryCache();
std::pair<VkQueryPool, u32> AllocateQuery(VideoCore::QueryType type);
void Reserve(VideoCore::QueryType type, std::pair<VkQueryPool, u32> query);
const Device& GetDevice() const noexcept {
return device;
}
Scheduler& GetScheduler() const noexcept {
return scheduler;
}
private:
const Device& device;
Scheduler& scheduler;
std::array<QueryPool, VideoCore::NumQueryTypes> query_pools;
struct QueryCacheParams {
using RuntimeType = Vulkan::QueryCacheRuntime;
};
class HostCounter final : public VideoCommon::HostCounterBase<QueryCache, HostCounter> {
public:
explicit HostCounter(QueryCache& cache_, std::shared_ptr<HostCounter> dependency_,
VideoCore::QueryType type_);
~HostCounter();
void EndQuery();
private:
u64 BlockingQuery(bool async = false) const override;
QueryCache& cache;
const VideoCore::QueryType type;
const std::pair<VkQueryPool, u32> query;
const u64 tick;
};
class CachedQuery : public VideoCommon::CachedQueryBase<HostCounter> {
public:
explicit CachedQuery(QueryCache&, VideoCore::QueryType, VAddr cpu_addr_, u8* host_ptr_)
: CachedQueryBase{cpu_addr_, host_ptr_} {}
};
using QueryCache = VideoCommon::QueryCacheBase<QueryCacheParams>;
} // namespace Vulkan

View file

@ -24,6 +24,7 @@
#include "video_core/renderer_vulkan/vk_compute_pipeline.h"
#include "video_core/renderer_vulkan/vk_descriptor_pool.h"
#include "video_core/renderer_vulkan/vk_pipeline_cache.h"
#include "video_core/renderer_vulkan/vk_query_cache.h"
#include "video_core/renderer_vulkan/vk_rasterizer.h"
#include "video_core/renderer_vulkan/vk_scheduler.h"
#include "video_core/renderer_vulkan/vk_staging_buffer_pool.h"
@ -170,9 +171,11 @@ RasterizerVulkan::RasterizerVulkan(Core::Frontend::EmuWindow& emu_window_, Tegra
buffer_cache_runtime(device, memory_allocator, scheduler, staging_pool,
guest_descriptor_queue, compute_pass_descriptor_queue, descriptor_pool),
buffer_cache(*this, cpu_memory_, buffer_cache_runtime),
query_cache_runtime(this, cpu_memory_, buffer_cache, device, memory_allocator, scheduler,
staging_pool, compute_pass_descriptor_queue, descriptor_pool),
query_cache(gpu, *this, cpu_memory_, query_cache_runtime),
pipeline_cache(*this, device, scheduler, descriptor_pool, guest_descriptor_queue,
render_pass_cache, buffer_cache, texture_cache, gpu.ShaderNotify()),
query_cache{*this, cpu_memory_, device, scheduler},
accelerate_dma(buffer_cache, texture_cache, scheduler),
fence_manager(*this, gpu, texture_cache, buffer_cache, query_cache, device, scheduler),
wfi_event(device.GetLogical().CreateEvent()) {
@ -189,13 +192,15 @@ void RasterizerVulkan::PrepareDraw(bool is_indexed, Func&& draw_func) {
FlushWork();
gpu_memory->FlushCaching();
query_cache.NotifySegment(true);
#if ANDROID
if (Settings::IsGPULevelHigh()) {
// This is problematic on Android, disable on GPU Normal.
query_cache.UpdateCounters();
// query_cache.UpdateCounters();
}
#else
query_cache.UpdateCounters();
// query_cache.UpdateCounters();
#endif
GraphicsPipeline* const pipeline{pipeline_cache.CurrentGraphicsPipeline()};
@ -207,13 +212,12 @@ void RasterizerVulkan::PrepareDraw(bool is_indexed, Func&& draw_func) {
pipeline->SetEngine(maxwell3d, gpu_memory);
pipeline->Configure(is_indexed);
BeginTransformFeedback();
UpdateDynamicStates();
HandleTransformFeedback();
query_cache.CounterEnable(VideoCommon::QueryType::ZPassPixelCount64,
maxwell3d->regs.zpass_pixel_count_enable);
draw_func();
EndTransformFeedback();
}
void RasterizerVulkan::Draw(bool is_indexed, u32 instance_count) {
@ -241,6 +245,14 @@ void RasterizerVulkan::DrawIndirect() {
const auto indirect_buffer = buffer_cache.GetDrawIndirectBuffer();
const auto& buffer = indirect_buffer.first;
const auto& offset = indirect_buffer.second;
if (params.is_byte_count) {
scheduler.Record([buffer_obj = buffer->Handle(), offset,
stride = params.stride](vk::CommandBuffer cmdbuf) {
cmdbuf.DrawIndirectByteCountEXT(1, 0, buffer_obj, offset, 0,
static_cast<u32>(stride));
});
return;
}
if (params.include_count) {
const auto count = buffer_cache.GetDrawIndirectCount();
const auto& draw_buffer = count.first;
@ -280,13 +292,15 @@ void RasterizerVulkan::DrawTexture() {
SCOPE_EXIT({ gpu.TickWork(); });
FlushWork();
query_cache.NotifySegment(true);
#if ANDROID
if (Settings::IsGPULevelHigh()) {
// This is problematic on Android, disable on GPU Normal.
query_cache.UpdateCounters();
// query_cache.UpdateCounters();
}
#else
query_cache.UpdateCounters();
// query_cache.UpdateCounters();
#endif
texture_cache.SynchronizeGraphicsDescriptors();
@ -294,6 +308,8 @@ void RasterizerVulkan::DrawTexture() {
UpdateDynamicStates();
query_cache.CounterEnable(VideoCommon::QueryType::ZPassPixelCount64,
maxwell3d->regs.zpass_pixel_count_enable);
const auto& draw_texture_state = maxwell3d->draw_manager->GetDrawTextureState();
const auto& sampler = texture_cache.GetGraphicsSampler(draw_texture_state.src_sampler);
const auto& texture = texture_cache.GetImageView(draw_texture_state.src_texture);
@ -319,12 +335,16 @@ void RasterizerVulkan::Clear(u32 layer_count) {
#if ANDROID
if (Settings::IsGPULevelHigh()) {
// This is problematic on Android, disable on GPU Normal.
query_cache.UpdateCounters();
// query_cache.UpdateCounters();
}
#else
query_cache.UpdateCounters();
// query_cache.UpdateCounters();
#endif
query_cache.NotifySegment(true);
query_cache.CounterEnable(VideoCommon::QueryType::ZPassPixelCount64,
maxwell3d->regs.zpass_pixel_count_enable);
auto& regs = maxwell3d->regs;
const bool use_color = regs.clear_surface.R || regs.clear_surface.G || regs.clear_surface.B ||
regs.clear_surface.A;
@ -482,13 +502,13 @@ void RasterizerVulkan::DispatchCompute() {
scheduler.Record([dim](vk::CommandBuffer cmdbuf) { cmdbuf.Dispatch(dim[0], dim[1], dim[2]); });
}
void RasterizerVulkan::ResetCounter(VideoCore::QueryType type) {
query_cache.ResetCounter(type);
void RasterizerVulkan::ResetCounter(VideoCommon::QueryType type) {
query_cache.CounterReset(type);
}
void RasterizerVulkan::Query(GPUVAddr gpu_addr, VideoCore::QueryType type,
std::optional<u64> timestamp) {
query_cache.Query(gpu_addr, type, timestamp);
void RasterizerVulkan::Query(GPUVAddr gpu_addr, VideoCommon::QueryType type,
VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) {
query_cache.CounterReport(gpu_addr, type, flags, payload, subreport);
}
void RasterizerVulkan::BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr,
@ -669,8 +689,8 @@ void RasterizerVulkan::SignalReference() {
fence_manager.SignalReference();
}
void RasterizerVulkan::ReleaseFences() {
fence_manager.WaitPendingFences();
void RasterizerVulkan::ReleaseFences(bool force) {
fence_manager.WaitPendingFences(force);
}
void RasterizerVulkan::FlushAndInvalidateRegion(VAddr addr, u64 size,
@ -694,6 +714,8 @@ void RasterizerVulkan::WaitForIdle() {
flags |= VK_PIPELINE_STAGE_TRANSFORM_FEEDBACK_BIT_EXT;
}
query_cache.NotifyWFI();
scheduler.RequestOutsideRenderPassOperationContext();
scheduler.Record([event = *wfi_event, flags](vk::CommandBuffer cmdbuf) {
cmdbuf.SetEvent(event, flags);
@ -737,19 +759,7 @@ void RasterizerVulkan::TickFrame() {
bool RasterizerVulkan::AccelerateConditionalRendering() {
gpu_memory->FlushCaching();
if (Settings::IsGPULevelHigh()) {
// TODO(Blinkhawk): Reimplement Host conditional rendering.
return false;
}
// Medium / Low Hack: stub any checks on queries written into the buffer cache.
const GPUVAddr condition_address{maxwell3d->regs.render_enable.Address()};
Maxwell::ReportSemaphore::Compare cmp;
if (gpu_memory->IsMemoryDirty(condition_address, sizeof(cmp),
VideoCommon::CacheType::BufferCache |
VideoCommon::CacheType::QueryCache)) {
return true;
}
return false;
return query_cache.AccelerateHostConditionalRendering();
}
bool RasterizerVulkan::AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Surface& src,
@ -795,6 +805,7 @@ bool RasterizerVulkan::AccelerateDisplay(const Tegra::FramebufferConfig& config,
if (!image_view) {
return false;
}
query_cache.NotifySegment(false);
screen_info.image = image_view->ImageHandle();
screen_info.image_view = image_view->Handle(Shader::TextureType::Color2D);
screen_info.width = image_view->size.width;
@ -933,31 +944,18 @@ void RasterizerVulkan::UpdateDynamicStates() {
}
}
void RasterizerVulkan::BeginTransformFeedback() {
void RasterizerVulkan::HandleTransformFeedback() {
const auto& regs = maxwell3d->regs;
if (regs.transform_feedback_enabled == 0) {
return;
}
if (!device.IsExtTransformFeedbackSupported()) {
LOG_ERROR(Render_Vulkan, "Transform feedbacks used but not supported");
return;
}
UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderType::TessellationInit) ||
regs.IsShaderConfigEnabled(Maxwell::ShaderType::Tessellation));
scheduler.Record(
[](vk::CommandBuffer cmdbuf) { cmdbuf.BeginTransformFeedbackEXT(0, 0, nullptr, nullptr); });
}
void RasterizerVulkan::EndTransformFeedback() {
const auto& regs = maxwell3d->regs;
if (regs.transform_feedback_enabled == 0) {
return;
query_cache.CounterEnable(VideoCommon::QueryType::StreamingByteCount,
regs.transform_feedback_enabled);
if (regs.transform_feedback_enabled != 0) {
UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderType::TessellationInit) ||
regs.IsShaderConfigEnabled(Maxwell::ShaderType::Tessellation));
}
if (!device.IsExtTransformFeedbackSupported()) {
return;
}
scheduler.Record(
[](vk::CommandBuffer cmdbuf) { cmdbuf.EndTransformFeedbackEXT(0, 0, nullptr, nullptr); });
}
void RasterizerVulkan::UpdateViewportsState(Tegra::Engines::Maxwell3D::Regs& regs) {

View file

@ -84,8 +84,8 @@ public:
void DrawTexture() override;
void Clear(u32 layer_count) override;
void DispatchCompute() override;
void ResetCounter(VideoCore::QueryType type) override;
void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional<u64> timestamp) override;
void ResetCounter(VideoCommon::QueryType type) override;
void Query(GPUVAddr gpu_addr, VideoCommon::QueryType type, VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) override;
void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size) override;
void DisableGraphicsUniformBuffer(size_t stage, u32 index) override;
void FlushAll() override;
@ -106,7 +106,7 @@ public:
void SyncOperation(std::function<void()>&& func) override;
void SignalSyncPoint(u32 value) override;
void SignalReference() override;
void ReleaseFences() override;
void ReleaseFences(bool force = true) override;
void FlushAndInvalidateRegion(
VAddr addr, u64 size, VideoCommon::CacheType which = VideoCommon::CacheType::All) override;
void WaitForIdle() override;
@ -146,9 +146,7 @@ private:
void UpdateDynamicStates();
void BeginTransformFeedback();
void EndTransformFeedback();
void HandleTransformFeedback();
void UpdateViewportsState(Tegra::Engines::Maxwell3D::Regs& regs);
void UpdateScissorsState(Tegra::Engines::Maxwell3D::Regs& regs);
@ -195,8 +193,9 @@ private:
TextureCache texture_cache;
BufferCacheRuntime buffer_cache_runtime;
BufferCache buffer_cache;
PipelineCache pipeline_cache;
QueryCacheRuntime query_cache_runtime;
QueryCache query_cache;
PipelineCache pipeline_cache;
AccelerateDMA accelerate_dma;
FenceManager fence_manager;

View file

@ -243,10 +243,10 @@ void Scheduler::AllocateNewContext() {
#if ANDROID
if (Settings::IsGPULevelHigh()) {
// This is problematic on Android, disable on GPU Normal.
query_cache->UpdateCounters();
query_cache->NotifySegment(true);
}
#else
query_cache->UpdateCounters();
query_cache->NotifySegment(true);
#endif
}
}
@ -261,11 +261,12 @@ void Scheduler::EndPendingOperations() {
#if ANDROID
if (Settings::IsGPULevelHigh()) {
// This is problematic on Android, disable on GPU Normal.
query_cache->DisableStreams();
// query_cache->DisableStreams();
}
#else
query_cache->DisableStreams();
// query_cache->DisableStreams();
#endif
query_cache->NotifySegment(false);
EndRenderPass();
}

View file

@ -15,6 +15,7 @@
#include "common/common_types.h"
#include "common/polyfill_thread.h"
#include "video_core/renderer_vulkan/vk_master_semaphore.h"
#include "video_core/renderer_vulkan/vk_query_cache.h"
#include "video_core/vulkan_common/vulkan_wrapper.h"
namespace Vulkan {
@ -24,7 +25,6 @@ class Device;
class Framebuffer;
class GraphicsPipeline;
class StateTracker;
class QueryCache;
/// The scheduler abstracts command buffer and fence management with an interface that's able to do
/// OpenGL-like operations on Vulkan command buffers.