video_core: Crucial buffer cache fixes + proper GPU clears (#414)

* translator: Use templates for stronger type guarantees

* spirv: Define buffer offsets upfront

* Saves a lot of shader instructions

* buffer_cache: Use dynamic vertex input when available

* Fixes issues when games like dark souls rebind vertex buffers with different stride

* externals: Update boost

* spirv: Use runtime array for ssbos

* ssbos can be large and typically their size will vary, especially in generic copy/clear cs shaders

* fs: Lock when doing case insensitive search

* Dark Souls does fs lookups from different threads

* texture_cache: More precise invalidation from compute

* Fixes unrelated render targets being cleared

* texture_cache: Use hashes for protect gpu modified images from reupload

* translator: Treat V_CNDMASK as float

* Sometimes it can have input modifiers. Worst this will cause is some extra calls to uintBitsToFloat and opposite. But most often this is used as float anyway

* translator: Small optimization for V_SAD_U32

* Fix review

* clang format
This commit is contained in:
TheTurtle 2024-08-13 09:21:48 +03:00 committed by GitHub
parent dfcfd62d4f
commit 1fb0da9b89
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
23 changed files with 372 additions and 346 deletions

View file

@ -87,6 +87,15 @@ void BufferCache::DownloadBufferMemory(Buffer& buffer, VAddr device_addr, u64 si
}
bool BufferCache::BindVertexBuffers(const Shader::Info& vs_info) {
boost::container::small_vector<vk::VertexInputAttributeDescription2EXT, 16> attributes;
boost::container::small_vector<vk::VertexInputBindingDescription2EXT, 16> bindings;
SCOPE_EXIT {
if (instance.IsVertexInputDynamicState()) {
const auto cmdbuf = scheduler.CommandBuffer();
cmdbuf.setVertexInputEXT(bindings, attributes);
}
};
if (vs_info.vs_inputs.empty()) {
return false;
}
@ -122,6 +131,21 @@ bool BufferCache::BindVertexBuffers(const Shader::Info& vs_info) {
}
guest_buffers.emplace_back(buffer);
ranges.emplace_back(buffer.base_address, buffer.base_address + buffer.GetSize());
attributes.push_back({
.location = input.binding,
.binding = input.binding,
.format =
Vulkan::LiverpoolToVK::SurfaceFormat(buffer.GetDataFmt(), buffer.GetNumberFmt()),
.offset = 0,
});
bindings.push_back({
.binding = input.binding,
.stride = buffer.GetStride(),
.inputRate = input.instance_step_rate == Shader::Info::VsInput::None
? vk::VertexInputRate::eVertex
: vk::VertexInputRate::eInstance,
.divisor = 1,
});
}
std::ranges::sort(ranges, [](const BufferRange& lhv, const BufferRange& rhv) {
@ -224,6 +248,19 @@ std::pair<Buffer*, u32> BufferCache::ObtainBuffer(VAddr device_addr, u32 size, b
return {&buffer, buffer.Offset(device_addr)};
}
std::pair<const Buffer*, u32> BufferCache::ObtainTempBuffer(VAddr gpu_addr, u32 size) {
const u64 page = gpu_addr >> CACHING_PAGEBITS;
const BufferId buffer_id = page_table[page];
if (buffer_id) {
const Buffer& buffer = slot_buffers[buffer_id];
if (buffer.IsInBounds(gpu_addr, size)) {
return {&buffer, buffer.Offset(gpu_addr)};
}
}
const u32 offset = staging_buffer.Copy(gpu_addr, size, 16);
return {&staging_buffer, offset};
}
bool BufferCache::IsRegionRegistered(VAddr addr, size_t size) {
const VAddr end_addr = addr + size;
const u64 page_end = Common::DivCeil(end_addr, CACHING_PAGESIZE);
@ -248,6 +285,10 @@ bool BufferCache::IsRegionCpuModified(VAddr addr, size_t size) {
return memory_tracker.IsRegionCpuModified(addr, size);
}
bool BufferCache::IsRegionGpuModified(VAddr addr, size_t size) {
return memory_tracker.IsRegionGpuModified(addr, size);
}
BufferId BufferCache::FindBuffer(VAddr device_addr, u32 size) {
if (device_addr == 0) {
return NULL_BUFFER_ID;

View file

@ -69,12 +69,18 @@ public:
/// Obtains a buffer for the specified region.
[[nodiscard]] std::pair<Buffer*, u32> ObtainBuffer(VAddr gpu_addr, u32 size, bool is_written);
/// Obtains a temporary buffer for usage in texture cache.
[[nodiscard]] std::pair<const Buffer*, u32> ObtainTempBuffer(VAddr gpu_addr, u32 size);
/// Return true when a region is registered on the cache
[[nodiscard]] bool IsRegionRegistered(VAddr addr, size_t size);
/// Return true when a CPU region is modified from the CPU
[[nodiscard]] bool IsRegionCpuModified(VAddr addr, size_t size);
/// Return true when a CPU region is modified from the GPU
[[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size);
private:
template <typename Func>
void ForEachBufferInRange(VAddr device_addr, u64 size, Func&& func) {

View file

@ -47,7 +47,7 @@ public:
Frame* PrepareFrame(const Libraries::VideoOut::BufferAttributeGroup& attribute,
VAddr cpu_address, bool is_eop) {
const auto info = VideoCore::ImageInfo{attribute, cpu_address};
const auto image_id = texture_cache.FindImage(info, false);
const auto image_id = texture_cache.FindImage(info);
auto& image = texture_cache.GetImage(image_id);
return PrepareFrameInternal(image, is_eop);
}
@ -61,7 +61,7 @@ public:
const Libraries::VideoOut::BufferAttributeGroup& attribute, VAddr cpu_address) {
vo_buffers_addr.emplace_back(cpu_address);
const auto info = VideoCore::ImageInfo{attribute, cpu_address};
const auto image_id = texture_cache.FindImage(info, false);
const auto image_id = texture_cache.FindImage(info);
return texture_cache.GetImage(image_id);
}

View file

@ -96,7 +96,7 @@ bool ComputePipeline::BindResources(VideoCore::BufferCache& buffer_cache,
Shader::PushData push_data{};
u32 binding{};
for (u32 i = 0; const auto& buffer : info.buffers) {
for (const auto& buffer : info.buffers) {
const auto vsharp = buffer.GetVsharp(info);
const VAddr address = vsharp.base_address;
// Most of the time when a metadata is updated with a shader it gets cleared. It means we
@ -115,7 +115,7 @@ bool ComputePipeline::BindResources(VideoCore::BufferCache& buffer_cache,
}
const u32 size = vsharp.GetSize();
if (buffer.is_written) {
texture_cache.InvalidateMemory(address, size);
texture_cache.InvalidateMemory(address, size, true);
}
const u32 alignment =
buffer.is_storage ? instance.StorageMinAlignment() : instance.UniformMinAlignment();
@ -137,7 +137,6 @@ bool ComputePipeline::BindResources(VideoCore::BufferCache& buffer_cache,
: vk::DescriptorType::eUniformBuffer,
.pBufferInfo = &buffer_infos.back(),
});
i++;
}
for (const auto& image_desc : info.images) {

View file

@ -145,6 +145,9 @@ GraphicsPipeline::GraphicsPipeline(const Instance& instance_, Scheduler& schedul
dynamic_states.push_back(vk::DynamicState::eColorWriteEnableEXT);
dynamic_states.push_back(vk::DynamicState::eColorWriteMaskEXT);
}
if (instance.IsVertexInputDynamicState()) {
dynamic_states.push_back(vk::DynamicState::eVertexInputEXT);
}
const vk::PipelineDynamicStateCreateInfo dynamic_info = {
.dynamicStateCount = static_cast<u32>(dynamic_states.size()),

View file

@ -202,6 +202,8 @@ bool Instance::CreateDevice() {
add_extension(VK_EXT_DEPTH_RANGE_UNRESTRICTED_EXTENSION_NAME);
workgroup_memory_explicit_layout =
add_extension(VK_KHR_WORKGROUP_MEMORY_EXPLICIT_LAYOUT_EXTENSION_NAME);
vertex_input_dynamic_state = add_extension(VK_EXT_VERTEX_INPUT_DYNAMIC_STATE_EXTENSION_NAME);
// The next two extensions are required to be available together in order to support write masks
color_write_en = add_extension(VK_EXT_COLOR_WRITE_ENABLE_EXTENSION_NAME);
color_write_en &= add_extension(VK_EXT_EXTENDED_DYNAMIC_STATE_3_EXTENSION_NAME);
@ -319,6 +321,9 @@ bool Instance::CreateDevice() {
vk::PhysicalDeviceSynchronization2Features{
.synchronization2 = true,
},
vk::PhysicalDeviceVertexInputDynamicStateFeaturesEXT{
.vertexInputDynamicState = true,
},
};
if (!color_write_en) {
@ -331,8 +336,8 @@ bool Instance::CreateDevice() {
} else {
device_chain.unlink<vk::PhysicalDeviceRobustness2FeaturesEXT>();
}
if (!has_sync2) {
device_chain.unlink<vk::PhysicalDeviceSynchronization2Features>();
if (!vertex_input_dynamic_state) {
device_chain.unlink<vk::PhysicalDeviceVertexInputDynamicStateFeaturesEXT>();
}
try {

View file

@ -132,6 +132,11 @@ public:
return color_write_en;
}
/// Returns true when VK_EXT_vertex_input_dynamic_state is supported.
bool IsVertexInputDynamicState() const {
return vertex_input_dynamic_state;
}
/// Returns the vendor ID of the physical device
u32 GetVendorID() const {
return properties.vendorID;
@ -257,6 +262,7 @@ private:
bool external_memory_host{};
bool workgroup_memory_explicit_layout{};
bool color_write_en{};
bool vertex_input_dynamic_state{};
u64 min_imported_host_pointer_alignment{};
u32 subgroup_size{};
bool tooling_info{};

View file

@ -209,6 +209,10 @@ void PipelineCache::RefreshGraphicsKey() {
continue;
}
const auto* bininfo = Liverpool::GetBinaryInfo(*pgm);
if (!bininfo->Valid()) {
key.stage_hashes[i] = 0;
continue;
}
key.stage_hashes[i] = bininfo->shader_hash;
}
}

View file

@ -117,6 +117,7 @@ Image::Image(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_,
: instance{&instance_}, scheduler{&scheduler_}, info{info_},
image{instance->GetDevice(), instance->GetAllocator()}, cpu_addr{info.guest_address},
cpu_addr_end{cpu_addr + info.guest_size_bytes} {
mip_hashes.resize(info.resources.levels);
ASSERT(info.pixel_format != vk::Format::eUndefined);
// Here we force `eExtendedUsage` as don't know all image usage cases beforehand. In normal case
// the texture cache should re-create the resource with the usage requested

View file

@ -111,6 +111,7 @@ struct Image {
vk::Flags<vk::PipelineStageFlagBits> pl_stage = vk::PipelineStageFlagBits::eAllCommands;
vk::Flags<vk::AccessFlagBits> access_mask = vk::AccessFlagBits::eNone;
vk::ImageLayout layout = vk::ImageLayout::eUndefined;
boost::container::small_vector<u64, 14> mip_hashes;
};
} // namespace VideoCore

View file

@ -3,6 +3,7 @@
#include <xxhash.h>
#include "common/assert.h"
#include "video_core/buffer_cache/buffer_cache.h"
#include "video_core/page_manager.h"
#include "video_core/renderer_vulkan/vk_instance.h"
#include "video_core/renderer_vulkan/vk_scheduler.h"
@ -11,13 +12,11 @@
namespace VideoCore {
static constexpr u64 StreamBufferSize = 512_MB;
static constexpr u64 PageShift = 12;
TextureCache::TextureCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_,
BufferCache& buffer_cache_, PageManager& tracker_)
: instance{instance_}, scheduler{scheduler_}, buffer_cache{buffer_cache_}, tracker{tracker_},
staging{instance, scheduler, MemoryUsage::Upload, StreamBufferSize},
tile_manager{instance, scheduler} {
ImageInfo info;
info.pixel_format = vk::Format::eR8G8B8A8Unorm;
@ -31,9 +30,12 @@ TextureCache::TextureCache(const Vulkan::Instance& instance_, Vulkan::Scheduler&
TextureCache::~TextureCache() = default;
void TextureCache::InvalidateMemory(VAddr address, size_t size) {
void TextureCache::InvalidateMemory(VAddr address, size_t size, bool from_compute) {
std::unique_lock lock{mutex};
ForEachImageInRegion(address, size, [&](ImageId image_id, Image& image) {
if (from_compute && !image.Overlaps(address, size)) {
return;
}
// Ensure image is reuploaded when accessed again.
image.flags |= ImageFlagBits::CpuModified;
// Untrack image, so the range is unprotected and the guest can write freely.
@ -57,7 +59,7 @@ void TextureCache::UnmapMemory(VAddr cpu_addr, size_t size) {
}
}
ImageId TextureCache::FindImage(const ImageInfo& info, bool refresh_on_create) {
ImageId TextureCache::FindImage(const ImageInfo& info) {
if (info.guest_address == 0) [[unlikely]] {
return NULL_IMAGE_VIEW_ID;
}
@ -87,12 +89,6 @@ ImageId TextureCache::FindImage(const ImageInfo& info, bool refresh_on_create) {
image_id = image_ids[image_ids.size() > 1 ? 1 : 0];
}
Image& image = slot_images[image_id];
if (True(image.flags & ImageFlagBits::CpuModified) && refresh_on_create) {
RefreshImage(image);
TrackImage(image, image_id);
}
return image_id;
}
@ -119,6 +115,7 @@ ImageView& TextureCache::RegisterImageView(ImageId image_id, const ImageViewInfo
ImageView& TextureCache::FindTexture(const ImageInfo& info, const ImageViewInfo& view_info) {
const ImageId image_id = FindImage(info);
UpdateImage(image_id);
Image& image = slot_images[image_id];
auto& usage = image.info.usage;
@ -165,7 +162,8 @@ ImageView& TextureCache::FindRenderTarget(const ImageInfo& image_info,
const ImageViewInfo& view_info) {
const ImageId image_id = FindImage(image_info);
Image& image = slot_images[image_id];
image.flags &= ~ImageFlagBits::CpuModified;
image.flags |= ImageFlagBits::GpuModified;
UpdateImage(image_id);
image.Transit(vk::ImageLayout::eColorAttachmentOptimal,
vk::AccessFlagBits::eColorAttachmentWrite |
@ -198,8 +196,9 @@ ImageView& TextureCache::FindRenderTarget(const ImageInfo& image_info,
ImageView& TextureCache::FindDepthTarget(const ImageInfo& image_info,
const ImageViewInfo& view_info) {
const ImageId image_id = FindImage(image_info, false);
const ImageId image_id = FindImage(image_info);
Image& image = slot_images[image_id];
image.flags |= ImageFlagBits::GpuModified;
image.flags &= ~ImageFlagBits::CpuModified;
const auto new_layout = view_info.is_storage ? vk::ImageLayout::eDepthStencilAttachmentOptimal
@ -228,22 +227,6 @@ void TextureCache::RefreshImage(Image& image) {
// Mark image as validated.
image.flags &= ~ImageFlagBits::CpuModified;
scheduler.EndRendering();
const auto cmdbuf = scheduler.CommandBuffer();
image.Transit(vk::ImageLayout::eTransferDstOptimal, vk::AccessFlagBits::eTransferWrite);
vk::Buffer buffer{staging.Handle()};
u32 offset{0};
auto upload_buffer = tile_manager.TryDetile(image);
if (upload_buffer) {
buffer = *upload_buffer;
} else {
// Upload data to the staging buffer.
offset = staging.Copy(image.info.guest_address, image.info.guest_size_bytes, 16);
}
const auto& num_layers = image.info.resources.layers;
const auto& num_mips = image.info.resources.levels;
ASSERT(num_mips == image.info.mips_layout.size());
@ -254,12 +237,23 @@ void TextureCache::RefreshImage(Image& image) {
const u32 height = std::max(image.info.size.height >> m, 1u);
const u32 depth =
image.info.props.is_volume ? std::max(image.info.size.depth >> m, 1u) : 1u;
const auto& [_, mip_pitch, mip_height, mip_ofs] = image.info.mips_layout[m];
const auto& [mip_size, mip_pitch, mip_height, mip_ofs] = image.info.mips_layout[m];
// Protect GPU modified resources from accidental reuploads.
if (True(image.flags & ImageFlagBits::GpuModified) &&
!buffer_cache.IsRegionGpuModified(image.info.guest_address + mip_ofs, mip_size)) {
const u8* addr = std::bit_cast<u8*>(image.info.guest_address);
const u64 hash = XXH3_64bits(addr + mip_ofs, mip_size);
if (image.mip_hashes[m] == hash) {
continue;
}
image.mip_hashes[m] = hash;
}
image_copy.push_back({
.bufferOffset = offset + mip_ofs * num_layers,
.bufferRowLength = static_cast<uint32_t>(mip_pitch),
.bufferImageHeight = static_cast<uint32_t>(mip_height),
.bufferOffset = mip_ofs * num_layers,
.bufferRowLength = static_cast<u32>(mip_pitch),
.bufferImageHeight = static_cast<u32>(mip_height),
.imageSubresource{
.aspectMask = vk::ImageAspectFlagBits::eColor,
.mipLevel = m,
@ -271,6 +265,30 @@ void TextureCache::RefreshImage(Image& image) {
});
}
if (image_copy.empty()) {
return;
}
scheduler.EndRendering();
const auto cmdbuf = scheduler.CommandBuffer();
image.Transit(vk::ImageLayout::eTransferDstOptimal, vk::AccessFlagBits::eTransferWrite, cmdbuf);
const VAddr image_addr = image.info.guest_address;
const size_t image_size = image.info.guest_size_bytes;
vk::Buffer buffer{};
u32 offset{};
if (auto upload_buffer = tile_manager.TryDetile(image); upload_buffer) {
buffer = *upload_buffer;
} else {
const auto [vk_buffer, buf_offset] = buffer_cache.ObtainTempBuffer(image_addr, image_size);
buffer = vk_buffer->Handle();
offset = buf_offset;
}
for (auto& copy : image_copy) {
copy.bufferOffset += offset;
}
cmdbuf.copyBufferToImage(buffer, image.image, vk::ImageLayout::eTransferDstOptimal, image_copy);
}

View file

@ -38,13 +38,13 @@ public:
~TextureCache();
/// Invalidates any image in the logical page range.
void InvalidateMemory(VAddr address, size_t size);
void InvalidateMemory(VAddr address, size_t size, bool from_compute = false);
/// Evicts any images that overlap the unmapped range.
void UnmapMemory(VAddr cpu_addr, size_t size);
/// Retrieves the image handle of the image with the provided attributes.
[[nodiscard]] ImageId FindImage(const ImageInfo& info, bool refresh_on_create = true);
[[nodiscard]] ImageId FindImage(const ImageInfo& info);
/// Retrieves an image view with the properties of the specified image descriptor.
[[nodiscard]] ImageView& FindTexture(const ImageInfo& image_info,
@ -58,6 +58,16 @@ public:
[[nodiscard]] ImageView& FindDepthTarget(const ImageInfo& image_info,
const ImageViewInfo& view_info);
/// Updates image contents if it was modified by CPU.
void UpdateImage(ImageId image_id) {
Image& image = slot_images[image_id];
if (False(image.flags & ImageFlagBits::CpuModified)) {
return;
}
RefreshImage(image);
TrackImage(image, image_id);
}
/// Reuploads image contents.
void RefreshImage(Image& image);
@ -170,7 +180,6 @@ private:
Vulkan::Scheduler& scheduler;
BufferCache& buffer_cache;
PageManager& tracker;
StreamBuffer staging;
TileManager tile_manager;
Common::SlotVector<Image> slot_images;
Common::SlotVector<ImageView> slot_image_views;

View file

@ -5,7 +5,6 @@
#include "video_core/renderer_vulkan/vk_scheduler.h"
#include "video_core/renderer_vulkan/vk_shader_util.h"
#include "video_core/texture_cache/image_view.h"
#include "video_core/texture_cache/texture_cache.h"
#include "video_core/texture_cache/tile_manager.h"
#include "video_core/host_shaders/detile_m32x1_comp.h"