video_core: Implement DMA. (#2819)

* Import memory

* 64K pages and fix memory mapping

* Queue coverage

* Buffer syncing, faulted readback adn BDA in Buffer

* Base DMA implementation

* Preparations for implementing SPV DMA access

* Base impl (pending 16K pages and getbuffersize)

* 16K pages and stack overflow fix

* clang-format

* clang-format but for real this time

* Try to fix macOS build

* Correct decltype

* Add testing log

* Fix stride and patch phi node blocks

* No need to check if it is a deleted buffer

* Clang format once more

* Offset in bytes

* Removed host buffers (may do it in another PR)

Also some random barrier fixes

* Add IR dumping from my read-const branch

* clang-format

* Correct size insteed of end

* Fix incorrect assert

* Possible fix for NieR deadlock

* Copy to avoid deadlock

* Use 2 mutexes insteed of copy

* Attempt to range sync error

* Revert "Attempt to range sync error"

This reverts commit dd287b48682b50f215680bb0956e39c2809bf3fe.

* Fix size truncated when syncing range

And memory barrier

* Some fixes (and async testing (doesn't work))

* Use compute to parse fault buffer

* Process faults on submit

* Only sync in the first time we see a readconst

Thsi is partialy wrong. We need to save the state into the submission context itself, not the rasterizer since we can yield and process another sumission (if im not understanding wrong).

* Use spec const and 32 bit atomic

* 32 bit counter

* Fix store_index

* Better sync (WIP, breaks PR now)

* Fixes for better sync

* Better sync

* Remove memory coveragte logic

* Point sirit to upstream

* Less waiting and barriers

* Correctly checkout moltenvk

* Bring back applying pending operations in wait

* Sync the whole buffer insteed of only the range

* Implement recursive shared/scoped locks

* Iterators

* Faster syncing with ranges

* Some alignment fixes

* fixed clang format

* Fix clang-format again

* Port page_manager from readbacks-poc

* clang-format

* Defer memory protect

* Remove RENDERER_TRACE

* Experiment: only sync on first readconst

* Added profiling (will be removed)

* Don't sync entire buffers

* Added logging for testing

* Updated temporary workaround to use 4k pages

* clang.-format

* Cleanup part 1

* Make ReadConst a SPIR-V function

---------

Co-authored-by: georgemoralis <giorgosmrls@gmail.com>
This commit is contained in:
Lander Gallastegi 2025-05-22 20:00:15 +02:00 committed by GitHub
parent 37887e8fde
commit f9bbde9c79
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
40 changed files with 1641 additions and 311 deletions

View file

@ -7,6 +7,7 @@
#include "shader_recompiler/frontend/fetch_shader.h"
#include "shader_recompiler/runtime_info.h"
#include "video_core/amdgpu/types.h"
#include "video_core/buffer_cache/buffer_cache.h"
#include <boost/container/static_vector.hpp>
#include <fmt/format.h>
@ -70,6 +71,12 @@ EmitContext::EmitContext(const Profile& profile_, const RuntimeInfo& runtime_inf
Bindings& binding_)
: Sirit::Module(profile_.supported_spirv), info{info_}, runtime_info{runtime_info_},
profile{profile_}, stage{info.stage}, l_stage{info.l_stage}, binding{binding_} {
if (info.dma_types != IR::Type::Void) {
SetMemoryModel(spv::AddressingModel::PhysicalStorageBuffer64, spv::MemoryModel::GLSL450);
} else {
SetMemoryModel(spv::AddressingModel::Logical, spv::MemoryModel::GLSL450);
}
AddCapability(spv::Capability::Shader);
DefineArithmeticTypes();
DefineInterfaces();
@ -137,9 +144,13 @@ void EmitContext::DefineArithmeticTypes() {
true_value = ConstantTrue(U1[1]);
false_value = ConstantFalse(U1[1]);
u8_one_value = Constant(U8, 1U);
u8_zero_value = Constant(U8, 0U);
u32_one_value = ConstU32(1U);
u32_zero_value = ConstU32(0U);
f32_zero_value = ConstF32(0.0f);
u64_one_value = Constant(U64, 1ULL);
u64_zero_value = Constant(U64, 0ULL);
pi_x2 = ConstF32(2.0f * float{std::numbers::pi});
@ -157,6 +168,35 @@ void EmitContext::DefineArithmeticTypes() {
if (info.uses_fp64) {
frexp_result_f64 = Name(TypeStruct(F64[1], S32[1]), "frexp_result_f64");
}
if (True(info.dma_types & IR::Type::F64)) {
physical_pointer_types[PointerType::F64] =
TypePointer(spv::StorageClass::PhysicalStorageBuffer, F64[1]);
}
if (True(info.dma_types & IR::Type::U64)) {
physical_pointer_types[PointerType::U64] =
TypePointer(spv::StorageClass::PhysicalStorageBuffer, U64);
}
if (True(info.dma_types & IR::Type::F32)) {
physical_pointer_types[PointerType::F32] =
TypePointer(spv::StorageClass::PhysicalStorageBuffer, F32[1]);
}
if (True(info.dma_types & IR::Type::U32)) {
physical_pointer_types[PointerType::U32] =
TypePointer(spv::StorageClass::PhysicalStorageBuffer, U32[1]);
}
if (True(info.dma_types & IR::Type::F16)) {
physical_pointer_types[PointerType::F16] =
TypePointer(spv::StorageClass::PhysicalStorageBuffer, F16[1]);
}
if (True(info.dma_types & IR::Type::U16)) {
physical_pointer_types[PointerType::U16] =
TypePointer(spv::StorageClass::PhysicalStorageBuffer, U16);
}
if (True(info.dma_types & IR::Type::U8)) {
physical_pointer_types[PointerType::U8] =
TypePointer(spv::StorageClass::PhysicalStorageBuffer, U8);
}
}
void EmitContext::DefineInterfaces() {
@ -195,9 +235,10 @@ EmitContext::SpirvAttribute EmitContext::GetAttributeInfo(AmdGpu::NumberFormat f
}
Id EmitContext::GetBufferSize(const u32 sharp_idx) {
const auto& srt_flatbuf = buffers.back();
ASSERT(srt_flatbuf.buffer_type == BufferType::ReadConstUbo);
const auto [id, pointer_type] = srt_flatbuf[BufferAlias::U32];
// Can this be done with memory access? Like we do now with ReadConst
const auto& srt_flatbuf = buffers[flatbuf_index];
ASSERT(srt_flatbuf.buffer_type == BufferType::Flatbuf);
const auto [id, pointer_type] = srt_flatbuf[PointerType::U32];
const auto rsrc1{
OpLoad(U32[1], OpAccessChain(pointer_type, id, u32_zero_value, ConstU32(sharp_idx + 1)))};
@ -690,8 +731,14 @@ EmitContext::BufferSpv EmitContext::DefineBuffer(bool is_storage, bool is_writte
case Shader::BufferType::GdsBuffer:
Name(id, "gds_buffer");
break;
case Shader::BufferType::ReadConstUbo:
Name(id, "srt_flatbuf_ubo");
case Shader::BufferType::Flatbuf:
Name(id, "srt_flatbuf");
break;
case Shader::BufferType::BdaPagetable:
Name(id, "bda_pagetable");
break;
case Shader::BufferType::FaultBuffer:
Name(id, "fault_buffer");
break;
case Shader::BufferType::SharedMemory:
Name(id, "ssbo_shmem");
@ -705,35 +752,53 @@ EmitContext::BufferSpv EmitContext::DefineBuffer(bool is_storage, bool is_writte
};
void EmitContext::DefineBuffers() {
if (!profile.supports_robust_buffer_access && !info.has_readconst) {
// In case ReadConstUbo has not already been bound by IR and is needed
if (!profile.supports_robust_buffer_access &&
info.readconst_types == Info::ReadConstType::None) {
// In case Flatbuf has not already been bound by IR and is needed
// to query buffer sizes, bind it now.
info.buffers.push_back({
.used_types = IR::Type::U32,
.inline_cbuf = AmdGpu::Buffer::Null(),
.buffer_type = BufferType::ReadConstUbo,
// We can't guarantee that flatbuf will not grow past UBO
// limit if there are a lot of ReadConsts. (We could specialize)
.inline_cbuf = AmdGpu::Buffer::Placeholder(std::numeric_limits<u32>::max()),
.buffer_type = BufferType::Flatbuf,
});
// In the future we may want to read buffer sizes from GPU memory if available.
// info.readconst_types |= Info::ReadConstType::Immediate;
}
for (const auto& desc : info.buffers) {
const auto buf_sharp = desc.GetSharp(info);
const bool is_storage = desc.IsStorage(buf_sharp, profile);
// Set indexes for special buffers.
if (desc.buffer_type == BufferType::Flatbuf) {
flatbuf_index = buffers.size();
} else if (desc.buffer_type == BufferType::BdaPagetable) {
bda_pagetable_index = buffers.size();
} else if (desc.buffer_type == BufferType::FaultBuffer) {
fault_buffer_index = buffers.size();
}
// Define aliases depending on the shader usage.
auto& spv_buffer = buffers.emplace_back(binding.buffer++, desc.buffer_type);
if (True(desc.used_types & IR::Type::U64)) {
spv_buffer[PointerType::U64] =
DefineBuffer(is_storage, desc.is_written, 3, desc.buffer_type, U64);
}
if (True(desc.used_types & IR::Type::U32)) {
spv_buffer[BufferAlias::U32] =
spv_buffer[PointerType::U32] =
DefineBuffer(is_storage, desc.is_written, 2, desc.buffer_type, U32[1]);
}
if (True(desc.used_types & IR::Type::F32)) {
spv_buffer[BufferAlias::F32] =
spv_buffer[PointerType::F32] =
DefineBuffer(is_storage, desc.is_written, 2, desc.buffer_type, F32[1]);
}
if (True(desc.used_types & IR::Type::U16)) {
spv_buffer[BufferAlias::U16] =
spv_buffer[PointerType::U16] =
DefineBuffer(is_storage, desc.is_written, 1, desc.buffer_type, U16);
}
if (True(desc.used_types & IR::Type::U8)) {
spv_buffer[BufferAlias::U8] =
spv_buffer[PointerType::U8] =
DefineBuffer(is_storage, desc.is_written, 0, desc.buffer_type, U8);
}
++binding.unified;
@ -1003,6 +1068,101 @@ Id EmitContext::DefineUfloatM5ToFloat32(u32 mantissa_bits, const std::string_vie
return func;
}
Id EmitContext::DefineGetBdaPointer() {
const auto caching_pagebits{
Constant(U64, static_cast<u64>(VideoCore::BufferCache::CACHING_PAGEBITS))};
const auto caching_pagemask{Constant(U64, VideoCore::BufferCache::CACHING_PAGESIZE - 1)};
const auto func_type{TypeFunction(U64, U64)};
const auto func{OpFunction(U64, spv::FunctionControlMask::MaskNone, func_type)};
const auto address{OpFunctionParameter(U64)};
Name(func, "get_bda_pointer");
AddLabel();
const auto fault_label{OpLabel()};
const auto available_label{OpLabel()};
const auto merge_label{OpLabel()};
// Get page BDA
const auto page{OpShiftRightLogical(U64, address, caching_pagebits)};
const auto page32{OpUConvert(U32[1], page)};
const auto& bda_buffer{buffers[bda_pagetable_index]};
const auto [bda_buffer_id, bda_pointer_type] = bda_buffer[PointerType::U64];
const auto bda_ptr{OpAccessChain(bda_pointer_type, bda_buffer_id, u32_zero_value, page32)};
const auto bda{OpLoad(U64, bda_ptr)};
// Check if page is GPU cached
const auto is_fault{OpIEqual(U1[1], bda, u64_zero_value)};
OpSelectionMerge(merge_label, spv::SelectionControlMask::MaskNone);
OpBranchConditional(is_fault, fault_label, available_label);
// First time acces, mark as fault
AddLabel(fault_label);
const auto& fault_buffer{buffers[fault_buffer_index]};
const auto [fault_buffer_id, fault_pointer_type] = fault_buffer[PointerType::U8];
const auto page_div8{OpShiftRightLogical(U32[1], page32, ConstU32(3U))};
const auto page_mod8{OpBitwiseAnd(U32[1], page32, ConstU32(7U))};
const auto page_mask{OpShiftLeftLogical(U8, u8_one_value, page_mod8)};
const auto fault_ptr{
OpAccessChain(fault_pointer_type, fault_buffer_id, u32_zero_value, page_div8)};
const auto fault_value{OpLoad(U8, fault_ptr)};
const auto fault_value_masked{OpBitwiseOr(U8, fault_value, page_mask)};
OpStore(fault_ptr, fault_value_masked);
// Return null pointer
const auto fallback_result{u64_zero_value};
OpBranch(merge_label);
// Value is available, compute address
AddLabel(available_label);
const auto offset_in_bda{OpBitwiseAnd(U64, address, caching_pagemask)};
const auto addr{OpIAdd(U64, bda, offset_in_bda)};
OpBranch(merge_label);
// Merge
AddLabel(merge_label);
const auto result{OpPhi(U64, addr, available_label, fallback_result, fault_label)};
OpReturnValue(result);
OpFunctionEnd();
return func;
}
Id EmitContext::DefineReadConst(bool dynamic) {
const auto func_type{!dynamic ? TypeFunction(U32[1], U32[2], U32[1], U32[1])
: TypeFunction(U32[1], U32[2], U32[1])};
const auto func{OpFunction(U32[1], spv::FunctionControlMask::MaskNone, func_type)};
const auto base{OpFunctionParameter(U32[2])};
const auto offset{OpFunctionParameter(U32[1])};
const auto flatbuf_offset{!dynamic ? OpFunctionParameter(U32[1]) : Id{}};
Name(func, dynamic ? "read_const_dynamic" : "read_const");
AddLabel();
const auto base_lo{OpUConvert(U64, OpCompositeExtract(U32[1], base, 0))};
const auto base_hi{OpUConvert(U64, OpCompositeExtract(U32[1], base, 1))};
const auto base_shift{OpShiftLeftLogical(U64, base_hi, ConstU32(32U))};
const auto base_addr{OpBitwiseOr(U64, base_lo, base_shift)};
const auto offset_bytes{OpShiftLeftLogical(U32[1], offset, ConstU32(2U))};
const auto addr{OpIAdd(U64, base_addr, OpUConvert(U64, offset_bytes))};
const auto result = EmitMemoryRead(U32[1], addr, [&]() {
if (dynamic) {
return u32_zero_value;
} else {
const auto& flatbuf_buffer{buffers[flatbuf_index]};
ASSERT(flatbuf_buffer.binding >= 0 &&
flatbuf_buffer.buffer_type == BufferType::Flatbuf);
const auto [flatbuf_buffer_id, flatbuf_pointer_type] = flatbuf_buffer[PointerType::U32];
const auto ptr{OpAccessChain(flatbuf_pointer_type, flatbuf_buffer_id, u32_zero_value,
flatbuf_offset)};
return OpLoad(U32[1], ptr);
}
});
OpReturnValue(result);
OpFunctionEnd();
return func;
}
void EmitContext::DefineFunctions() {
if (info.uses_pack_10_11_11) {
f32_to_uf11 = DefineFloat32ToUfloatM5(6, "f32_to_uf11");
@ -1012,6 +1172,18 @@ void EmitContext::DefineFunctions() {
uf11_to_f32 = DefineUfloatM5ToFloat32(6, "uf11_to_f32");
uf10_to_f32 = DefineUfloatM5ToFloat32(5, "uf10_to_f32");
}
if (info.dma_types != IR::Type::Void) {
get_bda_pointer = DefineGetBdaPointer();
}
if (True(info.readconst_types & Info::ReadConstType::Immediate)) {
LOG_DEBUG(Render_Recompiler, "Shader {:#x} uses immediate ReadConst", info.pgm_hash);
read_const = DefineReadConst(false);
}
if (True(info.readconst_types & Info::ReadConstType::Dynamic)) {
LOG_DEBUG(Render_Recompiler, "Shader {:#x} uses dynamic ReadConst", info.pgm_hash);
read_const_dynamic = DefineReadConst(true);
}
}
} // namespace Shader::Backend::SPIRV