From a49b13fe66603c387e01fa6eb9cbd85b6193b99c Mon Sep 17 00:00:00 2001
From: TheTurtle <geoster3d@gmail.com>
Date: Thu, 26 Jun 2025 12:14:36 +0300
Subject: [PATCH] shader_recompiler: Optimize general case of buffer addressing
 (#3159)

* shader_recompiler: Simplify dma types

Only U32 is needed for S_LOAD_DWORD

* shader_recompiler: Perform address shift on IR level

Buffer instructions now expect address in the data unit they work on. Doing the shift on IR level will allow us to optimize some operations away on common case

* shader_recompiler: Optimize common buffer access pattern

* emit_spirv: Use 32-bit integer ops for fault buffer

Not many GPUs have 8-bit bitwise or operations so that would probably require some overhead to emulate from the driver

* resource_tracking_pass: Fix texel buffer shift
---
 .../backend/spirv/emit_spirv.cpp              |   2 +-
 .../backend/spirv/emit_spirv_atomic.cpp       |  52 +++---
 .../spirv/emit_spirv_context_get_set.cpp      | 111 ++++++-------
 .../backend/spirv/spirv_emit_context.cpp      | 155 +++++++++---------
 .../backend/spirv/spirv_emit_context.h        |  90 ++++------
 .../frontend/translate/scalar_alu.cpp         |   1 -
 src/shader_recompiler/info.h                  |   2 +-
 .../ir/passes/resource_tracking_pass.cpp      |  64 +++++++-
 .../ir/passes/shader_info_collection_pass.cpp |  15 +-
 src/shader_recompiler/profile.h               |   2 +-
 .../renderer_vulkan/vk_pipeline_cache.cpp     |   1 +
 .../renderer_vulkan/vk_rasterizer.cpp         |   9 +-
 12 files changed, 271 insertions(+), 233 deletions(-)
diff --git a/src/shader_recompiler/backend/spirv/emit_spirv.cpp b/src/shader_recompiler/backend/spirv/emit_spirv.cpp
index 02f290140..b5b18eed1 100644
--- a/src/shader_recompiler/backend/spirv/emit_spirv.cpp
+++ b/src/shader_recompiler/backend/spirv/emit_spirv.cpp
@@ -300,7 +300,7 @@ void SetupCapabilities(const Info& info, const Profile& profile, EmitContext& ct
     if (stage == LogicalStage::TessellationControl || stage == LogicalStage::TessellationEval) {
         ctx.AddCapability(spv::Capability::Tessellation);
     }
-    if (info.dma_types != IR::Type::Void) {
+    if (info.uses_dma) {
         ctx.AddCapability(spv::Capability::PhysicalStorageBufferAddresses);
         ctx.AddExtension("SPV_KHR_physical_storage_buffer");
     }
diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp
index 97e455ff8..3c833b87d 100644
--- a/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp
+++ b/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp
@@ -7,7 +7,11 @@
 #include "shader_recompiler/backend/spirv/spirv_emit_context.h"
 
 namespace Shader::Backend::SPIRV {
+
 namespace {
+using PointerType = EmitContext::PointerType;
+using PointerSize = EmitContext::PointerSize;
+
 std::pair<Id, Id> AtomicArgs(EmitContext& ctx) {
     const Id scope{ctx.ConstU32(static_cast<u32>(spv::Scope::Device))};
     const Id semantics{ctx.u32_zero_value};
@@ -61,14 +65,13 @@ Id BufferAtomicU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id
             return ctx.U32[1];
         }
     }();
-    if (Sirit::ValidId(buffer.offset)) {
-        address = ctx.OpIAdd(ctx.U32[1], address, buffer.offset);
+    if (const Id offset = buffer.Offset(PointerSize::B32); Sirit::ValidId(offset)) {
+        address = ctx.OpIAdd(ctx.U32[1], address, offset);
     }
-    const Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(2u));
-    const auto [id, pointer_type] = buffer[EmitContext::PointerType::U32];
-    const Id ptr = ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, index);
+    const auto [id, pointer_type] = buffer.Alias(PointerType::U32);
+    const Id ptr = ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, address);
     const auto [scope, semantics]{AtomicArgs(ctx)};
-    return AccessBoundsCheck<32, 1, is_float>(ctx, index, buffer.size_dwords, [&] {
+    return AccessBoundsCheck<32, 1, is_float>(ctx, address, buffer.Size(PointerSize::B32), [&] {
         return (ctx.*atomic_func)(type, ptr, scope, semantics, value);
     });
 }
@@ -76,14 +79,13 @@ Id BufferAtomicU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id
 Id BufferAtomicU32IncDec(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address,
                          Id (Sirit::Module::*atomic_func)(Id, Id, Id, Id)) {
     const auto& buffer = ctx.buffers[handle];
-    if (Sirit::ValidId(buffer.offset)) {
-        address = ctx.OpIAdd(ctx.U32[1], address, buffer.offset);
+    if (const Id offset = buffer.Offset(PointerSize::B32); Sirit::ValidId(offset)) {
+        address = ctx.OpIAdd(ctx.U32[1], address, offset);
     }
-    const Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(2u));
-    const auto [id, pointer_type] = buffer[EmitContext::PointerType::U32];
-    const Id ptr = ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, index);
+    const auto [id, pointer_type] = buffer.Alias(PointerType::U32);
+    const Id ptr = ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, address);
     const auto [scope, semantics]{AtomicArgs(ctx)};
-    return AccessBoundsCheck<32>(ctx, index, buffer.size_dwords, [&] {
+    return AccessBoundsCheck<32>(ctx, address, buffer.Size(PointerSize::B32), [&] {
         return (ctx.*atomic_func)(ctx.U32[1], ptr, scope, semantics);
     });
 }
@@ -92,14 +94,13 @@ Id BufferAtomicU32CmpSwap(EmitContext& ctx, IR::Inst* inst, u32 handle, Id addre
                           Id cmp_value,
                           Id (Sirit::Module::*atomic_func)(Id, Id, Id, Id, Id, Id, Id)) {
     const auto& buffer = ctx.buffers[handle];
-    if (Sirit::ValidId(buffer.offset)) {
-        address = ctx.OpIAdd(ctx.U32[1], address, buffer.offset);
+    if (const Id offset = buffer.Offset(PointerSize::B32); Sirit::ValidId(offset)) {
+        address = ctx.OpIAdd(ctx.U32[1], address, offset);
     }
-    const Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(2u));
-    const auto [id, pointer_type] = buffer[EmitContext::PointerType::U32];
-    const Id ptr = ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, index);
+    const auto [id, pointer_type] = buffer.Alias(PointerType::U32);
+    const Id ptr = ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, address);
     const auto [scope, semantics]{AtomicArgs(ctx)};
-    return AccessBoundsCheck<32>(ctx, index, buffer.size_dwords, [&] {
+    return AccessBoundsCheck<32>(ctx, address, buffer.Size(PointerSize::B32), [&] {
         return (ctx.*atomic_func)(ctx.U32[1], ptr, scope, semantics, semantics, value, cmp_value);
     });
 }
@@ -107,14 +108,13 @@ Id BufferAtomicU32CmpSwap(EmitContext& ctx, IR::Inst* inst, u32 handle, Id addre
 Id BufferAtomicU64(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value,
                    Id (Sirit::Module::*atomic_func)(Id, Id, Id, Id, Id)) {
     const auto& buffer = ctx.buffers[handle];
-    if (Sirit::ValidId(buffer.offset)) {
-        address = ctx.OpIAdd(ctx.U32[1], address, buffer.offset);
+    if (const Id offset = buffer.Offset(PointerSize::B64); Sirit::ValidId(offset)) {
+        address = ctx.OpIAdd(ctx.U32[1], address, offset);
     }
-    const Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(3u));
-    const auto [id, pointer_type] = buffer[EmitContext::PointerType::U64];
-    const Id ptr = ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, index);
+    const auto [id, pointer_type] = buffer.Alias(PointerType::U64);
+    const Id ptr = ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, address);
     const auto [scope, semantics]{AtomicArgs(ctx)};
-    return AccessBoundsCheck<64>(ctx, index, buffer.size_qwords, [&] {
+    return AccessBoundsCheck<64>(ctx, address, buffer.Size(PointerSize::B64), [&] {
         return (ctx.*atomic_func)(ctx.U64, ptr, scope, semantics, value);
     });
 }
@@ -360,7 +360,7 @@ Id EmitImageAtomicExchange32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id co
 
 Id EmitDataAppend(EmitContext& ctx, u32 gds_addr, u32 binding) {
     const auto& buffer = ctx.buffers[binding];
-    const auto [id, pointer_type] = buffer[EmitContext::PointerType::U32];
+    const auto [id, pointer_type] = buffer.Alias(PointerType::U32);
     const Id ptr = ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, ctx.ConstU32(gds_addr));
     const auto [scope, semantics]{AtomicArgs(ctx)};
     return ctx.OpAtomicIIncrement(ctx.U32[1], ptr, scope, semantics);
@@ -368,7 +368,7 @@ Id EmitDataAppend(EmitContext& ctx, u32 gds_addr, u32 binding) {
 
 Id EmitDataConsume(EmitContext& ctx, u32 gds_addr, u32 binding) {
     const auto& buffer = ctx.buffers[binding];
-    const auto [id, pointer_type] = buffer[EmitContext::PointerType::U32];
+    const auto [id, pointer_type] = buffer.Alias(PointerType::U32);
     const Id ptr = ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, ctx.ConstU32(gds_addr));
     const auto [scope, semantics]{AtomicArgs(ctx)};
     return ctx.OpAtomicIDecrement(ctx.U32[1], ptr, scope, semantics);
diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp
index ccbe54d0a..564fb3f80 100644
--- a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp
+++ b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp
@@ -3,6 +3,7 @@
 
 #include "common/assert.h"
 #include "common/logging/log.h"
+#include "shader_recompiler/backend/spirv/emit_spirv_bounds.h"
 #include "shader_recompiler/backend/spirv/emit_spirv_instructions.h"
 #include "shader_recompiler/backend/spirv/spirv_emit_context.h"
 #include "shader_recompiler/ir/attribute.h"
@@ -11,8 +12,6 @@
 
 #include <magic_enum/magic_enum.hpp>
 
-#include "emit_spirv_bounds.h"
-
 namespace Shader::Backend::SPIRV {
 namespace {
 
@@ -164,6 +163,7 @@ void EmitGetGotoVariable(EmitContext&) {
 }
 
 using PointerType = EmitContext::PointerType;
+using PointerSize = EmitContext::PointerSize;
 
 Id EmitReadConst(EmitContext& ctx, IR::Inst* inst, Id addr, Id offset) {
     const u32 flatbuf_off_dw = inst->Flags<u32>();
@@ -179,14 +179,15 @@ Id EmitReadConst(EmitContext& ctx, IR::Inst* inst, Id addr, Id offset) {
 template <PointerType type>
 Id ReadConstBuffer(EmitContext& ctx, u32 handle, Id index) {
     const auto& buffer = ctx.buffers[handle];
-    index = ctx.OpIAdd(ctx.U32[1], index, buffer.offset_dwords);
-    const auto [id, pointer_type] = buffer[type];
+    if (const Id offset = buffer.Offset(PointerSize::B32); Sirit::ValidId(offset)) {
+        index = ctx.OpIAdd(ctx.U32[1], index, offset);
+    }
+    const auto [id, pointer_type] = buffer.Alias(type);
     const auto value_type = type == PointerType::U32 ? ctx.U32[1] : ctx.F32[1];
     const Id ptr{ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, index)};
     const Id result{ctx.OpLoad(value_type, ptr)};
-
-    if (Sirit::ValidId(buffer.size_dwords)) {
-        const Id in_bounds = ctx.OpULessThan(ctx.U1[1], index, buffer.size_dwords);
+    if (const Id size = buffer.Size(PointerSize::B32); Sirit::ValidId(size)) {
+        const Id in_bounds = ctx.OpULessThan(ctx.U1[1], index, size);
         return ctx.OpSelect(value_type, in_bounds, result, ctx.u32_zero_value);
     }
     return result;
@@ -419,25 +420,24 @@ void EmitSetPatch(EmitContext& ctx, IR::Patch patch, Id value) {
 
 template <u32 N, PointerType alias>
 static Id EmitLoadBufferB32xN(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
+    constexpr bool is_float = alias == PointerType::F32;
     const auto flags = inst->Flags<IR::BufferInstInfo>();
     const auto& spv_buffer = ctx.buffers[handle];
-    if (Sirit::ValidId(spv_buffer.offset)) {
-        address = ctx.OpIAdd(ctx.U32[1], address, spv_buffer.offset);
+    if (const Id offset = spv_buffer.Offset(PointerSize::B32); Sirit::ValidId(offset)) {
+        address = ctx.OpIAdd(ctx.U32[1], address, offset);
     }
-    const Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(2u));
     const auto& data_types = alias == PointerType::U32 ? ctx.U32 : ctx.F32;
-    const auto [id, pointer_type] = spv_buffer[alias];
+    const auto [id, pointer_type] = spv_buffer.Alias(alias);
 
     boost::container::static_vector<Id, N> ids;
     for (u32 i = 0; i < N; i++) {
-        const Id index_i = i == 0 ? index : ctx.OpIAdd(ctx.U32[1], index, ctx.ConstU32(i));
+        const Id index_i = i == 0 ? address : ctx.OpIAdd(ctx.U32[1], address, ctx.ConstU32(i));
         const Id ptr_i = ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, index_i);
         const Id result_i = ctx.OpLoad(data_types[1], ptr_i);
         if (!flags.typed) {
             // Untyped loads have bounds checking per-component.
-            ids.push_back(LoadAccessBoundsCheck < 32, 1,
-                          alias ==
-                              PointerType::F32 > (ctx, index_i, spv_buffer.size_dwords, result_i));
+            ids.push_back(LoadAccessBoundsCheck<32, 1, is_float>(
+                ctx, index_i, spv_buffer.Size(PointerSize::B32), result_i));
         } else {
             ids.push_back(result_i);
         }
@@ -446,33 +446,32 @@ static Id EmitLoadBufferB32xN(EmitContext& ctx, IR::Inst* inst, u32 handle, Id a
     const Id result = N == 1 ? ids[0] : ctx.OpCompositeConstruct(data_types[N], ids);
     if (flags.typed) {
         // Typed loads have single bounds check for the whole load.
-        return LoadAccessBoundsCheck < 32, N,
-               alias == PointerType::F32 > (ctx, index, spv_buffer.size_dwords, result);
+        return LoadAccessBoundsCheck<32, N, is_float>(ctx, address,
+                                                      spv_buffer.Size(PointerSize::B32), result);
     }
     return result;
 }
 
 Id EmitLoadBufferU8(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
     const auto& spv_buffer = ctx.buffers[handle];
-    if (Sirit::ValidId(spv_buffer.offset)) {
-        address = ctx.OpIAdd(ctx.U32[1], address, spv_buffer.offset);
+    if (const Id offset = spv_buffer.Offset(PointerSize::B8); Sirit::ValidId(offset)) {
+        address = ctx.OpIAdd(ctx.U32[1], address, offset);
     }
-    const auto [id, pointer_type] = spv_buffer[PointerType::U8];
+    const auto [id, pointer_type] = spv_buffer.Alias(PointerType::U8);
     const Id ptr{ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, address)};
     const Id result{ctx.OpLoad(ctx.U8, ptr)};
-    return LoadAccessBoundsCheck<8>(ctx, address, spv_buffer.size, result);
+    return LoadAccessBoundsCheck<8>(ctx, address, spv_buffer.Size(PointerSize::B8), result);
 }
 
 Id EmitLoadBufferU16(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
     const auto& spv_buffer = ctx.buffers[handle];
-    if (Sirit::ValidId(spv_buffer.offset)) {
-        address = ctx.OpIAdd(ctx.U32[1], address, spv_buffer.offset);
+    if (const Id offset = spv_buffer.Offset(PointerSize::B16); Sirit::ValidId(offset)) {
+        address = ctx.OpIAdd(ctx.U32[1], address, offset);
     }
-    const auto [id, pointer_type] = spv_buffer[PointerType::U16];
-    const Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(1u));
-    const Id ptr{ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, index)};
+    const auto [id, pointer_type] = spv_buffer.Alias(PointerType::U16);
+    const Id ptr{ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, address)};
     const Id result{ctx.OpLoad(ctx.U16, ptr)};
-    return LoadAccessBoundsCheck<16>(ctx, index, spv_buffer.size_shorts, result);
+    return LoadAccessBoundsCheck<16>(ctx, address, spv_buffer.Size(PointerSize::B16), result);
 }
 
 Id EmitLoadBufferU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
@@ -493,14 +492,13 @@ Id EmitLoadBufferU32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address)
 
 Id EmitLoadBufferU64(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
     const auto& spv_buffer = ctx.buffers[handle];
-    if (Sirit::ValidId(spv_buffer.offset)) {
-        address = ctx.OpIAdd(ctx.U32[1], address, spv_buffer.offset);
+    if (const Id offset = spv_buffer.Offset(PointerSize::B64); Sirit::ValidId(offset)) {
+        address = ctx.OpIAdd(ctx.U32[1], address, offset);
     }
-    const auto [id, pointer_type] = spv_buffer[PointerType::U64];
-    const Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(3u));
-    const Id ptr{ctx.OpAccessChain(pointer_type, id, ctx.u64_zero_value, index)};
+    const auto [id, pointer_type] = spv_buffer.Alias(PointerType::U64);
+    const Id ptr{ctx.OpAccessChain(pointer_type, id, ctx.u64_zero_value, address)};
     const Id result{ctx.OpLoad(ctx.U64, ptr)};
-    return LoadAccessBoundsCheck<64>(ctx, index, spv_buffer.size_qwords, result);
+    return LoadAccessBoundsCheck<64>(ctx, address, spv_buffer.Size(PointerSize::B64), result);
 }
 
 Id EmitLoadBufferF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
@@ -526,18 +524,18 @@ Id EmitLoadBufferFormatF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id addr
 template <u32 N, PointerType alias>
 static void EmitStoreBufferB32xN(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address,
                                  Id value) {
+    constexpr bool is_float = alias == PointerType::F32;
     const auto flags = inst->Flags<IR::BufferInstInfo>();
     const auto& spv_buffer = ctx.buffers[handle];
-    if (Sirit::ValidId(spv_buffer.offset)) {
-        address = ctx.OpIAdd(ctx.U32[1], address, spv_buffer.offset);
+    if (const Id offset = spv_buffer.Offset(PointerSize::B32); Sirit::ValidId(offset)) {
+        address = ctx.OpIAdd(ctx.U32[1], address, offset);
     }
-    const Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(2u));
     const auto& data_types = alias == PointerType::U32 ? ctx.U32 : ctx.F32;
-    const auto [id, pointer_type] = spv_buffer[alias];
+    const auto [id, pointer_type] = spv_buffer.Alias(alias);
 
     auto store = [&] {
         for (u32 i = 0; i < N; i++) {
-            const Id index_i = i == 0 ? index : ctx.OpIAdd(ctx.U32[1], index, ctx.ConstU32(i));
+            const Id index_i = i == 0 ? address : ctx.OpIAdd(ctx.U32[1], address, ctx.ConstU32(i));
             const Id ptr_i = ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, index_i);
             const Id value_i = N == 1 ? value : ctx.OpCompositeExtract(data_types[1], value, i);
             auto store_i = [&] {
@@ -546,8 +544,8 @@ static void EmitStoreBufferB32xN(EmitContext& ctx, IR::Inst* inst, u32 handle, I
             };
             if (!flags.typed) {
                 // Untyped stores have bounds checking per-component.
-                AccessBoundsCheck<32, 1, alias == PointerType::F32>(
-                    ctx, index_i, spv_buffer.size_dwords, store_i);
+                AccessBoundsCheck<32, 1, is_float>(ctx, index_i, spv_buffer.Size(PointerSize::B32),
+                                                   store_i);
             } else {
                 store_i();
             }
@@ -557,8 +555,7 @@ static void EmitStoreBufferB32xN(EmitContext& ctx, IR::Inst* inst, u32 handle, I
 
     if (flags.typed) {
         // Typed stores have single bounds check for the whole store.
-        AccessBoundsCheck<32, N, alias == PointerType::F32>(ctx, index, spv_buffer.size_dwords,
-                                                            store);
+        AccessBoundsCheck<32, N, is_float>(ctx, address, spv_buffer.Size(PointerSize::B32), store);
     } else {
         store();
     }
@@ -566,12 +563,12 @@ static void EmitStoreBufferB32xN(EmitContext& ctx, IR::Inst* inst, u32 handle, I
 
 void EmitStoreBufferU8(EmitContext& ctx, IR::Inst*, u32 handle, Id address, Id value) {
     const auto& spv_buffer = ctx.buffers[handle];
-    if (Sirit::ValidId(spv_buffer.offset)) {
-        address = ctx.OpIAdd(ctx.U32[1], address, spv_buffer.offset);
+    if (const Id offset = spv_buffer.Offset(PointerSize::B8); Sirit::ValidId(offset)) {
+        address = ctx.OpIAdd(ctx.U32[1], address, offset);
     }
-    const auto [id, pointer_type] = spv_buffer[PointerType::U8];
+    const auto [id, pointer_type] = spv_buffer.Alias(PointerType::U8);
     const Id ptr{ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, address)};
-    AccessBoundsCheck<8>(ctx, address, spv_buffer.size, [&] {
+    AccessBoundsCheck<8>(ctx, address, spv_buffer.Size(PointerSize::B8), [&] {
         ctx.OpStore(ptr, value);
         return Id{};
     });
@@ -579,13 +576,12 @@ void EmitStoreBufferU8(EmitContext& ctx, IR::Inst*, u32 handle, Id address, Id v
 
 void EmitStoreBufferU16(EmitContext& ctx, IR::Inst*, u32 handle, Id address, Id value) {
     const auto& spv_buffer = ctx.buffers[handle];
-    if (Sirit::ValidId(spv_buffer.offset)) {
-        address = ctx.OpIAdd(ctx.U32[1], address, spv_buffer.offset);
+    if (const Id offset = spv_buffer.Offset(PointerSize::B16); Sirit::ValidId(offset)) {
+        address = ctx.OpIAdd(ctx.U32[1], address, offset);
     }
-    const auto [id, pointer_type] = spv_buffer[PointerType::U16];
-    const Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(1u));
-    const Id ptr{ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, index)};
-    AccessBoundsCheck<16>(ctx, index, spv_buffer.size_shorts, [&] {
+    const auto [id, pointer_type] = spv_buffer.Alias(PointerType::U16);
+    const Id ptr{ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, address)};
+    AccessBoundsCheck<16>(ctx, address, spv_buffer.Size(PointerSize::B16), [&] {
         ctx.OpStore(ptr, value);
         return Id{};
     });
@@ -609,13 +605,12 @@ void EmitStoreBufferU32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id addre
 
 void EmitStoreBufferU64(EmitContext& ctx, IR::Inst*, u32 handle, Id address, Id value) {
     const auto& spv_buffer = ctx.buffers[handle];
-    if (Sirit::ValidId(spv_buffer.offset)) {
-        address = ctx.OpIAdd(ctx.U32[1], address, spv_buffer.offset);
+    if (const Id offset = spv_buffer.Offset(PointerSize::B64); Sirit::ValidId(offset)) {
+        address = ctx.OpIAdd(ctx.U32[1], address, offset);
     }
-    const auto [id, pointer_type] = spv_buffer[PointerType::U64];
-    const Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(3u));
-    const Id ptr{ctx.OpAccessChain(pointer_type, id, ctx.u64_zero_value, index)};
-    AccessBoundsCheck<64>(ctx, index, spv_buffer.size_qwords, [&] {
+    const auto [id, pointer_type] = spv_buffer.Alias(PointerType::U64);
+    const Id ptr{ctx.OpAccessChain(pointer_type, id, ctx.u64_zero_value, address)};
+    AccessBoundsCheck<64>(ctx, address, spv_buffer.Size(PointerSize::B64), [&] {
         ctx.OpStore(ptr, value);
         return Id{};
     });
diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp
index 567c059ae..524914ad4 100644
--- a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp
+++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp
@@ -71,7 +71,7 @@ EmitContext::EmitContext(const Profile& profile_, const RuntimeInfo& runtime_inf
                          Bindings& binding_)
     : Sirit::Module(profile_.supported_spirv), info{info_}, runtime_info{runtime_info_},
       profile{profile_}, stage{info.stage}, l_stage{info.l_stage}, binding{binding_} {
-    if (info.dma_types != IR::Type::Void) {
+    if (info.uses_dma) {
         SetMemoryModel(spv::AddressingModel::PhysicalStorageBuffer64, spv::MemoryModel::GLSL450);
     } else {
         SetMemoryModel(spv::AddressingModel::Logical, spv::MemoryModel::GLSL450);
@@ -169,34 +169,8 @@ void EmitContext::DefineArithmeticTypes() {
     if (info.uses_fp64) {
         frexp_result_f64 = Name(TypeStruct(F64[1], S32[1]), "frexp_result_f64");
     }
-
-    if (True(info.dma_types & IR::Type::F64)) {
-        physical_pointer_types[PointerType::F64] =
-            TypePointer(spv::StorageClass::PhysicalStorageBuffer, F64[1]);
-    }
-    if (True(info.dma_types & IR::Type::U64)) {
-        physical_pointer_types[PointerType::U64] =
-            TypePointer(spv::StorageClass::PhysicalStorageBuffer, U64);
-    }
-    if (True(info.dma_types & IR::Type::F32)) {
-        physical_pointer_types[PointerType::F32] =
-            TypePointer(spv::StorageClass::PhysicalStorageBuffer, F32[1]);
-    }
-    if (True(info.dma_types & IR::Type::U32)) {
-        physical_pointer_types[PointerType::U32] =
-            TypePointer(spv::StorageClass::PhysicalStorageBuffer, U32[1]);
-    }
-    if (True(info.dma_types & IR::Type::F16)) {
-        physical_pointer_types[PointerType::F16] =
-            TypePointer(spv::StorageClass::PhysicalStorageBuffer, F16[1]);
-    }
-    if (True(info.dma_types & IR::Type::U16)) {
-        physical_pointer_types[PointerType::U16] =
-            TypePointer(spv::StorageClass::PhysicalStorageBuffer, U16);
-    }
-    if (True(info.dma_types & IR::Type::U8)) {
-        physical_pointer_types[PointerType::U8] =
-            TypePointer(spv::StorageClass::PhysicalStorageBuffer, U8);
+    if (info.uses_dma) {
+        physical_pointer_type_u32 = TypePointer(spv::StorageClass::PhysicalStorageBuffer, U32[1]);
     }
 }
 
@@ -239,7 +213,7 @@ Id EmitContext::GetBufferSize(const u32 sharp_idx) {
     // Can this be done with memory access? Like we do now with ReadConst
     const auto& srt_flatbuf = buffers[flatbuf_index];
     ASSERT(srt_flatbuf.buffer_type == BufferType::Flatbuf);
-    const auto [id, pointer_type] = srt_flatbuf[PointerType::U32];
+    const auto [id, pointer_type] = srt_flatbuf.Alias(PointerType::U32);
 
     const auto rsrc1{
         OpLoad(U32[1], OpAccessChain(pointer_type, id, u32_zero_value, ConstU32(sharp_idx + 1)))};
@@ -255,39 +229,70 @@ Id EmitContext::GetBufferSize(const u32 sharp_idx) {
 }
 
 void EmitContext::DefineBufferProperties() {
+    if (!profile.needs_buffer_offsets && profile.supports_robust_buffer_access) {
+        return;
+    }
     for (u32 i = 0; i < buffers.size(); i++) {
-        BufferDefinition& buffer = buffers[i];
+        auto& buffer = buffers[i];
+        const auto& desc = info.buffers[i];
+        const u32 binding = buffer.binding;
         if (buffer.buffer_type != BufferType::Guest) {
             continue;
         }
-        const u32 binding = buffer.binding;
-        const u32 half = PushData::BufOffsetIndex + (binding >> 4);
-        const u32 comp = (binding & 0xf) >> 2;
-        const u32 offset = (binding & 0x3) << 3;
-        const Id ptr{OpAccessChain(TypePointer(spv::StorageClass::PushConstant, U32[1]),
-                                   push_data_block, ConstU32(half), ConstU32(comp))};
-        const Id value{OpLoad(U32[1], ptr)};
-        buffer.offset = OpBitFieldUExtract(U32[1], value, ConstU32(offset), ConstU32(8U));
-        Name(buffer.offset, fmt::format("buf{}_off", binding));
-        buffer.offset_dwords = OpShiftRightLogical(U32[1], buffer.offset, ConstU32(2U));
-        Name(buffer.offset_dwords, fmt::format("buf{}_dword_off", binding));
 
-        // Only need to load size if performing bounds checks and the buffer is both guest and not
-        // inline.
-        if (!profile.supports_robust_buffer_access && buffer.buffer_type == BufferType::Guest) {
-            const BufferResource& desc = info.buffers[i];
-            if (desc.sharp_idx == std::numeric_limits<u32>::max()) {
-                buffer.size = ConstU32(desc.inline_cbuf.GetSize());
-            } else {
-                buffer.size = GetBufferSize(desc.sharp_idx);
+        // Only load and apply buffer offsets if host GPU alignment is larger than guest.
+        if (profile.needs_buffer_offsets) {
+            const u32 half = PushData::BufOffsetIndex + (binding >> 4);
+            const u32 comp = (binding & 0xf) >> 2;
+            const u32 offset = (binding & 0x3) << 3;
+            const Id ptr{OpAccessChain(TypePointer(spv::StorageClass::PushConstant, U32[1]),
+                                       push_data_block, ConstU32(half), ConstU32(comp))};
+            const Id value{OpLoad(U32[1], ptr)};
+
+            const Id buf_offset{OpBitFieldUExtract(U32[1], value, ConstU32(offset), ConstU32(8U))};
+            Name(buf_offset, fmt::format("buf{}_off", binding));
+            buffer.Offset(PointerSize::B8) = buf_offset;
+
+            if (True(desc.used_types & IR::Type::U16)) {
+                const Id buf_word_offset{OpShiftRightLogical(U32[1], buf_offset, ConstU32(1U))};
+                Name(buf_word_offset, fmt::format("buf{}_word_off", binding));
+                buffer.Offset(PointerSize::B16) = buf_word_offset;
+            }
+            if (True(desc.used_types & IR::Type::U32)) {
+                const Id buf_dword_offset{OpShiftRightLogical(U32[1], buf_offset, ConstU32(2U))};
+                Name(buf_dword_offset, fmt::format("buf{}_dword_off", binding));
+                buffer.Offset(PointerSize::B32) = buf_dword_offset;
+            }
+            if (True(desc.used_types & IR::Type::U64)) {
+                const Id buf_qword_offset{OpShiftRightLogical(U32[1], buf_offset, ConstU32(3U))};
+                Name(buf_qword_offset, fmt::format("buf{}_qword_off", binding));
+                buffer.Offset(PointerSize::B64) = buf_qword_offset;
+            }
+        }
+
+        // Only load size if performing bounds checks.
+        if (!profile.supports_robust_buffer_access) {
+            const Id buf_size{desc.sharp_idx == std::numeric_limits<u32>::max()
+                                  ? ConstU32(desc.inline_cbuf.GetSize())
+                                  : GetBufferSize(desc.sharp_idx)};
+            Name(buf_size, fmt::format("buf{}_size", binding));
+            buffer.Size(PointerSize::B8) = buf_size;
+
+            if (True(desc.used_types & IR::Type::U16)) {
+                const Id buf_word_size{OpShiftRightLogical(U32[1], buf_size, ConstU32(1U))};
+                Name(buf_word_size, fmt::format("buf{}_short_size", binding));
+                buffer.Size(PointerSize::B16) = buf_word_size;
+            }
+            if (True(desc.used_types & IR::Type::U32)) {
+                const Id buf_dword_size{OpShiftRightLogical(U32[1], buf_size, ConstU32(2U))};
+                Name(buf_dword_size, fmt::format("buf{}_dword_size", binding));
+                buffer.Size(PointerSize::B32) = buf_dword_size;
+            }
+            if (True(desc.used_types & IR::Type::U64)) {
+                const Id buf_qword_size{OpShiftRightLogical(U32[1], buf_size, ConstU32(3U))};
+                Name(buf_qword_size, fmt::format("buf{}_qword_size", binding));
+                buffer.Size(PointerSize::B64) = buf_qword_size;
             }
-            Name(buffer.size, fmt::format("buf{}_size", binding));
-            buffer.size_shorts = OpShiftRightLogical(U32[1], buffer.size, ConstU32(1U));
-            Name(buffer.size_shorts, fmt::format("buf{}_short_size", binding));
-            buffer.size_dwords = OpShiftRightLogical(U32[1], buffer.size, ConstU32(2U));
-            Name(buffer.size_dwords, fmt::format("buf{}_dword_size", binding));
-            buffer.size_qwords = OpShiftRightLogical(U32[1], buffer.size, ConstU32(3U));
-            Name(buffer.size_qwords, fmt::format("buf{}_qword_size", binding));
         }
     }
 }
@@ -779,8 +784,7 @@ EmitContext::BufferSpv EmitContext::DefineBuffer(bool is_storage, bool is_writte
 };
 
 void EmitContext::DefineBuffers() {
-    if (!profile.supports_robust_buffer_access &&
-        info.readconst_types == Info::ReadConstType::None) {
+    if (!profile.supports_robust_buffer_access && !info.uses_dma) {
         // In case Flatbuf has not already been bound by IR and is needed
         // to query buffer sizes, bind it now.
         info.buffers.push_back({
@@ -809,23 +813,23 @@ void EmitContext::DefineBuffers() {
         // Define aliases depending on the shader usage.
         auto& spv_buffer = buffers.emplace_back(binding.buffer++, desc.buffer_type);
         if (True(desc.used_types & IR::Type::U64)) {
-            spv_buffer[PointerType::U64] =
+            spv_buffer.Alias(PointerType::U64) =
                 DefineBuffer(is_storage, desc.is_written, 3, desc.buffer_type, U64);
         }
         if (True(desc.used_types & IR::Type::U32)) {
-            spv_buffer[PointerType::U32] =
+            spv_buffer.Alias(PointerType::U32) =
                 DefineBuffer(is_storage, desc.is_written, 2, desc.buffer_type, U32[1]);
         }
         if (True(desc.used_types & IR::Type::F32)) {
-            spv_buffer[PointerType::F32] =
+            spv_buffer.Alias(PointerType::F32) =
                 DefineBuffer(is_storage, desc.is_written, 2, desc.buffer_type, F32[1]);
         }
         if (True(desc.used_types & IR::Type::U16)) {
-            spv_buffer[PointerType::U16] =
+            spv_buffer.Alias(PointerType::U16) =
                 DefineBuffer(is_storage, desc.is_written, 1, desc.buffer_type, U16);
         }
         if (True(desc.used_types & IR::Type::U8)) {
-            spv_buffer[PointerType::U8] =
+            spv_buffer.Alias(PointerType::U8) =
                 DefineBuffer(is_storage, desc.is_written, 0, desc.buffer_type, U8);
         }
         ++binding.unified;
@@ -1154,7 +1158,7 @@ Id EmitContext::DefineGetBdaPointer() {
     const auto page{OpShiftRightLogical(U64, address, caching_pagebits)};
     const auto page32{OpUConvert(U32[1], page)};
     const auto& bda_buffer{buffers[bda_pagetable_index]};
-    const auto [bda_buffer_id, bda_pointer_type] = bda_buffer[PointerType::U64];
+    const auto [bda_buffer_id, bda_pointer_type] = bda_buffer.Alias(PointerType::U64);
     const auto bda_ptr{OpAccessChain(bda_pointer_type, bda_buffer_id, u32_zero_value, page32)};
     const auto bda{OpLoad(U64, bda_ptr)};
 
@@ -1166,14 +1170,14 @@ Id EmitContext::DefineGetBdaPointer() {
     // First time acces, mark as fault
     AddLabel(fault_label);
     const auto& fault_buffer{buffers[fault_buffer_index]};
-    const auto [fault_buffer_id, fault_pointer_type] = fault_buffer[PointerType::U8];
-    const auto page_div8{OpShiftRightLogical(U32[1], page32, ConstU32(3U))};
-    const auto page_mod8{OpBitwiseAnd(U32[1], page32, ConstU32(7U))};
-    const auto page_mask{OpShiftLeftLogical(U8, u8_one_value, page_mod8)};
+    const auto [fault_buffer_id, fault_pointer_type] = fault_buffer.Alias(PointerType::U32);
+    const auto page_div32{OpShiftRightLogical(U32[1], page32, ConstU32(5U))};
+    const auto page_mod32{OpBitwiseAnd(U32[1], page32, ConstU32(31U))};
+    const auto page_mask{OpShiftLeftLogical(U32[1], u32_one_value, page_mod32)};
     const auto fault_ptr{
-        OpAccessChain(fault_pointer_type, fault_buffer_id, u32_zero_value, page_div8)};
-    const auto fault_value{OpLoad(U8, fault_ptr)};
-    const auto fault_value_masked{OpBitwiseOr(U8, fault_value, page_mask)};
+        OpAccessChain(fault_pointer_type, fault_buffer_id, u32_zero_value, page_div32)};
+    const auto fault_value{OpLoad(U32[1], fault_ptr)};
+    const auto fault_value_masked{OpBitwiseOr(U32[1], fault_value, page_mask)};
     OpStore(fault_ptr, fault_value_masked);
 
     // Return null pointer
@@ -1211,14 +1215,15 @@ Id EmitContext::DefineReadConst(bool dynamic) {
     const auto offset_bytes{OpShiftLeftLogical(U32[1], offset, ConstU32(2U))};
     const auto addr{OpIAdd(U64, base_addr, OpUConvert(U64, offset_bytes))};
 
-    const auto result = EmitMemoryRead(U32[1], addr, [&]() {
+    const auto result = EmitDwordMemoryRead(addr, [&]() {
         if (dynamic) {
             return u32_zero_value;
         } else {
             const auto& flatbuf_buffer{buffers[flatbuf_index]};
             ASSERT(flatbuf_buffer.binding >= 0 &&
                    flatbuf_buffer.buffer_type == BufferType::Flatbuf);
-            const auto [flatbuf_buffer_id, flatbuf_pointer_type] = flatbuf_buffer[PointerType::U32];
+            const auto [flatbuf_buffer_id, flatbuf_pointer_type] =
+                flatbuf_buffer.Alias(PointerType::U32);
             const auto ptr{OpAccessChain(flatbuf_pointer_type, flatbuf_buffer_id, u32_zero_value,
                                          flatbuf_offset)};
             return OpLoad(U32[1], ptr);
@@ -1239,7 +1244,7 @@ void EmitContext::DefineFunctions() {
         uf11_to_f32 = DefineUfloatM5ToFloat32(6, "uf11_to_f32");
         uf10_to_f32 = DefineUfloatM5ToFloat32(5, "uf10_to_f32");
     }
-    if (info.dma_types != IR::Type::Void) {
+    if (info.uses_dma) {
         get_bda_pointer = DefineGetBdaPointer();
     }
 
diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.h b/src/shader_recompiler/backend/spirv/spirv_emit_context.h
index 1eb7d05c6..f8c6416e8 100644
--- a/src/shader_recompiler/backend/spirv/spirv_emit_context.h
+++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.h
@@ -42,17 +42,6 @@ public:
                          Bindings& binding);
     ~EmitContext();
 
-    enum class PointerType : u32 {
-        U8,
-        U16,
-        F16,
-        U32,
-        F32,
-        U64,
-        F64,
-        NumAlias,
-    };
-
     Id Def(const IR::Value& value);
 
     void DefineBufferProperties();
@@ -155,25 +144,7 @@ public:
         return last_label;
     }
 
-    PointerType PointerTypeFromType(Id type) {
-        if (type.value == U8.value)
-            return PointerType::U8;
-        if (type.value == U16.value)
-            return PointerType::U16;
-        if (type.value == F16[1].value)
-            return PointerType::F16;
-        if (type.value == U32[1].value)
-            return PointerType::U32;
-        if (type.value == F32[1].value)
-            return PointerType::F32;
-        if (type.value == U64.value)
-            return PointerType::U64;
-        if (type.value == F64[1].value)
-            return PointerType::F64;
-        UNREACHABLE_MSG("Unknown type for pointer");
-    }
-
-    Id EmitMemoryRead(Id type, Id address, auto&& fallback) {
+    Id EmitDwordMemoryRead(Id address, auto&& fallback) {
         const Id available_label = OpLabel();
         const Id fallback_label = OpLabel();
         const Id merge_label = OpLabel();
@@ -185,10 +156,8 @@ public:
 
         // Available
         AddLabel(available_label);
-        const auto pointer_type = PointerTypeFromType(type);
-        const Id pointer_type_id = physical_pointer_types[pointer_type];
-        const Id addr_ptr = OpConvertUToPtr(pointer_type_id, addr);
-        const Id result = OpLoad(type, addr_ptr, spv::MemoryAccessMask::Aligned, 4u);
+        const Id addr_ptr = OpConvertUToPtr(physical_pointer_type_u32, addr);
+        const Id result = OpLoad(U32[1], addr_ptr, spv::MemoryAccessMask::Aligned, 4u);
         OpBranch(merge_label);
 
         // Fallback
@@ -199,7 +168,7 @@ public:
         // Merge
         AddLabel(merge_label);
         const Id final_result =
-            OpPhi(type, fallback_result, fallback_label, result, available_label);
+            OpPhi(U32[1], fallback_result, fallback_label, result, available_label);
         return final_result;
     }
 
@@ -314,6 +283,24 @@ public:
         bool is_storage = false;
     };
 
+    enum class PointerType : u32 {
+        U8,
+        U16,
+        U32,
+        F32,
+        U64,
+        F64,
+        NumAlias,
+    };
+
+    enum class PointerSize : u32 {
+        B8,
+        B16,
+        B32,
+        B64,
+        NumClass,
+    };
+
     struct BufferSpv {
         Id id;
         Id pointer_type;
@@ -322,32 +309,23 @@ public:
     struct BufferDefinition {
         u32 binding;
         BufferType buffer_type;
-        Id offset;
-        Id offset_dwords;
-        Id size;
-        Id size_shorts;
-        Id size_dwords;
-        Id size_qwords;
+        std::array<Id, u32(PointerSize::NumClass)> offsets;
+        std::array<Id, u32(PointerSize::NumClass)> sizes;
         std::array<BufferSpv, u32(PointerType::NumAlias)> aliases;
 
-        const BufferSpv& operator[](PointerType alias) const {
-            return aliases[u32(alias)];
+        template <class Self>
+        auto& Alias(this Self& self, PointerType alias) {
+            return self.aliases[u32(alias)];
         }
 
-        BufferSpv& operator[](PointerType alias) {
-            return aliases[u32(alias)];
-        }
-    };
-
-    struct PhysicalPointerTypes {
-        std::array<Id, u32(PointerType::NumAlias)> types;
-
-        const Id& operator[](PointerType type) const {
-            return types[u32(type)];
+        template <class Self>
+        auto& Offset(this Self& self, PointerSize size) {
+            return self.offsets[u32(size)];
         }
 
-        Id& operator[](PointerType type) {
-            return types[u32(type)];
+        template <class Self>
+        auto& Size(this Self& self, PointerSize size) {
+            return self.sizes[u32(size)];
         }
     };
 
@@ -356,12 +334,12 @@ public:
     boost::container::small_vector<BufferDefinition, 16> buffers;
     boost::container::small_vector<TextureDefinition, 8> images;
     boost::container::small_vector<Id, 4> samplers;
-    PhysicalPointerTypes physical_pointer_types;
     std::unordered_map<u32, Id> first_to_last_label_map;
 
     size_t flatbuf_index{};
     size_t bda_pagetable_index{};
     size_t fault_buffer_index{};
+    Id physical_pointer_type_u32;
 
     Id sampler_type{};
     Id sampler_pointer_type{};
diff --git a/src/shader_recompiler/frontend/translate/scalar_alu.cpp b/src/shader_recompiler/frontend/translate/scalar_alu.cpp
index 7beb594c3..48f977f49 100644
--- a/src/shader_recompiler/frontend/translate/scalar_alu.cpp
+++ b/src/shader_recompiler/frontend/translate/scalar_alu.cpp
@@ -1,7 +1,6 @@
 // SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
 // SPDX-License-Identifier: GPL-2.0-or-later
 
-#include <bit>
 #include "common/assert.h"
 #include "shader_recompiler/frontend/translate/translate.h"
 
diff --git a/src/shader_recompiler/info.h b/src/shader_recompiler/info.h
index 6777c4769..b2b03bbbf 100644
--- a/src/shader_recompiler/info.h
+++ b/src/shader_recompiler/info.h
@@ -238,7 +238,7 @@ struct Info {
         Dynamic = 1 << 1,
     };
     ReadConstType readconst_types{};
-    IR::Type dma_types{IR::Type::Void};
+    bool uses_dma{false};
 
     explicit Info(Stage stage_, LogicalStage l_stage_, ShaderParams params)
         : stage{stage_}, l_stage{l_stage_}, pgm_hash{params.hash}, pgm_base{params.Base()},
diff --git a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp
index 2e9b78f0e..f758d8e7b 100644
--- a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp
+++ b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp
@@ -105,6 +105,49 @@ IR::Type BufferDataType(const IR::Inst& inst, AmdGpu::NumberFormat num_format) {
     }
 }
 
+u32 BufferAddressShift(const IR::Inst& inst, AmdGpu::DataFormat data_format) {
+    switch (inst.GetOpcode()) {
+    case IR::Opcode::LoadBufferU8:
+    case IR::Opcode::StoreBufferU8:
+        return 0;
+    case IR::Opcode::LoadBufferU16:
+    case IR::Opcode::StoreBufferU16:
+        return 1;
+    case IR::Opcode::LoadBufferU64:
+    case IR::Opcode::StoreBufferU64:
+    case IR::Opcode::BufferAtomicIAdd64:
+        return 3;
+    case IR::Opcode::LoadBufferFormatF32:
+    case IR::Opcode::StoreBufferFormatF32: {
+        switch (data_format) {
+        case AmdGpu::DataFormat::Format8:
+            return 0;
+        case AmdGpu::DataFormat::Format8_8:
+        case AmdGpu::DataFormat::Format16:
+            return 1;
+        case AmdGpu::DataFormat::Format8_8_8_8:
+        case AmdGpu::DataFormat::Format16_16:
+        case AmdGpu::DataFormat::Format10_11_11:
+        case AmdGpu::DataFormat::Format2_10_10_10:
+        case AmdGpu::DataFormat::Format16_16_16_16:
+        case AmdGpu::DataFormat::Format32:
+        case AmdGpu::DataFormat::Format32_32:
+        case AmdGpu::DataFormat::Format32_32_32:
+        case AmdGpu::DataFormat::Format32_32_32_32:
+            return 2;
+        default:
+            return 0;
+        }
+        break;
+    }
+    case IR::Opcode::ReadConstBuffer:
+        // Provided address is already in dwords
+        return 0;
+    default:
+        return 2;
+    }
+}
+
 bool IsImageAtomicInstruction(const IR::Inst& inst) {
     switch (inst.GetOpcode()) {
     case IR::Opcode::ImageAtomicIAdd32:
@@ -496,6 +539,22 @@ void PatchDataRingAccess(IR::Block& block, IR::Inst& inst, Info& info, Descripto
 IR::U32 CalculateBufferAddress(IR::IREmitter& ir, const IR::Inst& inst, const Info& info,
                                const AmdGpu::Buffer& buffer, u32 stride) {
     const auto inst_info = inst.Flags<IR::BufferInstInfo>();
+    const u32 inst_offset = inst_info.inst_offset.Value();
+    const auto is_inst_typed = inst_info.inst_data_fmt != AmdGpu::DataFormat::FormatInvalid;
+    const auto data_format = is_inst_typed
+                                 ? AmdGpu::RemapDataFormat(inst_info.inst_data_fmt.Value())
+                                 : buffer.GetDataFmt();
+    const u32 shift = BufferAddressShift(inst, data_format);
+    const u32 mask = (1 << shift) - 1;
+
+    // If address calculation is of the form "index * const_stride + offset" with offset constant
+    // and both const_stride and offset are divisible with the element size, apply shift directly.
+    if (inst_info.index_enable && !inst_info.offset_enable && !buffer.swizzle_enable &&
+        !buffer.add_tid_enable && (stride & mask) == 0 && (inst_offset & mask) == 0) {
+        // buffer_offset = index * (const_stride >> shift) + (inst_offset >> shift)
+        const IR::U32 index = IR::U32{inst.Arg(1)};
+        return ir.IAdd(ir.IMul(index, ir.Imm32(stride >> shift)), ir.Imm32(inst_offset >> shift));
+    }
 
     // index = (inst_idxen ? vgpr_index : 0) + (const_add_tid_enable ? thread_id[5:0] : 0)
     IR::U32 index = ir.Imm32(0U);
@@ -512,7 +571,7 @@ IR::U32 CalculateBufferAddress(IR::IREmitter& ir, const IR::Inst& inst, const In
         index = ir.IAdd(index, thread_id);
     }
     // offset = (inst_offen ? vgpr_offset : 0) + inst_offset
-    IR::U32 offset = ir.Imm32(inst_info.inst_offset.Value());
+    IR::U32 offset = ir.Imm32(inst_offset);
     if (inst_info.offset_enable) {
         const IR::U32 vgpr_offset = inst_info.index_enable
                                         ? IR::U32{ir.CompositeExtract(inst.Arg(1), 1)}
@@ -545,6 +604,9 @@ IR::U32 CalculateBufferAddress(IR::IREmitter& ir, const IR::Inst& inst, const In
         // buffer_offset = index * const_stride + offset
         buffer_offset = ir.IAdd(ir.IMul(index, const_stride), offset);
     }
+    if (shift != 0) {
+        buffer_offset = ir.ShiftRightLogical(buffer_offset, ir.Imm32(shift));
+    }
     return buffer_offset;
 }
 
diff --git a/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp b/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp
index b3b4ac36a..797d8bb4a 100644
--- a/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp
+++ b/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp
@@ -102,7 +102,7 @@ void Visit(Info& info, const IR::Inst& inst) {
         info.uses_lane_id = true;
         break;
     case IR::Opcode::ReadConst:
-        if (info.readconst_types == Info::ReadConstType::None) {
+        if (!info.uses_dma) {
             info.buffers.push_back({
                 .used_types = IR::Type::U32,
                 // We can't guarantee that flatbuf will not grow past UBO
@@ -116,7 +116,7 @@ void Visit(Info& info, const IR::Inst& inst) {
         } else {
             info.readconst_types |= Info::ReadConstType::Dynamic;
         }
-        info.dma_types |= IR::Type::U32;
+        info.uses_dma = true;
         break;
     case IR::Opcode::PackUfloat10_11_11:
         info.uses_pack_10_11_11 = true;
@@ -130,21 +130,22 @@ void Visit(Info& info, const IR::Inst& inst) {
 }
 
 void CollectShaderInfoPass(IR::Program& program) {
+    auto& info = program.info;
     for (IR::Block* const block : program.post_order_blocks) {
         for (IR::Inst& inst : block->Instructions()) {
-            Visit(program.info, inst);
+            Visit(info, inst);
         }
     }
 
-    if (program.info.dma_types != IR::Type::Void) {
-        program.info.buffers.push_back({
+    if (info.uses_dma) {
+        info.buffers.push_back({
             .used_types = IR::Type::U64,
             .inline_cbuf = AmdGpu::Buffer::Placeholder(VideoCore::BufferCache::BDA_PAGETABLE_SIZE),
             .buffer_type = BufferType::BdaPagetable,
             .is_written = true,
         });
-        program.info.buffers.push_back({
-            .used_types = IR::Type::U8,
+        info.buffers.push_back({
+            .used_types = IR::Type::U32,
             .inline_cbuf = AmdGpu::Buffer::Placeholder(VideoCore::BufferCache::FAULT_BUFFER_SIZE),
             .buffer_type = BufferType::FaultBuffer,
             .is_written = true,
diff --git a/src/shader_recompiler/profile.h b/src/shader_recompiler/profile.h
index bcdf86962..d7eb307b6 100644
--- a/src/shader_recompiler/profile.h
+++ b/src/shader_recompiler/profile.h
@@ -35,7 +35,7 @@ struct Profile {
     bool lower_left_origin_mode{};
     bool needs_manual_interpolation{};
     bool needs_lds_barriers{};
-    u64 min_ssbo_alignment{};
+    bool needs_buffer_offsets{};
     u64 max_ubo_size{};
     u32 max_viewport_width{};
     u32 max_viewport_height{};
diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
index 1d8ac4823..831995339 100644
--- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
@@ -225,6 +225,7 @@ PipelineCache::PipelineCache(const Instance& instance_, Scheduler& scheduler_,
                                       instance.GetDriverID() == vk::DriverId::eNvidiaProprietary,
         .needs_lds_barriers = instance.GetDriverID() == vk::DriverId::eNvidiaProprietary ||
                               instance.GetDriverID() == vk::DriverId::eMoltenvk,
+        .needs_buffer_offsets = instance.StorageMinAlignment() > 4,
         // When binding a UBO, we calculate its size considering the offset in the larger buffer
         // cache underlying resource. In some cases, it may produce sizes exceeding the system
         // maximum allowed UBO range, so we need to reduce the threshold to prevent issues.
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
index 9dea5ceea..fbeaaf9dc 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@@ -468,17 +468,12 @@ bool Rasterizer::BindResources(const Pipeline* pipeline) {
         stage->PushUd(binding, push_data);
         BindBuffers(*stage, binding, push_data);
         BindTextures(*stage, binding);
-
-        uses_dma |= stage->dma_types != Shader::IR::Type::Void;
+        uses_dma |= stage->uses_dma;
     }
 
-    pipeline->BindResources(set_writes, buffer_barriers, push_data);
-
     if (uses_dma && !fault_process_pending) {
         // We only use fault buffer for DMA right now.
         {
-            // TODO: GPU might have written to memory (for example with EVENT_WRITE_EOP)
-            // we need to account for that and synchronize.
             Common::RecursiveSharedLock lock{mapped_ranges_mutex};
             for (auto& range : mapped_ranges) {
                 buffer_cache.SynchronizeBuffersInRange(range.lower(),
@@ -490,6 +485,8 @@ bool Rasterizer::BindResources(const Pipeline* pipeline) {
 
     fault_process_pending |= uses_dma;
 
+    pipeline->BindResources(set_writes, buffer_barriers, push_data);
+
     return true;
 }