Merge branch 'main' into filesystem_win

2025-07-12 12:45:56 +00:00 · 2025-05-30 09:56:07 +03:00 · 2025-05-30 09:56:07 +03:00 · a3e998b84d
commit a3e998b84d
parent 0289b99d69 790b54bf29
82 changed files with 2400 additions and 557 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -203,7 +203,7 @@ execute_process(

 # Set Version
 set(EMULATOR_VERSION_MAJOR "0")
-set(EMULATOR_VERSION_MINOR "8")
+set(EMULATOR_VERSION_MINOR "9")
 set(EMULATOR_VERSION_PATCH "1")

 set_source_files_properties(src/shadps4.rc PROPERTIES COMPILE_DEFINITIONS "EMULATOR_VERSION_MAJOR=${EMULATOR_VERSION_MAJOR};EMULATOR_VERSION_MINOR=${EMULATOR_VERSION_MINOR};EMULATOR_VERSION_PATCH=${EMULATOR_VERSION_PATCH}")
@ -227,7 +227,7 @@ find_package(SDL3 3.1.2 CONFIG)
 find_package(stb MODULE)
 find_package(toml11 4.2.0 CONFIG)
 find_package(tsl-robin-map 1.3.0 CONFIG)
-find_package(VulkanHeaders 1.4.309 CONFIG)
+find_package(VulkanHeaders 1.4.314 CONFIG)
 find_package(VulkanMemoryAllocator 3.1.0 CONFIG)
 find_package(xbyak 7.07 CONFIG)
 find_package(xxHash 0.8.2 MODULE)
@ -603,6 +603,8 @@ set(CAMERA_LIBS  src/core/libraries/camera/camera.cpp

 set(COMPANION_LIBS  src/core/libraries/companion/companion_httpd.cpp
                    src/core/libraries/companion/companion_httpd.h
+                    src/core/libraries/companion/companion_util.cpp
+                    src/core/libraries/companion/companion_util.h
                    src/core/libraries/companion/companion_error.h
 )
 set(DEV_TOOLS src/core/devtools/layer.cpp
@ -622,6 +624,8 @@ set(DEV_TOOLS src/core/devtools/layer.cpp
              src/core/devtools/widget/imgui_memory_editor.h
              src/core/devtools/widget/memory_map.cpp
              src/core/devtools/widget/memory_map.h
+              src/core/devtools/widget/module_list.cpp
+              src/core/devtools/widget/module_list.h
              src/core/devtools/widget/reg_popup.cpp
              src/core/devtools/widget/reg_popup.h
              src/core/devtools/widget/reg_view.cpp
@ -674,6 +678,8 @@ set(COMMON src/common/logging/backend.cpp
           src/common/polyfill_thread.h
           src/common/rdtsc.cpp
           src/common/rdtsc.h
+           src/common/recursive_lock.cpp
+           src/common/recursive_lock.h
           src/common/sha1.h
           src/common/signal_context.h
           src/common/signal_context.cpp
@ -864,6 +870,7 @@ set(SHADER_RECOMPILER src/shader_recompiler/exception.h
                      src/shader_recompiler/ir/passes/shared_memory_barrier_pass.cpp
                      src/shader_recompiler/ir/passes/shared_memory_to_storage_pass.cpp
                      src/shader_recompiler/ir/passes/ssa_rewrite_pass.cpp
+                      src/shader_recompiler/ir/abstract_syntax_list.cpp
                      src/shader_recompiler/ir/abstract_syntax_list.h
                      src/shader_recompiler/ir/attribute.cpp
                      src/shader_recompiler/ir/attribute.h
--- a/dist/net.shadps4.shadPS4.metainfo.xml
+++ b/dist/net.shadps4.shadPS4.metainfo.xml
@ -37,7 +37,10 @@
    <category translate="no">Game</category>
  </categories>
  <releases>
-  	<release version="0.8.0" date="2025-05-23">
+    <release version="0.9.0" date="2025-05-22">
+      <url>https://github.com/shadps4-emu/shadPS4/releases/tag/v.0.9.0</url>
+    </release>
+  	<release version="0.8.0" date="2025-04-23">
      <url>https://github.com/shadps4-emu/shadPS4/releases/tag/v.0.8.0</url>
    </release>
 	<release version="0.7.0" date="2025-03-23">
--- a/externals/MoltenVK/MoltenVK
+++ b/externals/MoltenVK/MoltenVK
@ -1 +1 @@
-Subproject commit 87a8e8b13d4ad8835367fea1ebad1896d0460946
+Subproject commit 3a0b07a24a4a681ffe70b461b1f4333b2729e2ef
--- a/externals/MoltenVK/SPIRV-Cross
+++ b/externals/MoltenVK/SPIRV-Cross
@ -1 +1 @@
-Subproject commit 7918775748c5e2f5c40d9918ce68825035b5a1e1
+Subproject commit 969e75f7cc0718774231d029f9d52fa87d4ae1b2
--- a/externals/sirit
+++ b/externals/sirit
@ -1 +1 @@
-Subproject commit 09a1416ab1b59ddfebd2618412f118f2004f3b2c
+Subproject commit 6b450704f6fedb9413d0c89a9eb59d028eb1e6c0
--- a/externals/vulkan-headers
+++ b/externals/vulkan-headers
@ -1 +1 @@
-Subproject commit 5ceb9ed481e58e705d0d9b5326537daedd06b97d
+Subproject commit 9c77de5c3dd216f28e407eec65ed9c0a296c1f74
--- a/src/common/config.cpp
+++ b/src/common/config.cpp
@ -154,7 +154,7 @@ bool GetLoadGameSizeEnabled() {

 std::filesystem::path GetSaveDataPath() {
    if (save_data_path.empty()) {
-        return Common::FS::GetUserPath(Common::FS::PathType::SaveDataDir);
+        return Common::FS::GetUserPath(Common::FS::PathType::UserDir) / "savedata";
    }
    return save_data_path;
 }
--- a/src/common/elf_info.h
+++ b/src/common/elf_info.h
@ -71,6 +71,7 @@ class ElfInfo {
    PSFAttributes psf_attributes{};

    std::filesystem::path splash_path{};
+    std::filesystem::path game_folder{};

 public:
    static constexpr u32 FW_15 = 0x1500000;
@ -123,6 +124,10 @@ public:
    [[nodiscard]] const std::filesystem::path& GetSplashPath() const {
        return splash_path;
    }
+
+    [[nodiscard]] const std::filesystem::path& GetGameFolder() const {
+        return game_folder;
+    }
 };

 } // namespace Common
--- a/src/common/logging/filter.cpp
+++ b/src/common/logging/filter.cpp
@ -140,6 +140,7 @@ bool ParseFilterRule(Filter& instance, Iterator begin, Iterator end) {
    SUB(Lib, SigninDialog)                                                                         \
    SUB(Lib, Camera)                                                                               \
    SUB(Lib, CompanionHttpd)                                                                       \
+    SUB(Lib, CompanionUtil)                                                                        \
    CLS(Frontend)                                                                                  \
    CLS(Render)                                                                                    \
    SUB(Render, Vulkan)                                                                            \
--- a/src/common/logging/types.h
+++ b/src/common/logging/types.h
@ -107,6 +107,7 @@ enum class Class : u8 {
    Lib_SigninDialog,      ///< The LibSigninDialog implementation.
    Lib_Camera,            ///< The LibCamera implementation.
    Lib_CompanionHttpd,    ///< The LibCompanionHttpd implementation.
+    Lib_CompanionUtil,     ///< The LibCompanionUtil implementation.
    Frontend,              ///< Emulator UI
    Render,                ///< Video Core
    Render_Vulkan,         ///< Vulkan backend
--- a/src/common/path_util.cpp
+++ b/src/common/path_util.cpp
@ -128,7 +128,6 @@ static auto UserPaths = [] {
    create_path(PathType::LogDir, user_dir / LOG_DIR);
    create_path(PathType::ScreenshotsDir, user_dir / SCREENSHOTS_DIR);
    create_path(PathType::ShaderDir, user_dir / SHADER_DIR);
-    create_path(PathType::SaveDataDir, user_dir / SAVEDATA_DIR);
    create_path(PathType::GameDataDir, user_dir / GAMEDATA_DIR);
    create_path(PathType::TempDataDir, user_dir / TEMPDATA_DIR);
    create_path(PathType::SysModuleDir, user_dir / SYSMODULES_DIR);
--- a/src/common/path_util.h
+++ b/src/common/path_util.h
@ -18,7 +18,6 @@ enum class PathType {
    LogDir,         // Where log files are stored.
    ScreenshotsDir, // Where screenshots are stored.
    ShaderDir,      // Where shaders are stored.
-    SaveDataDir,    // Where guest save data is stored.
    TempDataDir,    // Where game temp data is stored.
    GameDataDir,    // Where game data is stored.
    SysModuleDir,   // Where system modules are stored.
@ -36,7 +35,6 @@ constexpr auto PORTABLE_DIR = "user";
 constexpr auto LOG_DIR = "log";
 constexpr auto SCREENSHOTS_DIR = "screenshots";
 constexpr auto SHADER_DIR = "shader";
-constexpr auto SAVEDATA_DIR = "savedata";
 constexpr auto GAMEDATA_DIR = "data";
 constexpr auto TEMPDATA_DIR = "temp";
 constexpr auto SYSMODULES_DIR = "sys_modules";
--- a/src/common/recursive_lock.cpp
+++ b/src/common/recursive_lock.cpp
@ -0,0 +1,37 @@
+// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <unordered_map>
+#include "common/assert.h"
+#include "common/recursive_lock.h"
+
+namespace Common::Detail {
+
+struct RecursiveLockState {
+    RecursiveLockType type;
+    int count;
+};
+
+thread_local std::unordered_map<void*, RecursiveLockState> g_recursive_locks;
+
+bool IncrementRecursiveLock(void* mutex, RecursiveLockType type) {
+    auto& state = g_recursive_locks[mutex];
+    if (state.count == 0) {
+        ASSERT(state.type == RecursiveLockType::None);
+        state.type = type;
+    }
+    ASSERT(state.type == type);
+    return state.count++ == 0;
+}
+
+bool DecrementRecursiveLock(void* mutex, RecursiveLockType type) {
+    auto& state = g_recursive_locks[mutex];
+    ASSERT(state.type == type && state.count > 0);
+    if (--state.count == 0) {
+        g_recursive_locks.erase(mutex);
+        return true;
+    }
+    return false;
+}
+
+} // namespace Common::Detail
--- a/src/common/recursive_lock.h
+++ b/src/common/recursive_lock.h
@ -0,0 +1,67 @@
+// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#pragma once
+
+#include <mutex>
+#include <optional>
+#include <shared_mutex>
+
+namespace Common {
+
+namespace Detail {
+
+enum class RecursiveLockType { None, Shared, Exclusive };
+
+bool IncrementRecursiveLock(void* mutex, RecursiveLockType type);
+bool DecrementRecursiveLock(void* mutex, RecursiveLockType type);
+
+} // namespace Detail
+
+template <typename MutexType>
+class RecursiveScopedLock {
+public:
+    explicit RecursiveScopedLock(MutexType& mutex) : m_mutex(mutex), m_locked(false) {
+        if (Detail::IncrementRecursiveLock(&m_mutex, Detail::RecursiveLockType::Exclusive)) {
+            m_locked = true;
+            m_lock.emplace(m_mutex);
+        }
+    }
+
+    ~RecursiveScopedLock() {
+        Detail::DecrementRecursiveLock(&m_mutex, Detail::RecursiveLockType::Exclusive);
+        if (m_locked) {
+            m_lock.reset();
+        }
+    }
+
+private:
+    MutexType& m_mutex;
+    std::optional<std::unique_lock<MutexType>> m_lock;
+    bool m_locked = false;
+};
+
+template <typename MutexType>
+class RecursiveSharedLock {
+public:
+    explicit RecursiveSharedLock(MutexType& mutex) : m_mutex(mutex), m_locked(false) {
+        if (Detail::IncrementRecursiveLock(&m_mutex, Detail::RecursiveLockType::Shared)) {
+            m_locked = true;
+            m_lock.emplace(m_mutex);
+        }
+    }
+
+    ~RecursiveSharedLock() {
+        Detail::DecrementRecursiveLock(&m_mutex, Detail::RecursiveLockType::Shared);
+        if (m_locked) {
+            m_lock.reset();
+        }
+    }
+
+private:
+    MutexType& m_mutex;
+    std::optional<std::shared_lock<MutexType>> m_lock;
+    bool m_locked = false;
+};
+
+} // namespace Common
--- a/src/common/slot_vector.h
+++ b/src/common/slot_vector.h
@ -14,6 +14,9 @@ namespace Common {
 struct SlotId {
    static constexpr u32 INVALID_INDEX = std::numeric_limits<u32>::max();

+    SlotId() noexcept = default;
+    constexpr SlotId(u32 index) noexcept : index(index) {}
+
    constexpr auto operator<=>(const SlotId&) const noexcept = default;

    constexpr explicit operator bool() const noexcept {
@ -28,6 +31,63 @@ class SlotVector {
    constexpr static std::size_t InitialCapacity = 2048;

 public:
+    template <typename ValueType, typename Pointer, typename Reference>
+    class Iterator {
+    public:
+        using iterator_category = std::forward_iterator_tag;
+        using value_type = ValueType;
+        using difference_type = std::ptrdiff_t;
+        using pointer = Pointer;
+        using reference = Reference;
+
+        Iterator(SlotVector& vector_, SlotId index_) : vector(vector_), slot(index_) {
+            AdvanceToValid();
+        }
+
+        reference operator*() const {
+            return vector[slot];
+        }
+
+        pointer operator->() const {
+            return &vector[slot];
+        }
+
+        Iterator& operator++() {
+            ++slot.index;
+            AdvanceToValid();
+            return *this;
+        }
+
+        Iterator operator++(int) {
+            Iterator temp = *this;
+            ++(*this);
+            return temp;
+        }
+
+        bool operator==(const Iterator& other) const {
+            return slot == other.slot;
+        }
+
+        bool operator!=(const Iterator& other) const {
+            return !(*this == other);
+        }
+
+    private:
+        void AdvanceToValid() {
+            while (slot < vector.values_capacity && !vector.ReadStorageBit(slot.index)) {
+                ++slot.index;
+            }
+        }
+
+        SlotVector& vector;
+        SlotId slot;
+    };
+
+    using iterator = Iterator<T, T*, T&>;
+    using const_iterator = Iterator<const T, const T*, const T&>;
+    using reverse_iterator = std::reverse_iterator<iterator>;
+    using const_reverse_iterator = std::reverse_iterator<const_iterator>;
+
    SlotVector() {
        Reserve(InitialCapacity);
    }
@ -60,7 +120,7 @@ public:
    }

    template <typename... Args>
-    [[nodiscard]] SlotId insert(Args&&... args) noexcept {
+    SlotId insert(Args&&... args) noexcept {
        const u32 index = FreeValueIndex();
        new (&values[index].object) T(std::forward<Args>(args)...);
        SetStorageBit(index);
@ -78,6 +138,54 @@ public:
        return values_capacity - free_list.size();
    }

+    iterator begin() noexcept {
+        return iterator(*this, 0);
+    }
+
+    const_iterator begin() const noexcept {
+        return const_iterator(*this, 0);
+    }
+
+    const_iterator cbegin() const noexcept {
+        return begin();
+    }
+
+    iterator end() noexcept {
+        return iterator(*this, values_capacity);
+    }
+
+    const_iterator end() const noexcept {
+        return const_iterator(*this, values_capacity);
+    }
+
+    const_iterator cend() const noexcept {
+        return end();
+    }
+
+    reverse_iterator rbegin() noexcept {
+        return reverse_iterator(end());
+    }
+
+    const_reverse_iterator rbegin() const noexcept {
+        return const_reverse_iterator(end());
+    }
+
+    const_reverse_iterator crbegin() const noexcept {
+        return rbegin();
+    }
+
+    reverse_iterator rend() noexcept {
+        return reverse_iterator(begin());
+    }
+
+    const_reverse_iterator rend() const noexcept {
+        return const_reverse_iterator(begin());
+    }
+
+    const_reverse_iterator crend() const noexcept {
+        return rend();
+    }
+
 private:
    struct NonTrivialDummy {
        NonTrivialDummy() noexcept {}
--- a/src/core/devtools/layer.cpp
+++ b/src/core/devtools/layer.cpp
@ -17,6 +17,7 @@
 #include "widget/frame_dump.h"
 #include "widget/frame_graph.h"
 #include "widget/memory_map.h"
+#include "widget/module_list.h"
 #include "widget/shader_list.h"

 extern std::unique_ptr<Vulkan::Presenter> presenter;
@ -40,6 +41,7 @@ static bool just_opened_options = false;

 static Widget::MemoryMapViewer memory_map;
 static Widget::ShaderList shader_list;
+static Widget::ModuleList module_list;

 // clang-format off
 static std::string help_text =
@ -108,6 +110,9 @@ void L::DrawMenuBar() {
            if (MenuItem("Memory map")) {
                memory_map.open = true;
            }
+            if (MenuItem("Module list")) {
+                module_list.open = true;
+            }
            ImGui::EndMenu();
        }

@ -256,6 +261,9 @@ void L::DrawAdvanced() {
    if (shader_list.open) {
        shader_list.Draw();
    }
+    if (module_list.open) {
+        module_list.Draw();
+    }
 }

 void L::DrawSimple() {
--- a/src/core/devtools/widget/module_list.cpp
+++ b/src/core/devtools/widget/module_list.cpp
@ -0,0 +1,55 @@
+//  SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
+//  SPDX-License-Identifier: GPL-2.0-or-later
+
+#include "module_list.h"
+
+#include <imgui.h>
+
+#include "common.h"
+#include "core/debug_state.h"
+#include "imgui/imgui_std.h"
+
+using namespace ImGui;
+
+namespace Core::Devtools::Widget {
+void ModuleList::Draw() {
+    SetNextWindowSize({550.0f, 600.0f}, ImGuiCond_FirstUseEver);
+    if (!Begin("Module List", &open)) {
+        End();
+        return;
+    }
+
+    if (BeginTable("ModuleTable", 3,
+                   ImGuiTableFlags_Borders | ImGuiTableFlags_Resizable | ImGuiTableFlags_Sortable |
+                       ImGuiTableFlags_RowBg)) {
+        TableSetupColumn("Modulname", ImGuiTableColumnFlags_WidthStretch);
+        TableHeadersRow();
+
+        std::scoped_lock lock(modules_mutex);
+        for (const auto& module : modules) {
+            TableNextRow();
+
+            TableSetColumnIndex(0);
+            TextUnformatted(module.name.c_str());
+
+            TableSetColumnIndex(1);
+            if (module.is_sys_module) {
+                TextColored({0.2f, 0.6f, 0.8f, 1.0f}, "System Module");
+            } else {
+                TextColored({0.8f, 0.4f, 0.2f, 1.0f}, "Game Module");
+            }
+
+            TableSetColumnIndex(2);
+            if (module.is_lle) {
+                TextColored({0.4f, 0.7f, 0.4f, 1.0f}, "LLE");
+            } else {
+                TextColored({0.7f, 0.4f, 0.5f, 1.0f}, "HLE");
+            }
+        }
+        EndTable();
+    }
+
+    End();
+}
+
+} // namespace Core::Devtools::Widget
--- a/src/core/devtools/widget/module_list.h
+++ b/src/core/devtools/widget/module_list.h
@ -0,0 +1,82 @@
+//  SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
+//  SPDX-License-Identifier: GPL-2.0-or-later
+
+#pragma once
+
+#include <algorithm>
+#include <filesystem>
+#include <mutex>
+#include <string>
+#include <vector>
+#include "common/elf_info.h"
+#include "common/path_util.h"
+
+namespace Core::Devtools::Widget {
+
+class ModuleList {
+public:
+    ModuleList() = default;
+    ~ModuleList() = default;
+
+    void Draw();
+    bool open = false;
+
+    static bool IsSystemModule(const std::filesystem::path& path) {
+        const auto sys_modules_path = Common::FS::GetUserPath(Common::FS::PathType::SysModuleDir);
+
+        const auto abs_path = std::filesystem::absolute(path).lexically_normal();
+        const auto abs_sys_path = std::filesystem::absolute(sys_modules_path).lexically_normal();
+
+        const auto path_str = abs_path.string();
+        const auto sys_path_str = abs_sys_path.string();
+
+        return path_str.starts_with(sys_path_str);
+    }
+
+    static bool IsSystemModule(const std::string& name) {
+        const auto game_modules_path = Common::ElfInfo::Instance().GetGameFolder() / "sce_module";
+        const auto prx_path = game_modules_path / name;
+
+        if (!std::filesystem::exists(prx_path)) {
+            return true;
+        }
+        return false;
+    }
+
+    static void AddModule(const std::string& name, std::filesystem::path path) {
+        if (name == "eboot.bin") {
+            return;
+        }
+        std::scoped_lock lock(modules_mutex);
+        modules.push_back({name, IsSystemModule(path), true});
+    }
+
+    static void AddModule(std::string name) {
+        name = name + ".prx";
+        std::scoped_lock lock(modules_mutex);
+
+        bool is_sys_module = IsSystemModule(name);
+        bool is_lle = false;
+        auto it = std::find_if(modules.begin(), modules.end(),
+                               [&name, is_sys_module, is_lle](const ModuleInfo& entry) {
+                                   return entry.name == name && !entry.is_lle;
+                               });
+
+        if (it == modules.end()) {
+            modules.push_back({name, is_sys_module, is_lle});
+        }
+    }
+
+private:
+    struct ModuleInfo {
+        std::string name;
+        bool is_sys_module;
+        bool is_lle;
+    };
+
+    static inline std::mutex modules_mutex;
+
+    static inline std::vector<ModuleInfo> modules;
+};
+
+} // namespace Core::Devtools::Widget
--- a/src/core/libraries/companion/companion_error.h
+++ b/src/core/libraries/companion/companion_error.h
@ -3,6 +3,8 @@

 #pragma once

+#include "common/types.h"
+
 // companion_httpd error codes
 constexpr int ORBIS_COMPANION_HTTPD_ERROR_UNKNOWN = 0x80E40001;
 constexpr int ORBIS_COMPANION_HTTPD_ERROR_FATAL = 0x80E40002;
@ -18,3 +20,8 @@ constexpr int ORBIS_COMPANION_HTTPD_ERROR_NOT_STARTED = 0x80E4000B;
 constexpr int ORBIS_COMPANION_HTTPD_ERROR_ALREADY_REGISTERED = 0x80E4000;
 constexpr int ORBIS_COMPANION_HTTPD_ERROR_NOT_CONNECTED = 0x80E4000D;
 constexpr int ORBIS_COMPANION_HTTPD_ERROR_USER_NOT_FOUND = 0x80E4000E;
+
+// companion_util error codes
+constexpr u32 ORBIS_COMPANION_UTIL_INVALID_ARGUMENT = 0x80AD0004;
+constexpr u32 ORBIS_COMPANION_UTIL_INVALID_POINTER = 0x80AD0006;
+constexpr u32 ORBIS_COMPANION_UTIL_NO_EVENT = 0x80AD0008;
--- a/src/core/libraries/companion/companion_util.cpp
+++ b/src/core/libraries/companion/companion_util.cpp
@ -0,0 +1,72 @@
+// SPDX-FileCopyrightText: Copyright 2025 shadPS4 Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include "common/logging/log.h"
+#include "companion_error.h"
+#include "core/libraries/companion/companion_util.h"
+#include "core/libraries/error_codes.h"
+#include "core/libraries/libs.h"
+
+namespace Libraries::CompanionUtil {
+
+u32 PS4_SYSV_ABI getEvent(sceCompanionUtilContext* ctx, sceCompanionUtilEvent* outEvent,
+                          s32 param_3) {
+    if (outEvent == 0) {
+        return ORBIS_COMPANION_UTIL_INVALID_ARGUMENT;
+    }
+
+    if (ctx == nullptr) {
+        return ORBIS_COMPANION_UTIL_INVALID_POINTER;
+    }
+
+    uint8_t* base = ctx->blob;
+    int flag = *reinterpret_cast<int*>(base + 0x178);
+    if (flag == 0) {
+        return ORBIS_COMPANION_UTIL_NO_EVENT;
+    }
+
+    return ORBIS_COMPANION_UTIL_OK;
+}
+
+s32 PS4_SYSV_ABI sceCompanionUtilGetEvent(sceCompanionUtilEvent* outEvent) {
+    sceCompanionUtilContext* ctx = nullptr;
+    u32 ret = getEvent(ctx, outEvent, 1);
+
+    LOG_DEBUG(Lib_CompanionUtil, "(STUBBED) called ret: {}", ret);
+    return ret;
+}
+
+s32 PS4_SYSV_ABI sceCompanionUtilGetRemoteOskEvent() {
+    LOG_ERROR(Lib_CompanionUtil, "(STUBBED) called");
+    return ORBIS_OK;
+}
+
+s32 PS4_SYSV_ABI sceCompanionUtilInitialize() {
+    LOG_ERROR(Lib_CompanionUtil, "(STUBBED) called");
+    return ORBIS_OK;
+}
+
+s32 PS4_SYSV_ABI sceCompanionUtilOptParamInitialize() {
+    LOG_ERROR(Lib_CompanionUtil, "(STUBBED) called");
+    return ORBIS_OK;
+}
+
+s32 PS4_SYSV_ABI sceCompanionUtilTerminate() {
+    LOG_ERROR(Lib_CompanionUtil, "(STUBBED) called");
+    return ORBIS_OK;
+}
+
+void RegisterlibSceCompanionUtil(Core::Loader::SymbolsResolver* sym) {
+    LIB_FUNCTION("cE5Msy11WhU", "libSceCompanionUtil", 1, "libSceCompanionUtil", 1, 1,
+                 sceCompanionUtilGetEvent);
+    LIB_FUNCTION("MaVrz79mT5o", "libSceCompanionUtil", 1, "libSceCompanionUtil", 1, 1,
+                 sceCompanionUtilGetRemoteOskEvent);
+    LIB_FUNCTION("xb1xlIhf0QY", "libSceCompanionUtil", 1, "libSceCompanionUtil", 1, 1,
+                 sceCompanionUtilInitialize);
+    LIB_FUNCTION("IPN-FRSrafk", "libSceCompanionUtil", 1, "libSceCompanionUtil", 1, 1,
+                 sceCompanionUtilOptParamInitialize);
+    LIB_FUNCTION("H1fYQd5lFAI", "libSceCompanionUtil", 1, "libSceCompanionUtil", 1, 1,
+                 sceCompanionUtilTerminate);
+};
+
+} // namespace Libraries::CompanionUtil
--- a/src/core/libraries/companion/companion_util.h
+++ b/src/core/libraries/companion/companion_util.h
@ -0,0 +1,33 @@
+// SPDX-FileCopyrightText: Copyright 2025 shadPS4 Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#pragma once
+
+#include "common/types.h"
+
+namespace Core::Loader {
+class SymbolsResolver;
+}
+
+namespace Libraries::CompanionUtil {
+
+constexpr u32 ORBIS_COMPANION_UTIL_OK = 0;
+
+struct sceCompanionUtilEvent {
+    std::uint8_t blob[0x104]{}; /// 0x104 bytes of data, dont know what it is exactly
+};
+
+struct sceCompanionUtilContext {
+    std::uint8_t blob[0x27B]{}; /// 0x27B bytes of data, dont know what it is exactly
+};
+
+u32 PS4_SYSV_ABI getEvent(sceCompanionUtilContext* ctx, sceCompanionUtilEvent* outEvent,
+                          s32 param_3);
+s32 PS4_SYSV_ABI sceCompanionUtilGetEvent(sceCompanionUtilEvent* outEvent);
+s32 PS4_SYSV_ABI sceCompanionUtilGetRemoteOskEvent();
+s32 PS4_SYSV_ABI sceCompanionUtilInitialize();
+s32 PS4_SYSV_ABI sceCompanionUtilOptParamInitialize();
+s32 PS4_SYSV_ABI sceCompanionUtilTerminate();
+
+void RegisterlibSceCompanionUtil(Core::Loader::SymbolsResolver* sym);
+} // namespace Libraries::CompanionUtil
--- a/src/core/libraries/kernel/equeue.cpp
+++ b/src/core/libraries/kernel/equeue.cpp
@ -98,6 +98,11 @@ bool EqueueInternal::RemoveEvent(u64 id, s16 filter) {
 }

 int EqueueInternal::WaitForEvents(SceKernelEvent* ev, int num, u32 micros) {
+    if (HasSmallTimer()) {
+        // If a small timer is set, just wait for it to expire.
+        return WaitForSmallTimer(ev, num, micros);
+    }
+
    int count = 0;

    const auto predicate = [&] {
@ -187,7 +192,8 @@ int EqueueInternal::WaitForSmallTimer(SceKernelEvent* ev, int num, u32 micros) {
    ASSERT(num == 1);

    auto curr_clock = std::chrono::steady_clock::now();
-    const auto wait_end_us = curr_clock + std::chrono::microseconds{micros};
+    const auto wait_end_us = (micros == 0) ? std::chrono::steady_clock::time_point::max()
+                                           : curr_clock + std::chrono::microseconds{micros};

    do {
        curr_clock = std::chrono::steady_clock::now();
@ -266,24 +272,15 @@ int PS4_SYSV_ABI sceKernelWaitEqueue(SceKernelEqueue eq, SceKernelEvent* ev, int
        return ORBIS_KERNEL_ERROR_EINVAL;
    }

-    if (eq->HasSmallTimer()) {
-        ASSERT(timo && *timo);
-        *out = eq->WaitForSmallTimer(ev, num, *timo);
+    if (timo == nullptr) {
+        // When the timeout is nullptr, we wait indefinitely
+        *out = eq->WaitForEvents(ev, num, 0);
+    } else if (*timo == 0) {
+        // Only events that have already arrived at the time of this function call can be received
+        *out = eq->GetTriggeredEvents(ev, num);
    } else {
-        if (timo == nullptr) { // wait until an event arrives without timing out
-            *out = eq->WaitForEvents(ev, num, 0);
-        }
-
-        if (timo != nullptr) {
-            // Only events that have already arrived at the time of this function call can be
-            // received
-            if (*timo == 0) {
-                *out = eq->GetTriggeredEvents(ev, num);
-            } else {
-                // Wait until an event arrives with timing out
-                *out = eq->WaitForEvents(ev, num, *timo);
-            }
-        }
+        // Wait for up to the specified timeout value
+        *out = eq->WaitForEvents(ev, num, *timo);
    }

    if (*out == 0) {
--- a/src/core/libraries/kernel/memory.cpp
+++ b/src/core/libraries/kernel/memory.cpp
@ -8,7 +8,6 @@
 #include "common/logging/log.h"
 #include "common/scope_exit.h"
 #include "common/singleton.h"
-#include "core/file_sys/fs.h"
 #include "core/libraries/kernel/kernel.h"
 #include "core/libraries/kernel/memory.h"
 #include "core/libraries/kernel/orbis_error.h"
@ -152,7 +151,8 @@ s32 PS4_SYSV_ABI sceKernelReserveVirtualRange(void** addr, u64 len, int flags, u
    const VAddr in_addr = reinterpret_cast<VAddr>(*addr);
    const auto map_flags = static_cast<Core::MemoryMapFlags>(flags);

-    s32 result = memory->Reserve(addr, in_addr, len, map_flags, alignment);
+    s32 result = memory->MapMemory(addr, in_addr, len, Core::MemoryProt::NoAccess, map_flags,
+                                   Core::VMAType::Reserved, "anon", false, -1, alignment);
    if (result == 0) {
        LOG_INFO(Kernel_Vmm, "out_addr = {}", fmt::ptr(*addr));
    }
@ -263,13 +263,22 @@ int PS4_SYSV_ABI sceKernelQueryMemoryProtection(void* addr, void** start, void**
    return memory->QueryProtection(std::bit_cast<VAddr>(addr), start, end, prot);
 }

-int PS4_SYSV_ABI sceKernelMProtect(const void* addr, size_t size, int prot) {
+s32 PS4_SYSV_ABI sceKernelMprotect(const void* addr, u64 size, s32 prot) {
    Core::MemoryManager* memory_manager = Core::Memory::Instance();
    Core::MemoryProt protection_flags = static_cast<Core::MemoryProt>(prot);
    return memory_manager->Protect(std::bit_cast<VAddr>(addr), size, protection_flags);
 }

-int PS4_SYSV_ABI sceKernelMTypeProtect(const void* addr, size_t size, int mtype, int prot) {
+s32 PS4_SYSV_ABI posix_mprotect(const void* addr, u64 size, s32 prot) {
+    s32 result = sceKernelMprotect(addr, size, prot);
+    if (result < 0) {
+        ErrSceToPosix(result);
+        return -1;
+    }
+    return result;
+}
+
+s32 PS4_SYSV_ABI sceKernelMtypeprotect(const void* addr, u64 size, s32 mtype, s32 prot) {
    Core::MemoryManager* memory_manager = Core::Memory::Instance();
    Core::MemoryProt protection_flags = static_cast<Core::MemoryProt>(prot);
    return memory_manager->Protect(std::bit_cast<VAddr>(addr), size, protection_flags);
@ -344,7 +353,7 @@ s32 PS4_SYSV_ABI sceKernelBatchMap2(OrbisKernelBatchMapEntry* entries, int numEn
            break;
        }
        case MemoryOpTypes::ORBIS_KERNEL_MAP_OP_PROTECT: {
-            result = sceKernelMProtect(entries[i].start, entries[i].length, entries[i].protection);
+            result = sceKernelMprotect(entries[i].start, entries[i].length, entries[i].protection);
            LOG_INFO(Kernel_Vmm, "entry = {}, operation = {}, len = {:#x}, result = {}", i,
                     entries[i].operation, entries[i].length, result);
            break;
@ -359,7 +368,7 @@ s32 PS4_SYSV_ABI sceKernelBatchMap2(OrbisKernelBatchMapEntry* entries, int numEn
            break;
        }
        case MemoryOpTypes::ORBIS_KERNEL_MAP_OP_TYPE_PROTECT: {
-            result = sceKernelMTypeProtect(entries[i].start, entries[i].length, entries[i].type,
+            result = sceKernelMtypeprotect(entries[i].start, entries[i].length, entries[i].type,
                                           entries[i].protection);
            LOG_INFO(Kernel_Vmm, "entry = {}, operation = {}, len = {:#x}, result = {}", i,
                     entries[i].operation, entries[i].length, result);
@ -380,7 +389,7 @@ s32 PS4_SYSV_ABI sceKernelBatchMap2(OrbisKernelBatchMapEntry* entries, int numEn
    return result;
 }

-s32 PS4_SYSV_ABI sceKernelSetVirtualRangeName(const void* addr, size_t len, const char* name) {
+s32 PS4_SYSV_ABI sceKernelSetVirtualRangeName(const void* addr, u64 len, const char* name) {
    if (name == nullptr) {
        LOG_ERROR(Kernel_Vmm, "name is invalid!");
        return ORBIS_KERNEL_ERROR_EFAULT;
@ -396,8 +405,8 @@ s32 PS4_SYSV_ABI sceKernelSetVirtualRangeName(const void* addr, size_t len, cons
    return ORBIS_OK;
 }

-s32 PS4_SYSV_ABI sceKernelMemoryPoolExpand(u64 searchStart, u64 searchEnd, size_t len,
-                                           size_t alignment, u64* physAddrOut) {
+s32 PS4_SYSV_ABI sceKernelMemoryPoolExpand(u64 searchStart, u64 searchEnd, u64 len, u64 alignment,
+                                           u64* physAddrOut) {
    if (searchStart < 0 || searchEnd <= searchStart) {
        LOG_ERROR(Kernel_Vmm, "Provided address range is invalid!");
        return ORBIS_KERNEL_ERROR_EINVAL;
@ -439,10 +448,10 @@ s32 PS4_SYSV_ABI sceKernelMemoryPoolExpand(u64 searchStart, u64 searchEnd, size_
    return ORBIS_OK;
 }

-s32 PS4_SYSV_ABI sceKernelMemoryPoolReserve(void* addrIn, size_t len, size_t alignment, int flags,
-                                            void** addrOut) {
-    LOG_INFO(Kernel_Vmm, "addrIn = {}, len = {:#x}, alignment = {:#x}, flags = {:#x}",
-             fmt::ptr(addrIn), len, alignment, flags);
+s32 PS4_SYSV_ABI sceKernelMemoryPoolReserve(void* addr_in, u64 len, u64 alignment, s32 flags,
+                                            void** addr_out) {
+    LOG_INFO(Kernel_Vmm, "addr_in = {}, len = {:#x}, alignment = {:#x}, flags = {:#x}",
+             fmt::ptr(addr_in), len, alignment, flags);

    if (len == 0 || !Common::Is2MBAligned(len)) {
        LOG_ERROR(Kernel_Vmm, "Map size is either zero or not 2MB aligned!");
@ -456,14 +465,16 @@ s32 PS4_SYSV_ABI sceKernelMemoryPoolReserve(void* addrIn, size_t len, size_t ali
    }

    auto* memory = Core::Memory::Instance();
-    const VAddr in_addr = reinterpret_cast<VAddr>(addrIn);
+    const VAddr in_addr = reinterpret_cast<VAddr>(addr_in);
    const auto map_flags = static_cast<Core::MemoryMapFlags>(flags);
-    memory->PoolReserve(addrOut, in_addr, len, map_flags, alignment);
+    u64 map_alignment = alignment == 0 ? 2_MB : alignment;

-    return ORBIS_OK;
+    return memory->MapMemory(addr_out, std::bit_cast<VAddr>(addr_in), len,
+                             Core::MemoryProt::NoAccess, map_flags, Core::VMAType::PoolReserved,
+                             "anon", false, -1, map_alignment);
 }

-s32 PS4_SYSV_ABI sceKernelMemoryPoolCommit(void* addr, size_t len, int type, int prot, int flags) {
+s32 PS4_SYSV_ABI sceKernelMemoryPoolCommit(void* addr, u64 len, s32 type, s32 prot, s32 flags) {
    if (addr == nullptr) {
        LOG_ERROR(Kernel_Vmm, "Address is invalid!");
        return ORBIS_KERNEL_ERROR_EINVAL;
@ -482,7 +493,7 @@ s32 PS4_SYSV_ABI sceKernelMemoryPoolCommit(void* addr, size_t len, int type, int
    return memory->PoolCommit(in_addr, len, mem_prot);
 }

-s32 PS4_SYSV_ABI sceKernelMemoryPoolDecommit(void* addr, size_t len, int flags) {
+s32 PS4_SYSV_ABI sceKernelMemoryPoolDecommit(void* addr, u64 len, s32 flags) {
    if (addr == nullptr) {
        LOG_ERROR(Kernel_Vmm, "Address is invalid!");
        return ORBIS_KERNEL_ERROR_EINVAL;
@ -523,12 +534,12 @@ s32 PS4_SYSV_ABI sceKernelMemoryPoolBatch(const OrbisKernelMemoryPoolBatchEntry*
            break;
        }
        case OrbisKernelMemoryPoolOpcode::Protect: {
-            result = sceKernelMProtect(entry.protect_params.addr, entry.protect_params.len,
+            result = sceKernelMprotect(entry.protect_params.addr, entry.protect_params.len,
                                       entry.protect_params.prot);
            break;
        }
        case OrbisKernelMemoryPoolOpcode::TypeProtect: {
-            result = sceKernelMTypeProtect(
+            result = sceKernelMtypeprotect(
                entry.type_protect_params.addr, entry.type_protect_params.len,
                entry.type_protect_params.type, entry.type_protect_params.prot);
            break;
@ -553,30 +564,48 @@ s32 PS4_SYSV_ABI sceKernelMemoryPoolBatch(const OrbisKernelMemoryPoolBatchEntry*
    return result;
 }

-int PS4_SYSV_ABI sceKernelMmap(void* addr, u64 len, int prot, int flags, int fd, size_t offset,
-                               void** res) {
-    LOG_INFO(Kernel_Vmm, "called addr = {}, len = {}, prot = {}, flags = {}, fd = {}, offset = {}",
-             fmt::ptr(addr), len, prot, flags, fd, offset);
-    auto* h = Common::Singleton<Core::FileSys::HandleTable>::Instance();
+void* PS4_SYSV_ABI posix_mmap(void* addr, u64 len, s32 prot, s32 flags, s32 fd, s64 phys_addr) {
+    LOG_INFO(Kernel_Vmm,
+             "called addr = {}, len = {}, prot = {}, flags = {}, fd = {}, phys_addr = {}",
+             fmt::ptr(addr), len, prot, flags, fd, phys_addr);
+
+    void* addr_out;
    auto* memory = Core::Memory::Instance();
    const auto mem_prot = static_cast<Core::MemoryProt>(prot);
    const auto mem_flags = static_cast<Core::MemoryMapFlags>(flags);
+
+    s32 result = ORBIS_OK;
    if (fd == -1) {
-        return memory->MapMemory(res, std::bit_cast<VAddr>(addr), len, mem_prot, mem_flags,
-                                 Core::VMAType::Flexible);
+        result = memory->MapMemory(&addr_out, std::bit_cast<VAddr>(addr), len, mem_prot, mem_flags,
+                                   Core::VMAType::Flexible);
    } else {
-        const uintptr_t handle = h->GetFile(fd)->f.GetFileMapping();
-        return memory->MapFile(res, std::bit_cast<VAddr>(addr), len, mem_prot, mem_flags, handle,
-                               offset);
+        result = memory->MapFile(&addr_out, std::bit_cast<VAddr>(addr), len, mem_prot, mem_flags,
+                                 fd, phys_addr);
    }
+
+    if (result != ORBIS_OK) {
+        // If the memory mappings fail, mmap sets errno to the appropriate error code,
+        // then returns (void*)-1;
+        ErrSceToPosix(result);
+        return reinterpret_cast<void*>(-1);
+    }
+
+    return addr_out;
 }

-void* PS4_SYSV_ABI posix_mmap(void* addr, u64 len, int prot, int flags, int fd, u64 offset) {
-    void* ptr;
-    LOG_INFO(Kernel_Vmm, "posix mmap redirect to sceKernelMmap");
-    int result = sceKernelMmap(addr, len, prot, flags, fd, offset, &ptr);
-    ASSERT(result == 0);
-    return ptr;
+s32 PS4_SYSV_ABI sceKernelMmap(void* addr, u64 len, s32 prot, s32 flags, s32 fd, s64 phys_addr,
+                               void** res) {
+    void* addr_out = posix_mmap(addr, len, prot, flags, fd, phys_addr);
+
+    if (addr_out == reinterpret_cast<void*>(-1)) {
+        // posix_mmap failed, calculate and return the appropriate kernel error code using errno.
+        LOG_ERROR(Kernel_Fs, "error = {}", *__Error());
+        return ErrnoToSceKernelError(*__Error());
+    }
+
+    // Set the outputted address
+    *res = addr_out;
+    return ORBIS_OK;
 }

 s32 PS4_SYSV_ABI sceKernelConfiguredFlexibleMemorySize(u64* sizeOut) {
@ -678,8 +707,9 @@ void RegisterMemory(Core::Loader::SymbolsResolver* sym) {
    LIB_FUNCTION("n1-v6FgU7MQ", "libkernel", 1, "libkernel", 1, 1,
                 sceKernelConfiguredFlexibleMemorySize);

-    LIB_FUNCTION("9bfdLIyuwCY", "libkernel", 1, "libkernel", 1, 1, sceKernelMTypeProtect);
-    LIB_FUNCTION("vSMAm3cxYTY", "libkernel", 1, "libkernel", 1, 1, sceKernelMProtect);
+    LIB_FUNCTION("vSMAm3cxYTY", "libkernel", 1, "libkernel", 1, 1, sceKernelMprotect);
+    LIB_FUNCTION("YQOfxL4QfeU", "libScePosix", 1, "libkernel", 1, 1, posix_mprotect);
+    LIB_FUNCTION("9bfdLIyuwCY", "libkernel", 1, "libkernel", 1, 1, sceKernelMtypeprotect);

    // Memory pool
    LIB_FUNCTION("qCSfqDILlns", "libkernel", 1, "libkernel", 1, 1, sceKernelMemoryPoolExpand);
--- a/src/core/libraries/kernel/memory.h
+++ b/src/core/libraries/kernel/memory.h
@ -147,9 +147,9 @@ s32 PS4_SYSV_ABI sceKernelMapFlexibleMemory(void** addr_in_out, std::size_t len,
                                            int flags);
 int PS4_SYSV_ABI sceKernelQueryMemoryProtection(void* addr, void** start, void** end, u32* prot);

-int PS4_SYSV_ABI sceKernelMProtect(const void* addr, size_t size, int prot);
+s32 PS4_SYSV_ABI sceKernelMprotect(const void* addr, u64 size, s32 prot);

-int PS4_SYSV_ABI sceKernelMTypeProtect(const void* addr, size_t size, int mtype, int prot);
+s32 PS4_SYSV_ABI sceKernelMtypeprotect(const void* addr, u64 size, s32 mtype, s32 prot);

 int PS4_SYSV_ABI sceKernelDirectMemoryQuery(u64 offset, int flags, OrbisQueryInfo* query_info,
                                            size_t infoSize);
@ -165,14 +165,14 @@ s32 PS4_SYSV_ABI sceKernelBatchMap(OrbisKernelBatchMapEntry* entries, int numEnt
 s32 PS4_SYSV_ABI sceKernelBatchMap2(OrbisKernelBatchMapEntry* entries, int numEntries,
                                    int* numEntriesOut, int flags);

-s32 PS4_SYSV_ABI sceKernelSetVirtualRangeName(const void* addr, size_t len, const char* name);
+s32 PS4_SYSV_ABI sceKernelSetVirtualRangeName(const void* addr, u64 len, const char* name);

-s32 PS4_SYSV_ABI sceKernelMemoryPoolExpand(u64 searchStart, u64 searchEnd, size_t len,
-                                           size_t alignment, u64* physAddrOut);
-s32 PS4_SYSV_ABI sceKernelMemoryPoolReserve(void* addrIn, size_t len, size_t alignment, int flags,
-                                            void** addrOut);
-s32 PS4_SYSV_ABI sceKernelMemoryPoolCommit(void* addr, size_t len, int type, int prot, int flags);
-s32 PS4_SYSV_ABI sceKernelMemoryPoolDecommit(void* addr, size_t len, int flags);
+s32 PS4_SYSV_ABI sceKernelMemoryPoolExpand(u64 searchStart, u64 searchEnd, u64 len, u64 alignment,
+                                           u64* physAddrOut);
+s32 PS4_SYSV_ABI sceKernelMemoryPoolReserve(void* addr_in, u64 len, u64 alignment, s32 flags,
+                                            void** addr_out);
+s32 PS4_SYSV_ABI sceKernelMemoryPoolCommit(void* addr, u64 len, s32 type, s32 prot, s32 flags);
+s32 PS4_SYSV_ABI sceKernelMemoryPoolDecommit(void* addr, u64 len, s32 flags);
 s32 PS4_SYSV_ABI sceKernelMemoryPoolBatch(const OrbisKernelMemoryPoolBatchEntry* entries, s32 count,
                                          s32* num_processed, s32 flags);

--- a/src/core/libraries/libs.cpp
+++ b/src/core/libraries/libs.cpp
@ -10,6 +10,7 @@
 #include "core/libraries/avplayer/avplayer.h"
 #include "core/libraries/camera/camera.h"
 #include "core/libraries/companion/companion_httpd.h"
+#include "core/libraries/companion/companion_util.h"
 #include "core/libraries/disc_map/disc_map.h"
 #include "core/libraries/game_live_streaming/gamelivestreaming.h"
 #include "core/libraries/gnmdriver/gnmdriver.h"
@ -126,6 +127,7 @@ void InitHLELibs(Core::Loader::SymbolsResolver* sym) {
    Libraries::SigninDialog::RegisterlibSceSigninDialog(sym);
    Libraries::Camera::RegisterlibSceCamera(sym);
    Libraries::CompanionHttpd::RegisterlibSceCompanionHttpd(sym);
+    Libraries::CompanionUtil::RegisterlibSceCompanionUtil(sym);
 }

 } // namespace Libraries
--- a/src/core/libraries/network/net.cpp
+++ b/src/core/libraries/network/net.cpp
@ -955,16 +955,148 @@ u16 PS4_SYSV_ABI sceNetHtons(u16 host16) {
    return htons(host16);
 }

-const char* PS4_SYSV_ABI sceNetInetNtop(int af, const void* src, char* dst, u32 size) {
 #ifdef WIN32
-    const char* res = InetNtopA(af, src, dst, size);
-#else
-    const char* res = inet_ntop(af, src, dst, size);
-#endif
-    if (res == nullptr) {
-        UNREACHABLE();
+// there isn't a strlcpy function in windows so implement one
+u64 strlcpy(char* dst, const char* src, u64 size) {
+    u64 src_len = strlen(src);
+
+    if (size > 0) {
+        u64 copy_len = (src_len >= size) ? (size - 1) : src_len;
+        memcpy(dst, src, copy_len);
+        dst[copy_len] = '\0';
    }
-    return dst;
+
+    return src_len;
+}
+
+#endif
+
+const char* freebsd_inet_ntop4(const char* src, char* dst, u64 size) {
+    static const char fmt[] = "%u.%u.%u.%u";
+    char tmp[sizeof "255.255.255.255"];
+    int l;
+
+    l = snprintf(tmp, sizeof(tmp), fmt, src[0], src[1], src[2], src[3]);
+    if (l <= 0 || (socklen_t)l >= size) {
+        return nullptr;
+    }
+    strlcpy(dst, tmp, size);
+    return (dst);
+}
+
+const char* freebsd_inet_ntop6(const char* src, char* dst, u64 size) {
+    /*
+     * Note that int32_t and int16_t need only be "at least" large enough
+     * to contain a value of the specified size.  On some systems, like
+     * Crays, there is no such thing as an integer variable with 16 bits.
+     * Keep this in mind if you think this function should have been coded
+     * to use pointer overlays.  All the world's not a VAX.
+     */
+    char tmp[sizeof "ffff:ffff:ffff:ffff:ffff:ffff:255.255.255.255"], *tp;
+    struct {
+        int base, len;
+    } best, cur;
+#define NS_IN6ADDRSZ 16
+#define NS_INT16SZ 2
+    u_int words[NS_IN6ADDRSZ / NS_INT16SZ];
+    int i;
+
+    /*
+     * Preprocess:
+     *	Copy the input (bytewise) array into a wordwise array.
+     *	Find the longest run of 0x00's in src[] for :: shorthanding.
+     */
+    memset(words, '\0', sizeof words);
+    for (i = 0; i < NS_IN6ADDRSZ; i++)
+        words[i / 2] |= (src[i] << ((1 - (i % 2)) << 3));
+    best.base = -1;
+    best.len = 0;
+    cur.base = -1;
+    cur.len = 0;
+    for (i = 0; i < (NS_IN6ADDRSZ / NS_INT16SZ); i++) {
+        if (words[i] == 0) {
+            if (cur.base == -1)
+                cur.base = i, cur.len = 1;
+            else
+                cur.len++;
+        } else {
+            if (cur.base != -1) {
+                if (best.base == -1 || cur.len > best.len)
+                    best = cur;
+                cur.base = -1;
+            }
+        }
+    }
+    if (cur.base != -1) {
+        if (best.base == -1 || cur.len > best.len)
+            best = cur;
+    }
+    if (best.base != -1 && best.len < 2)
+        best.base = -1;
+
+    /*
+     * Format the result.
+     */
+    tp = tmp;
+    for (i = 0; i < (NS_IN6ADDRSZ / NS_INT16SZ); i++) {
+        /* Are we inside the best run of 0x00's? */
+        if (best.base != -1 && i >= best.base && i < (best.base + best.len)) {
+            if (i == best.base)
+                *tp++ = ':';
+            continue;
+        }
+        /* Are we following an initial run of 0x00s or any real hex? */
+        if (i != 0)
+            *tp++ = ':';
+        /* Is this address an encapsulated IPv4? */
+        if (i == 6 && best.base == 0 &&
+            (best.len == 6 || (best.len == 7 && words[7] != 0x0001) ||
+             (best.len == 5 && words[5] == 0xffff))) {
+            if (!freebsd_inet_ntop4(src + 12, tp, sizeof tmp - (tp - tmp)))
+                return nullptr;
+            tp += strlen(tp);
+            break;
+        }
+        tp += sprintf(tp, "%x", words[i]);
+    }
+    /* Was it a trailing run of 0x00's? */
+    if (best.base != -1 && (best.base + best.len) == (NS_IN6ADDRSZ / NS_INT16SZ))
+        *tp++ = ':';
+    *tp++ = '\0';
+
+    /*
+     * Check for overflow, copy, and we're done.
+     */
+    if ((u64)(tp - tmp) > size) {
+        return nullptr;
+    }
+    strcpy(dst, tmp);
+    return (dst);
+}
+const char* PS4_SYSV_ABI sceNetInetNtop(int af, const void* src, char* dst, u32 size) {
+    if (!(src && dst)) {
+        *sceNetErrnoLoc() = ORBIS_NET_ENOSPC;
+        LOG_ERROR(Lib_Net, "returned ORBIS_NET_ENOSPC");
+        return nullptr;
+    }
+    const char* returnvalue = nullptr;
+    switch (af) {
+    case ORBIS_NET_AF_INET:
+        returnvalue = freebsd_inet_ntop4((const char*)src, dst, size);
+        break;
+    case ORBIS_NET_AF_INET6:
+        returnvalue = freebsd_inet_ntop6((const char*)src, dst, size);
+        break;
+    default:
+        *sceNetErrnoLoc() = ORBIS_NET_EAFNOSUPPORT;
+        LOG_ERROR(Lib_Net, "returned ORBIS_NET_EAFNOSUPPORT");
+        return nullptr;
+    }
+    if (returnvalue == nullptr) {
+        *sceNetErrnoLoc() = ORBIS_NET_ENOSPC;
+        LOG_ERROR(Lib_Net, "returned ORBIS_NET_ENOSPC");
+    }
+    return returnvalue;
 }

 int PS4_SYSV_ABI sceNetInetNtopWithScopeId() {
--- a/src/core/libraries/network/net.h
+++ b/src/core/libraries/network/net.h
@ -20,6 +20,10 @@ class SymbolsResolver;

 namespace Libraries::Net {

+enum OrbisNetFamily : u32 {
+    ORBIS_NET_AF_INET = 2,
+    ORBIS_NET_AF_INET6 = 28,
+};
 enum OrbisNetSocketType : u32 {
    ORBIS_NET_SOCK_STREAM = 1,
    ORBIS_NET_SOCK_DGRAM = 2,
--- a/src/core/libraries/np_trophy/np_trophy.cpp
+++ b/src/core/libraries/np_trophy/np_trophy.cpp
@ -206,6 +206,10 @@ s32 PS4_SYSV_ABI sceNpTrophyDestroyHandle(OrbisNpTrophyHandle handle) {
    if (handle == ORBIS_NP_TROPHY_INVALID_HANDLE)
        return ORBIS_NP_TROPHY_ERROR_INVALID_HANDLE;

+    if (handle >= trophy_handles.size()) {
+        LOG_ERROR(Lib_NpTrophy, "Invalid handle {}", handle);
+        return ORBIS_NP_TROPHY_ERROR_INVALID_HANDLE;
+    }
    if (!trophy_handles.is_allocated({static_cast<u32>(handle)})) {
        return ORBIS_NP_TROPHY_ERROR_INVALID_HANDLE;
    }
--- a/src/core/libraries/save_data/savedata.cpp
+++ b/src/core/libraries/save_data/savedata.cpp
@ -8,6 +8,7 @@
 #include <magic_enum/magic_enum.hpp>

 #include "common/assert.h"
+#include "common/config.h"
 #include "common/cstring.h"
 #include "common/elf_info.h"
 #include "common/enum.h"
@ -438,7 +439,7 @@ static Error saveDataMount(const OrbisSaveDataMount2* mount_info,
            LOG_INFO(Lib_SaveData, "called with invalid block size");
        }

-        const auto root_save = Common::FS::GetUserPath(Common::FS::PathType::SaveDataDir);
+        const auto root_save = Config::GetSaveDataPath();
        fs::create_directories(root_save);
        const auto available = fs::space(root_save).available;

--- a/src/core/linker.cpp
+++ b/src/core/linker.cpp
@ -12,6 +12,7 @@
 #include "common/thread.h"
 #include "core/aerolib/aerolib.h"
 #include "core/aerolib/stubs.h"
+#include "core/devtools/widget/module_list.h"
 #include "core/libraries/kernel/memory.h"
 #include "core/libraries/kernel/threads.h"
 #include "core/linker.h"
@ -147,6 +148,9 @@ s32 Linker::LoadModule(const std::filesystem::path& elf_name, bool is_dynamic) {

    num_static_modules += !is_dynamic;
    m_modules.emplace_back(std::move(module));
+
+    Core::Devtools::Widget::ModuleList::AddModule(elf_name.filename().string(), elf_name);
+
    return m_modules.size() - 1;
 }

@ -325,6 +329,9 @@ bool Linker::Resolve(const std::string& name, Loader::SymbolType sym_type, Modul
    }
    if (record) {
        *return_info = *record;
+
+        Core::Devtools::Widget::ModuleList::AddModule(sr.library);
+
        return true;
    }

--- a/src/core/memory.cpp
+++ b/src/core/memory.cpp
@ -5,6 +5,7 @@
 #include "common/assert.h"
 #include "common/config.h"
 #include "common/debug.h"
+#include "core/file_sys/fs.h"
 #include "core/libraries/kernel/memory.h"
 #include "core/libraries/kernel/orbis_error.h"
 #include "core/libraries/kernel/process.h"
@ -181,6 +182,7 @@ PAddr MemoryManager::Allocate(PAddr search_start, PAddr search_end, size_t size,
    auto& area = CarveDmemArea(mapping_start, size)->second;
    area.memory_type = memory_type;
    area.is_free = false;
+    MergeAdjacent(dmem_map, dmem_area);
    return mapping_start;
 }

@ -214,90 +216,6 @@ void MemoryManager::Free(PAddr phys_addr, size_t size) {
    MergeAdjacent(dmem_map, dmem_area);
 }

-int MemoryManager::PoolReserve(void** out_addr, VAddr virtual_addr, size_t size,
-                               MemoryMapFlags flags, u64 alignment) {
-    std::scoped_lock lk{mutex};
-    alignment = alignment > 0 ? alignment : 2_MB;
-    VAddr min_address = Common::AlignUp(impl.SystemManagedVirtualBase(), alignment);
-    VAddr mapped_addr = Common::AlignUp(virtual_addr, alignment);
-
-    // Fixed mapping means the virtual address must exactly match the provided one.
-    if (True(flags & MemoryMapFlags::Fixed)) {
-        // Make sure we're mapping to a valid address
-        mapped_addr = mapped_addr > min_address ? mapped_addr : min_address;
-        auto vma = FindVMA(mapped_addr)->second;
-        size_t remaining_size = vma.base + vma.size - mapped_addr;
-        // If the VMA is mapped or there's not enough space, unmap the region first.
-        if (vma.IsMapped() || remaining_size < size) {
-            UnmapMemoryImpl(mapped_addr, size);
-            vma = FindVMA(mapped_addr)->second;
-        }
-    }
-
-    if (False(flags & MemoryMapFlags::Fixed)) {
-        // When MemoryMapFlags::Fixed is not specified, and mapped_addr is 0,
-        // search from address 0x200000000 instead.
-        mapped_addr = mapped_addr == 0 ? 0x200000000 : mapped_addr;
-        mapped_addr = SearchFree(mapped_addr, size, alignment);
-        if (mapped_addr == -1) {
-            // No suitable memory areas to map to
-            return ORBIS_KERNEL_ERROR_ENOMEM;
-        }
-    }
-
-    // Add virtual memory area
-    const auto new_vma_handle = CarveVMA(mapped_addr, size);
-    auto& new_vma = new_vma_handle->second;
-    new_vma.disallow_merge = True(flags & MemoryMapFlags::NoCoalesce);
-    new_vma.prot = MemoryProt::NoAccess;
-    new_vma.name = "anon";
-    new_vma.type = VMAType::PoolReserved;
-
-    *out_addr = std::bit_cast<void*>(mapped_addr);
-    return ORBIS_OK;
-}
-
-int MemoryManager::Reserve(void** out_addr, VAddr virtual_addr, size_t size, MemoryMapFlags flags,
-                           u64 alignment) {
-    std::scoped_lock lk{mutex};
-
-    virtual_addr = (virtual_addr == 0) ? impl.SystemManagedVirtualBase() : virtual_addr;
-    alignment = alignment > 0 ? alignment : 16_KB;
-    VAddr mapped_addr = alignment > 0 ? Common::AlignUp(virtual_addr, alignment) : virtual_addr;
-
-    // Fixed mapping means the virtual address must exactly match the provided one.
-    if (True(flags & MemoryMapFlags::Fixed)) {
-        auto vma = FindVMA(mapped_addr)->second;
-        size_t remaining_size = vma.base + vma.size - mapped_addr;
-        // If the VMA is mapped or there's not enough space, unmap the region first.
-        if (vma.IsMapped() || remaining_size < size) {
-            UnmapMemoryImpl(mapped_addr, size);
-            vma = FindVMA(mapped_addr)->second;
-        }
-    }
-
-    // Find the first free area starting with provided virtual address.
-    if (False(flags & MemoryMapFlags::Fixed)) {
-        mapped_addr = SearchFree(mapped_addr, size, alignment);
-        if (mapped_addr == -1) {
-            // No suitable memory areas to map to
-            return ORBIS_KERNEL_ERROR_ENOMEM;
-        }
-    }
-
-    // Add virtual memory area
-    const auto new_vma_handle = CarveVMA(mapped_addr, size);
-    auto& new_vma = new_vma_handle->second;
-    new_vma.disallow_merge = True(flags & MemoryMapFlags::NoCoalesce);
-    new_vma.prot = MemoryProt::NoAccess;
-    new_vma.name = "anon";
-    new_vma.type = VMAType::Reserved;
-    MergeAdjacent(vma_map, new_vma_handle);
-
-    *out_addr = std::bit_cast<void*>(mapped_addr);
-    return ORBIS_OK;
-}
-
 int MemoryManager::PoolCommit(VAddr virtual_addr, size_t size, MemoryProt prot) {
    std::scoped_lock lk{mutex};

@ -344,14 +262,17 @@ int MemoryManager::PoolCommit(VAddr virtual_addr, size_t size, MemoryProt prot)
    void* out_addr = impl.Map(mapped_addr, size, alignment, -1, false);
    TRACK_ALLOC(out_addr, size, "VMEM");

-    if (IsValidGpuMapping(mapped_addr, size)) {
+    if (prot >= MemoryProt::GpuRead) {
+        // PS4s only map to GPU memory when the protection includes GPU access.
+        // If the address to map to is too high, PS4s throw a page fault and crash.
+        ASSERT_MSG(IsValidGpuMapping(mapped_addr, size), "Invalid address for GPU mapping");
        rasterizer->MapMemory(mapped_addr, size);
    }

    return ORBIS_OK;
 }

-int MemoryManager::MapMemory(void** out_addr, VAddr virtual_addr, size_t size, MemoryProt prot,
+s32 MemoryManager::MapMemory(void** out_addr, VAddr virtual_addr, u64 size, MemoryProt prot,
                             MemoryMapFlags flags, VMAType type, std::string_view name,
                             bool is_exec, PAddr phys_addr, u64 alignment) {
    std::scoped_lock lk{mutex};
@ -366,17 +287,18 @@ int MemoryManager::MapMemory(void** out_addr, VAddr virtual_addr, size_t size, M
    VAddr mapped_addr = (virtual_addr == 0) ? impl.SystemManagedVirtualBase() : virtual_addr;

    // Fixed mapping means the virtual address must exactly match the provided one.
-    if (True(flags & MemoryMapFlags::Fixed)) {
+    // On a PS4, the Fixed flag is ignored if address 0 is provided.
+    if (True(flags & MemoryMapFlags::Fixed) && virtual_addr != 0) {
        auto vma = FindVMA(mapped_addr)->second;
-        size_t remaining_size = vma.base + vma.size - mapped_addr;
        // There's a possible edge case where we're mapping to a partially reserved range.
        // To account for this, unmap any reserved areas within this mapping range first.
        auto unmap_addr = mapped_addr;
        auto unmap_size = size;
+
        // If flag NoOverwrite is provided, don't overwrite mapped VMAs.
        // When it isn't provided, VMAs can be overwritten regardless of if they're mapped.
        while ((False(flags & MemoryMapFlags::NoOverwrite) || !vma.IsMapped()) &&
-               unmap_addr < mapped_addr + size && remaining_size < size) {
+               unmap_addr < mapped_addr + size) {
            auto unmapped = UnmapBytesFromEntry(unmap_addr, vma, unmap_size);
            unmap_addr += unmapped;
            unmap_size -= unmapped;
@ -384,51 +306,69 @@ int MemoryManager::MapMemory(void** out_addr, VAddr virtual_addr, size_t size, M
        }

        vma = FindVMA(mapped_addr)->second;
-        remaining_size = vma.base + vma.size - mapped_addr;
+        auto remaining_size = vma.base + vma.size - mapped_addr;
        if (vma.IsMapped() || remaining_size < size) {
            LOG_ERROR(Kernel_Vmm, "Unable to map {:#x} bytes at address {:#x}", size, mapped_addr);
            return ORBIS_KERNEL_ERROR_ENOMEM;
        }
-    }
-
-    // Find the first free area starting with provided virtual address.
-    if (False(flags & MemoryMapFlags::Fixed)) {
-        // Provided address needs to be aligned before we can map.
+    } else {
+        // When MemoryMapFlags::Fixed is not specified, and mapped_addr is 0,
+        // search from address 0x200000000 instead.
        alignment = alignment > 0 ? alignment : 16_KB;
-        mapped_addr = SearchFree(Common::AlignUp(mapped_addr, alignment), size, alignment);
+        mapped_addr = virtual_addr == 0 ? 0x200000000 : mapped_addr;
+        mapped_addr = SearchFree(mapped_addr, size, alignment);
        if (mapped_addr == -1) {
            // No suitable memory areas to map to
            return ORBIS_KERNEL_ERROR_ENOMEM;
        }
    }

-    // Perform the mapping.
-    *out_addr = impl.Map(mapped_addr, size, alignment, phys_addr, is_exec);
-    TRACK_ALLOC(*out_addr, size, "VMEM");
+    // Create a memory area representing this mapping.
+    const auto new_vma_handle = CarveVMA(mapped_addr, size);
+    auto& new_vma = new_vma_handle->second;

-    auto& new_vma = CarveVMA(mapped_addr, size)->second;
-    new_vma.disallow_merge = True(flags & MemoryMapFlags::NoCoalesce);
-    new_vma.prot = prot;
-    new_vma.name = name;
-    new_vma.type = type;
-    new_vma.is_exec = is_exec;
-
-    if (type == VMAType::Direct) {
-        new_vma.phys_base = phys_addr;
-    }
+    // If type is Flexible, we need to track how much flexible memory is used here.
    if (type == VMAType::Flexible) {
        flexible_usage += size;
    }

-    if (IsValidGpuMapping(mapped_addr, size)) {
+    new_vma.disallow_merge = True(flags & MemoryMapFlags::NoCoalesce);
+    new_vma.prot = prot;
+    new_vma.name = name;
+    new_vma.type = type;
+    new_vma.phys_base = phys_addr == -1 ? 0 : phys_addr;
+    new_vma.is_exec = is_exec;
+
+    if (type == VMAType::Reserved) {
+        // Technically this should be done for direct and flexible mappings too,
+        // But some Windows-specific limitations make that hard to accomplish.
+        MergeAdjacent(vma_map, new_vma_handle);
+    }
+
+    if (prot >= MemoryProt::GpuRead) {
+        // PS4s only map to GPU memory when the protection includes GPU access.
+        // If the address to map to is too high, PS4s throw a page fault and crash.
+        ASSERT_MSG(IsValidGpuMapping(mapped_addr, size), "Invalid address for GPU mapping");
        rasterizer->MapMemory(mapped_addr, size);
    }

+    if (type == VMAType::Reserved || type == VMAType::PoolReserved) {
+        // For Reserved/PoolReserved mappings, we don't perform any address space allocations.
+        // Just set out_addr to mapped_addr instead.
+        *out_addr = std::bit_cast<void*>(mapped_addr);
+    } else {
+        // Type is either Direct, Flexible, or Code, these need to be mapped in our address space.
+        *out_addr = impl.Map(mapped_addr, size, alignment, phys_addr, is_exec);
+    }
+
+    TRACK_ALLOC(*out_addr, size, "VMEM");
    return ORBIS_OK;
 }

-int MemoryManager::MapFile(void** out_addr, VAddr virtual_addr, size_t size, MemoryProt prot,
-                           MemoryMapFlags flags, uintptr_t fd, size_t offset) {
+s32 MemoryManager::MapFile(void** out_addr, VAddr virtual_addr, u64 size, MemoryProt prot,
+                           MemoryMapFlags flags, s32 fd, s64 phys_addr) {
+    auto* h = Common::Singleton<Core::FileSys::HandleTable>::Instance();
+
    VAddr mapped_addr = (virtual_addr == 0) ? impl.SystemManagedVirtualBase() : virtual_addr;
    const size_t size_aligned = Common::AlignUp(size, 16_KB);

@ -449,8 +389,19 @@ int MemoryManager::MapFile(void** out_addr, VAddr virtual_addr, size_t size, Mem
                   vma.base, vma.base + vma.size, virtual_addr, virtual_addr + size);
    }

-    // Map the file.
-    impl.MapFile(mapped_addr, size_aligned, offset, std::bit_cast<u32>(prot), fd);
+    // Get the file to map
+    auto file = h->GetFile(fd);
+    if (file == nullptr) {
+        return ORBIS_KERNEL_ERROR_EBADF;
+    }
+
+    const auto handle = file->f.GetFileMapping();
+
+    impl.MapFile(mapped_addr, size_aligned, phys_addr, std::bit_cast<u32>(prot), handle);
+
+    if (prot >= MemoryProt::GpuRead) {
+        ASSERT_MSG(false, "Files cannot be mapped to GPU memory");
+    }

    // Add virtual memory area
    auto& new_vma = CarveVMA(mapped_addr, size_aligned)->second;
@ -478,6 +429,7 @@ s32 MemoryManager::PoolDecommit(VAddr virtual_addr, size_t size) {
    const bool is_exec = vma_base.is_exec;
    const auto start_in_vma = virtual_addr - vma_base_addr;
    const auto type = vma_base.type;
+    const auto prot = vma_base.prot;

    if (type != VMAType::PoolReserved && type != VMAType::Pooled) {
        LOG_ERROR(Kernel_Vmm, "Attempting to decommit non-pooled memory!");
@ -489,7 +441,8 @@ s32 MemoryManager::PoolDecommit(VAddr virtual_addr, size_t size) {
        pool_budget += size;
    }

-    if (IsValidGpuMapping(virtual_addr, size)) {
+    if (prot >= MemoryProt::GpuRead) {
+        // If this mapping has GPU access, unmap from GPU.
        rasterizer->UnmapMemory(virtual_addr, size);
    }

@ -528,6 +481,7 @@ u64 MemoryManager::UnmapBytesFromEntry(VAddr virtual_addr, VirtualMemoryArea vma
    const auto adjusted_size =
        vma_base_size - start_in_vma < size ? vma_base_size - start_in_vma : size;
    const bool has_backing = type == VMAType::Direct || type == VMAType::File;
+    const auto prot = vma_base.prot;

    if (type == VMAType::Free) {
        return adjusted_size;
@ -536,8 +490,9 @@ u64 MemoryManager::UnmapBytesFromEntry(VAddr virtual_addr, VirtualMemoryArea vma
        flexible_usage -= adjusted_size;
    }

-    if (IsValidGpuMapping(virtual_addr, adjusted_size)) {
-        rasterizer->UnmapMemory(virtual_addr, adjusted_size);
+    if (prot >= MemoryProt::GpuRead) {
+        // If this mapping has GPU access, unmap from GPU.
+        rasterizer->UnmapMemory(virtual_addr, size);
    }

    // Mark region as free and attempt to coalesce it with neighbours.
@ -605,8 +560,8 @@ s64 MemoryManager::ProtectBytes(VAddr addr, VirtualMemoryArea vma_base, size_t s
        vma_base.size - start_in_vma < size ? vma_base.size - start_in_vma : size;

    if (vma_base.type == VMAType::Free) {
-        LOG_ERROR(Kernel_Vmm, "Cannot change protection on free memory region");
-        return ORBIS_KERNEL_ERROR_EINVAL;
+        // On PS4, protecting freed memory does nothing.
+        return adjusted_size;
    }

    // Validate protection flags
@ -621,6 +576,18 @@ s64 MemoryManager::ProtectBytes(VAddr addr, VirtualMemoryArea vma_base, size_t s
        return ORBIS_KERNEL_ERROR_EINVAL;
    }

+    if (vma_base.prot < MemoryProt::GpuRead && prot >= MemoryProt::GpuRead) {
+        // New protection will give the GPU access to this VMA, perform a rasterizer map
+        ASSERT_MSG(IsValidGpuMapping(addr, size), "Invalid address for GPU mapping");
+        rasterizer->MapMemory(addr, size);
+    }
+
+    if (vma_base.prot >= MemoryProt::GpuRead && prot < MemoryProt::GpuRead) {
+        // New protection will remove the GPU's access to this VMA, perform a rasterizer unmap
+        ASSERT_MSG(IsValidGpuMapping(addr, size), "Invalid address for GPU unmap");
+        rasterizer->UnmapMemory(addr, size);
+    }
+
    // Change protection
    vma_base.prot = prot;

@ -798,12 +765,31 @@ s32 MemoryManager::SetDirectMemoryType(s64 phys_addr, s32 memory_type) {
    return ORBIS_OK;
 }

-void MemoryManager::NameVirtualRange(VAddr virtual_addr, size_t size, std::string_view name) {
-    auto it = FindVMA(virtual_addr);
+void MemoryManager::NameVirtualRange(VAddr virtual_addr, u64 size, std::string_view name) {
+    // Sizes are aligned up to the nearest 16_KB
+    auto aligned_size = Common::AlignUp(size, 16_KB);
+    // Addresses are aligned down to the nearest 16_KB
+    auto aligned_addr = Common::AlignDown(virtual_addr, 16_KB);

-    ASSERT_MSG(it->second.Contains(virtual_addr, size),
-               "Range provided is not fully contained in vma");
-    it->second.name = name;
+    auto it = FindVMA(aligned_addr);
+    s64 remaining_size = aligned_size;
+    auto current_addr = aligned_addr;
+    while (remaining_size > 0) {
+        // Nothing needs to be done to free VMAs
+        if (!it->second.IsFree()) {
+            if (remaining_size < it->second.size) {
+                // We should split VMAs here, but this could cause trouble for Windows.
+                // Instead log a warning and name the whole VMA.
+                // it = CarveVMA(current_addr, remaining_size);
+                LOG_WARNING(Kernel_Vmm, "Trying to partially name a range");
+            }
+            auto& vma = it->second;
+            vma.name = name;
+        }
+        remaining_size -= it->second.size;
+        current_addr += it->second.size;
+        it = FindVMA(current_addr);
+    }
 }

 void MemoryManager::InvalidateMemory(const VAddr addr, const u64 size) const {
@ -824,6 +810,8 @@ VAddr MemoryManager::SearchFree(VAddr virtual_addr, size_t size, u32 alignment)
    ASSERT_MSG(virtual_addr <= max_search_address, "Input address {:#x} is out of bounds",
               virtual_addr);

+    // Align up the virtual_addr first.
+    virtual_addr = Common::AlignUp(virtual_addr, alignment);
    auto it = FindVMA(virtual_addr);

    // If the VMA is free and contains the requested mapping we are done.
--- a/src/core/memory.h
+++ b/src/core/memory.h
@ -183,20 +183,14 @@ public:

    void Free(PAddr phys_addr, size_t size);

-    int PoolReserve(void** out_addr, VAddr virtual_addr, size_t size, MemoryMapFlags flags,
-                    u64 alignment = 0);
-
-    int Reserve(void** out_addr, VAddr virtual_addr, size_t size, MemoryMapFlags flags,
-                u64 alignment = 0);
-
    int PoolCommit(VAddr virtual_addr, size_t size, MemoryProt prot);

-    int MapMemory(void** out_addr, VAddr virtual_addr, size_t size, MemoryProt prot,
+    s32 MapMemory(void** out_addr, VAddr virtual_addr, u64 size, MemoryProt prot,
                  MemoryMapFlags flags, VMAType type, std::string_view name = "anon",
                  bool is_exec = false, PAddr phys_addr = -1, u64 alignment = 0);

-    int MapFile(void** out_addr, VAddr virtual_addr, size_t size, MemoryProt prot,
-                MemoryMapFlags flags, uintptr_t fd, size_t offset);
+    s32 MapFile(void** out_addr, VAddr virtual_addr, u64 size, MemoryProt prot,
+                MemoryMapFlags flags, s32 fd, s64 phys_addr);

    s32 PoolDecommit(VAddr virtual_addr, size_t size);

@ -221,7 +215,7 @@ public:

    s32 SetDirectMemoryType(s64 phys_addr, s32 memory_type);

-    void NameVirtualRange(VAddr virtual_addr, size_t size, std::string_view name);
+    void NameVirtualRange(VAddr virtual_addr, u64 size, std::string_view name);

    void InvalidateMemory(VAddr addr, u64 size) const;

--- a/src/emulator.cpp
+++ b/src/emulator.cpp
@ -25,6 +25,7 @@
 #include "common/polyfill_thread.h"
 #include "common/scm_rev.h"
 #include "common/singleton.h"
+#include "core/devtools/widget/module_list.h"
 #include "core/file_format/psf.h"
 #include "core/file_format/trp.h"
 #include "core/file_sys/fs.h"
@ -188,6 +189,8 @@ void Emulator::Run(const std::filesystem::path& file, const std::vector<std::str
        game_info.splash_path = pic1_path;
    }

+    game_info.game_folder = game_folder;
+
    std::string game_title = fmt::format("{} - {} <{}>", id, title, app_version);
    std::string window_title = "";
    std::string remote_url(Common::g_scm_remote_url);
--- a/src/qt_gui/gui_context_menus.h
+++ b/src/qt_gui/gui_context_menus.h
@ -156,11 +156,9 @@ public:
        }

        if (selected == openSaveDataFolder) {
-            QString userPath;
-            Common::FS::PathToQString(userPath,
-                                      Common::FS::GetUserPath(Common::FS::PathType::UserDir));
-            QString saveDataPath =
-                userPath + "/savedata/1/" + QString::fromStdString(m_games[itemID].save_dir);
+            QString saveDataPath;
+            Common::FS::PathToQString(saveDataPath,
+                                      Config::GetSaveDataPath() / "1" / m_games[itemID].save_dir);
            QDir(saveDataPath).mkpath(saveDataPath);
            QDesktopServices::openUrl(QUrl::fromLocalFile(saveDataPath));
        }
@ -485,8 +483,7 @@ public:
                dlc_path, Config::getAddonInstallDir() /
                              Common::FS::PathFromQString(folder_path).parent_path().filename());
            Common::FS::PathToQString(save_data_path,
-                                      Common::FS::GetUserPath(Common::FS::PathType::UserDir) /
-                                          "savedata/1" / m_games[itemID].serial);
+                                      Config::GetSaveDataPath() / "1" / m_games[itemID].save_dir);

            Common::FS::PathToQString(trophy_data_path,
                                      Common::FS::GetUserPath(Common::FS::PathType::MetaDataDir) /
--- a/src/qt_gui/translations/es_ES.ts
+++ b/src/qt_gui/translations/es_ES.ts
@ -26,7 +26,7 @@
    </message>
    <message>
      <source>Cheats/Patches are experimental.\nUse with caution.\n\nDownload cheats individually by selecting the repository and clicking the download button.\nIn the Patches tab, you can download all patches at once, choose which ones you want to use, and save your selection.\n\nSince we do not develop the Cheats/Patches,\nplease report issues to the cheat author.\n\nCreated a new cheat? Visit:\n</source>
-      <translation>Los cheats/patches son experimentales.\nÚselos con precaución.\n\nDescargue los cheats individualmente seleccionando el repositorio y haciendo clic en el botón de descarga.\nEn la pestaña Patches, puede descargar todos los patches a la vez, elegir cuáles desea usar y guardar la selección.\n\nComo no desarrollamos los Cheats/Patches,\npor favor informe los problemas al autor del cheat.\n\n¿Creaste un nuevo cheat? Visita:\n</translation>
+      <translation>Los trucos/parches son experimentales.\nÚselos con precaución.\n\nPuede descargar cada truco seleccionando el repositorio y haciendo clic en el botón de descarga.\nEn la pestaña Parches podrá descargar todos los parches a la vez, elegir cuáles desea usar y guardar la selección.\n\nComo no desarrollamos los trucos/parches,\ndebe informar de cualquier problema a sus autores correspondientes.\n\n¿Creaste un truco nuevo? Visita:\n</translation>
    </message>
    <message>
      <source>No Image Available</source>
@ -2048,7 +2048,7 @@
    </message>
    <message>
      <source> * Unsupported Vulkan Version</source>
-      <translation type="unfinished"> * Unsupported Vulkan Version</translation>
+      <translation> * Versión de Vulkan no soportada</translation>
    </message>
  </context>
  <context>
--- a/src/qt_gui/translations/tr_TR.ts
+++ b/src/qt_gui/translations/tr_TR.ts
@ -2048,7 +2048,7 @@
    </message>
    <message>
      <source> * Unsupported Vulkan Version</source>
-      <translation type="unfinished"> * Unsupported Vulkan Version</translation>
+      <translation> * Desteklenmeyen Vulkan Sürümü</translation>
    </message>
  </context>
  <context>
--- a/src/shader_recompiler/backend/spirv/emit_spirv.cpp
+++ b/src/shader_recompiler/backend/spirv/emit_spirv.cpp
@ -154,6 +154,7 @@ void Traverse(EmitContext& ctx, const IR::Program& program) {
            for (IR::Inst& inst : node.data.block->Instructions()) {
                EmitInst(ctx, &inst);
            }
+            ctx.first_to_last_label_map[label.value] = ctx.last_label;
            break;
        }
        case IR::AbstractSyntaxNode::Type::If: {
@ -298,6 +299,10 @@ void SetupCapabilities(const Info& info, const Profile& profile, EmitContext& ct
    if (stage == LogicalStage::TessellationControl || stage == LogicalStage::TessellationEval) {
        ctx.AddCapability(spv::Capability::Tessellation);
    }
+    if (info.dma_types != IR::Type::Void) {
+        ctx.AddCapability(spv::Capability::PhysicalStorageBufferAddresses);
+        ctx.AddExtension("SPV_KHR_physical_storage_buffer");
+    }
 }

 void DefineEntryPoint(const Info& info, EmitContext& ctx, Id main) {
@ -387,7 +392,7 @@ void SetupFloatMode(EmitContext& ctx, const Profile& profile, const RuntimeInfo&
 void PatchPhiNodes(const IR::Program& program, EmitContext& ctx) {
    auto inst{program.blocks.front()->begin()};
    size_t block_index{0};
-    ctx.PatchDeferredPhi([&](size_t phi_arg) {
+    ctx.PatchDeferredPhi([&](u32 phi_arg, Id first_parent) {
        if (phi_arg == 0) {
            ++inst;
            if (inst == program.blocks[block_index]->end() ||
@ -398,7 +403,9 @@ void PatchPhiNodes(const IR::Program& program, EmitContext& ctx) {
                } while (inst->GetOpcode() != IR::Opcode::Phi);
            }
        }
-        return ctx.Def(inst->Arg(phi_arg));
+        const Id arg = ctx.Def(inst->Arg(phi_arg));
+        const Id parent = ctx.first_to_last_label_map[first_parent.value];
+        return std::make_pair(arg, parent);
    });
 }
 } // Anonymous namespace
--- a/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp
+++ b/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp
@ -60,7 +60,7 @@ Id BufferAtomicU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id
        address = ctx.OpIAdd(ctx.U32[1], address, buffer.offset);
    }
    const Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(2u));
-    const auto [id, pointer_type] = buffer[EmitContext::BufferAlias::U32];
+    const auto [id, pointer_type] = buffer[EmitContext::PointerType::U32];
    const Id ptr = ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, index);
    const auto [scope, semantics]{AtomicArgs(ctx)};
    return BufferAtomicU32BoundsCheck(ctx, index, buffer.size_dwords, [&] {
@ -257,7 +257,7 @@ Id EmitImageAtomicExchange32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id co

 Id EmitDataAppend(EmitContext& ctx, u32 gds_addr, u32 binding) {
    const auto& buffer = ctx.buffers[binding];
-    const auto [id, pointer_type] = buffer[EmitContext::BufferAlias::U32];
+    const auto [id, pointer_type] = buffer[EmitContext::PointerType::U32];
    const Id ptr = ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, ctx.ConstU32(gds_addr));
    const auto [scope, semantics]{AtomicArgs(ctx)};
    return ctx.OpAtomicIIncrement(ctx.U32[1], ptr, scope, semantics);
@ -265,7 +265,7 @@ Id EmitDataAppend(EmitContext& ctx, u32 gds_addr, u32 binding) {

 Id EmitDataConsume(EmitContext& ctx, u32 gds_addr, u32 binding) {
    const auto& buffer = ctx.buffers[binding];
-    const auto [id, pointer_type] = buffer[EmitContext::BufferAlias::U32];
+    const auto [id, pointer_type] = buffer[EmitContext::PointerType::U32];
    const Id ptr = ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, ctx.ConstU32(gds_addr));
    const auto [scope, semantics]{AtomicArgs(ctx)};
    return ctx.OpAtomicIDecrement(ctx.U32[1], ptr, scope, semantics);
--- a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp
+++ b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp
@ -161,33 +161,37 @@ void EmitGetGotoVariable(EmitContext&) {
    UNREACHABLE_MSG("Unreachable instruction");
 }

-using BufferAlias = EmitContext::BufferAlias;
+using PointerType = EmitContext::PointerType;

-Id EmitReadConst(EmitContext& ctx, IR::Inst* inst) {
+Id EmitReadConst(EmitContext& ctx, IR::Inst* inst, Id addr, Id offset) {
    const u32 flatbuf_off_dw = inst->Flags<u32>();
-    const auto& srt_flatbuf = ctx.buffers.back();
-    ASSERT(srt_flatbuf.binding >= 0 && flatbuf_off_dw > 0 &&
-           srt_flatbuf.buffer_type == BufferType::ReadConstUbo);
-    LOG_DEBUG(Render_Recompiler, "ReadConst from flatbuf dword {}", flatbuf_off_dw);
-    const auto [id, pointer_type] = srt_flatbuf[BufferAlias::U32];
-    const Id ptr{
-        ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, ctx.ConstU32(flatbuf_off_dw))};
-    return ctx.OpLoad(ctx.U32[1], ptr);
+    // We can only provide a fallback for immediate offsets.
+    if (flatbuf_off_dw == 0) {
+        return ctx.OpFunctionCall(ctx.U32[1], ctx.read_const_dynamic, addr, offset);
+    } else {
+        return ctx.OpFunctionCall(ctx.U32[1], ctx.read_const, addr, offset,
+                                  ctx.ConstU32(flatbuf_off_dw));
+    }
 }

-Id EmitReadConstBuffer(EmitContext& ctx, u32 handle, Id index) {
+template <PointerType type>
+Id ReadConstBuffer(EmitContext& ctx, u32 handle, Id index) {
    const auto& buffer = ctx.buffers[handle];
    index = ctx.OpIAdd(ctx.U32[1], index, buffer.offset_dwords);
-    const auto [id, pointer_type] = buffer[BufferAlias::U32];
+    const auto [id, pointer_type] = buffer[type];
+    const auto value_type = type == PointerType::U32 ? ctx.U32[1] : ctx.F32[1];
    const Id ptr{ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, index)};
-    const Id result{ctx.OpLoad(ctx.U32[1], ptr)};
+    const Id result{ctx.OpLoad(value_type, ptr)};

    if (Sirit::ValidId(buffer.size_dwords)) {
        const Id in_bounds = ctx.OpULessThan(ctx.U1[1], index, buffer.size_dwords);
-        return ctx.OpSelect(ctx.U32[1], in_bounds, result, ctx.u32_zero_value);
-    } else {
-        return result;
+        return ctx.OpSelect(value_type, in_bounds, result, ctx.u32_zero_value);
    }
+    return result;
+}
+
+Id EmitReadConstBuffer(EmitContext& ctx, u32 handle, Id index) {
+    return ReadConstBuffer<PointerType::U32>(ctx, handle, index);
 }

 Id EmitReadStepRate(EmitContext& ctx, int rate_idx) {
@ -246,7 +250,7 @@ Id EmitGetAttribute(EmitContext& ctx, IR::Attribute attr, u32 comp, Id index) {
                    ctx.OpUDiv(ctx.U32[1], ctx.OpLoad(ctx.U32[1], ctx.instance_id), step_rate),
                    ctx.ConstU32(param.num_components)),
                ctx.ConstU32(comp));
-            return EmitReadConstBuffer(ctx, param.buffer_handle, offset);
+            return ReadConstBuffer<PointerType::F32>(ctx, param.buffer_handle, offset);
        }

        Id result;
@ -432,7 +436,7 @@ static Id EmitLoadBufferBoundsCheck(EmitContext& ctx, Id index, Id buffer_size,
    return result;
 }

-template <u32 N, BufferAlias alias>
+template <u32 N, PointerType alias>
 static Id EmitLoadBufferB32xN(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
    const auto flags = inst->Flags<IR::BufferInstInfo>();
    const auto& spv_buffer = ctx.buffers[handle];
@ -440,7 +444,7 @@ static Id EmitLoadBufferB32xN(EmitContext& ctx, IR::Inst* inst, u32 handle, Id a
        address = ctx.OpIAdd(ctx.U32[1], address, spv_buffer.offset);
    }
    const Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(2u));
-    const auto& data_types = alias == BufferAlias::U32 ? ctx.U32 : ctx.F32;
+    const auto& data_types = alias == PointerType::U32 ? ctx.U32 : ctx.F32;
    const auto [id, pointer_type] = spv_buffer[alias];

    boost::container::static_vector<Id, N> ids;
@ -451,7 +455,7 @@ static Id EmitLoadBufferB32xN(EmitContext& ctx, IR::Inst* inst, u32 handle, Id a
        if (!flags.typed) {
            // Untyped loads have bounds checking per-component.
            ids.push_back(EmitLoadBufferBoundsCheck<1>(ctx, index_i, spv_buffer.size_dwords,
-                                                       result_i, alias == BufferAlias::F32));
+                                                       result_i, alias == PointerType::F32));
        } else {
            ids.push_back(result_i);
        }
@ -461,7 +465,7 @@ static Id EmitLoadBufferB32xN(EmitContext& ctx, IR::Inst* inst, u32 handle, Id a
    if (flags.typed) {
        // Typed loads have single bounds check for the whole load.
        return EmitLoadBufferBoundsCheck<N>(ctx, index, spv_buffer.size_dwords, result,
-                                            alias == BufferAlias::F32);
+                                            alias == PointerType::F32);
    }
    return result;
 }
@ -471,7 +475,7 @@ Id EmitLoadBufferU8(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
    if (Sirit::ValidId(spv_buffer.offset)) {
        address = ctx.OpIAdd(ctx.U32[1], address, spv_buffer.offset);
    }
-    const auto [id, pointer_type] = spv_buffer[BufferAlias::U8];
+    const auto [id, pointer_type] = spv_buffer[PointerType::U8];
    const Id ptr{ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, address)};
    const Id result{ctx.OpUConvert(ctx.U32[1], ctx.OpLoad(ctx.U8, ptr))};
    return EmitLoadBufferBoundsCheck<1>(ctx, address, spv_buffer.size, result, false);
@ -482,7 +486,7 @@ Id EmitLoadBufferU16(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
    if (Sirit::ValidId(spv_buffer.offset)) {
        address = ctx.OpIAdd(ctx.U32[1], address, spv_buffer.offset);
    }
-    const auto [id, pointer_type] = spv_buffer[BufferAlias::U16];
+    const auto [id, pointer_type] = spv_buffer[PointerType::U16];
    const Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(1u));
    const Id ptr{ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, index)};
    const Id result{ctx.OpUConvert(ctx.U32[1], ctx.OpLoad(ctx.U16, ptr))};
@ -490,35 +494,35 @@ Id EmitLoadBufferU16(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
 }

 Id EmitLoadBufferU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
-    return EmitLoadBufferB32xN<1, BufferAlias::U32>(ctx, inst, handle, address);
+    return EmitLoadBufferB32xN<1, PointerType::U32>(ctx, inst, handle, address);
 }

 Id EmitLoadBufferU32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
-    return EmitLoadBufferB32xN<2, BufferAlias::U32>(ctx, inst, handle, address);
+    return EmitLoadBufferB32xN<2, PointerType::U32>(ctx, inst, handle, address);
 }

 Id EmitLoadBufferU32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
-    return EmitLoadBufferB32xN<3, BufferAlias::U32>(ctx, inst, handle, address);
+    return EmitLoadBufferB32xN<3, PointerType::U32>(ctx, inst, handle, address);
 }

 Id EmitLoadBufferU32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
-    return EmitLoadBufferB32xN<4, BufferAlias::U32>(ctx, inst, handle, address);
+    return EmitLoadBufferB32xN<4, PointerType::U32>(ctx, inst, handle, address);
 }

 Id EmitLoadBufferF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
-    return EmitLoadBufferB32xN<1, BufferAlias::F32>(ctx, inst, handle, address);
+    return EmitLoadBufferB32xN<1, PointerType::F32>(ctx, inst, handle, address);
 }

 Id EmitLoadBufferF32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
-    return EmitLoadBufferB32xN<2, BufferAlias::F32>(ctx, inst, handle, address);
+    return EmitLoadBufferB32xN<2, PointerType::F32>(ctx, inst, handle, address);
 }

 Id EmitLoadBufferF32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
-    return EmitLoadBufferB32xN<3, BufferAlias::F32>(ctx, inst, handle, address);
+    return EmitLoadBufferB32xN<3, PointerType::F32>(ctx, inst, handle, address);
 }

 Id EmitLoadBufferF32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
-    return EmitLoadBufferB32xN<4, BufferAlias::F32>(ctx, inst, handle, address);
+    return EmitLoadBufferB32xN<4, PointerType::F32>(ctx, inst, handle, address);
 }

 Id EmitLoadBufferFormatF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
@ -548,7 +552,7 @@ void EmitStoreBufferBoundsCheck(EmitContext& ctx, Id index, Id buffer_size, auto
    emit_func();
 }

-template <u32 N, BufferAlias alias>
+template <u32 N, PointerType alias>
 static void EmitStoreBufferB32xN(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address,
                                 Id value) {
    const auto flags = inst->Flags<IR::BufferInstInfo>();
@ -557,7 +561,7 @@ static void EmitStoreBufferB32xN(EmitContext& ctx, IR::Inst* inst, u32 handle, I
        address = ctx.OpIAdd(ctx.U32[1], address, spv_buffer.offset);
    }
    const Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(2u));
-    const auto& data_types = alias == BufferAlias::U32 ? ctx.U32 : ctx.F32;
+    const auto& data_types = alias == PointerType::U32 ? ctx.U32 : ctx.F32;
    const auto [id, pointer_type] = spv_buffer[alias];

    auto store = [&] {
@ -588,7 +592,7 @@ void EmitStoreBufferU8(EmitContext& ctx, IR::Inst*, u32 handle, Id address, Id v
    if (Sirit::ValidId(spv_buffer.offset)) {
        address = ctx.OpIAdd(ctx.U32[1], address, spv_buffer.offset);
    }
-    const auto [id, pointer_type] = spv_buffer[BufferAlias::U8];
+    const auto [id, pointer_type] = spv_buffer[PointerType::U8];
    const Id ptr{ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, address)};
    const Id result{ctx.OpUConvert(ctx.U8, value)};
    EmitStoreBufferBoundsCheck<1>(ctx, address, spv_buffer.size, [&] { ctx.OpStore(ptr, result); });
@ -599,7 +603,7 @@ void EmitStoreBufferU16(EmitContext& ctx, IR::Inst*, u32 handle, Id address, Id
    if (Sirit::ValidId(spv_buffer.offset)) {
        address = ctx.OpIAdd(ctx.U32[1], address, spv_buffer.offset);
    }
-    const auto [id, pointer_type] = spv_buffer[BufferAlias::U16];
+    const auto [id, pointer_type] = spv_buffer[PointerType::U16];
    const Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(1u));
    const Id ptr{ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, index)};
    const Id result{ctx.OpUConvert(ctx.U16, value)};
@ -608,35 +612,35 @@ void EmitStoreBufferU16(EmitContext& ctx, IR::Inst*, u32 handle, Id address, Id
 }

 void EmitStoreBufferU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) {
-    EmitStoreBufferB32xN<1, BufferAlias::U32>(ctx, inst, handle, address, value);
+    EmitStoreBufferB32xN<1, PointerType::U32>(ctx, inst, handle, address, value);
 }

 void EmitStoreBufferU32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) {
-    EmitStoreBufferB32xN<2, BufferAlias::U32>(ctx, inst, handle, address, value);
+    EmitStoreBufferB32xN<2, PointerType::U32>(ctx, inst, handle, address, value);
 }

 void EmitStoreBufferU32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) {
-    EmitStoreBufferB32xN<3, BufferAlias::U32>(ctx, inst, handle, address, value);
+    EmitStoreBufferB32xN<3, PointerType::U32>(ctx, inst, handle, address, value);
 }

 void EmitStoreBufferU32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) {
-    EmitStoreBufferB32xN<4, BufferAlias::U32>(ctx, inst, handle, address, value);
+    EmitStoreBufferB32xN<4, PointerType::U32>(ctx, inst, handle, address, value);
 }

 void EmitStoreBufferF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) {
-    EmitStoreBufferB32xN<1, BufferAlias::F32>(ctx, inst, handle, address, value);
+    EmitStoreBufferB32xN<1, PointerType::F32>(ctx, inst, handle, address, value);
 }

 void EmitStoreBufferF32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) {
-    EmitStoreBufferB32xN<2, BufferAlias::F32>(ctx, inst, handle, address, value);
+    EmitStoreBufferB32xN<2, PointerType::F32>(ctx, inst, handle, address, value);
 }

 void EmitStoreBufferF32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) {
-    EmitStoreBufferB32xN<3, BufferAlias::F32>(ctx, inst, handle, address, value);
+    EmitStoreBufferB32xN<3, PointerType::F32>(ctx, inst, handle, address, value);
 }

 void EmitStoreBufferF32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) {
-    EmitStoreBufferB32xN<4, BufferAlias::F32>(ctx, inst, handle, address, value);
+    EmitStoreBufferB32xN<4, PointerType::F32>(ctx, inst, handle, address, value);
 }

 void EmitStoreBufferFormatF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) {
--- a/src/shader_recompiler/backend/spirv/emit_spirv_floating_point.cpp
+++ b/src/shader_recompiler/backend/spirv/emit_spirv_floating_point.cpp
@ -154,7 +154,7 @@ Id EmitFPRecip32(EmitContext& ctx, Id value) {
 }

 Id EmitFPRecip64(EmitContext& ctx, Id value) {
-    return ctx.OpFDiv(ctx.F64[1], ctx.Constant(ctx.F64[1], 1.0f), value);
+    return ctx.OpFDiv(ctx.F64[1], ctx.Constant(ctx.F64[1], f64{1.0}), value);
 }

 Id EmitFPRecipSqrt32(EmitContext& ctx, Id value) {
--- a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h
+++ b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h
@ -61,7 +61,7 @@ void EmitSetVectorRegister(EmitContext& ctx);
 void EmitSetGotoVariable(EmitContext& ctx);
 void EmitGetGotoVariable(EmitContext& ctx);
 void EmitSetScc(EmitContext& ctx);
-Id EmitReadConst(EmitContext& ctx, IR::Inst* inst);
+Id EmitReadConst(EmitContext& ctx, IR::Inst* inst, Id addr, Id offset);
 Id EmitReadConstBuffer(EmitContext& ctx, u32 handle, Id index);
 Id EmitLoadBufferU8(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address);
 Id EmitLoadBufferU16(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address);
--- a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp
+++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp
@ -7,6 +7,7 @@
 #include "shader_recompiler/frontend/fetch_shader.h"
 #include "shader_recompiler/runtime_info.h"
 #include "video_core/amdgpu/types.h"
+#include "video_core/buffer_cache/buffer_cache.h"

 #include <boost/container/static_vector.hpp>
 #include <fmt/format.h>
@ -70,6 +71,12 @@ EmitContext::EmitContext(const Profile& profile_, const RuntimeInfo& runtime_inf
                         Bindings& binding_)
    : Sirit::Module(profile_.supported_spirv), info{info_}, runtime_info{runtime_info_},
      profile{profile_}, stage{info.stage}, l_stage{info.l_stage}, binding{binding_} {
+    if (info.dma_types != IR::Type::Void) {
+        SetMemoryModel(spv::AddressingModel::PhysicalStorageBuffer64, spv::MemoryModel::GLSL450);
+    } else {
+        SetMemoryModel(spv::AddressingModel::Logical, spv::MemoryModel::GLSL450);
+    }
+
    AddCapability(spv::Capability::Shader);
    DefineArithmeticTypes();
    DefineInterfaces();
@ -137,9 +144,13 @@ void EmitContext::DefineArithmeticTypes() {

    true_value = ConstantTrue(U1[1]);
    false_value = ConstantFalse(U1[1]);
+    u8_one_value = Constant(U8, 1U);
+    u8_zero_value = Constant(U8, 0U);
    u32_one_value = ConstU32(1U);
    u32_zero_value = ConstU32(0U);
    f32_zero_value = ConstF32(0.0f);
+    u64_one_value = Constant(U64, 1ULL);
+    u64_zero_value = Constant(U64, 0ULL);

    pi_x2 = ConstF32(2.0f * float{std::numbers::pi});

@ -157,6 +168,35 @@ void EmitContext::DefineArithmeticTypes() {
    if (info.uses_fp64) {
        frexp_result_f64 = Name(TypeStruct(F64[1], S32[1]), "frexp_result_f64");
    }
+
+    if (True(info.dma_types & IR::Type::F64)) {
+        physical_pointer_types[PointerType::F64] =
+            TypePointer(spv::StorageClass::PhysicalStorageBuffer, F64[1]);
+    }
+    if (True(info.dma_types & IR::Type::U64)) {
+        physical_pointer_types[PointerType::U64] =
+            TypePointer(spv::StorageClass::PhysicalStorageBuffer, U64);
+    }
+    if (True(info.dma_types & IR::Type::F32)) {
+        physical_pointer_types[PointerType::F32] =
+            TypePointer(spv::StorageClass::PhysicalStorageBuffer, F32[1]);
+    }
+    if (True(info.dma_types & IR::Type::U32)) {
+        physical_pointer_types[PointerType::U32] =
+            TypePointer(spv::StorageClass::PhysicalStorageBuffer, U32[1]);
+    }
+    if (True(info.dma_types & IR::Type::F16)) {
+        physical_pointer_types[PointerType::F16] =
+            TypePointer(spv::StorageClass::PhysicalStorageBuffer, F16[1]);
+    }
+    if (True(info.dma_types & IR::Type::U16)) {
+        physical_pointer_types[PointerType::U16] =
+            TypePointer(spv::StorageClass::PhysicalStorageBuffer, U16);
+    }
+    if (True(info.dma_types & IR::Type::U8)) {
+        physical_pointer_types[PointerType::U8] =
+            TypePointer(spv::StorageClass::PhysicalStorageBuffer, U8);
+    }
 }

 void EmitContext::DefineInterfaces() {
@ -195,9 +235,10 @@ EmitContext::SpirvAttribute EmitContext::GetAttributeInfo(AmdGpu::NumberFormat f
 }

 Id EmitContext::GetBufferSize(const u32 sharp_idx) {
-    const auto& srt_flatbuf = buffers.back();
-    ASSERT(srt_flatbuf.buffer_type == BufferType::ReadConstUbo);
-    const auto [id, pointer_type] = srt_flatbuf[BufferAlias::U32];
+    // Can this be done with memory access? Like we do now with ReadConst
+    const auto& srt_flatbuf = buffers[flatbuf_index];
+    ASSERT(srt_flatbuf.buffer_type == BufferType::Flatbuf);
+    const auto [id, pointer_type] = srt_flatbuf[PointerType::U32];

    const auto rsrc1{
        OpLoad(U32[1], OpAccessChain(pointer_type, id, u32_zero_value, ConstU32(sharp_idx + 1)))};
@ -690,8 +731,14 @@ EmitContext::BufferSpv EmitContext::DefineBuffer(bool is_storage, bool is_writte
    case Shader::BufferType::GdsBuffer:
        Name(id, "gds_buffer");
        break;
-    case Shader::BufferType::ReadConstUbo:
-        Name(id, "srt_flatbuf_ubo");
+    case Shader::BufferType::Flatbuf:
+        Name(id, "srt_flatbuf");
+        break;
+    case Shader::BufferType::BdaPagetable:
+        Name(id, "bda_pagetable");
+        break;
+    case Shader::BufferType::FaultBuffer:
+        Name(id, "fault_buffer");
        break;
    case Shader::BufferType::SharedMemory:
        Name(id, "ssbo_shmem");
@ -705,35 +752,53 @@ EmitContext::BufferSpv EmitContext::DefineBuffer(bool is_storage, bool is_writte
 };

 void EmitContext::DefineBuffers() {
-    if (!profile.supports_robust_buffer_access && !info.has_readconst) {
-        // In case ReadConstUbo has not already been bound by IR and is needed
+    if (!profile.supports_robust_buffer_access &&
+        info.readconst_types == Info::ReadConstType::None) {
+        // In case Flatbuf has not already been bound by IR and is needed
        // to query buffer sizes, bind it now.
        info.buffers.push_back({
            .used_types = IR::Type::U32,
-            .inline_cbuf = AmdGpu::Buffer::Null(),
-            .buffer_type = BufferType::ReadConstUbo,
+            // We can't guarantee that flatbuf will not grow past UBO
+            // limit if there are a lot of ReadConsts. (We could specialize)
+            .inline_cbuf = AmdGpu::Buffer::Placeholder(std::numeric_limits<u32>::max()),
+            .buffer_type = BufferType::Flatbuf,
        });
+        // In the future we may want to read buffer sizes from GPU memory if available.
+        // info.readconst_types |= Info::ReadConstType::Immediate;
    }
    for (const auto& desc : info.buffers) {
        const auto buf_sharp = desc.GetSharp(info);
        const bool is_storage = desc.IsStorage(buf_sharp, profile);

+        // Set indexes for special buffers.
+        if (desc.buffer_type == BufferType::Flatbuf) {
+            flatbuf_index = buffers.size();
+        } else if (desc.buffer_type == BufferType::BdaPagetable) {
+            bda_pagetable_index = buffers.size();
+        } else if (desc.buffer_type == BufferType::FaultBuffer) {
+            fault_buffer_index = buffers.size();
+        }
+
        // Define aliases depending on the shader usage.
        auto& spv_buffer = buffers.emplace_back(binding.buffer++, desc.buffer_type);
+        if (True(desc.used_types & IR::Type::U64)) {
+            spv_buffer[PointerType::U64] =
+                DefineBuffer(is_storage, desc.is_written, 3, desc.buffer_type, U64);
+        }
        if (True(desc.used_types & IR::Type::U32)) {
-            spv_buffer[BufferAlias::U32] =
+            spv_buffer[PointerType::U32] =
                DefineBuffer(is_storage, desc.is_written, 2, desc.buffer_type, U32[1]);
        }
        if (True(desc.used_types & IR::Type::F32)) {
-            spv_buffer[BufferAlias::F32] =
+            spv_buffer[PointerType::F32] =
                DefineBuffer(is_storage, desc.is_written, 2, desc.buffer_type, F32[1]);
        }
        if (True(desc.used_types & IR::Type::U16)) {
-            spv_buffer[BufferAlias::U16] =
+            spv_buffer[PointerType::U16] =
                DefineBuffer(is_storage, desc.is_written, 1, desc.buffer_type, U16);
        }
        if (True(desc.used_types & IR::Type::U8)) {
-            spv_buffer[BufferAlias::U8] =
+            spv_buffer[PointerType::U8] =
                DefineBuffer(is_storage, desc.is_written, 0, desc.buffer_type, U8);
        }
        ++binding.unified;
@ -1003,6 +1068,101 @@ Id EmitContext::DefineUfloatM5ToFloat32(u32 mantissa_bits, const std::string_vie
    return func;
 }

+Id EmitContext::DefineGetBdaPointer() {
+    const auto caching_pagebits{
+        Constant(U64, static_cast<u64>(VideoCore::BufferCache::CACHING_PAGEBITS))};
+    const auto caching_pagemask{Constant(U64, VideoCore::BufferCache::CACHING_PAGESIZE - 1)};
+
+    const auto func_type{TypeFunction(U64, U64)};
+    const auto func{OpFunction(U64, spv::FunctionControlMask::MaskNone, func_type)};
+    const auto address{OpFunctionParameter(U64)};
+    Name(func, "get_bda_pointer");
+    AddLabel();
+
+    const auto fault_label{OpLabel()};
+    const auto available_label{OpLabel()};
+    const auto merge_label{OpLabel()};
+
+    // Get page BDA
+    const auto page{OpShiftRightLogical(U64, address, caching_pagebits)};
+    const auto page32{OpUConvert(U32[1], page)};
+    const auto& bda_buffer{buffers[bda_pagetable_index]};
+    const auto [bda_buffer_id, bda_pointer_type] = bda_buffer[PointerType::U64];
+    const auto bda_ptr{OpAccessChain(bda_pointer_type, bda_buffer_id, u32_zero_value, page32)};
+    const auto bda{OpLoad(U64, bda_ptr)};
+
+    // Check if page is GPU cached
+    const auto is_fault{OpIEqual(U1[1], bda, u64_zero_value)};
+    OpSelectionMerge(merge_label, spv::SelectionControlMask::MaskNone);
+    OpBranchConditional(is_fault, fault_label, available_label);
+
+    // First time acces, mark as fault
+    AddLabel(fault_label);
+    const auto& fault_buffer{buffers[fault_buffer_index]};
+    const auto [fault_buffer_id, fault_pointer_type] = fault_buffer[PointerType::U8];
+    const auto page_div8{OpShiftRightLogical(U32[1], page32, ConstU32(3U))};
+    const auto page_mod8{OpBitwiseAnd(U32[1], page32, ConstU32(7U))};
+    const auto page_mask{OpShiftLeftLogical(U8, u8_one_value, page_mod8)};
+    const auto fault_ptr{
+        OpAccessChain(fault_pointer_type, fault_buffer_id, u32_zero_value, page_div8)};
+    const auto fault_value{OpLoad(U8, fault_ptr)};
+    const auto fault_value_masked{OpBitwiseOr(U8, fault_value, page_mask)};
+    OpStore(fault_ptr, fault_value_masked);
+
+    // Return null pointer
+    const auto fallback_result{u64_zero_value};
+    OpBranch(merge_label);
+
+    // Value is available, compute address
+    AddLabel(available_label);
+    const auto offset_in_bda{OpBitwiseAnd(U64, address, caching_pagemask)};
+    const auto addr{OpIAdd(U64, bda, offset_in_bda)};
+    OpBranch(merge_label);
+
+    // Merge
+    AddLabel(merge_label);
+    const auto result{OpPhi(U64, addr, available_label, fallback_result, fault_label)};
+    OpReturnValue(result);
+    OpFunctionEnd();
+    return func;
+}
+
+Id EmitContext::DefineReadConst(bool dynamic) {
+    const auto func_type{!dynamic ? TypeFunction(U32[1], U32[2], U32[1], U32[1])
+                                  : TypeFunction(U32[1], U32[2], U32[1])};
+    const auto func{OpFunction(U32[1], spv::FunctionControlMask::MaskNone, func_type)};
+    const auto base{OpFunctionParameter(U32[2])};
+    const auto offset{OpFunctionParameter(U32[1])};
+    const auto flatbuf_offset{!dynamic ? OpFunctionParameter(U32[1]) : Id{}};
+    Name(func, dynamic ? "read_const_dynamic" : "read_const");
+    AddLabel();
+
+    const auto base_lo{OpUConvert(U64, OpCompositeExtract(U32[1], base, 0))};
+    const auto base_hi{OpUConvert(U64, OpCompositeExtract(U32[1], base, 1))};
+    const auto base_shift{OpShiftLeftLogical(U64, base_hi, ConstU32(32U))};
+    const auto base_addr{OpBitwiseOr(U64, base_lo, base_shift)};
+    const auto offset_bytes{OpShiftLeftLogical(U32[1], offset, ConstU32(2U))};
+    const auto addr{OpIAdd(U64, base_addr, OpUConvert(U64, offset_bytes))};
+
+    const auto result = EmitMemoryRead(U32[1], addr, [&]() {
+        if (dynamic) {
+            return u32_zero_value;
+        } else {
+            const auto& flatbuf_buffer{buffers[flatbuf_index]};
+            ASSERT(flatbuf_buffer.binding >= 0 &&
+                   flatbuf_buffer.buffer_type == BufferType::Flatbuf);
+            const auto [flatbuf_buffer_id, flatbuf_pointer_type] = flatbuf_buffer[PointerType::U32];
+            const auto ptr{OpAccessChain(flatbuf_pointer_type, flatbuf_buffer_id, u32_zero_value,
+                                         flatbuf_offset)};
+            return OpLoad(U32[1], ptr);
+        }
+    });
+
+    OpReturnValue(result);
+    OpFunctionEnd();
+    return func;
+}
+
 void EmitContext::DefineFunctions() {
    if (info.uses_pack_10_11_11) {
        f32_to_uf11 = DefineFloat32ToUfloatM5(6, "f32_to_uf11");
@ -1012,6 +1172,18 @@ void EmitContext::DefineFunctions() {
        uf11_to_f32 = DefineUfloatM5ToFloat32(6, "uf11_to_f32");
        uf10_to_f32 = DefineUfloatM5ToFloat32(5, "uf10_to_f32");
    }
+    if (info.dma_types != IR::Type::Void) {
+        get_bda_pointer = DefineGetBdaPointer();
+    }
+
+    if (True(info.readconst_types & Info::ReadConstType::Immediate)) {
+        LOG_DEBUG(Render_Recompiler, "Shader {:#x} uses immediate ReadConst", info.pgm_hash);
+        read_const = DefineReadConst(false);
+    }
+    if (True(info.readconst_types & Info::ReadConstType::Dynamic)) {
+        LOG_DEBUG(Render_Recompiler, "Shader {:#x} uses dynamic ReadConst", info.pgm_hash);
+        read_const_dynamic = DefineReadConst(true);
+    }
 }

 } // namespace Shader::Backend::SPIRV
--- a/src/shader_recompiler/backend/spirv/spirv_emit_context.h
+++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.h
@ -4,6 +4,7 @@
 #pragma once

 #include <array>
+#include <unordered_map>
 #include <sirit/sirit.h>

 #include "shader_recompiler/backend/bindings.h"
@ -41,6 +42,17 @@ public:
                         Bindings& binding);
    ~EmitContext();

+    enum class PointerType : u32 {
+        U8,
+        U16,
+        F16,
+        U32,
+        F32,
+        U64,
+        F64,
+        NumAlias,
+    };
+
    Id Def(const IR::Value& value);

    void DefineBufferProperties();
@ -133,12 +145,72 @@ public:
        return ConstantComposite(type, constituents);
    }

+    inline Id AddLabel() {
+        last_label = Module::AddLabel();
+        return last_label;
+    }
+
+    inline Id AddLabel(Id label) {
+        last_label = Module::AddLabel(label);
+        return last_label;
+    }
+
+    PointerType PointerTypeFromType(Id type) {
+        if (type.value == U8.value)
+            return PointerType::U8;
+        if (type.value == U16.value)
+            return PointerType::U16;
+        if (type.value == F16[1].value)
+            return PointerType::F16;
+        if (type.value == U32[1].value)
+            return PointerType::U32;
+        if (type.value == F32[1].value)
+            return PointerType::F32;
+        if (type.value == U64.value)
+            return PointerType::U64;
+        if (type.value == F64[1].value)
+            return PointerType::F64;
+        UNREACHABLE_MSG("Unknown type for pointer");
+    }
+
+    Id EmitMemoryRead(Id type, Id address, auto&& fallback) {
+        const Id available_label = OpLabel();
+        const Id fallback_label = OpLabel();
+        const Id merge_label = OpLabel();
+
+        const Id addr = OpFunctionCall(U64, get_bda_pointer, address);
+        const Id is_available = OpINotEqual(U1[1], addr, u64_zero_value);
+        OpSelectionMerge(merge_label, spv::SelectionControlMask::MaskNone);
+        OpBranchConditional(is_available, available_label, fallback_label);
+
+        // Available
+        AddLabel(available_label);
+        const auto pointer_type = PointerTypeFromType(type);
+        const Id pointer_type_id = physical_pointer_types[pointer_type];
+        const Id addr_ptr = OpConvertUToPtr(pointer_type_id, addr);
+        const Id result = OpLoad(type, addr_ptr, spv::MemoryAccessMask::Aligned, 4u);
+        OpBranch(merge_label);
+
+        // Fallback
+        AddLabel(fallback_label);
+        const Id fallback_result = fallback();
+        OpBranch(merge_label);
+
+        // Merge
+        AddLabel(merge_label);
+        const Id final_result =
+            OpPhi(type, fallback_result, fallback_label, result, available_label);
+        return final_result;
+    }
+
    Info& info;
    const RuntimeInfo& runtime_info;
    const Profile& profile;
    Stage stage;
    LogicalStage l_stage{};

+    Id last_label{};
+
    Id void_id{};
    Id U8{};
    Id S8{};
@ -161,9 +233,13 @@ public:

    Id true_value{};
    Id false_value{};
+    Id u8_one_value{};
+    Id u8_zero_value{};
    Id u32_one_value{};
    Id u32_zero_value{};
    Id f32_zero_value{};
+    Id u64_one_value{};
+    Id u64_zero_value{};

    Id shared_u8{};
    Id shared_u16{};
@ -231,14 +307,6 @@ public:
        bool is_storage = false;
    };

-    enum class BufferAlias : u32 {
-        U8,
-        U16,
-        U32,
-        F32,
-        NumAlias,
-    };
-
    struct BufferSpv {
        Id id;
        Id pointer_type;
@ -252,22 +320,40 @@ public:
        Id size;
        Id size_shorts;
        Id size_dwords;
-        std::array<BufferSpv, u32(BufferAlias::NumAlias)> aliases;
+        std::array<BufferSpv, u32(PointerType::NumAlias)> aliases;

-        const BufferSpv& operator[](BufferAlias alias) const {
+        const BufferSpv& operator[](PointerType alias) const {
            return aliases[u32(alias)];
        }

-        BufferSpv& operator[](BufferAlias alias) {
+        BufferSpv& operator[](PointerType alias) {
            return aliases[u32(alias)];
        }
    };

+    struct PhysicalPointerTypes {
+        std::array<Id, u32(PointerType::NumAlias)> types;
+
+        const Id& operator[](PointerType type) const {
+            return types[u32(type)];
+        }
+
+        Id& operator[](PointerType type) {
+            return types[u32(type)];
+        }
+    };
+
    Bindings& binding;
    boost::container::small_vector<Id, 16> buf_type_ids;
    boost::container::small_vector<BufferDefinition, 16> buffers;
    boost::container::small_vector<TextureDefinition, 8> images;
    boost::container::small_vector<Id, 4> samplers;
+    PhysicalPointerTypes physical_pointer_types;
+    std::unordered_map<u32, Id> first_to_last_label_map;
+
+    size_t flatbuf_index{};
+    size_t bda_pagetable_index{};
+    size_t fault_buffer_index{};

    Id sampler_type{};
    Id sampler_pointer_type{};
@ -292,6 +378,11 @@ public:
    Id uf10_to_f32{};
    Id f32_to_uf10{};

+    Id get_bda_pointer{};
+
+    Id read_const{};
+    Id read_const_dynamic{};
+
 private:
    void DefineArithmeticTypes();
    void DefineInterfaces();
@ -312,6 +403,10 @@ private:
    Id DefineFloat32ToUfloatM5(u32 mantissa_bits, std::string_view name);
    Id DefineUfloatM5ToFloat32(u32 mantissa_bits, std::string_view name);

+    Id DefineGetBdaPointer();
+
+    Id DefineReadConst(bool dynamic);
+
    Id GetBufferSize(u32 sharp_idx);
 };

--- a/src/shader_recompiler/frontend/decode.cpp
+++ b/src/shader_recompiler/frontend/decode.cpp
@ -1032,7 +1032,6 @@ void GcnDecodeContext::decodeInstructionMIMG(uint64_t hexInstruction) {

    m_instruction.control.mimg = *reinterpret_cast<InstControlMIMG*>(&hexInstruction);
    m_instruction.control.mimg.mod = getMimgModifier(m_instruction.opcode);
-    ASSERT(m_instruction.control.mimg.r128 == 0);
 }

 void GcnDecodeContext::decodeInstructionDS(uint64_t hexInstruction) {
--- a/src/shader_recompiler/frontend/translate/scalar_memory.cpp
+++ b/src/shader_recompiler/frontend/translate/scalar_memory.cpp
@ -39,21 +39,22 @@ void Translator::EmitScalarMemory(const GcnInst& inst) {

 void Translator::S_LOAD_DWORD(int num_dwords, const GcnInst& inst) {
    const auto& smrd = inst.control.smrd;
-    const u32 dword_offset = [&] -> u32 {
+    const IR::ScalarReg sbase{inst.src[0].code * 2};
+    const IR::U32 dword_offset = [&] -> IR::U32 {
        if (smrd.imm) {
-            return smrd.offset;
+            return ir.Imm32(smrd.offset);
        }
        if (smrd.offset == SQ_SRC_LITERAL) {
-            return inst.src[1].code;
+            return ir.Imm32(inst.src[1].code);
        }
-        UNREACHABLE();
+        return ir.ShiftRightLogical(ir.GetScalarReg(IR::ScalarReg(smrd.offset)), ir.Imm32(2));
    }();
-    const IR::ScalarReg sbase{inst.src[0].code * 2};
    const IR::Value base =
        ir.CompositeConstruct(ir.GetScalarReg(sbase), ir.GetScalarReg(sbase + 1));
    IR::ScalarReg dst_reg{inst.dst[0].code};
    for (u32 i = 0; i < num_dwords; i++) {
-        ir.SetScalarReg(dst_reg + i, ir.ReadConst(base, ir.Imm32(dword_offset + i)));
+        IR::U32 index = ir.IAdd(dword_offset, ir.Imm32(i));
+        ir.SetScalarReg(dst_reg + i, ir.ReadConst(base, index));
    }
 }

--- a/src/shader_recompiler/frontend/translate/translate.cpp
+++ b/src/shader_recompiler/frontend/translate/translate.cpp
@ -380,7 +380,7 @@ T Translator::GetSrc64(const InstOperand& operand) {
        break;
    case OperandField::VccLo:
        if constexpr (is_float) {
-            UNREACHABLE();
+            value = ir.PackDouble2x32(ir.CompositeConstruct(ir.GetVccLo(), ir.GetVccHi()));
        } else {
            value = ir.PackUint2x32(ir.CompositeConstruct(ir.GetVccLo(), ir.GetVccHi()));
        }
--- a/src/shader_recompiler/frontend/translate/translate.h
+++ b/src/shader_recompiler/frontend/translate/translate.h
@ -183,6 +183,7 @@ public:
    void V_READFIRSTLANE_B32(const GcnInst& inst);
    void V_CVT_I32_F64(const GcnInst& inst);
    void V_CVT_F64_I32(const GcnInst& inst);
+    void V_CVT_F64_U32(const GcnInst& inst);
    void V_CVT_F32_I32(const GcnInst& inst);
    void V_CVT_F32_U32(const GcnInst& inst);
    void V_CVT_U32_F32(const GcnInst& inst);
--- a/src/shader_recompiler/frontend/translate/vector_alu.cpp
+++ b/src/shader_recompiler/frontend/translate/vector_alu.cpp
@ -110,6 +110,8 @@ void Translator::EmitVectorAlu(const GcnInst& inst) {
        return V_CVT_I32_F64(inst);
    case Opcode::V_CVT_F64_I32:
        return V_CVT_F64_I32(inst);
+    case Opcode::V_CVT_F64_U32:
+        return V_CVT_F64_U32(inst);
    case Opcode::V_CVT_F32_I32:
        return V_CVT_F32_I32(inst);
    case Opcode::V_CVT_F32_U32:
@ -684,6 +686,11 @@ void Translator::V_CVT_F64_I32(const GcnInst& inst) {
    SetDst64(inst.dst[0], ir.ConvertSToF(64, 32, src0));
 }

+void Translator::V_CVT_F64_U32(const GcnInst& inst) {
+    const IR::U32 src0{GetSrc(inst.src[0])};
+    SetDst64(inst.dst[0], ir.ConvertUToF(64, 32, src0));
+}
+
 void Translator::V_CVT_F32_I32(const GcnInst& inst) {
    const IR::U32 src0{GetSrc(inst.src[0])};
    SetDst(inst.dst[0], ir.ConvertSToF(32, 32, src0));
--- a/src/shader_recompiler/frontend/translate/vector_memory.cpp
+++ b/src/shader_recompiler/frontend/translate/vector_memory.cpp
@ -152,6 +152,7 @@ void Translator::EmitVectorMemory(const GcnInst& inst) {

        // Image gather operations
    case Opcode::IMAGE_GATHER4:
+    case Opcode::IMAGE_GATHER4_L:
    case Opcode::IMAGE_GATHER4_LZ:
    case Opcode::IMAGE_GATHER4_C:
    case Opcode::IMAGE_GATHER4_O:
@ -377,6 +378,7 @@ void Translator::IMAGE_LOAD(bool has_mip, const GcnInst& inst) {
    IR::TextureInstInfo info{};
    info.has_lod.Assign(has_mip);
    info.is_array.Assign(mimg.da);
+    info.is_r128.Assign(mimg.r128);
    const IR::Value texel = ir.ImageRead(handle, body, {}, {}, info);

    for (u32 i = 0; i < 4; i++) {
@ -426,6 +428,7 @@ void Translator::IMAGE_GET_RESINFO(const GcnInst& inst) {

    IR::TextureInstInfo info{};
    info.is_array.Assign(mimg.da);
+    info.is_r128.Assign(mimg.r128);

    const IR::Value size = ir.ImageQueryDimension(tsharp, lod, ir.Imm1(has_mips), info);

@ -451,6 +454,7 @@ void Translator::IMAGE_ATOMIC(AtomicOp op, const GcnInst& inst) {

    IR::TextureInstInfo info{};
    info.is_array.Assign(mimg.da);
+    info.is_r128.Assign(mimg.r128);

    const IR::Value value = ir.GetVectorReg(val_reg);
    const IR::Value handle = ir.GetScalarReg(tsharp_reg);
@ -509,6 +513,7 @@ IR::Value EmitImageSample(IR::IREmitter& ir, const GcnInst& inst, const IR::Scal
    info.has_lod.Assign(flags.any(MimgModifier::Lod));
    info.is_array.Assign(mimg.da);
    info.is_unnormalized.Assign(mimg.unrm);
+    info.is_r128.Assign(mimg.r128);

    if (gather) {
        info.gather_comp.Assign(std::bit_width(mimg.dmask) - 1);
@ -617,6 +622,7 @@ void Translator::IMAGE_GET_LOD(const GcnInst& inst) {

    IR::TextureInstInfo info{};
    info.is_array.Assign(mimg.da);
+    info.is_r128.Assign(mimg.r128);

    const IR::Value handle = ir.GetScalarReg(tsharp_reg);
    const IR::Value body = ir.CompositeConstruct(
--- a/src/shader_recompiler/info.h
+++ b/src/shader_recompiler/info.h
@ -41,7 +41,9 @@ constexpr u32 NUM_TEXTURE_TYPES = 7;

 enum class BufferType : u32 {
    Guest,
-    ReadConstUbo,
+    Flatbuf,
+    BdaPagetable,
+    FaultBuffer,
    GdsBuffer,
    SharedMemory,
 };
@ -82,6 +84,7 @@ struct ImageResource {
    bool is_atomic{};
    bool is_array{};
    bool is_written{};
+    bool is_r128{};

    [[nodiscard]] constexpr AmdGpu::Image GetSharp(const Info& info) const noexcept;
 };
@ -215,11 +218,18 @@ struct Info {
    bool stores_tess_level_outer{};
    bool stores_tess_level_inner{};
    bool translation_failed{};
-    bool has_readconst{};
    u8 mrt_mask{0u};
    bool has_fetch_shader{false};
    u32 fetch_shader_sgpr_base{0u};

+    enum class ReadConstType {
+        None = 0,
+        Immediate = 1 << 0,
+        Dynamic = 1 << 1,
+    };
+    ReadConstType readconst_types{};
+    IR::Type dma_types{IR::Type::Void};
+
    explicit Info(Stage stage_, LogicalStage l_stage_, ShaderParams params)
        : stage{stage_}, l_stage{l_stage_}, pgm_hash{params.hash}, pgm_base{params.Base()},
          user_data{params.user_data} {}
@ -277,13 +287,20 @@ struct Info {
               sizeof(tess_constants));
    }
 };
+DECLARE_ENUM_FLAG_OPERATORS(Info::ReadConstType);

 constexpr AmdGpu::Buffer BufferResource::GetSharp(const Info& info) const noexcept {
    return inline_cbuf ? inline_cbuf : info.ReadUdSharp<AmdGpu::Buffer>(sharp_idx);
 }

 constexpr AmdGpu::Image ImageResource::GetSharp(const Info& info) const noexcept {
-    const auto image = info.ReadUdSharp<AmdGpu::Image>(sharp_idx);
+    AmdGpu::Image image{0};
+    if (!is_r128) {
+        image = info.ReadUdSharp<AmdGpu::Image>(sharp_idx);
+    } else {
+        AmdGpu::Buffer buf = info.ReadUdSharp<AmdGpu::Buffer>(sharp_idx);
+        memcpy(&image, &buf, sizeof(buf));
+    }
    if (!image.Valid()) {
        // Fall back to null image if unbound.
        return AmdGpu::Image::Null();
--- a/src/shader_recompiler/ir/abstract_syntax_list.cpp
+++ b/src/shader_recompiler/ir/abstract_syntax_list.cpp
@ -0,0 +1,44 @@
+// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include "abstract_syntax_list.h"
+
+namespace Shader::IR {
+
+std::string DumpASLNode(const AbstractSyntaxNode& node,
+                        const std::map<const Block*, size_t>& block_to_index,
+                        const std::map<const Inst*, size_t>& inst_to_index) {
+    switch (node.type) {
+    case AbstractSyntaxNode::Type::Block:
+        return fmt::format("Block: ${}", block_to_index.at(node.data.block));
+    case AbstractSyntaxNode::Type::If:
+        return fmt::format("If: cond = %{}, body = ${}, merge = ${}",
+                           inst_to_index.at(node.data.if_node.cond.Inst()),
+                           block_to_index.at(node.data.if_node.body),
+                           block_to_index.at(node.data.if_node.merge));
+    case AbstractSyntaxNode::Type::EndIf:
+        return fmt::format("EndIf: merge = ${}", block_to_index.at(node.data.end_if.merge));
+    case AbstractSyntaxNode::Type::Loop:
+        return fmt::format("Loop: body = ${}, continue = ${}, merge = ${}",
+                           block_to_index.at(node.data.loop.body),
+                           block_to_index.at(node.data.loop.continue_block),
+                           block_to_index.at(node.data.loop.merge));
+    case AbstractSyntaxNode::Type::Repeat:
+        return fmt::format("Repeat: cond = %{}, header = ${}, merge = ${}",
+                           inst_to_index.at(node.data.repeat.cond.Inst()),
+                           block_to_index.at(node.data.repeat.loop_header),
+                           block_to_index.at(node.data.repeat.merge));
+    case AbstractSyntaxNode::Type::Break:
+        return fmt::format("Break: cond = %{}, merge = ${}, skip = ${}",
+                           inst_to_index.at(node.data.break_node.cond.Inst()),
+                           block_to_index.at(node.data.break_node.merge),
+                           block_to_index.at(node.data.break_node.skip));
+    case AbstractSyntaxNode::Type::Return:
+        return "Return";
+    case AbstractSyntaxNode::Type::Unreachable:
+        return "Unreachable";
+    };
+    UNREACHABLE();
+}
+
+} // namespace Shader::IR
--- a/src/shader_recompiler/ir/abstract_syntax_list.h
+++ b/src/shader_recompiler/ir/abstract_syntax_list.h
@ -3,6 +3,7 @@

 #pragma once

+#include <map>
 #include <vector>
 #include "shader_recompiler/ir/value.h"

@ -53,4 +54,8 @@ struct AbstractSyntaxNode {
 };
 using AbstractSyntaxList = std::vector<AbstractSyntaxNode>;

+std::string DumpASLNode(const AbstractSyntaxNode& node,
+                        const std::map<const Block*, size_t>& block_to_index,
+                        const std::map<const Inst*, size_t>& inst_to_index);
+
 } // namespace Shader::IR
--- a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp
+++ b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp
@ -411,6 +411,7 @@ void PatchImageSharp(IR::Block& block, IR::Inst& inst, Info& info, Descriptors&
        .is_atomic = IsImageAtomicInstruction(inst),
        .is_array = bool(inst_info.is_array),
        .is_written = is_written,
+        .is_r128 = bool(inst_info.is_r128),
    });

    IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)};
--- a/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp
+++ b/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp
@ -2,6 +2,7 @@
 // SPDX-License-Identifier: GPL-2.0-or-later

 #include "shader_recompiler/ir/program.h"
+#include "video_core/buffer_cache/buffer_cache.h"

 namespace Shader::Optimization {

@ -79,14 +80,21 @@ void Visit(Info& info, const IR::Inst& inst) {
        info.uses_lane_id = true;
        break;
    case IR::Opcode::ReadConst:
-        if (!info.has_readconst) {
+        if (info.readconst_types == Info::ReadConstType::None) {
            info.buffers.push_back({
                .used_types = IR::Type::U32,
-                .inline_cbuf = AmdGpu::Buffer::Null(),
-                .buffer_type = BufferType::ReadConstUbo,
+                // We can't guarantee that flatbuf will not grow past UBO
+                // limit if there are a lot of ReadConsts. (We could specialize)
+                .inline_cbuf = AmdGpu::Buffer::Placeholder(std::numeric_limits<u32>::max()),
+                .buffer_type = BufferType::Flatbuf,
            });
-            info.has_readconst = true;
        }
+        if (inst.Flags<u32>() != 0) {
+            info.readconst_types |= Info::ReadConstType::Immediate;
+        } else {
+            info.readconst_types |= Info::ReadConstType::Dynamic;
+        }
+        info.dma_types |= IR::Type::U32;
        break;
    case IR::Opcode::PackUfloat10_11_11:
        info.uses_pack_10_11_11 = true;
@ -105,6 +113,21 @@ void CollectShaderInfoPass(IR::Program& program) {
            Visit(program.info, inst);
        }
    }
+
+    if (program.info.dma_types != IR::Type::Void) {
+        program.info.buffers.push_back({
+            .used_types = IR::Type::U64,
+            .inline_cbuf = AmdGpu::Buffer::Placeholder(VideoCore::BufferCache::BDA_PAGETABLE_SIZE),
+            .buffer_type = BufferType::BdaPagetable,
+            .is_written = true,
+        });
+        program.info.buffers.push_back({
+            .used_types = IR::Type::U8,
+            .inline_cbuf = AmdGpu::Buffer::Placeholder(VideoCore::BufferCache::FAULT_BUFFER_SIZE),
+            .buffer_type = BufferType::FaultBuffer,
+            .is_written = true,
+        });
+    }
 }

 } // namespace Shader::Optimization
--- a/src/shader_recompiler/ir/program.cpp
+++ b/src/shader_recompiler/ir/program.cpp
@ -6,13 +6,30 @@

 #include <fmt/format.h>

+#include "common/config.h"
+#include "common/io_file.h"
+#include "common/path_util.h"
 #include "shader_recompiler/ir/basic_block.h"
 #include "shader_recompiler/ir/program.h"
 #include "shader_recompiler/ir/value.h"

 namespace Shader::IR {

-std::string DumpProgram(const Program& program) {
+void DumpProgram(const Program& program, const Info& info, const std::string& type) {
+    using namespace Common::FS;
+
+    if (!Config::dumpShaders()) {
+        return;
+    }
+
+    const auto dump_dir = GetUserPath(PathType::ShaderDir) / "dumps";
+    if (!std::filesystem::exists(dump_dir)) {
+        std::filesystem::create_directories(dump_dir);
+    }
+    const auto ir_filename =
+        fmt::format("{}_{:#018x}.{}irprogram.txt", info.stage, info.pgm_hash, type);
+    const auto ir_file = IOFile{dump_dir / ir_filename, FileAccessMode::Write, FileType::TextFile};
+
    size_t index{0};
    std::map<const IR::Inst*, size_t> inst_to_index;
    std::map<const IR::Block*, size_t> block_to_index;
@ -21,11 +38,20 @@ std::string DumpProgram(const Program& program) {
        block_to_index.emplace(block, index);
        ++index;
    }
-    std::string ret;
+
    for (const auto& block : program.blocks) {
-        ret += IR::DumpBlock(*block, block_to_index, inst_to_index, index) + '\n';
+        std::string s = IR::DumpBlock(*block, block_to_index, inst_to_index, index) + '\n';
+        ir_file.WriteString(s);
+    }
+
+    const auto asl_filename = fmt::format("{}_{:#018x}.{}asl.txt", info.stage, info.pgm_hash, type);
+    const auto asl_file =
+        IOFile{dump_dir / asl_filename, FileAccessMode::Write, FileType::TextFile};
+
+    for (const auto& node : program.syntax_list) {
+        std::string s = IR::DumpASLNode(node, block_to_index, inst_to_index) + '\n';
+        asl_file.WriteString(s);
    }
-    return ret;
 }

 } // namespace Shader::IR
--- a/src/shader_recompiler/ir/program.h
+++ b/src/shader_recompiler/ir/program.h
@ -21,6 +21,6 @@ struct Program {
    Info& info;
 };

-[[nodiscard]] std::string DumpProgram(const Program& program);
+void DumpProgram(const Program& program, const Info& info, const std::string& type = "");

 } // namespace Shader::IR
--- a/src/shader_recompiler/ir/reg.h
+++ b/src/shader_recompiler/ir/reg.h
@ -44,6 +44,7 @@ union TextureInstInfo {
    BitField<9, 1, u32> is_array;
    BitField<10, 1, u32> is_unnormalized;
    BitField<11, 1, u32> is_gather;
+    BitField<12, 1, u32> is_r128;
 };

 union BufferInstInfo {
--- a/src/shader_recompiler/ir/reinterpret.h
+++ b/src/shader_recompiler/ir/reinterpret.h
@ -46,6 +46,10 @@ inline F32 ApplyReadNumberConversion(IREmitter& ir, const F32& value,
        const IR::F32 max = ir.Imm32(float(std::numeric_limits<u16>::max()));
        return ir.FPDiv(left, max);
    }
+    case AmdGpu::NumberConversion::Uint32ToUnorm: {
+        const auto float_val = ir.ConvertUToF(32, 32, ir.BitCast<U32>(value));
+        return ir.FPDiv(float_val, ir.Imm32(static_cast<float>(std::numeric_limits<u32>::max())));
+    }
    default:
        UNREACHABLE();
    }
@ -92,6 +96,12 @@ inline F32 ApplyWriteNumberConversion(IREmitter& ir, const F32& value,
        const IR::U32 raw = ir.ConvertFToS(32, ir.FPDiv(left, ir.Imm32(2.f)));
        return ir.BitCast<F32>(raw);
    }
+    case AmdGpu::NumberConversion::Uint32ToUnorm: {
+        const auto clamped = ir.FPClamp(value, ir.Imm32(0.f), ir.Imm32(1.f));
+        const auto unnormalized =
+            ir.FPMul(clamped, ir.Imm32(static_cast<float>(std::numeric_limits<u32>::max())));
+        return ir.BitCast<F32>(U32{ir.ConvertFToU(32, unnormalized)});
+    }
    default:
        UNREACHABLE();
    }
--- a/src/shader_recompiler/recompiler.cpp
+++ b/src/shader_recompiler/recompiler.cpp
@ -85,6 +85,8 @@ IR::Program TranslateProgram(std::span<const u32> code, Pools& pools, Info& info
    Shader::Optimization::ConstantPropagationPass(program.post_order_blocks);
    Shader::Optimization::CollectShaderInfoPass(program);

+    Shader::IR::DumpProgram(program, info);
+
    return program;
 }

--- a/src/video_core/amdgpu/liverpool.cpp
+++ b/src/video_core/amdgpu/liverpool.cpp
@ -133,6 +133,7 @@ void Liverpool::Process(std::stop_token stoken) {
            VideoCore::EndCapture();

            if (rasterizer) {
+                rasterizer->ProcessFaults();
                rasterizer->Flush();
            }
            submit_done = false;
@ -751,6 +752,10 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span<const u32> dcb, std::span<c
                            strmout->buffer_select.Value());
                break;
            }
+            case PM4ItOpcode::GetLodStats: {
+                LOG_WARNING(Render_Vulkan, "Unimplemented IT_GET_LOD_STATS");
+                break;
+            }
            default:
                UNREACHABLE_MSG("Unknown PM4 type 3 opcode {:#x} with count {}",
                                static_cast<u32>(opcode), count);
--- a/src/video_core/amdgpu/pm4_opcodes.h
+++ b/src/video_core/amdgpu/pm4_opcodes.h
@ -71,6 +71,7 @@ enum class PM4ItOpcode : u32 {
    IncrementDeCounter = 0x85,
    WaitOnCeCounter = 0x86,
    WaitOnDeCounterDiff = 0x88,
+    GetLodStats = 0x8E,
    DrawIndexIndirectCountMulti = 0x9d,
 };

--- a/src/video_core/amdgpu/resource.h
+++ b/src/video_core/amdgpu/resource.h
@ -37,6 +37,13 @@ struct Buffer {
        return buffer;
    }

+    static constexpr Buffer Placeholder(u32 size) {
+        Buffer buffer{};
+        buffer.base_address = 1;
+        buffer.num_records = size;
+        return buffer;
+    }
+
    bool Valid() const {
        return type == 0u;
    }
--- a/src/video_core/amdgpu/types.h
+++ b/src/video_core/amdgpu/types.h
@ -197,8 +197,9 @@ enum class NumberConversion : u32 {
    UintToUscaled = 1,
    SintToSscaled = 2,
    UnormToUbnorm = 3,
-    Sint8ToSnormNz = 5,
-    Sint16ToSnormNz = 6,
+    Sint8ToSnormNz = 4,
+    Sint16ToSnormNz = 5,
+    Uint32ToUnorm = 6,
 };

 struct CompMapping {
@ -286,6 +287,17 @@ inline DataFormat RemapDataFormat(const DataFormat format) {

 inline NumberFormat RemapNumberFormat(const NumberFormat format, const DataFormat data_format) {
    switch (format) {
+    case NumberFormat::Unorm: {
+        switch (data_format) {
+        case DataFormat::Format32:
+        case DataFormat::Format32_32:
+        case DataFormat::Format32_32_32:
+        case DataFormat::Format32_32_32_32:
+            return NumberFormat::Uint;
+        default:
+            return format;
+        }
+    }
    case NumberFormat::Uscaled:
        return NumberFormat::Uint;
    case NumberFormat::Sscaled:
@ -341,6 +353,17 @@ inline CompMapping RemapSwizzle(const DataFormat format, const CompMapping swizz

 inline NumberConversion MapNumberConversion(const NumberFormat num_fmt, const DataFormat data_fmt) {
    switch (num_fmt) {
+    case NumberFormat::Unorm: {
+        switch (data_fmt) {
+        case DataFormat::Format32:
+        case DataFormat::Format32_32:
+        case DataFormat::Format32_32_32:
+        case DataFormat::Format32_32_32_32:
+            return NumberConversion::Uint32ToUnorm;
+        default:
+            return NumberConversion::None;
+        }
+    }
    case NumberFormat::Uscaled:
        return NumberConversion::UintToUscaled;
    case NumberFormat::Sscaled:
--- a/src/video_core/buffer_cache/buffer.cpp
+++ b/src/video_core/buffer_cache/buffer.cpp
@ -70,8 +70,11 @@ UniqueBuffer::~UniqueBuffer() {

 void UniqueBuffer::Create(const vk::BufferCreateInfo& buffer_ci, MemoryUsage usage,
                          VmaAllocationInfo* out_alloc_info) {
+    const bool with_bda = bool(buffer_ci.usage & vk::BufferUsageFlagBits::eShaderDeviceAddress);
+    const VmaAllocationCreateFlags bda_flag =
+        with_bda ? VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT : 0;
    const VmaAllocationCreateInfo alloc_ci = {
-        .flags = VMA_ALLOCATION_CREATE_WITHIN_BUDGET_BIT | MemoryUsageVmaFlags(usage),
+        .flags = VMA_ALLOCATION_CREATE_WITHIN_BUDGET_BIT | bda_flag | MemoryUsageVmaFlags(usage),
        .usage = MemoryUsageVma(usage),
        .requiredFlags = 0,
        .preferredFlags = MemoryUsagePreferredVmaFlags(usage),
@ -86,6 +89,15 @@ void UniqueBuffer::Create(const vk::BufferCreateInfo& buffer_ci, MemoryUsage usa
    ASSERT_MSG(result == VK_SUCCESS, "Failed allocating buffer with error {}",
               vk::to_string(vk::Result{result}));
    buffer = vk::Buffer{unsafe_buffer};
+
+    if (with_bda) {
+        vk::BufferDeviceAddressInfo bda_info{
+            .buffer = buffer,
+        };
+        auto bda_result = device.getBufferAddress(bda_info);
+        ASSERT_MSG(bda_result != 0, "Failed to get buffer device address");
+        bda_addr = bda_result;
+    }
 }

 Buffer::Buffer(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_, MemoryUsage usage_,
--- a/src/video_core/buffer_cache/buffer.h
+++ b/src/video_core/buffer_cache/buffer.h
@ -68,6 +68,7 @@ struct UniqueBuffer {
    VmaAllocator allocator;
    VmaAllocation allocation;
    vk::Buffer buffer{};
+    vk::DeviceAddress bda_addr = 0;
 };

 class Buffer {
@ -115,6 +116,11 @@ public:
        return buffer;
    }

+    vk::DeviceAddress BufferDeviceAddress() const noexcept {
+        ASSERT_MSG(buffer.bda_addr != 0, "Can't get BDA from a non BDA buffer");
+        return buffer.bda_addr;
+    }
+
    std::optional<vk::BufferMemoryBarrier2> GetBarrier(
        vk::Flags<vk::AccessFlagBits2> dst_acess_mask, vk::PipelineStageFlagBits2 dst_stage,
        u32 offset = 0) {
--- a/src/video_core/buffer_cache/buffer_cache.cpp
+++ b/src/video_core/buffer_cache/buffer_cache.cpp
@ -3,13 +3,17 @@

 #include <algorithm>
 #include "common/alignment.h"
+#include "common/debug.h"
 #include "common/scope_exit.h"
 #include "common/types.h"
 #include "video_core/amdgpu/liverpool.h"
 #include "video_core/buffer_cache/buffer_cache.h"
+#include "video_core/host_shaders/fault_buffer_process_comp.h"
 #include "video_core/renderer_vulkan/vk_graphics_pipeline.h"
 #include "video_core/renderer_vulkan/vk_instance.h"
+#include "video_core/renderer_vulkan/vk_rasterizer.h"
 #include "video_core/renderer_vulkan/vk_scheduler.h"
+#include "video_core/renderer_vulkan/vk_shader_util.h"
 #include "video_core/texture_cache/texture_cache.h"

 namespace VideoCore {
@ -17,17 +21,26 @@ namespace VideoCore {
 static constexpr size_t DataShareBufferSize = 64_KB;
 static constexpr size_t StagingBufferSize = 512_MB;
 static constexpr size_t UboStreamBufferSize = 128_MB;
+static constexpr size_t DownloadBufferSize = 128_MB;
+static constexpr size_t MaxPageFaults = 1024;

 BufferCache::BufferCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_,
-                         AmdGpu::Liverpool* liverpool_, TextureCache& texture_cache_,
-                         PageManager& tracker_)
-    : instance{instance_}, scheduler{scheduler_}, liverpool{liverpool_},
+                         Vulkan::Rasterizer& rasterizer_, AmdGpu::Liverpool* liverpool_,
+                         TextureCache& texture_cache_, PageManager& tracker_)
+    : instance{instance_}, scheduler{scheduler_}, rasterizer{rasterizer_}, liverpool{liverpool_},
      texture_cache{texture_cache_}, tracker{tracker_},
      staging_buffer{instance, scheduler, MemoryUsage::Upload, StagingBufferSize},
      stream_buffer{instance, scheduler, MemoryUsage::Stream, UboStreamBufferSize},
+      download_buffer(instance, scheduler, MemoryUsage::Download, DownloadBufferSize),
      gds_buffer{instance, scheduler, MemoryUsage::Stream, 0, AllFlags, DataShareBufferSize},
-      memory_tracker{&tracker} {
+      bda_pagetable_buffer{instance, scheduler, MemoryUsage::DeviceLocal,
+                           0,        AllFlags,  BDA_PAGETABLE_SIZE},
+      fault_buffer(instance, scheduler, MemoryUsage::DeviceLocal, 0, AllFlags, FAULT_BUFFER_SIZE),
+      memory_tracker{tracker} {
    Vulkan::SetObjectName(instance.GetDevice(), gds_buffer.Handle(), "GDS Buffer");
+    Vulkan::SetObjectName(instance.GetDevice(), bda_pagetable_buffer.Handle(),
+                          "BDA Page Table Buffer");
+    Vulkan::SetObjectName(instance.GetDevice(), fault_buffer.Handle(), "Fault Buffer");

    // Ensure the first slot is used for the null buffer
    const auto null_id =
@ -35,15 +48,93 @@ BufferCache::BufferCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& s
    ASSERT(null_id.index == 0);
    const vk::Buffer& null_buffer = slot_buffers[null_id].buffer;
    Vulkan::SetObjectName(instance.GetDevice(), null_buffer, "Null Buffer");
+
+    // Prepare the fault buffer parsing pipeline
+    boost::container::static_vector<vk::DescriptorSetLayoutBinding, 2> bindings{
+        {
+            .binding = 0,
+            .descriptorType = vk::DescriptorType::eStorageBuffer,
+            .descriptorCount = 1,
+            .stageFlags = vk::ShaderStageFlagBits::eCompute,
+        },
+        {
+            .binding = 1,
+            .descriptorType = vk::DescriptorType::eStorageBuffer,
+            .descriptorCount = 1,
+            .stageFlags = vk::ShaderStageFlagBits::eCompute,
+        },
+    };
+
+    const vk::DescriptorSetLayoutCreateInfo desc_layout_ci = {
+        .flags = vk::DescriptorSetLayoutCreateFlagBits::ePushDescriptorKHR,
+        .bindingCount = static_cast<u32>(bindings.size()),
+        .pBindings = bindings.data(),
+    };
+    auto [desc_layout_result, desc_layout] =
+        instance.GetDevice().createDescriptorSetLayoutUnique(desc_layout_ci);
+    ASSERT_MSG(desc_layout_result == vk::Result::eSuccess,
+               "Failed to create descriptor set layout: {}", vk::to_string(desc_layout_result));
+    fault_process_desc_layout = std::move(desc_layout);
+
+    const auto& module = Vulkan::Compile(HostShaders::FAULT_BUFFER_PROCESS_COMP,
+                                         vk::ShaderStageFlagBits::eCompute, instance.GetDevice());
+    Vulkan::SetObjectName(instance.GetDevice(), module, "Fault Buffer Parser");
+
+    const vk::SpecializationMapEntry specialization_map_entry = {
+        .constantID = 0,
+        .offset = 0,
+        .size = sizeof(u32),
+    };
+
+    const vk::SpecializationInfo specialization_info = {
+        .mapEntryCount = 1,
+        .pMapEntries = &specialization_map_entry,
+        .dataSize = sizeof(u32),
+        .pData = &CACHING_PAGEBITS,
+    };
+
+    const vk::PipelineShaderStageCreateInfo shader_ci = {
+        .stage = vk::ShaderStageFlagBits::eCompute,
+        .module = module,
+        .pName = "main",
+        .pSpecializationInfo = &specialization_info,
+    };
+
+    const vk::PipelineLayoutCreateInfo layout_info = {
+        .setLayoutCount = 1U,
+        .pSetLayouts = &(*fault_process_desc_layout),
+    };
+    auto [layout_result, layout] = instance.GetDevice().createPipelineLayoutUnique(layout_info);
+    ASSERT_MSG(layout_result == vk::Result::eSuccess, "Failed to create pipeline layout: {}",
+               vk::to_string(layout_result));
+    fault_process_pipeline_layout = std::move(layout);
+
+    const vk::ComputePipelineCreateInfo pipeline_info = {
+        .stage = shader_ci,
+        .layout = *fault_process_pipeline_layout,
+    };
+    auto [pipeline_result, pipeline] =
+        instance.GetDevice().createComputePipelineUnique({}, pipeline_info);
+    ASSERT_MSG(pipeline_result == vk::Result::eSuccess, "Failed to create compute pipeline: {}",
+               vk::to_string(pipeline_result));
+    fault_process_pipeline = std::move(pipeline);
+    Vulkan::SetObjectName(instance.GetDevice(), *fault_process_pipeline,
+                          "Fault Buffer Parser Pipeline");
+
+    instance.GetDevice().destroyShaderModule(module);
 }

 BufferCache::~BufferCache() = default;

-void BufferCache::InvalidateMemory(VAddr device_addr, u64 size) {
+void BufferCache::InvalidateMemory(VAddr device_addr, u64 size, bool unmap) {
    const bool is_tracked = IsRegionRegistered(device_addr, size);
    if (is_tracked) {
        // Mark the page as CPU modified to stop tracking writes.
        memory_tracker.MarkRegionAsCpuModified(device_addr, size);
+
+        if (unmap) {
+            return;
+        }
    }
 }

@ -69,20 +160,20 @@ void BufferCache::DownloadBufferMemory(Buffer& buffer, VAddr device_addr, u64 si
    if (total_size_bytes == 0) {
        return;
    }
-    const auto [staging, offset] = staging_buffer.Map(total_size_bytes);
+    const auto [download, offset] = download_buffer.Map(total_size_bytes);
    for (auto& copy : copies) {
        // Modify copies to have the staging offset in mind
        copy.dstOffset += offset;
    }
-    staging_buffer.Commit();
+    download_buffer.Commit();
    scheduler.EndRendering();
    const auto cmdbuf = scheduler.CommandBuffer();
-    cmdbuf.copyBuffer(buffer.buffer, staging_buffer.Handle(), copies);
+    cmdbuf.copyBuffer(buffer.buffer, download_buffer.Handle(), copies);
    scheduler.Finish();
    for (const auto& copy : copies) {
        const VAddr copy_device_addr = buffer.CpuAddr() + copy.srcOffset;
        const u64 dst_offset = copy.dstOffset - offset;
-        std::memcpy(std::bit_cast<u8*>(copy_device_addr), staging + dst_offset, copy.size);
+        std::memcpy(std::bit_cast<u8*>(copy_device_addr), download + dst_offset, copy.size);
    }
 }

@ -206,58 +297,37 @@ void BufferCache::InlineData(VAddr address, const void* value, u32 num_bytes, bo
        memcpy(std::bit_cast<void*>(address), value, num_bytes);
        return;
    }
-    scheduler.EndRendering();
-    const Buffer* buffer = [&] {
+    Buffer* buffer = [&] {
        if (is_gds) {
            return &gds_buffer;
        }
        const BufferId buffer_id = FindBuffer(address, num_bytes);
        return &slot_buffers[buffer_id];
    }();
-    const auto cmdbuf = scheduler.CommandBuffer();
-    const vk::BufferMemoryBarrier2 pre_barrier = {
-        .srcStageMask = vk::PipelineStageFlagBits2::eAllCommands,
-        .srcAccessMask = vk::AccessFlagBits2::eMemoryRead,
-        .dstStageMask = vk::PipelineStageFlagBits2::eTransfer,
-        .dstAccessMask = vk::AccessFlagBits2::eTransferWrite,
-        .buffer = buffer->Handle(),
-        .offset = buffer->Offset(address),
-        .size = num_bytes,
-    };
-    const vk::BufferMemoryBarrier2 post_barrier = {
-        .srcStageMask = vk::PipelineStageFlagBits2::eTransfer,
-        .srcAccessMask = vk::AccessFlagBits2::eTransferWrite,
-        .dstStageMask = vk::PipelineStageFlagBits2::eAllCommands,
-        .dstAccessMask = vk::AccessFlagBits2::eMemoryRead,
-        .buffer = buffer->Handle(),
-        .offset = buffer->Offset(address),
-        .size = num_bytes,
-    };
-    cmdbuf.pipelineBarrier2(vk::DependencyInfo{
-        .dependencyFlags = vk::DependencyFlagBits::eByRegion,
-        .bufferMemoryBarrierCount = 1,
-        .pBufferMemoryBarriers = &pre_barrier,
-    });
-    // vkCmdUpdateBuffer can only copy up to 65536 bytes at a time.
-    static constexpr u32 UpdateBufferMaxSize = 65536;
-    const auto dst_offset = buffer->Offset(address);
-    for (u32 offset = 0; offset < num_bytes; offset += UpdateBufferMaxSize) {
-        const auto* update_src = static_cast<const u8*>(value) + offset;
-        const auto update_dst = dst_offset + offset;
-        const auto update_size = std::min(num_bytes - offset, UpdateBufferMaxSize);
-        cmdbuf.updateBuffer(buffer->Handle(), update_dst, update_size, update_src);
+    InlineDataBuffer(*buffer, address, value, num_bytes);
+}
+
+void BufferCache::WriteData(VAddr address, const void* value, u32 num_bytes, bool is_gds) {
+    ASSERT_MSG(address % 4 == 0, "GDS offset must be dword aligned");
+    if (!is_gds && !IsRegionRegistered(address, num_bytes)) {
+        memcpy(std::bit_cast<void*>(address), value, num_bytes);
+        return;
    }
-    cmdbuf.pipelineBarrier2(vk::DependencyInfo{
-        .dependencyFlags = vk::DependencyFlagBits::eByRegion,
-        .bufferMemoryBarrierCount = 1,
-        .pBufferMemoryBarriers = &post_barrier,
-    });
+    Buffer* buffer = [&] {
+        if (is_gds) {
+            return &gds_buffer;
+        }
+        const BufferId buffer_id = FindBuffer(address, num_bytes);
+        return &slot_buffers[buffer_id];
+    }();
+    WriteDataBuffer(*buffer, address, value, num_bytes);
 }

 std::pair<Buffer*, u32> BufferCache::ObtainBuffer(VAddr device_addr, u32 size, bool is_written,
                                                  bool is_texel_buffer, BufferId buffer_id) {
    // For small uniform buffers that have not been modified by gpu
    // use device local stream buffer to reduce renderpass breaks.
+    // Maybe we want to modify the threshold now that the page size is 16KB?
    static constexpr u64 StreamThreshold = CACHING_PAGESIZE;
    const bool is_gpu_dirty = memory_tracker.IsRegionGpuModified(device_addr, size);
    if (!is_written && size <= StreamThreshold && !is_gpu_dirty) {
@ -280,7 +350,7 @@ std::pair<Buffer*, u32> BufferCache::ObtainBuffer(VAddr device_addr, u32 size, b
 std::pair<Buffer*, u32> BufferCache::ObtainViewBuffer(VAddr gpu_addr, u32 size, bool prefer_gpu) {
    // Check if any buffer contains the full requested range.
    const u64 page = gpu_addr >> CACHING_PAGEBITS;
-    const BufferId buffer_id = page_table[page];
+    const BufferId buffer_id = page_table[page].buffer_id;
    if (buffer_id) {
        Buffer& buffer = slot_buffers[buffer_id];
        if (buffer.IsInBounds(gpu_addr, size)) {
@ -300,24 +370,8 @@ std::pair<Buffer*, u32> BufferCache::ObtainViewBuffer(VAddr gpu_addr, u32 size,
 }

 bool BufferCache::IsRegionRegistered(VAddr addr, size_t size) {
-    const VAddr end_addr = addr + size;
-    const u64 page_end = Common::DivCeil(end_addr, CACHING_PAGESIZE);
-    for (u64 page = addr >> CACHING_PAGEBITS; page < page_end;) {
-        const BufferId buffer_id = page_table[page];
-        if (!buffer_id) {
-            ++page;
-            continue;
-        }
-        std::shared_lock lk{mutex};
-        Buffer& buffer = slot_buffers[buffer_id];
-        const VAddr buf_start_addr = buffer.CpuAddr();
-        const VAddr buf_end_addr = buf_start_addr + buffer.SizeBytes();
-        if (buf_start_addr < end_addr && addr < buf_end_addr) {
-            return true;
-        }
-        page = Common::DivCeil(buf_end_addr, CACHING_PAGESIZE);
-    }
-    return false;
+    // Check if we are missing some edge case here
+    return buffer_ranges.Intersects(addr, size);
 }

 bool BufferCache::IsRegionCpuModified(VAddr addr, size_t size) {
@ -333,7 +387,7 @@ BufferId BufferCache::FindBuffer(VAddr device_addr, u32 size) {
        return NULL_BUFFER_ID;
    }
    const u64 page = device_addr >> CACHING_PAGEBITS;
-    const BufferId buffer_id = page_table[page];
+    const BufferId buffer_id = page_table[page].buffer_id;
    if (!buffer_id) {
        return CreateBuffer(device_addr, size);
    }
@ -379,7 +433,7 @@ BufferCache::OverlapResult BufferCache::ResolveOverlaps(VAddr device_addr, u32 w
    }
    for (; device_addr >> CACHING_PAGEBITS < Common::DivCeil(end, CACHING_PAGESIZE);
         device_addr += CACHING_PAGESIZE) {
-        const BufferId overlap_id = page_table[device_addr >> CACHING_PAGEBITS];
+        const BufferId overlap_id = page_table[device_addr >> CACHING_PAGEBITS].buffer_id;
        if (!overlap_id) {
            continue;
        }
@ -480,11 +534,21 @@ BufferId BufferCache::CreateBuffer(VAddr device_addr, u32 wanted_size) {
    const OverlapResult overlap = ResolveOverlaps(device_addr, wanted_size);
    const u32 size = static_cast<u32>(overlap.end - overlap.begin);
    const BufferId new_buffer_id = [&] {
-        std::scoped_lock lk{mutex};
+        std::scoped_lock lk{slot_buffers_mutex};
        return slot_buffers.insert(instance, scheduler, MemoryUsage::DeviceLocal, overlap.begin,
-                                   AllFlags, size);
+                                   AllFlags | vk::BufferUsageFlagBits::eShaderDeviceAddress, size);
    }();
    auto& new_buffer = slot_buffers[new_buffer_id];
+    boost::container::small_vector<vk::DeviceAddress, 128> bda_addrs;
+    const u64 start_page = overlap.begin >> CACHING_PAGEBITS;
+    const u64 size_pages = size >> CACHING_PAGEBITS;
+    bda_addrs.reserve(size_pages);
+    for (u64 i = 0; i < size_pages; ++i) {
+        vk::DeviceAddress addr = new_buffer.BufferDeviceAddress() + (i << CACHING_PAGEBITS);
+        bda_addrs.push_back(addr);
+    }
+    WriteDataBuffer(bda_pagetable_buffer, start_page * sizeof(vk::DeviceAddress), bda_addrs.data(),
+                    bda_addrs.size() * sizeof(vk::DeviceAddress));
    const size_t size_bytes = new_buffer.SizeBytes();
    const auto cmdbuf = scheduler.CommandBuffer();
    scheduler.EndRendering();
@ -496,6 +560,129 @@ BufferId BufferCache::CreateBuffer(VAddr device_addr, u32 wanted_size) {
    return new_buffer_id;
 }

+void BufferCache::ProcessFaultBuffer() {
+    // Run fault processing shader
+    const auto [mapped, offset] = download_buffer.Map(MaxPageFaults * sizeof(u64));
+    vk::BufferMemoryBarrier2 fault_buffer_barrier{
+        .srcStageMask = vk::PipelineStageFlagBits2::eAllCommands,
+        .srcAccessMask = vk::AccessFlagBits2::eShaderWrite,
+        .dstStageMask = vk::PipelineStageFlagBits2::eComputeShader,
+        .dstAccessMask = vk::AccessFlagBits2::eShaderRead,
+        .buffer = fault_buffer.Handle(),
+        .offset = 0,
+        .size = FAULT_BUFFER_SIZE,
+    };
+    vk::BufferMemoryBarrier2 download_barrier{
+        .srcStageMask = vk::PipelineStageFlagBits2::eTransfer,
+        .srcAccessMask = vk::AccessFlagBits2::eTransferWrite,
+        .dstStageMask = vk::PipelineStageFlagBits2::eComputeShader,
+        .dstAccessMask = vk::AccessFlagBits2::eShaderRead | vk::AccessFlagBits2::eShaderWrite,
+        .buffer = download_buffer.Handle(),
+        .offset = offset,
+        .size = MaxPageFaults * sizeof(u64),
+    };
+    std::array<vk::BufferMemoryBarrier2, 2> barriers{fault_buffer_barrier, download_barrier};
+    vk::DescriptorBufferInfo fault_buffer_info{
+        .buffer = fault_buffer.Handle(),
+        .offset = 0,
+        .range = FAULT_BUFFER_SIZE,
+    };
+    vk::DescriptorBufferInfo download_info{
+        .buffer = download_buffer.Handle(),
+        .offset = offset,
+        .range = MaxPageFaults * sizeof(u64),
+    };
+    boost::container::small_vector<vk::WriteDescriptorSet, 2> writes{
+        {
+            .dstSet = VK_NULL_HANDLE,
+            .dstBinding = 0,
+            .dstArrayElement = 0,
+            .descriptorCount = 1,
+            .descriptorType = vk::DescriptorType::eStorageBuffer,
+            .pBufferInfo = &fault_buffer_info,
+        },
+        {
+            .dstSet = VK_NULL_HANDLE,
+            .dstBinding = 1,
+            .dstArrayElement = 0,
+            .descriptorCount = 1,
+            .descriptorType = vk::DescriptorType::eStorageBuffer,
+            .pBufferInfo = &download_info,
+        },
+    };
+    download_buffer.Commit();
+    scheduler.EndRendering();
+    const auto cmdbuf = scheduler.CommandBuffer();
+    cmdbuf.fillBuffer(download_buffer.Handle(), offset, MaxPageFaults * sizeof(u64), 0);
+    cmdbuf.pipelineBarrier2(vk::DependencyInfo{
+        .dependencyFlags = vk::DependencyFlagBits::eByRegion,
+        .bufferMemoryBarrierCount = 2,
+        .pBufferMemoryBarriers = barriers.data(),
+    });
+    cmdbuf.bindPipeline(vk::PipelineBindPoint::eCompute, *fault_process_pipeline);
+    cmdbuf.pushDescriptorSetKHR(vk::PipelineBindPoint::eCompute, *fault_process_pipeline_layout, 0,
+                                writes);
+    constexpr u32 num_threads = CACHING_NUMPAGES / 32; // 1 bit per page, 32 pages per workgroup
+    constexpr u32 num_workgroups = Common::DivCeil(num_threads, 64u);
+    cmdbuf.dispatch(num_workgroups, 1, 1);
+
+    // Reset fault buffer
+    const vk::BufferMemoryBarrier2 reset_pre_barrier = {
+        .srcStageMask = vk::PipelineStageFlagBits2::eComputeShader,
+        .srcAccessMask = vk::AccessFlagBits2::eShaderRead,
+        .dstStageMask = vk::PipelineStageFlagBits2::eTransfer,
+        .dstAccessMask = vk::AccessFlagBits2::eTransferWrite,
+        .buffer = fault_buffer.Handle(),
+        .offset = 0,
+        .size = FAULT_BUFFER_SIZE,
+    };
+    const vk::BufferMemoryBarrier2 reset_post_barrier = {
+        .srcStageMask = vk::PipelineStageFlagBits2::eTransfer,
+        .srcAccessMask = vk::AccessFlagBits2::eTransferWrite,
+        .dstStageMask = vk::PipelineStageFlagBits2::eAllCommands,
+        .dstAccessMask = vk::AccessFlagBits2::eMemoryRead | vk::AccessFlagBits2::eMemoryWrite,
+        .buffer = fault_buffer.Handle(),
+        .offset = 0,
+        .size = FAULT_BUFFER_SIZE,
+    };
+    cmdbuf.pipelineBarrier2(vk::DependencyInfo{
+        .dependencyFlags = vk::DependencyFlagBits::eByRegion,
+        .bufferMemoryBarrierCount = 1,
+        .pBufferMemoryBarriers = &reset_pre_barrier,
+    });
+    cmdbuf.fillBuffer(fault_buffer.buffer, 0, FAULT_BUFFER_SIZE, 0);
+    cmdbuf.pipelineBarrier2(vk::DependencyInfo{
+        .dependencyFlags = vk::DependencyFlagBits::eByRegion,
+        .bufferMemoryBarrierCount = 1,
+        .pBufferMemoryBarriers = &reset_post_barrier,
+    });
+
+    // Defer creating buffers
+    scheduler.DeferOperation([this, mapped]() {
+        // Create the fault buffers batched
+        boost::icl::interval_set<VAddr> fault_ranges;
+        const u64* fault_ptr = std::bit_cast<const u64*>(mapped);
+        const u32 fault_count = static_cast<u32>(*(fault_ptr++));
+        for (u32 i = 0; i < fault_count; ++i) {
+            const VAddr fault = *(fault_ptr++);
+            const VAddr fault_end = fault + CACHING_PAGESIZE; // This can be adjusted
+            fault_ranges +=
+                boost::icl::interval_set<VAddr>::interval_type::right_open(fault, fault_end);
+            LOG_INFO(Render_Vulkan, "Accessed non-GPU cached memory at {:#x}", fault);
+        }
+        for (const auto& range : fault_ranges) {
+            const VAddr start = range.lower();
+            const VAddr end = range.upper();
+            const u64 page_start = start >> CACHING_PAGEBITS;
+            const u64 page_end = Common::DivCeil(end, CACHING_PAGESIZE);
+            // Buffer size is in 32 bits
+            ASSERT_MSG((range.upper() - range.lower()) <= std::numeric_limits<u32>::max(),
+                       "Buffer size is too large");
+            CreateBuffer(start, static_cast<u32>(end - start));
+        }
+    });
+}
+
 void BufferCache::Register(BufferId buffer_id) {
    ChangeRegister<true>(buffer_id);
 }
@ -514,11 +701,16 @@ void BufferCache::ChangeRegister(BufferId buffer_id) {
    const u64 page_end = Common::DivCeil(device_addr_end, CACHING_PAGESIZE);
    for (u64 page = page_begin; page != page_end; ++page) {
        if constexpr (insert) {
-            page_table[page] = buffer_id;
+            page_table[page].buffer_id = buffer_id;
        } else {
-            page_table[page] = BufferId{};
+            page_table[page].buffer_id = BufferId{};
        }
    }
+    if constexpr (insert) {
+        buffer_ranges.Add(buffer.CpuAddr(), buffer.SizeBytes(), buffer_id);
+    } else {
+        buffer_ranges.Subtract(buffer.CpuAddr(), buffer.SizeBytes());
+    }
 }

 void BufferCache::SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size,
@ -697,6 +889,138 @@ bool BufferCache::SynchronizeBufferFromImage(Buffer& buffer, VAddr device_addr,
    return true;
 }

+void BufferCache::SynchronizeBuffersInRange(VAddr device_addr, u64 size) {
+    if (device_addr == 0) {
+        return;
+    }
+    VAddr device_addr_end = device_addr + size;
+    ForEachBufferInRange(device_addr, size, [&](BufferId buffer_id, Buffer& buffer) {
+        RENDERER_TRACE;
+        VAddr start = std::max(buffer.CpuAddr(), device_addr);
+        VAddr end = std::min(buffer.CpuAddr() + buffer.SizeBytes(), device_addr_end);
+        u32 size = static_cast<u32>(end - start);
+        SynchronizeBuffer(buffer, start, size, false);
+    });
+}
+
+void BufferCache::MemoryBarrier() {
+    // Vulkan doesn't know which buffer we access in a shader if we use
+    // BufferDeviceAddress. We need a full memory barrier.
+    // For now, we only read memory using BDA. If we want to write to it,
+    // we might need to change this.
+    scheduler.EndRendering();
+    const auto cmdbuf = scheduler.CommandBuffer();
+    vk::MemoryBarrier2 barrier = {
+        .srcStageMask = vk::PipelineStageFlagBits2::eTransfer,
+        .srcAccessMask = vk::AccessFlagBits2::eMemoryWrite,
+        .dstStageMask = vk::PipelineStageFlagBits2::eAllCommands,
+        .dstAccessMask = vk::AccessFlagBits2::eMemoryRead,
+    };
+    cmdbuf.pipelineBarrier2(vk::DependencyInfo{
+        .memoryBarrierCount = 1,
+        .pMemoryBarriers = &barrier,
+    });
+}
+
+void BufferCache::InlineDataBuffer(Buffer& buffer, VAddr address, const void* value,
+                                   u32 num_bytes) {
+    scheduler.EndRendering();
+    const auto cmdbuf = scheduler.CommandBuffer();
+    const vk::BufferMemoryBarrier2 pre_barrier = {
+        .srcStageMask = vk::PipelineStageFlagBits2::eAllCommands,
+        .srcAccessMask = vk::AccessFlagBits2::eMemoryRead,
+        .dstStageMask = vk::PipelineStageFlagBits2::eTransfer,
+        .dstAccessMask = vk::AccessFlagBits2::eTransferWrite,
+        .buffer = buffer.Handle(),
+        .offset = buffer.Offset(address),
+        .size = num_bytes,
+    };
+    const vk::BufferMemoryBarrier2 post_barrier = {
+        .srcStageMask = vk::PipelineStageFlagBits2::eTransfer,
+        .srcAccessMask = vk::AccessFlagBits2::eTransferWrite,
+        .dstStageMask = vk::PipelineStageFlagBits2::eAllCommands,
+        .dstAccessMask = vk::AccessFlagBits2::eMemoryRead,
+        .buffer = buffer.Handle(),
+        .offset = buffer.Offset(address),
+        .size = num_bytes,
+    };
+    cmdbuf.pipelineBarrier2(vk::DependencyInfo{
+        .dependencyFlags = vk::DependencyFlagBits::eByRegion,
+        .bufferMemoryBarrierCount = 1,
+        .pBufferMemoryBarriers = &pre_barrier,
+    });
+    // vkCmdUpdateBuffer can only copy up to 65536 bytes at a time.
+    static constexpr u32 UpdateBufferMaxSize = 65536;
+    const auto dst_offset = buffer.Offset(address);
+    for (u32 offset = 0; offset < num_bytes; offset += UpdateBufferMaxSize) {
+        const auto* update_src = static_cast<const u8*>(value) + offset;
+        const auto update_dst = dst_offset + offset;
+        const auto update_size = std::min(num_bytes - offset, UpdateBufferMaxSize);
+        cmdbuf.updateBuffer(buffer.Handle(), update_dst, update_size, update_src);
+    }
+    cmdbuf.pipelineBarrier2(vk::DependencyInfo{
+        .dependencyFlags = vk::DependencyFlagBits::eByRegion,
+        .bufferMemoryBarrierCount = 1,
+        .pBufferMemoryBarriers = &post_barrier,
+    });
+}
+
+void BufferCache::WriteDataBuffer(Buffer& buffer, VAddr address, const void* value, u32 num_bytes) {
+    vk::BufferCopy copy = {
+        .srcOffset = 0,
+        .dstOffset = buffer.Offset(address),
+        .size = num_bytes,
+    };
+    vk::Buffer src_buffer = staging_buffer.Handle();
+    if (num_bytes < StagingBufferSize) {
+        const auto [staging, offset] = staging_buffer.Map(num_bytes);
+        std::memcpy(staging, value, num_bytes);
+        copy.srcOffset = offset;
+        staging_buffer.Commit();
+    } else {
+        // For large one time transfers use a temporary host buffer.
+        // RenderDoc can lag quite a bit if the stream buffer is too large.
+        Buffer temp_buffer{
+            instance, scheduler, MemoryUsage::Upload, 0, vk::BufferUsageFlagBits::eTransferSrc,
+            num_bytes};
+        src_buffer = temp_buffer.Handle();
+        u8* const staging = temp_buffer.mapped_data.data();
+        std::memcpy(staging, value, num_bytes);
+        scheduler.DeferOperation([buffer = std::move(temp_buffer)]() mutable {});
+    }
+    scheduler.EndRendering();
+    const auto cmdbuf = scheduler.CommandBuffer();
+    const vk::BufferMemoryBarrier2 pre_barrier = {
+        .srcStageMask = vk::PipelineStageFlagBits2::eAllCommands,
+        .srcAccessMask = vk::AccessFlagBits2::eMemoryRead,
+        .dstStageMask = vk::PipelineStageFlagBits2::eTransfer,
+        .dstAccessMask = vk::AccessFlagBits2::eTransferWrite,
+        .buffer = buffer.Handle(),
+        .offset = buffer.Offset(address),
+        .size = num_bytes,
+    };
+    const vk::BufferMemoryBarrier2 post_barrier = {
+        .srcStageMask = vk::PipelineStageFlagBits2::eTransfer,
+        .srcAccessMask = vk::AccessFlagBits2::eTransferWrite,
+        .dstStageMask = vk::PipelineStageFlagBits2::eAllCommands,
+        .dstAccessMask = vk::AccessFlagBits2::eMemoryRead | vk::AccessFlagBits2::eMemoryWrite,
+        .buffer = buffer.Handle(),
+        .offset = buffer.Offset(address),
+        .size = num_bytes,
+    };
+    cmdbuf.pipelineBarrier2(vk::DependencyInfo{
+        .dependencyFlags = vk::DependencyFlagBits::eByRegion,
+        .bufferMemoryBarrierCount = 1,
+        .pBufferMemoryBarriers = &pre_barrier,
+    });
+    cmdbuf.copyBuffer(src_buffer, buffer.Handle(), copy);
+    cmdbuf.pipelineBarrier2(vk::DependencyInfo{
+        .dependencyFlags = vk::DependencyFlagBits::eByRegion,
+        .bufferMemoryBarrierCount = 1,
+        .pBufferMemoryBarriers = &post_barrier,
+    });
+}
+
 void BufferCache::DeleteBuffer(BufferId buffer_id) {
    Buffer& buffer = slot_buffers[buffer_id];
    Unregister(buffer_id);
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@ -38,14 +38,22 @@ class TextureCache;

 class BufferCache {
 public:
-    static constexpr u32 CACHING_PAGEBITS = 12;
+    static constexpr u32 CACHING_PAGEBITS = 14;
    static constexpr u64 CACHING_PAGESIZE = u64{1} << CACHING_PAGEBITS;
-    static constexpr u64 DEVICE_PAGESIZE = 4_KB;
+    static constexpr u64 DEVICE_PAGESIZE = 16_KB;
+    static constexpr u64 CACHING_NUMPAGES = u64{1} << (40 - CACHING_PAGEBITS);
+
+    static constexpr u64 BDA_PAGETABLE_SIZE = CACHING_NUMPAGES * sizeof(vk::DeviceAddress);
+    static constexpr u64 FAULT_BUFFER_SIZE = CACHING_NUMPAGES / 8; // Bit per page
+
+    struct PageData {
+        BufferId buffer_id{};
+    };

    struct Traits {
-        using Entry = BufferId;
+        using Entry = PageData;
        static constexpr size_t AddressSpaceBits = 40;
-        static constexpr size_t FirstLevelBits = 14;
+        static constexpr size_t FirstLevelBits = 16;
        static constexpr size_t PageBits = CACHING_PAGEBITS;
    };
    using PageTable = MultiLevelPageTable<Traits>;
@ -59,8 +67,8 @@ public:

 public:
    explicit BufferCache(const Vulkan::Instance& instance, Vulkan::Scheduler& scheduler,
-                         AmdGpu::Liverpool* liverpool, TextureCache& texture_cache,
-                         PageManager& tracker);
+                         Vulkan::Rasterizer& rasterizer_, AmdGpu::Liverpool* liverpool,
+                         TextureCache& texture_cache, PageManager& tracker);
    ~BufferCache();

    /// Returns a pointer to GDS device local buffer.
@ -73,13 +81,23 @@ public:
        return stream_buffer;
    }

+    /// Retrieves the device local DBA page table buffer.
+    [[nodiscard]] Buffer* GetBdaPageTableBuffer() noexcept {
+        return &bda_pagetable_buffer;
+    }
+
+    /// Retrieves the fault buffer.
+    [[nodiscard]] Buffer* GetFaultBuffer() noexcept {
+        return &fault_buffer;
+    }
+
    /// Retrieves the buffer with the specified id.
    [[nodiscard]] Buffer& GetBuffer(BufferId id) {
        return slot_buffers[id];
    }

    /// Invalidates any buffer in the logical page range.
-    void InvalidateMemory(VAddr device_addr, u64 size);
+    void InvalidateMemory(VAddr device_addr, u64 size, bool unmap);

    /// Binds host vertex buffers for the current draw.
    void BindVertexBuffers(const Vulkan::GraphicsPipeline& pipeline);
@ -87,9 +105,12 @@ public:
    /// Bind host index buffer for the current draw.
    void BindIndexBuffer(u32 index_offset);

-    /// Writes a value to GPU buffer.
+    /// Writes a value to GPU buffer. (uses command buffer to temporarily store the data)
    void InlineData(VAddr address, const void* value, u32 num_bytes, bool is_gds);

+    /// Writes a value to GPU buffer. (uses staging buffer to temporarily store the data)
+    void WriteData(VAddr address, const void* value, u32 num_bytes, bool is_gds);
+
    /// Obtains a buffer for the specified region.
    [[nodiscard]] std::pair<Buffer*, u32> ObtainBuffer(VAddr gpu_addr, u32 size, bool is_written,
                                                       bool is_texel_buffer = false,
@ -108,24 +129,29 @@ public:
    /// Return true when a CPU region is modified from the GPU
    [[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size);

-    [[nodiscard]] BufferId FindBuffer(VAddr device_addr, u32 size);
+    /// Return buffer id for the specified region
+    BufferId FindBuffer(VAddr device_addr, u32 size);
+
+    /// Processes the fault buffer.
+    void ProcessFaultBuffer();
+
+    /// Synchronizes all buffers in the specified range.
+    void SynchronizeBuffersInRange(VAddr device_addr, u64 size);
+
+    /// Synchronizes all buffers neede for DMA.
+    void SynchronizeDmaBuffers();
+
+    /// Record memory barrier. Used for buffers when accessed via BDA.
+    void MemoryBarrier();

 private:
    template <typename Func>
    void ForEachBufferInRange(VAddr device_addr, u64 size, Func&& func) {
-        const u64 page_end = Common::DivCeil(device_addr + size, CACHING_PAGESIZE);
-        for (u64 page = device_addr >> CACHING_PAGEBITS; page < page_end;) {
-            const BufferId buffer_id = page_table[page];
-            if (!buffer_id) {
-                ++page;
-                continue;
-            }
-            Buffer& buffer = slot_buffers[buffer_id];
-            func(buffer_id, buffer);
-
-            const VAddr end_addr = buffer.CpuAddr() + buffer.SizeBytes();
-            page = Common::DivCeil(end_addr, CACHING_PAGESIZE);
-        }
+        buffer_ranges.ForEachInRange(device_addr, size,
+                                     [&](u64 page_start, u64 page_end, BufferId id) {
+                                         Buffer& buffer = slot_buffers[id];
+                                         func(id, buffer);
+                                     });
    }

    void DownloadBufferMemory(Buffer& buffer, VAddr device_addr, u64 size);
@ -134,7 +160,7 @@ private:

    void JoinOverlap(BufferId new_buffer_id, BufferId overlap_id, bool accumulate_stream_score);

-    [[nodiscard]] BufferId CreateBuffer(VAddr device_addr, u32 wanted_size);
+    BufferId CreateBuffer(VAddr device_addr, u32 wanted_size);

    void Register(BufferId buffer_id);

@ -147,21 +173,33 @@ private:

    bool SynchronizeBufferFromImage(Buffer& buffer, VAddr device_addr, u32 size);

+    void InlineDataBuffer(Buffer& buffer, VAddr address, const void* value, u32 num_bytes);
+
+    void WriteDataBuffer(Buffer& buffer, VAddr address, const void* value, u32 num_bytes);
+
    void DeleteBuffer(BufferId buffer_id);

    const Vulkan::Instance& instance;
    Vulkan::Scheduler& scheduler;
+    Vulkan::Rasterizer& rasterizer;
    AmdGpu::Liverpool* liverpool;
    TextureCache& texture_cache;
    PageManager& tracker;
    StreamBuffer staging_buffer;
    StreamBuffer stream_buffer;
+    StreamBuffer download_buffer;
    Buffer gds_buffer;
-    std::shared_mutex mutex;
+    Buffer bda_pagetable_buffer;
+    Buffer fault_buffer;
+    std::shared_mutex slot_buffers_mutex;
    Common::SlotVector<Buffer> slot_buffers;
    RangeSet gpu_modified_ranges;
+    SplitRangeMap<BufferId> buffer_ranges;
    MemoryTracker memory_tracker;
    PageTable page_table;
+    vk::UniqueDescriptorSetLayout fault_process_desc_layout;
+    vk::UniquePipeline fault_process_pipeline;
+    vk::UniquePipelineLayout fault_process_pipeline_layout;
 };

 } // namespace VideoCore
--- a/src/video_core/buffer_cache/memory_tracker_base.h
+++ b/src/video_core/buffer_cache/memory_tracker_base.h
@ -7,6 +7,7 @@
 #include <deque>
 #include <type_traits>
 #include <vector>
+#include "common/debug.h"
 #include "common/types.h"
 #include "video_core/buffer_cache/word_manager.h"

@ -19,11 +20,11 @@ public:
    static constexpr size_t MANAGER_POOL_SIZE = 32;

 public:
-    explicit MemoryTracker(PageManager* tracker_) : tracker{tracker_} {}
+    explicit MemoryTracker(PageManager& tracker_) : tracker{&tracker_} {}
    ~MemoryTracker() = default;

    /// Returns true if a region has been modified from the CPU
-    [[nodiscard]] bool IsRegionCpuModified(VAddr query_cpu_addr, u64 query_size) noexcept {
+    bool IsRegionCpuModified(VAddr query_cpu_addr, u64 query_size) noexcept {
        return IteratePages<true>(
            query_cpu_addr, query_size, [](RegionManager* manager, u64 offset, size_t size) {
                return manager->template IsRegionModified<Type::CPU>(offset, size);
@ -31,7 +32,7 @@ public:
    }

    /// Returns true if a region has been modified from the GPU
-    [[nodiscard]] bool IsRegionGpuModified(VAddr query_cpu_addr, u64 query_size) noexcept {
+    bool IsRegionGpuModified(VAddr query_cpu_addr, u64 query_size) noexcept {
        return IteratePages<false>(
            query_cpu_addr, query_size, [](RegionManager* manager, u64 offset, size_t size) {
                return manager->template IsRegionModified<Type::GPU>(offset, size);
@ -57,8 +58,7 @@ public:
    }

    /// Call 'func' for each CPU modified range and unmark those pages as CPU modified
-    template <typename Func>
-    void ForEachUploadRange(VAddr query_cpu_range, u64 query_size, Func&& func) {
+    void ForEachUploadRange(VAddr query_cpu_range, u64 query_size, auto&& func) {
        IteratePages<true>(query_cpu_range, query_size,
                           [&func](RegionManager* manager, u64 offset, size_t size) {
                               manager->template ForEachModifiedRange<Type::CPU, true>(
@ -67,17 +67,12 @@ public:
    }

    /// Call 'func' for each GPU modified range and unmark those pages as GPU modified
-    template <bool clear, typename Func>
-    void ForEachDownloadRange(VAddr query_cpu_range, u64 query_size, Func&& func) {
+    template <bool clear>
+    void ForEachDownloadRange(VAddr query_cpu_range, u64 query_size, auto&& func) {
        IteratePages<false>(query_cpu_range, query_size,
                            [&func](RegionManager* manager, u64 offset, size_t size) {
-                                if constexpr (clear) {
-                                    manager->template ForEachModifiedRange<Type::GPU, true>(
-                                        manager->GetCpuAddr() + offset, size, func);
-                                } else {
-                                    manager->template ForEachModifiedRange<Type::GPU, false>(
-                                        manager->GetCpuAddr() + offset, size, func);
-                                }
+                                manager->template ForEachModifiedRange<Type::GPU, clear>(
+                                    manager->GetCpuAddr() + offset, size, func);
                            });
    }

@ -91,6 +86,7 @@ private:
     */
    template <bool create_region_on_fail, typename Func>
    bool IteratePages(VAddr cpu_address, size_t size, Func&& func) {
+        RENDERER_TRACE;
        using FuncReturn = typename std::invoke_result<Func, RegionManager*, u64, size_t>::type;
        static constexpr bool BOOL_BREAK = std::is_same_v<FuncReturn, bool>;
        std::size_t remaining_size{size};
--- a/src/video_core/buffer_cache/range_set.h
+++ b/src/video_core/buffer_cache/range_set.h
@ -3,7 +3,10 @@

 #pragma once

+#include <boost/icl/discrete_interval.hpp>
 #include <boost/icl/interval_map.hpp>
+#include <boost/icl/split_interval_map.hpp>
+#include <boost/icl/split_interval_set.hpp>
 #include <boost/pool/pool.hpp>
 #include <boost/pool/pool_alloc.hpp>
 #include <boost/pool/poolfwd.hpp>
@ -38,6 +41,22 @@ struct RangeSet {
        m_ranges_set.subtract(interval);
    }

+    void Clear() {
+        m_ranges_set.clear();
+    }
+
+    bool Contains(VAddr base_address, size_t size) const {
+        const VAddr end_address = base_address + size;
+        IntervalType interval{base_address, end_address};
+        return boost::icl::contains(m_ranges_set, interval);
+    }
+
+    bool Intersects(VAddr base_address, size_t size) const {
+        const VAddr end_address = base_address + size;
+        IntervalType interval{base_address, end_address};
+        return boost::icl::intersects(m_ranges_set, interval);
+    }
+
    template <typename Func>
    void ForEach(Func&& func) const {
        if (m_ranges_set.empty()) {
@ -77,14 +96,29 @@ struct RangeSet {
        }
    }

+    template <typename Func>
+    void ForEachNotInRange(VAddr base_addr, size_t size, Func&& func) const {
+        const VAddr end_addr = base_addr + size;
+        ForEachInRange(base_addr, size, [&](VAddr range_addr, VAddr range_end) {
+            if (size_t gap_size = range_addr - base_addr; gap_size != 0) {
+                func(base_addr, gap_size);
+            }
+            base_addr = range_end;
+        });
+        if (base_addr != end_addr) {
+            func(base_addr, end_addr - base_addr);
+        }
+    }
+
    IntervalSet m_ranges_set;
 };

+template <typename T>
 class RangeMap {
 public:
    using IntervalMap =
-        boost::icl::interval_map<VAddr, u64, boost::icl::partial_absorber, std::less,
-                                 boost::icl::inplace_plus, boost::icl::inter_section,
+        boost::icl::interval_map<VAddr, T, boost::icl::total_absorber, std::less,
+                                 boost::icl::inplace_identity, boost::icl::inter_section,
                                 ICL_INTERVAL_INSTANCE(ICL_INTERVAL_DEFAULT, VAddr, std::less),
                                 RangeSetsAllocator>;
    using IntervalType = typename IntervalMap::interval_type;
@ -99,7 +133,7 @@ public:
    RangeMap(RangeMap&& other);
    RangeMap& operator=(RangeMap&& other);

-    void Add(VAddr base_address, size_t size, u64 value) {
+    void Add(VAddr base_address, size_t size, const T& value) {
        const VAddr end_address = base_address + size;
        IntervalType interval{base_address, end_address};
        m_ranges_map.add({interval, value});
@ -111,6 +145,35 @@ public:
        m_ranges_map -= interval;
    }

+    void Clear() {
+        m_ranges_map.clear();
+    }
+
+    bool Contains(VAddr base_address, size_t size) const {
+        const VAddr end_address = base_address + size;
+        IntervalType interval{base_address, end_address};
+        return boost::icl::contains(m_ranges_map, interval);
+    }
+
+    bool Intersects(VAddr base_address, size_t size) const {
+        const VAddr end_address = base_address + size;
+        IntervalType interval{base_address, end_address};
+        return boost::icl::intersects(m_ranges_map, interval);
+    }
+
+    template <typename Func>
+    void ForEach(Func&& func) const {
+        if (m_ranges_map.empty()) {
+            return;
+        }
+
+        for (const auto& [interval, value] : m_ranges_map) {
+            const VAddr inter_addr_end = interval.upper();
+            const VAddr inter_addr = interval.lower();
+            func(inter_addr, inter_addr_end, value);
+        }
+    }
+
    template <typename Func>
    void ForEachInRange(VAddr base_addr, size_t size, Func&& func) const {
        if (m_ranges_map.empty()) {
@ -140,7 +203,111 @@ public:
    template <typename Func>
    void ForEachNotInRange(VAddr base_addr, size_t size, Func&& func) const {
        const VAddr end_addr = base_addr + size;
-        ForEachInRange(base_addr, size, [&](VAddr range_addr, VAddr range_end, u64) {
+        ForEachInRange(base_addr, size, [&](VAddr range_addr, VAddr range_end, const T&) {
+            if (size_t gap_size = range_addr - base_addr; gap_size != 0) {
+                func(base_addr, gap_size);
+            }
+            base_addr = range_end;
+        });
+        if (base_addr != end_addr) {
+            func(base_addr, end_addr - base_addr);
+        }
+    }
+
+private:
+    IntervalMap m_ranges_map;
+};
+
+template <typename T>
+class SplitRangeMap {
+public:
+    using IntervalMap = boost::icl::split_interval_map<
+        VAddr, T, boost::icl::total_absorber, std::less, boost::icl::inplace_identity,
+        boost::icl::inter_section, ICL_INTERVAL_INSTANCE(ICL_INTERVAL_DEFAULT, VAddr, std::less),
+        RangeSetsAllocator>;
+    using IntervalType = typename IntervalMap::interval_type;
+
+public:
+    SplitRangeMap() = default;
+    ~SplitRangeMap() = default;
+
+    SplitRangeMap(SplitRangeMap const&) = delete;
+    SplitRangeMap& operator=(SplitRangeMap const&) = delete;
+
+    SplitRangeMap(SplitRangeMap&& other);
+    SplitRangeMap& operator=(SplitRangeMap&& other);
+
+    void Add(VAddr base_address, size_t size, const T& value) {
+        const VAddr end_address = base_address + size;
+        IntervalType interval{base_address, end_address};
+        m_ranges_map.add({interval, value});
+    }
+
+    void Subtract(VAddr base_address, size_t size) {
+        const VAddr end_address = base_address + size;
+        IntervalType interval{base_address, end_address};
+        m_ranges_map -= interval;
+    }
+
+    void Clear() {
+        m_ranges_map.clear();
+    }
+
+    bool Contains(VAddr base_address, size_t size) const {
+        const VAddr end_address = base_address + size;
+        IntervalType interval{base_address, end_address};
+        return boost::icl::contains(m_ranges_map, interval);
+    }
+
+    bool Intersects(VAddr base_address, size_t size) const {
+        const VAddr end_address = base_address + size;
+        IntervalType interval{base_address, end_address};
+        return boost::icl::intersects(m_ranges_map, interval);
+    }
+
+    template <typename Func>
+    void ForEach(Func&& func) const {
+        if (m_ranges_map.empty()) {
+            return;
+        }
+
+        for (const auto& [interval, value] : m_ranges_map) {
+            const VAddr inter_addr_end = interval.upper();
+            const VAddr inter_addr = interval.lower();
+            func(inter_addr, inter_addr_end, value);
+        }
+    }
+
+    template <typename Func>
+    void ForEachInRange(VAddr base_addr, size_t size, Func&& func) const {
+        if (m_ranges_map.empty()) {
+            return;
+        }
+        const VAddr start_address = base_addr;
+        const VAddr end_address = start_address + size;
+        const IntervalType search_interval{start_address, end_address};
+        auto it = m_ranges_map.lower_bound(search_interval);
+        if (it == m_ranges_map.end()) {
+            return;
+        }
+        auto end_it = m_ranges_map.upper_bound(search_interval);
+        for (; it != end_it; it++) {
+            VAddr inter_addr_end = it->first.upper();
+            VAddr inter_addr = it->first.lower();
+            if (inter_addr_end > end_address) {
+                inter_addr_end = end_address;
+            }
+            if (inter_addr < start_address) {
+                inter_addr = start_address;
+            }
+            func(inter_addr, inter_addr_end, it->second);
+        }
+    }
+
+    template <typename Func>
+    void ForEachNotInRange(VAddr base_addr, size_t size, Func&& func) const {
+        const VAddr end_addr = base_addr + size;
+        ForEachInRange(base_addr, size, [&](VAddr range_addr, VAddr range_end, const T&) {
            if (size_t gap_size = range_addr - base_addr; gap_size != 0) {
                func(base_addr, gap_size);
            }
--- a/src/video_core/buffer_cache/word_manager.h
+++ b/src/video_core/buffer_cache/word_manager.h
@ -10,8 +10,10 @@

 #ifdef __linux__
 #include "common/adaptive_mutex.h"
-#endif
+#else
 #include "common/spin_lock.h"
+#endif
+#include "common/debug.h"
 #include "common/types.h"
 #include "video_core/page_manager.h"

@ -56,7 +58,7 @@ public:
        return cpu_addr;
    }

-    static u64 ExtractBits(u64 word, size_t page_start, size_t page_end) {
+    static constexpr u64 ExtractBits(u64 word, size_t page_start, size_t page_end) {
        constexpr size_t number_bits = sizeof(u64) * 8;
        const size_t limit_page_end = number_bits - std::min(page_end, number_bits);
        u64 bits = (word >> page_start) << page_start;
@ -64,7 +66,7 @@ public:
        return bits;
    }

-    static std::pair<size_t, size_t> GetWordPage(VAddr address) {
+    static constexpr std::pair<size_t, size_t> GetWordPage(VAddr address) {
        const size_t converted_address = static_cast<size_t>(address);
        const size_t word_number = converted_address / BYTES_PER_WORD;
        const size_t amount_pages = converted_address % BYTES_PER_WORD;
@ -73,6 +75,7 @@ public:

    template <typename Func>
    void IterateWords(size_t offset, size_t size, Func&& func) const {
+        RENDERER_TRACE;
        using FuncReturn = std::invoke_result_t<Func, std::size_t, u64>;
        static constexpr bool BOOL_BREAK = std::is_same_v<FuncReturn, bool>;
        const size_t start = static_cast<size_t>(std::max<s64>(static_cast<s64>(offset), 0LL));
@ -104,13 +107,13 @@ public:
        }
    }

-    template <typename Func>
-    void IteratePages(u64 mask, Func&& func) const {
+    void IteratePages(u64 mask, auto&& func) const {
+        RENDERER_TRACE;
        size_t offset = 0;
        while (mask != 0) {
            const size_t empty_bits = std::countr_zero(mask);
            offset += empty_bits;
-            mask = mask >> empty_bits;
+            mask >>= empty_bits;

            const size_t continuous_bits = std::countr_one(mask);
            func(offset, continuous_bits);
@ -155,8 +158,9 @@ public:
     * @param size            Size in bytes of the CPU range to loop over
     * @param func            Function to call for each turned off region
     */
-    template <Type type, bool clear, typename Func>
-    void ForEachModifiedRange(VAddr query_cpu_range, s64 size, Func&& func) {
+    template <Type type, bool clear>
+    void ForEachModifiedRange(VAddr query_cpu_range, s64 size, auto&& func) {
+        RENDERER_TRACE;
        std::scoped_lock lk{lock};
        static_assert(type != Type::Untracked);

@ -170,6 +174,7 @@ public:
                 (pending_pointer - pending_offset) * BYTES_PER_PAGE);
        };
        IterateWords(offset, size, [&](size_t index, u64 mask) {
+            RENDERER_TRACE;
            if constexpr (type == Type::GPU) {
                mask &= ~untracked[index];
            }
@ -177,14 +182,13 @@ public:
            if constexpr (clear) {
                if constexpr (type == Type::CPU) {
                    UpdateProtection<true>(index, untracked[index], mask);
-                }
-                state_words[index] &= ~mask;
-                if constexpr (type == Type::CPU) {
                    untracked[index] &= ~mask;
                }
+                state_words[index] &= ~mask;
            }
            const size_t base_offset = index * PAGES_PER_WORD;
            IteratePages(word, [&](size_t pages_offset, size_t pages_size) {
+                RENDERER_TRACE;
                const auto reset = [&]() {
                    pending_offset = base_offset + pages_offset;
                    pending_pointer = base_offset + pages_offset + pages_size;
@ -245,11 +249,13 @@ private:
     */
    template <bool add_to_tracker>
    void UpdateProtection(u64 word_index, u64 current_bits, u64 new_bits) const {
+        RENDERER_TRACE;
+        constexpr s32 delta = add_to_tracker ? 1 : -1;
        u64 changed_bits = (add_to_tracker ? current_bits : ~current_bits) & new_bits;
        VAddr addr = cpu_addr + word_index * BYTES_PER_WORD;
        IteratePages(changed_bits, [&](size_t offset, size_t size) {
-            tracker->UpdatePagesCachedCount(addr + offset * BYTES_PER_PAGE, size * BYTES_PER_PAGE,
-                                            add_to_tracker ? 1 : -1);
+            tracker->UpdatePageWatchers<delta>(addr + offset * BYTES_PER_PAGE,
+                                               size * BYTES_PER_PAGE);
        });
    }

--- a/src/video_core/host_shaders/CMakeLists.txt
+++ b/src/video_core/host_shaders/CMakeLists.txt
@ -11,6 +11,7 @@ set(SHADER_FILES
    detilers/micro_32bpp.comp
    detilers/micro_64bpp.comp
    detilers/micro_8bpp.comp
+    fault_buffer_process.comp
    fs_tri.vert
    fsr.comp
    post_process.frag
--- a/src/video_core/host_shaders/fault_buffer_process.comp
+++ b/src/video_core/host_shaders/fault_buffer_process.comp
@ -0,0 +1,42 @@
+// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#version 450
+#extension GL_ARB_gpu_shader_int64 : enable
+
+layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
+
+layout(std430, binding = 0) buffer input_buf {
+    uint fault_buffer[];
+};
+
+layout(std430, binding = 1) buffer output_buf {
+    uint64_t download_buffer[];
+};
+
+// Overlap for 32 bit atomics
+layout(std430, binding = 1) buffer output_buf32 {
+    uint download_buffer32[];
+};
+
+layout(constant_id = 0) const uint CACHING_PAGEBITS = 0;
+
+void main() {
+    uint id = gl_GlobalInvocationID.x;
+    uint word = fault_buffer[id];
+    if (word == 0u) {
+        return;
+    }
+    // 1 page per bit
+    uint base_bit = id * 32u;
+    while (word != 0u) {
+        uint bit = findLSB(word);
+        word &= word - 1;
+        uint page = base_bit + bit;
+        uint store_index = atomicAdd(download_buffer32[0], 1u) + 1u;
+        // It is very unlikely, but should we check for overflow?
+        if (store_index < 1024u) { // only support 1024 page faults
+            download_buffer[store_index] = uint64_t(page) << CACHING_PAGEBITS;
+        }
+    }
+}
--- a/src/video_core/page_manager.cpp
+++ b/src/video_core/page_manager.cpp
@ -1,11 +1,9 @@
 // SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
 // SPDX-License-Identifier: GPL-2.0-or-later

-#include <thread>
-#include <boost/icl/interval_set.hpp>
-#include "common/alignment.h"
+#include <boost/container/small_vector.hpp>
 #include "common/assert.h"
-#include "common/error.h"
+#include "common/debug.h"
 #include "common/signal_context.h"
 #include "core/memory.h"
 #include "core/signals.h"
@ -15,23 +13,60 @@
 #ifndef _WIN64
 #include <sys/mman.h>
 #ifdef ENABLE_USERFAULTFD
+#include <thread>
 #include <fcntl.h>
 #include <linux/userfaultfd.h>
 #include <poll.h>
 #include <sys/ioctl.h>
+#include "common/error.h"
 #endif
 #else
 #include <windows.h>
 #endif

+#ifdef __linux__
+#include "common/adaptive_mutex.h"
+#else
+#include "common/spin_lock.h"
+#endif
+
 namespace VideoCore {

-constexpr size_t PAGESIZE = 4_KB;
-constexpr size_t PAGEBITS = 12;
+constexpr size_t PAGE_SIZE = 4_KB;
+constexpr size_t PAGE_BITS = 12;

-#ifdef ENABLE_USERFAULTFD
 struct PageManager::Impl {
-    Impl(Vulkan::Rasterizer* rasterizer_) : rasterizer{rasterizer_} {
+    struct PageState {
+        u8 num_watchers{};
+
+        Core::MemoryPermission Perm() const noexcept {
+            return num_watchers == 0 ? Core::MemoryPermission::ReadWrite
+                                     : Core::MemoryPermission::Read;
+        }
+
+        template <s32 delta>
+        u8 AddDelta() {
+            if constexpr (delta == 1) {
+                return ++num_watchers;
+            } else {
+                ASSERT_MSG(num_watchers > 0, "Not enough watchers");
+                return --num_watchers;
+            }
+        }
+    };
+
+    struct UpdateProtectRange {
+        VAddr addr;
+        u64 size;
+        Core::MemoryPermission perms;
+    };
+
+    static constexpr size_t ADDRESS_BITS = 40;
+    static constexpr size_t NUM_ADDRESS_PAGES = 1ULL << (40 - PAGE_BITS);
+    inline static Vulkan::Rasterizer* rasterizer;
+#ifdef ENABLE_USERFAULTFD
+    Impl(Vulkan::Rasterizer* rasterizer_) {
+        rasterizer = rasterizer_;
        uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY);
        ASSERT_MSG(uffd != -1, "{}", Common::GetLastErrorMsg());

@ -63,7 +98,8 @@ struct PageManager::Impl {
        ASSERT_MSG(ret != -1, "Uffdio unregister failed");
    }

-    void Protect(VAddr address, size_t size, bool allow_write) {
+    void Protect(VAddr address, size_t size, Core::MemoryPermission perms) {
+        bool allow_write = True(perms & Core::MemoryPermission::Write);
        uffdio_writeprotect wp;
        wp.range.start = address;
        wp.range.len = size;
@ -118,12 +154,9 @@ struct PageManager::Impl {
        }
    }

-    Vulkan::Rasterizer* rasterizer;
    std::jthread ufd_thread;
    int uffd;
-};
 #else
-struct PageManager::Impl {
    Impl(Vulkan::Rasterizer* rasterizer_) {
        rasterizer = rasterizer_;

@ -141,12 +174,11 @@ struct PageManager::Impl {
        // No-op
    }

-    void Protect(VAddr address, size_t size, bool allow_write) {
+    void Protect(VAddr address, size_t size, Core::MemoryPermission perms) {
+        RENDERER_TRACE;
        auto* memory = Core::Memory::Instance();
        auto& impl = memory->GetAddressSpace();
-        impl.Protect(address, size,
-                     allow_write ? Core::MemoryPermission::ReadWrite
-                                 : Core::MemoryPermission::Read);
+        impl.Protect(address, size, perms);
    }

    static bool GuestFaultSignalHandler(void* context, void* fault_address) {
@ -157,23 +189,82 @@ struct PageManager::Impl {
        return false;
    }

-    inline static Vulkan::Rasterizer* rasterizer;
-};
 #endif
+    template <s32 delta>
+    void UpdatePageWatchers(VAddr addr, u64 size) {
+        RENDERER_TRACE;
+        boost::container::small_vector<UpdateProtectRange, 16> update_ranges;
+        {
+            std::scoped_lock lk(lock);
+
+            size_t page = addr >> PAGE_BITS;
+            auto perms = cached_pages[page].Perm();
+            u64 range_begin = 0;
+            u64 range_bytes = 0;
+
+            const auto release_pending = [&] {
+                if (range_bytes > 0) {
+                    RENDERER_TRACE;
+                    // Add pending (un)protect action
+                    update_ranges.push_back({range_begin << PAGE_BITS, range_bytes, perms});
+                    range_bytes = 0;
+                }
+            };
+
+            // Iterate requested pages
+            const u64 page_end = Common::DivCeil(addr + size, PAGE_SIZE);
+            const u64 aligned_addr = page << PAGE_BITS;
+            const u64 aligned_end = page_end << PAGE_BITS;
+            ASSERT_MSG(rasterizer->IsMapped(aligned_addr, aligned_end - aligned_addr),
+                       "Attempted to track non-GPU memory at address {:#x}, size {:#x}.",
+                       aligned_addr, aligned_end - aligned_addr);
+
+            for (; page != page_end; ++page) {
+                PageState& state = cached_pages[page];
+
+                // Apply the change to the page state
+                const u8 new_count = state.AddDelta<delta>();
+
+                // If the protection changed add pending (un)protect action
+                if (auto new_perms = state.Perm(); new_perms != perms) [[unlikely]] {
+                    release_pending();
+                    perms = new_perms;
+                }
+
+                // If the page must be (un)protected, add it to the pending range
+                if ((new_count == 0 && delta < 0) || (new_count == 1 && delta > 0)) {
+                    if (range_bytes == 0) {
+                        range_begin = page;
+                    }
+                    range_bytes += PAGE_SIZE;
+                } else {
+                    release_pending();
+                }
+            }
+
+            // Add pending (un)protect action
+            release_pending();
+        }
+
+        // Flush deferred protects
+        for (const auto& range : update_ranges) {
+            Protect(range.addr, range.size, range.perms);
+        }
+    }
+
+    std::array<PageState, NUM_ADDRESS_PAGES> cached_pages{};
+#ifdef __linux__
+    Common::AdaptiveMutex lock;
+#else
+    Common::SpinLock lock;
+#endif
+};

 PageManager::PageManager(Vulkan::Rasterizer* rasterizer_)
-    : impl{std::make_unique<Impl>(rasterizer_)}, rasterizer{rasterizer_} {}
+    : impl{std::make_unique<Impl>(rasterizer_)} {}

 PageManager::~PageManager() = default;

-VAddr PageManager::GetPageAddr(VAddr addr) {
-    return Common::AlignDown(addr, PAGESIZE);
-}
-
-VAddr PageManager::GetNextPageAddr(VAddr addr) {
-    return Common::AlignUp(addr + 1, PAGESIZE);
-}
-
 void PageManager::OnGpuMap(VAddr address, size_t size) {
    impl->OnMap(address, size);
 }
@ -182,41 +273,12 @@ void PageManager::OnGpuUnmap(VAddr address, size_t size) {
    impl->OnUnmap(address, size);
 }

-void PageManager::UpdatePagesCachedCount(VAddr addr, u64 size, s32 delta) {
-    static constexpr u64 PageShift = 12;
-
-    std::scoped_lock lk{lock};
-    const u64 num_pages = ((addr + size - 1) >> PageShift) - (addr >> PageShift) + 1;
-    const u64 page_start = addr >> PageShift;
-    const u64 page_end = page_start + num_pages;
-
-    const auto pages_interval =
-        decltype(cached_pages)::interval_type::right_open(page_start, page_end);
-    if (delta > 0) {
-        cached_pages.add({pages_interval, delta});
-    }
-
-    const auto& range = cached_pages.equal_range(pages_interval);
-    for (const auto& [range, count] : boost::make_iterator_range(range)) {
-        const auto interval = range & pages_interval;
-        const VAddr interval_start_addr = boost::icl::first(interval) << PageShift;
-        const VAddr interval_end_addr = boost::icl::last_next(interval) << PageShift;
-        const u32 interval_size = interval_end_addr - interval_start_addr;
-        ASSERT_MSG(rasterizer->IsMapped(interval_start_addr, interval_size),
-                   "Attempted to track non-GPU memory at address {:#x}, size {:#x}.",
-                   interval_start_addr, interval_size);
-        if (delta > 0 && count == delta) {
-            impl->Protect(interval_start_addr, interval_size, false);
-        } else if (delta < 0 && count == -delta) {
-            impl->Protect(interval_start_addr, interval_size, true);
-        } else {
-            ASSERT(count >= 0);
-        }
-    }
-
-    if (delta < 0) {
-        cached_pages.add({pages_interval, delta});
-    }
+template <s32 delta>
+void PageManager::UpdatePageWatchers(VAddr addr, u64 size) const {
+    impl->UpdatePageWatchers<delta>(addr, size);
 }

+template void PageManager::UpdatePageWatchers<1>(VAddr addr, u64 size) const;
+template void PageManager::UpdatePageWatchers<-1>(VAddr addr, u64 size) const;
+
 } // namespace VideoCore
--- a/src/video_core/page_manager.h
+++ b/src/video_core/page_manager.h
@ -4,11 +4,7 @@
 #pragma once

 #include <memory>
-#include <boost/icl/interval_map.hpp>
-#ifdef __linux__
-#include "common/adaptive_mutex.h"
-#endif
-#include "common/spin_lock.h"
+#include "common/alignment.h"
 #include "common/types.h"

 namespace Vulkan {
@ -18,6 +14,9 @@ class Rasterizer;
 namespace VideoCore {

 class PageManager {
+    static constexpr size_t PAGE_BITS = 12;
+    static constexpr size_t PAGE_SIZE = 1ULL << PAGE_BITS;
+
 public:
    explicit PageManager(Vulkan::Rasterizer* rasterizer);
    ~PageManager();
@ -28,22 +27,23 @@ public:
    /// Unregister a range of gpu memory that was unmapped.
    void OnGpuUnmap(VAddr address, size_t size);

-    /// Increase/decrease the number of surface in pages touching the specified region
-    void UpdatePagesCachedCount(VAddr addr, u64 size, s32 delta);
+    /// Updates watches in the pages touching the specified region.
+    template <s32 delta>
+    void UpdatePageWatchers(VAddr addr, u64 size) const;

-    static VAddr GetPageAddr(VAddr addr);
-    static VAddr GetNextPageAddr(VAddr addr);
+    /// Returns page aligned address.
+    static constexpr VAddr GetPageAddr(VAddr addr) {
+        return Common::AlignDown(addr, PAGE_SIZE);
+    }
+
+    /// Returns address of the next page.
+    static constexpr VAddr GetNextPageAddr(VAddr addr) {
+        return Common::AlignUp(addr + 1, PAGE_SIZE);
+    }

 private:
    struct Impl;
    std::unique_ptr<Impl> impl;
-    Vulkan::Rasterizer* rasterizer;
-    boost::icl::interval_map<VAddr, s32> cached_pages;
-#ifdef PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP
-    Common::AdaptiveMutex lock;
-#else
-    Common::SpinLock lock;
-#endif
 };

 } // namespace VideoCore
--- a/src/video_core/renderdoc.cpp
+++ b/src/video_core/renderdoc.cpp
@ -121,6 +121,7 @@ void SetOutputDir(const std::filesystem::path& path, const std::string& prefix)
    if (!rdoc_api) {
        return;
    }
+    LOG_WARNING(Common, "RenderDoc capture path: {}", (path / prefix).string());
    rdoc_api->SetCaptureFilePathTemplate(fmt::UTF((path / prefix).u8string()).data.data());
 }

--- a/src/video_core/renderer_vulkan/vk_instance.cpp
+++ b/src/video_core/renderer_vulkan/vk_instance.cpp
@ -147,6 +147,7 @@ Instance::Instance(Frontend::WindowSDL& window, s32 physical_device_index,
    available_extensions = GetSupportedExtensions(physical_device);
    format_properties = GetFormatProperties(physical_device);
    properties = physical_device.getProperties();
+    memory_properties = physical_device.getMemoryProperties();
    CollectDeviceParameters();
    ASSERT_MSG(properties.apiVersion >= TargetVulkanApiVersion,
               "Vulkan {}.{} is required, but only {}.{} is supported by device!",
@ -375,6 +376,7 @@ bool Instance::CreateDevice() {
            .separateDepthStencilLayouts = vk12_features.separateDepthStencilLayouts,
            .hostQueryReset = vk12_features.hostQueryReset,
            .timelineSemaphore = vk12_features.timelineSemaphore,
+            .bufferDeviceAddress = vk12_features.bufferDeviceAddress,
        },
        vk::PhysicalDeviceVulkan13Features{
            .robustImageAccess = vk13_features.robustImageAccess,
@ -505,6 +507,7 @@ void Instance::CreateAllocator() {
    };

    const VmaAllocatorCreateInfo allocator_info = {
+        .flags = VMA_ALLOCATOR_CREATE_BUFFER_DEVICE_ADDRESS_BIT,
        .physicalDevice = physical_device,
        .device = *device,
        .pVulkanFunctions = &functions,
--- a/src/video_core/renderer_vulkan/vk_instance.h
+++ b/src/video_core/renderer_vulkan/vk_instance.h
@ -286,6 +286,11 @@ public:
        return vk12_props;
    }

+    /// Returns the memory properties of the physical device.
+    const vk::PhysicalDeviceMemoryProperties& GetMemoryProperties() const noexcept {
+        return memory_properties;
+    }
+
    /// Returns true if shaders can declare the ClipDistance attribute
    bool IsShaderClipDistanceSupported() const {
        return features.shaderClipDistance;
@ -335,6 +340,7 @@ private:
    vk::PhysicalDevice physical_device;
    vk::UniqueDevice device;
    vk::PhysicalDeviceProperties properties;
+    vk::PhysicalDeviceMemoryProperties memory_properties;
    vk::PhysicalDeviceVulkan11Properties vk11_props;
    vk::PhysicalDeviceVulkan12Properties vk12_props;
    vk::PhysicalDevicePushDescriptorPropertiesKHR push_descriptor_props;
--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@ -36,7 +36,7 @@ static Shader::PushData MakeUserData(const AmdGpu::Liverpool::Regs& regs) {
 Rasterizer::Rasterizer(const Instance& instance_, Scheduler& scheduler_,
                       AmdGpu::Liverpool* liverpool_)
    : instance{instance_}, scheduler{scheduler_}, page_manager{this},
-      buffer_cache{instance, scheduler, liverpool_, texture_cache, page_manager},
+      buffer_cache{instance, scheduler, *this, liverpool_, texture_cache, page_manager},
      texture_cache{instance, scheduler, buffer_cache, page_manager}, liverpool{liverpool_},
      memory{Core::Memory::Instance()}, pipeline_cache{instance, scheduler, liverpool} {
    if (!Config::nullGpu()) {
@ -439,6 +439,13 @@ void Rasterizer::Finish() {
    scheduler.Finish();
 }

+void Rasterizer::ProcessFaults() {
+    if (fault_process_pending) {
+        fault_process_pending = false;
+        buffer_cache.ProcessFaultBuffer();
+    }
+}
+
 bool Rasterizer::BindResources(const Pipeline* pipeline) {
    if (IsComputeMetaClear(pipeline)) {
        return false;
@ -449,6 +456,8 @@ bool Rasterizer::BindResources(const Pipeline* pipeline) {
    buffer_infos.clear();
    image_infos.clear();

+    bool uses_dma = false;
+
    // Bind resource buffers and textures.
    Shader::Backend::Bindings binding{};
    Shader::PushData push_data = MakeUserData(liverpool->regs);
@ -459,9 +468,28 @@ bool Rasterizer::BindResources(const Pipeline* pipeline) {
        stage->PushUd(binding, push_data);
        BindBuffers(*stage, binding, push_data);
        BindTextures(*stage, binding);
+
+        uses_dma |= stage->dma_types != Shader::IR::Type::Void;
    }

    pipeline->BindResources(set_writes, buffer_barriers, push_data);
+
+    if (uses_dma && !fault_process_pending) {
+        // We only use fault buffer for DMA right now.
+        {
+            // TODO: GPU might have written to memory (for example with EVENT_WRITE_EOP)
+            // we need to account for that and synchronize.
+            Common::RecursiveSharedLock lock{mapped_ranges_mutex};
+            for (auto& range : mapped_ranges) {
+                buffer_cache.SynchronizeBuffersInRange(range.lower(),
+                                                       range.upper() - range.lower());
+            }
+        }
+        buffer_cache.MemoryBarrier();
+    }
+
+    fault_process_pending |= uses_dma;
+
    return true;
 }

@ -520,12 +548,18 @@ void Rasterizer::BindBuffers(const Shader::Info& stage, Shader::Backend::Binding
            if (desc.buffer_type == Shader::BufferType::GdsBuffer) {
                const auto* gds_buf = buffer_cache.GetGdsBuffer();
                buffer_infos.emplace_back(gds_buf->Handle(), 0, gds_buf->SizeBytes());
-            } else if (desc.buffer_type == Shader::BufferType::ReadConstUbo) {
+            } else if (desc.buffer_type == Shader::BufferType::Flatbuf) {
                auto& vk_buffer = buffer_cache.GetStreamBuffer();
                const u32 ubo_size = stage.flattened_ud_buf.size() * sizeof(u32);
                const u64 offset = vk_buffer.Copy(stage.flattened_ud_buf.data(), ubo_size,
                                                  instance.UniformMinAlignment());
                buffer_infos.emplace_back(vk_buffer.Handle(), offset, ubo_size);
+            } else if (desc.buffer_type == Shader::BufferType::BdaPagetable) {
+                const auto* bda_buffer = buffer_cache.GetBdaPageTableBuffer();
+                buffer_infos.emplace_back(bda_buffer->Handle(), 0, bda_buffer->SizeBytes());
+            } else if (desc.buffer_type == Shader::BufferType::FaultBuffer) {
+                const auto* fault_buffer = buffer_cache.GetFaultBuffer();
+                buffer_infos.emplace_back(fault_buffer->Handle(), 0, fault_buffer->SizeBytes());
            } else if (desc.buffer_type == Shader::BufferType::SharedMemory) {
                auto& lds_buffer = buffer_cache.GetStreamBuffer();
                const auto& cs_program = liverpool->GetCsRegs();
@ -925,7 +959,7 @@ bool Rasterizer::InvalidateMemory(VAddr addr, u64 size) {
        // Not GPU mapped memory, can skip invalidation logic entirely.
        return false;
    }
-    buffer_cache.InvalidateMemory(addr, size);
+    buffer_cache.InvalidateMemory(addr, size, false);
    texture_cache.InvalidateMemory(addr, size);
    return true;
 }
@ -937,24 +971,24 @@ bool Rasterizer::IsMapped(VAddr addr, u64 size) {
    }
    const auto range = decltype(mapped_ranges)::interval_type::right_open(addr, addr + size);

-    std::shared_lock lock{mapped_ranges_mutex};
+    Common::RecursiveSharedLock lock{mapped_ranges_mutex};
    return boost::icl::contains(mapped_ranges, range);
 }

 void Rasterizer::MapMemory(VAddr addr, u64 size) {
    {
-        std::unique_lock lock{mapped_ranges_mutex};
+        std::scoped_lock lock{mapped_ranges_mutex};
        mapped_ranges += decltype(mapped_ranges)::interval_type::right_open(addr, addr + size);
    }
    page_manager.OnGpuMap(addr, size);
 }

 void Rasterizer::UnmapMemory(VAddr addr, u64 size) {
-    buffer_cache.InvalidateMemory(addr, size);
+    buffer_cache.InvalidateMemory(addr, size, true);
    texture_cache.UnmapMemory(addr, size);
    page_manager.OnGpuUnmap(addr, size);
    {
-        std::unique_lock lock{mapped_ranges_mutex};
+        std::scoped_lock lock{mapped_ranges_mutex};
        mapped_ranges -= decltype(mapped_ranges)::interval_type::right_open(addr, addr + size);
    }
 }
--- a/src/video_core/renderer_vulkan/vk_rasterizer.h
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.h
@ -4,7 +4,7 @@
 #pragma once

 #include <shared_mutex>
-
+#include "common/recursive_lock.h"
 #include "video_core/buffer_cache/buffer_cache.h"
 #include "video_core/page_manager.h"
 #include "video_core/renderer_vulkan/vk_pipeline_cache.h"
@ -65,11 +65,21 @@ public:
    void CpSync();
    u64 Flush();
    void Finish();
+    void ProcessFaults();

    PipelineCache& GetPipelineCache() {
        return pipeline_cache;
    }

+    template <typename Func>
+    void ForEachMappedRangeInRange(VAddr addr, u64 size, Func&& func) {
+        const auto range = decltype(mapped_ranges)::interval_type::right_open(addr, addr + size);
+        Common::RecursiveSharedLock lock{mapped_ranges_mutex};
+        for (const auto& mapped_range : (mapped_ranges & range)) {
+            func(mapped_range);
+        }
+    }
+
 private:
    RenderState PrepareRenderState(u32 mrt_mask);
    void BeginRendering(const GraphicsPipeline& pipeline, RenderState& state);
@ -100,6 +110,8 @@ private:
    bool IsComputeMetaClear(const Pipeline* pipeline);

 private:
+    friend class VideoCore::BufferCache;
+
    const Instance& instance;
    Scheduler& scheduler;
    VideoCore::PageManager page_manager;
@ -126,6 +138,7 @@ private:
    boost::container::static_vector<BufferBindingInfo, Shader::NumBuffers> buffer_bindings;
    using ImageBindingInfo = std::pair<VideoCore::ImageId, VideoCore::TextureCache::TextureDesc>;
    boost::container::static_vector<ImageBindingInfo, Shader::NumImages> image_bindings;
+    bool fault_process_pending{false};
 };

 } // namespace Vulkan
--- a/src/video_core/renderer_vulkan/vk_scheduler.cpp
+++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp
@ -70,6 +70,11 @@ void Scheduler::Flush(SubmitInfo& info) {
    SubmitExecution(info);
 }

+void Scheduler::Flush() {
+    SubmitInfo info{};
+    Flush(info);
+}
+
 void Scheduler::Finish() {
    // When finishing, we need to wait for the submission to have executed on the device.
    const u64 presubmit_tick = CurrentTick();
@ -85,6 +90,15 @@ void Scheduler::Wait(u64 tick) {
        Flush(info);
    }
    master_semaphore.Wait(tick);
+
+    // CAUTION: This can introduce unexpected variation in the wait time.
+    // We don't currently sync the GPU, and some games are very sensitive to this.
+    // If this becomes a problem, it can be commented out.
+    // Idealy we would implement proper gpu sync.
+    while (!pending_ops.empty() && pending_ops.front().gpu_tick <= tick) {
+        pending_ops.front().callback();
+        pending_ops.pop();
+    }
 }

 void Scheduler::AllocateWorkerCommandBuffers() {
--- a/src/video_core/renderer_vulkan/vk_scheduler.h
+++ b/src/video_core/renderer_vulkan/vk_scheduler.h
@ -307,6 +307,10 @@ public:
    /// and increments the scheduler timeline semaphore.
    void Flush(SubmitInfo& info);

+    /// Sends the current execution context to the GPU
+    /// and increments the scheduler timeline semaphore.
+    void Flush();
+
    /// Sends the current execution context to the GPU and waits for it to complete.
    void Finish();

--- a/src/video_core/texture_cache/texture_cache.cpp
+++ b/src/video_core/texture_cache/texture_cache.cpp
@ -222,14 +222,23 @@ std::tuple<ImageId, int, int> TextureCache::ResolveOverlap(const ImageInfo& imag
                -1, -1};
        }

-        ImageId new_image_id{};
-        if (image_info.type == tex_cache_image.info.type) {
-            ASSERT(image_info.resources > tex_cache_image.info.resources);
-            new_image_id = ExpandImage(image_info, cache_image_id);
-        } else {
-            UNREACHABLE();
+        if (image_info.type == tex_cache_image.info.type &&
+            image_info.resources > tex_cache_image.info.resources) {
+            // Size and resources are greater, expand the image.
+            return {ExpandImage(image_info, cache_image_id), -1, -1};
        }
-        return {new_image_id, -1, -1};
+
+        if (image_info.tiling_mode != tex_cache_image.info.tiling_mode) {
+            // Size is greater but resources are not, because the tiling mode is different.
+            // Likely this memory address is being reused for a different image with a different
+            // tiling mode.
+            if (safe_to_delete) {
+                FreeImage(cache_image_id);
+            }
+            return {merged_image_id, -1, -1};
+        }
+
+        UNREACHABLE_MSG("Encountered unresolvable image overlap with equal memory address.");
    }

    // Right overlap, the image requested is a possible subresource of the image from cache.
@ -538,10 +547,16 @@ void TextureCache::RefreshImage(Image& image, Vulkan::Scheduler* custom_schedule
            image.mip_hashes[m] = hash;
        }

+        auto mip_pitch = static_cast<u32>(mip.pitch);
+        auto mip_height = static_cast<u32>(mip.height);
+
+        auto image_extent_width = mip_pitch ? std::min(mip_pitch, width) : width;
+        auto image_extent_height = mip_height ? std::min(mip_height, height) : height;
+
        image_copy.push_back({
            .bufferOffset = mip.offset,
-            .bufferRowLength = static_cast<u32>(mip.pitch),
-            .bufferImageHeight = static_cast<u32>(mip.height),
+            .bufferRowLength = mip_pitch,
+            .bufferImageHeight = mip_height,
            .imageSubresource{
                .aspectMask = image.aspect_mask & ~vk::ImageAspectFlagBits::eStencil,
                .mipLevel = m,
@ -549,7 +564,7 @@ void TextureCache::RefreshImage(Image& image, Vulkan::Scheduler* custom_schedule
                .layerCount = num_layers,
            },
            .imageOffset = {0, 0, 0},
-            .imageExtent = {width, height, depth},
+            .imageExtent = {image_extent_width, image_extent_height, depth},
        });
    }

@ -672,7 +687,7 @@ void TextureCache::TrackImage(ImageId image_id) {
        // Re-track the whole image
        image.track_addr = image_begin;
        image.track_addr_end = image_end;
-        tracker.UpdatePagesCachedCount(image_begin, image.info.guest_size, 1);
+        tracker.UpdatePageWatchers<1>(image_begin, image.info.guest_size);
    } else {
        if (image_begin < image.track_addr) {
            TrackImageHead(image_id);
@ -695,7 +710,7 @@ void TextureCache::TrackImageHead(ImageId image_id) {
    ASSERT(image.track_addr != 0 && image_begin < image.track_addr);
    const auto size = image.track_addr - image_begin;
    image.track_addr = image_begin;
-    tracker.UpdatePagesCachedCount(image_begin, size, 1);
+    tracker.UpdatePageWatchers<1>(image_begin, size);
 }

 void TextureCache::TrackImageTail(ImageId image_id) {
@ -711,7 +726,7 @@ void TextureCache::TrackImageTail(ImageId image_id) {
    const auto addr = image.track_addr_end;
    const auto size = image_end - image.track_addr_end;
    image.track_addr_end = image_end;
-    tracker.UpdatePagesCachedCount(addr, size, 1);
+    tracker.UpdatePageWatchers<1>(addr, size);
 }

 void TextureCache::UntrackImage(ImageId image_id) {
@ -724,7 +739,7 @@ void TextureCache::UntrackImage(ImageId image_id) {
    image.track_addr = 0;
    image.track_addr_end = 0;
    if (size != 0) {
-        tracker.UpdatePagesCachedCount(addr, size, -1);
+        tracker.UpdatePageWatchers<-1>(addr, size);
    }
 }

@ -743,7 +758,7 @@ void TextureCache::UntrackImageHead(ImageId image_id) {
        // Cehck its hash later.
        MarkAsMaybeDirty(image_id, image);
    }
-    tracker.UpdatePagesCachedCount(image_begin, size, -1);
+    tracker.UpdatePageWatchers<-1>(image_begin, size);
 }

 void TextureCache::UntrackImageTail(ImageId image_id) {
@ -762,7 +777,7 @@ void TextureCache::UntrackImageTail(ImageId image_id) {
        // Cehck its hash later.
        MarkAsMaybeDirty(image_id, image);
    }
-    tracker.UpdatePagesCachedCount(addr, size, -1);
+    tracker.UpdatePageWatchers<-1>(addr, size);
 }

 void TextureCache::DeleteImage(ImageId image_id) {