Merge pull request #4729 from ameerj/nvdec-prod

video_core: NVDEC Implementation
2020-10-26 23:02:42 -07:00 · 2020-10-26 23:02:42 -07:00 · d33399e1f4
commit d33399e1f4
parent c7f32931ee eb67a45ca8
53 changed files with 4033 additions and 310 deletions
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@ -439,6 +439,8 @@ add_library(core STATIC
    hle/service/nvdrv/devices/nvhost_gpu.h
    hle/service/nvdrv/devices/nvhost_nvdec.cpp
    hle/service/nvdrv/devices/nvhost_nvdec.h
+    hle/service/nvdrv/devices/nvhost_nvdec_common.cpp
+    hle/service/nvdrv/devices/nvhost_nvdec_common.h
    hle/service/nvdrv/devices/nvhost_nvjpg.cpp
    hle/service/nvdrv/devices/nvhost_nvjpg.h
    hle/service/nvdrv/devices/nvhost_vic.cpp
--- a/src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp
@ -2,15 +2,17 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.

-#include <cstring>
-
 #include "common/assert.h"
 #include "common/logging/log.h"
+#include "core/core.h"
 #include "core/hle/service/nvdrv/devices/nvhost_nvdec.h"
+#include "video_core/memory_manager.h"
+#include "video_core/renderer_base.h"

 namespace Service::Nvidia::Devices {

-nvhost_nvdec::nvhost_nvdec(Core::System& system) : nvdevice(system) {}
+nvhost_nvdec::nvhost_nvdec(Core::System& system, std::shared_ptr<nvmap> nvmap_dev)
+    : nvhost_nvdec_common(system, std::move(nvmap_dev)) {}
 nvhost_nvdec::~nvhost_nvdec() = default;

 u32 nvhost_nvdec::ioctl(Ioctl command, const std::vector<u8>& input, const std::vector<u8>& input2,
@ -21,7 +23,7 @@ u32 nvhost_nvdec::ioctl(Ioctl command, const std::vector<u8>& input, const std::

    switch (static_cast<IoctlCommand>(command.raw)) {
    case IoctlCommand::IocSetNVMAPfdCommand:
-        return SetNVMAPfd(input, output);
+        return SetNVMAPfd(input);
    case IoctlCommand::IocSubmit:
        return Submit(input, output);
    case IoctlCommand::IocGetSyncpoint:
@ -29,79 +31,29 @@ u32 nvhost_nvdec::ioctl(Ioctl command, const std::vector<u8>& input, const std::
    case IoctlCommand::IocGetWaitbase:
        return GetWaitbase(input, output);
    case IoctlCommand::IocMapBuffer:
-        return MapBuffer(input, output);
+    case IoctlCommand::IocMapBuffer2:
+    case IoctlCommand::IocMapBuffer3:
    case IoctlCommand::IocMapBufferEx:
-        return MapBufferEx(input, output);
-    case IoctlCommand::IocUnmapBufferEx:
-        return UnmapBufferEx(input, output);
+        return MapBuffer(input, output);
+    case IoctlCommand::IocUnmapBufferEx: {
+        // This command is sent when the video stream has ended, flush all video contexts
+        // This is usually sent in the folowing order: vic, nvdec, vic.
+        // Inform the GPU to clear any remaining nvdec buffers when this is detected.
+        LOG_INFO(Service_NVDRV, "NVDEC video stream ended");
+        Tegra::ChCommandHeaderList cmdlist(1);
+        cmdlist[0] = Tegra::ChCommandHeader{0xDEADB33F};
+        system.GPU().PushCommandBuffer(cmdlist);
+        [[fallthrough]]; // fallthrough to unmap buffers
+    };
+    case IoctlCommand::IocUnmapBuffer:
+    case IoctlCommand::IocUnmapBuffer2:
+    case IoctlCommand::IocUnmapBuffer3:
+        return UnmapBuffer(input, output);
+    case IoctlCommand::IocSetSubmitTimeout:
+        return SetSubmitTimeout(input, output);
    }

-    UNIMPLEMENTED_MSG("Unimplemented ioctl");
-    return 0;
-}
-
-u32 nvhost_nvdec::SetNVMAPfd(const std::vector<u8>& input, std::vector<u8>& output) {
-    IoctlSetNvmapFD params{};
-    std::memcpy(&params, input.data(), sizeof(IoctlSetNvmapFD));
-    LOG_DEBUG(Service_NVDRV, "called, fd={}", params.nvmap_fd);
-
-    nvmap_fd = params.nvmap_fd;
-    return 0;
-}
-
-u32 nvhost_nvdec::Submit(const std::vector<u8>& input, std::vector<u8>& output) {
-    IoctlSubmit params{};
-    std::memcpy(&params, input.data(), sizeof(IoctlSubmit));
-    LOG_WARNING(Service_NVDRV, "(STUBBED) called");
-    std::memcpy(output.data(), &params, sizeof(IoctlSubmit));
-    return 0;
-}
-
-u32 nvhost_nvdec::GetSyncpoint(const std::vector<u8>& input, std::vector<u8>& output) {
-    IoctlGetSyncpoint params{};
-    std::memcpy(&params, input.data(), sizeof(IoctlGetSyncpoint));
-    LOG_INFO(Service_NVDRV, "called, unknown=0x{:X}", params.unknown);
-    params.value = 0; // Seems to be hard coded at 0
-    std::memcpy(output.data(), &params, sizeof(IoctlGetSyncpoint));
-    return 0;
-}
-
-u32 nvhost_nvdec::GetWaitbase(const std::vector<u8>& input, std::vector<u8>& output) {
-    IoctlGetWaitbase params{};
-    std::memcpy(&params, input.data(), sizeof(IoctlGetWaitbase));
-    LOG_INFO(Service_NVDRV, "called, unknown=0x{:X}", params.unknown);
-    params.value = 0; // Seems to be hard coded at 0
-    std::memcpy(output.data(), &params, sizeof(IoctlGetWaitbase));
-    return 0;
-}
-
-u32 nvhost_nvdec::MapBuffer(const std::vector<u8>& input, std::vector<u8>& output) {
-    IoctlMapBuffer params{};
-    std::memcpy(&params, input.data(), sizeof(IoctlMapBuffer));
-    LOG_WARNING(Service_NVDRV, "(STUBBED) called with address={:08X}{:08X}", params.address_2,
-                params.address_1);
-    params.address_1 = 0;
-    params.address_2 = 0;
-    std::memcpy(output.data(), &params, sizeof(IoctlMapBuffer));
-    return 0;
-}
-
-u32 nvhost_nvdec::MapBufferEx(const std::vector<u8>& input, std::vector<u8>& output) {
-    IoctlMapBufferEx params{};
-    std::memcpy(&params, input.data(), sizeof(IoctlMapBufferEx));
-    LOG_WARNING(Service_NVDRV, "(STUBBED) called with address={:08X}{:08X}", params.address_2,
-                params.address_1);
-    params.address_1 = 0;
-    params.address_2 = 0;
-    std::memcpy(output.data(), &params, sizeof(IoctlMapBufferEx));
-    return 0;
-}
-
-u32 nvhost_nvdec::UnmapBufferEx(const std::vector<u8>& input, std::vector<u8>& output) {
-    IoctlUnmapBufferEx params{};
-    std::memcpy(&params, input.data(), sizeof(IoctlUnmapBufferEx));
-    LOG_WARNING(Service_NVDRV, "(STUBBED) called");
-    std::memcpy(output.data(), &params, sizeof(IoctlUnmapBufferEx));
+    UNIMPLEMENTED_MSG("Unimplemented ioctl 0x{:X}", command.raw);
    return 0;
 }

--- a/src/core/hle/service/nvdrv/devices/nvhost_nvdec.h
+++ b/src/core/hle/service/nvdrv/devices/nvhost_nvdec.h
@ -4,16 +4,14 @@

 #pragma once

-#include <vector>
-#include "common/common_types.h"
-#include "common/swap.h"
-#include "core/hle/service/nvdrv/devices/nvdevice.h"
+#include <memory>
+#include "core/hle/service/nvdrv/devices/nvhost_nvdec_common.h"

 namespace Service::Nvidia::Devices {

-class nvhost_nvdec final : public nvdevice {
+class nvhost_nvdec final : public nvhost_nvdec_common {
 public:
-    explicit nvhost_nvdec(Core::System& system);
+    explicit nvhost_nvdec(Core::System& system, std::shared_ptr<nvmap> nvmap_dev);
    ~nvhost_nvdec() override;

    u32 ioctl(Ioctl command, const std::vector<u8>& input, const std::vector<u8>& input2,
@ -27,62 +25,15 @@ private:
        IocGetSyncpoint = 0xC0080002,
        IocGetWaitbase = 0xC0080003,
        IocMapBuffer = 0xC01C0009,
+        IocMapBuffer2 = 0xC16C0009,
+        IocMapBuffer3 = 0xC15C0009,
        IocMapBufferEx = 0xC0A40009,
-        IocUnmapBufferEx = 0xC0A4000A,
+        IocUnmapBuffer = 0xC0A4000A,
+        IocUnmapBuffer2 = 0xC16C000A,
+        IocUnmapBufferEx = 0xC01C000A,
+        IocUnmapBuffer3 = 0xC15C000A,
+        IocSetSubmitTimeout = 0x40040007,
    };
-
-    struct IoctlSetNvmapFD {
-        u32_le nvmap_fd;
-    };
-    static_assert(sizeof(IoctlSetNvmapFD) == 0x4, "IoctlSetNvmapFD is incorrect size");
-
-    struct IoctlSubmit {
-        INSERT_PADDING_BYTES(0x40); // TODO(DarkLordZach): RE this structure
-    };
-    static_assert(sizeof(IoctlSubmit) == 0x40, "IoctlSubmit has incorrect size");
-
-    struct IoctlGetSyncpoint {
-        u32 unknown; // seems to be ignored? Nintendo added this
-        u32 value;
-    };
-    static_assert(sizeof(IoctlGetSyncpoint) == 0x08, "IoctlGetSyncpoint has incorrect size");
-
-    struct IoctlGetWaitbase {
-        u32 unknown; // seems to be ignored? Nintendo added this
-        u32 value;
-    };
-    static_assert(sizeof(IoctlGetWaitbase) == 0x08, "IoctlGetWaitbase has incorrect size");
-
-    struct IoctlMapBuffer {
-        u32 unknown;
-        u32 address_1;
-        u32 address_2;
-        INSERT_PADDING_BYTES(0x10); // TODO(DarkLordZach): RE this structure
-    };
-    static_assert(sizeof(IoctlMapBuffer) == 0x1C, "IoctlMapBuffer is incorrect size");
-
-    struct IoctlMapBufferEx {
-        u32 unknown;
-        u32 address_1;
-        u32 address_2;
-        INSERT_PADDING_BYTES(0x98); // TODO(DarkLordZach): RE this structure
-    };
-    static_assert(sizeof(IoctlMapBufferEx) == 0xA4, "IoctlMapBufferEx has incorrect size");
-
-    struct IoctlUnmapBufferEx {
-        INSERT_PADDING_BYTES(0xA4); // TODO(DarkLordZach): RE this structure
-    };
-    static_assert(sizeof(IoctlUnmapBufferEx) == 0xA4, "IoctlUnmapBufferEx has incorrect size");
-
-    u32_le nvmap_fd{};
-
-    u32 SetNVMAPfd(const std::vector<u8>& input, std::vector<u8>& output);
-    u32 Submit(const std::vector<u8>& input, std::vector<u8>& output);
-    u32 GetSyncpoint(const std::vector<u8>& input, std::vector<u8>& output);
-    u32 GetWaitbase(const std::vector<u8>& input, std::vector<u8>& output);
-    u32 MapBuffer(const std::vector<u8>& input, std::vector<u8>& output);
-    u32 MapBufferEx(const std::vector<u8>& input, std::vector<u8>& output);
-    u32 UnmapBufferEx(const std::vector<u8>& input, std::vector<u8>& output);
 };

 } // namespace Service::Nvidia::Devices
--- a/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.cpp
@ -0,0 +1,234 @@
+// Copyright 2020 yuzu emulator team
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <algorithm>
+#include <cstring>
+
+#include "common/assert.h"
+#include "common/common_types.h"
+#include "common/logging/log.h"
+#include "core/core.h"
+#include "core/hle/service/nvdrv/devices/nvhost_nvdec_common.h"
+#include "core/hle/service/nvdrv/devices/nvmap.h"
+#include "core/memory.h"
+#include "video_core/memory_manager.h"
+#include "video_core/renderer_base.h"
+
+namespace Service::Nvidia::Devices {
+
+namespace {
+// Splice vectors will copy count amount of type T from the input vector into the dst vector.
+template <typename T>
+std::size_t SpliceVectors(const std::vector<u8>& input, std::vector<T>& dst, std::size_t count,
+                          std::size_t offset) {
+    std::memcpy(dst.data(), input.data() + offset, count * sizeof(T));
+    offset += count * sizeof(T);
+    return offset;
+}
+
+// Write vectors will write data to the output buffer
+template <typename T>
+std::size_t WriteVectors(std::vector<u8>& dst, const std::vector<T>& src, std::size_t offset) {
+    std::memcpy(dst.data() + offset, src.data(), src.size() * sizeof(T));
+    offset += src.size() * sizeof(T);
+    return offset;
+}
+} // Anonymous namespace
+
+namespace NvErrCodes {
+constexpr u32 Success{};
+constexpr u32 OutOfMemory{static_cast<u32>(-12)};
+constexpr u32 InvalidInput{static_cast<u32>(-22)};
+} // namespace NvErrCodes
+
+nvhost_nvdec_common::nvhost_nvdec_common(Core::System& system, std::shared_ptr<nvmap> nvmap_dev)
+    : nvdevice(system), nvmap_dev(std::move(nvmap_dev)) {}
+nvhost_nvdec_common::~nvhost_nvdec_common() = default;
+
+u32 nvhost_nvdec_common::SetNVMAPfd(const std::vector<u8>& input) {
+    IoctlSetNvmapFD params{};
+    std::memcpy(&params, input.data(), sizeof(IoctlSetNvmapFD));
+    LOG_DEBUG(Service_NVDRV, "called, fd={}", params.nvmap_fd);
+
+    nvmap_fd = params.nvmap_fd;
+    return 0;
+}
+
+u32 nvhost_nvdec_common::Submit(const std::vector<u8>& input, std::vector<u8>& output) {
+    IoctlSubmit params{};
+    std::memcpy(&params, input.data(), sizeof(IoctlSubmit));
+    LOG_DEBUG(Service_NVDRV, "called NVDEC Submit, cmd_buffer_count={}", params.cmd_buffer_count);
+
+    // Instantiate param buffers
+    std::size_t offset = sizeof(IoctlSubmit);
+    std::vector<CommandBuffer> command_buffers(params.cmd_buffer_count);
+    std::vector<Reloc> relocs(params.relocation_count);
+    std::vector<u32> reloc_shifts(params.relocation_count);
+    std::vector<SyncptIncr> syncpt_increments(params.syncpoint_count);
+    std::vector<SyncptIncr> wait_checks(params.syncpoint_count);
+    std::vector<Fence> fences(params.fence_count);
+
+    // Splice input into their respective buffers
+    offset = SpliceVectors(input, command_buffers, params.cmd_buffer_count, offset);
+    offset = SpliceVectors(input, relocs, params.relocation_count, offset);
+    offset = SpliceVectors(input, reloc_shifts, params.relocation_count, offset);
+    offset = SpliceVectors(input, syncpt_increments, params.syncpoint_count, offset);
+    offset = SpliceVectors(input, wait_checks, params.syncpoint_count, offset);
+    offset = SpliceVectors(input, fences, params.fence_count, offset);
+
+    // TODO(ameerj): For async gpu, utilize fences for syncpoint 'max' increment
+
+    auto& gpu = system.GPU();
+
+    for (const auto& cmd_buffer : command_buffers) {
+        auto object = nvmap_dev->GetObject(cmd_buffer.memory_id);
+        ASSERT_OR_EXECUTE(object, return NvErrCodes::InvalidInput;);
+        const auto map = FindBufferMap(object->dma_map_addr);
+        if (!map) {
+            LOG_ERROR(Service_NVDRV, "Tried to submit an invalid offset 0x{:X} dma 0x{:X}",
+                      object->addr, object->dma_map_addr);
+            return 0;
+        }
+        Tegra::ChCommandHeaderList cmdlist(cmd_buffer.word_count);
+        gpu.MemoryManager().ReadBlock(map->StartAddr() + cmd_buffer.offset, cmdlist.data(),
+                                      cmdlist.size() * sizeof(u32));
+        gpu.PushCommandBuffer(cmdlist);
+    }
+
+    std::memcpy(output.data(), &params, sizeof(IoctlSubmit));
+    // Some games expect command_buffers to be written back
+    offset = sizeof(IoctlSubmit);
+    offset = WriteVectors(output, command_buffers, offset);
+    offset = WriteVectors(output, relocs, offset);
+    offset = WriteVectors(output, reloc_shifts, offset);
+    offset = WriteVectors(output, syncpt_increments, offset);
+    offset = WriteVectors(output, wait_checks, offset);
+
+    return NvErrCodes::Success;
+}
+
+u32 nvhost_nvdec_common::GetSyncpoint(const std::vector<u8>& input, std::vector<u8>& output) {
+    IoctlGetSyncpoint params{};
+    std::memcpy(&params, input.data(), sizeof(IoctlGetSyncpoint));
+    LOG_DEBUG(Service_NVDRV, "called GetSyncpoint, id={}", params.param);
+
+    // We found that implementing this causes deadlocks with async gpu, along with degraded
+    // performance. TODO: RE the nvdec async implementation
+    params.value = 0;
+    std::memcpy(output.data(), &params, sizeof(IoctlGetSyncpoint));
+
+    return NvErrCodes::Success;
+}
+
+u32 nvhost_nvdec_common::GetWaitbase(const std::vector<u8>& input, std::vector<u8>& output) {
+    IoctlGetWaitbase params{};
+    std::memcpy(&params, input.data(), sizeof(IoctlGetWaitbase));
+    params.value = 0; // Seems to be hard coded at 0
+    std::memcpy(output.data(), &params, sizeof(IoctlGetWaitbase));
+    return 0;
+}
+
+u32 nvhost_nvdec_common::MapBuffer(const std::vector<u8>& input, std::vector<u8>& output) {
+    IoctlMapBuffer params{};
+    std::memcpy(&params, input.data(), sizeof(IoctlMapBuffer));
+    std::vector<MapBufferEntry> cmd_buffer_handles(params.num_entries);
+
+    SpliceVectors(input, cmd_buffer_handles, params.num_entries, sizeof(IoctlMapBuffer));
+
+    auto& gpu = system.GPU();
+
+    for (auto& cmf_buff : cmd_buffer_handles) {
+        auto object{nvmap_dev->GetObject(cmf_buff.map_handle)};
+        if (!object) {
+            LOG_ERROR(Service_NVDRV, "invalid cmd_buffer nvmap_handle={:X}", cmf_buff.map_handle);
+            std::memcpy(output.data(), &params, output.size());
+            return NvErrCodes::InvalidInput;
+        }
+        if (object->dma_map_addr == 0) {
+            // NVDEC and VIC memory is in the 32-bit address space
+            // MapAllocate32 will attempt to map a lower 32-bit value in the shared gpu memory space
+            const GPUVAddr low_addr = gpu.MemoryManager().MapAllocate32(object->addr, object->size);
+            object->dma_map_addr = static_cast<u32>(low_addr);
+            // Ensure that the dma_map_addr is indeed in the lower 32-bit address space.
+            ASSERT(object->dma_map_addr == low_addr);
+        }
+        if (!object->dma_map_addr) {
+            LOG_ERROR(Service_NVDRV, "failed to map size={}", object->size);
+        } else {
+            cmf_buff.map_address = object->dma_map_addr;
+            AddBufferMap(object->dma_map_addr, object->size, object->addr,
+                         object->status == nvmap::Object::Status::Allocated);
+        }
+    }
+    std::memcpy(output.data(), &params, sizeof(IoctlMapBuffer));
+    std::memcpy(output.data() + sizeof(IoctlMapBuffer), cmd_buffer_handles.data(),
+                cmd_buffer_handles.size() * sizeof(MapBufferEntry));
+
+    return NvErrCodes::Success;
+}
+
+u32 nvhost_nvdec_common::UnmapBuffer(const std::vector<u8>& input, std::vector<u8>& output) {
+    IoctlMapBuffer params{};
+    std::memcpy(&params, input.data(), sizeof(IoctlMapBuffer));
+    std::vector<MapBufferEntry> cmd_buffer_handles(params.num_entries);
+    SpliceVectors(input, cmd_buffer_handles, params.num_entries, sizeof(IoctlMapBuffer));
+
+    auto& gpu = system.GPU();
+
+    for (auto& cmf_buff : cmd_buffer_handles) {
+        const auto object{nvmap_dev->GetObject(cmf_buff.map_handle)};
+        if (!object) {
+            LOG_ERROR(Service_NVDRV, "invalid cmd_buffer nvmap_handle={:X}", cmf_buff.map_handle);
+            std::memcpy(output.data(), &params, output.size());
+            return NvErrCodes::InvalidInput;
+        }
+        if (const auto size{RemoveBufferMap(object->dma_map_addr)}; size) {
+            gpu.MemoryManager().Unmap(object->dma_map_addr, *size);
+        } else {
+            // This occurs quite frequently, however does not seem to impact functionality
+            LOG_DEBUG(Service_NVDRV, "invalid offset=0x{:X} dma=0x{:X}", object->addr,
+                      object->dma_map_addr);
+        }
+        object->dma_map_addr = 0;
+    }
+    std::memset(output.data(), 0, output.size());
+    return NvErrCodes::Success;
+}
+
+u32 nvhost_nvdec_common::SetSubmitTimeout(const std::vector<u8>& input, std::vector<u8>& output) {
+    std::memcpy(&submit_timeout, input.data(), input.size());
+    LOG_WARNING(Service_NVDRV, "(STUBBED) called");
+    return NvErrCodes::Success;
+}
+
+std::optional<nvhost_nvdec_common::BufferMap> nvhost_nvdec_common::FindBufferMap(
+    GPUVAddr gpu_addr) const {
+    const auto it = std::find_if(
+        buffer_mappings.begin(), buffer_mappings.upper_bound(gpu_addr), [&](const auto& entry) {
+            return (gpu_addr >= entry.second.StartAddr() && gpu_addr < entry.second.EndAddr());
+        });
+
+    ASSERT(it != buffer_mappings.end());
+    return it->second;
+}
+
+void nvhost_nvdec_common::AddBufferMap(GPUVAddr gpu_addr, std::size_t size, VAddr cpu_addr,
+                                       bool is_allocated) {
+    buffer_mappings.insert_or_assign(gpu_addr, BufferMap{gpu_addr, size, cpu_addr, is_allocated});
+}
+
+std::optional<std::size_t> nvhost_nvdec_common::RemoveBufferMap(GPUVAddr gpu_addr) {
+    const auto iter{buffer_mappings.find(gpu_addr)};
+    if (iter == buffer_mappings.end()) {
+        return std::nullopt;
+    }
+    std::size_t size = 0;
+    if (iter->second.IsAllocated()) {
+        size = iter->second.Size();
+    }
+    buffer_mappings.erase(iter);
+    return size;
+}
+
+} // namespace Service::Nvidia::Devices
--- a/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.h
+++ b/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.h
@ -0,0 +1,168 @@
+// Copyright 2020 yuzu emulator team
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <map>
+#include <vector>
+#include "common/common_types.h"
+#include "common/swap.h"
+#include "core/hle/service/nvdrv/devices/nvdevice.h"
+
+namespace Service::Nvidia::Devices {
+class nvmap;
+
+class nvhost_nvdec_common : public nvdevice {
+public:
+    explicit nvhost_nvdec_common(Core::System& system, std::shared_ptr<nvmap> nvmap_dev);
+    ~nvhost_nvdec_common() override;
+
+    virtual u32 ioctl(Ioctl command, const std::vector<u8>& input, const std::vector<u8>& input2,
+                      std::vector<u8>& output, std::vector<u8>& output2, IoctlCtrl& ctrl,
+                      IoctlVersion version) = 0;
+
+protected:
+    class BufferMap final {
+    public:
+        constexpr BufferMap() = default;
+
+        constexpr BufferMap(GPUVAddr start_addr, std::size_t size)
+            : start_addr{start_addr}, end_addr{start_addr + size} {}
+
+        constexpr BufferMap(GPUVAddr start_addr, std::size_t size, VAddr cpu_addr,
+                            bool is_allocated)
+            : start_addr{start_addr}, end_addr{start_addr + size}, cpu_addr{cpu_addr},
+              is_allocated{is_allocated} {}
+
+        constexpr VAddr StartAddr() const {
+            return start_addr;
+        }
+
+        constexpr VAddr EndAddr() const {
+            return end_addr;
+        }
+
+        constexpr std::size_t Size() const {
+            return end_addr - start_addr;
+        }
+
+        constexpr VAddr CpuAddr() const {
+            return cpu_addr;
+        }
+
+        constexpr bool IsAllocated() const {
+            return is_allocated;
+        }
+
+    private:
+        GPUVAddr start_addr{};
+        GPUVAddr end_addr{};
+        VAddr cpu_addr{};
+        bool is_allocated{};
+    };
+
+    struct IoctlSetNvmapFD {
+        u32_le nvmap_fd;
+    };
+    static_assert(sizeof(IoctlSetNvmapFD) == 4, "IoctlSetNvmapFD is incorrect size");
+
+    struct IoctlSubmitCommandBuffer {
+        u32_le id;
+        u32_le offset;
+        u32_le count;
+    };
+    static_assert(sizeof(IoctlSubmitCommandBuffer) == 0xC,
+                  "IoctlSubmitCommandBuffer is incorrect size");
+    struct IoctlSubmit {
+        u32_le cmd_buffer_count;
+        u32_le relocation_count;
+        u32_le syncpoint_count;
+        u32_le fence_count;
+    };
+    static_assert(sizeof(IoctlSubmit) == 0x10, "IoctlSubmit has incorrect size");
+
+    struct CommandBuffer {
+        s32 memory_id;
+        u32 offset;
+        s32 word_count;
+    };
+    static_assert(sizeof(CommandBuffer) == 0xC, "CommandBuffer has incorrect size");
+
+    struct Reloc {
+        s32 cmdbuffer_memory;
+        s32 cmdbuffer_offset;
+        s32 target;
+        s32 target_offset;
+    };
+    static_assert(sizeof(Reloc) == 0x10, "CommandBuffer has incorrect size");
+
+    struct SyncptIncr {
+        u32 id;
+        u32 increments;
+    };
+    static_assert(sizeof(SyncptIncr) == 0x8, "CommandBuffer has incorrect size");
+
+    struct Fence {
+        u32 id;
+        u32 value;
+    };
+    static_assert(sizeof(Fence) == 0x8, "CommandBuffer has incorrect size");
+
+    struct IoctlGetSyncpoint {
+        // Input
+        u32_le param;
+        // Output
+        u32_le value;
+    };
+    static_assert(sizeof(IoctlGetSyncpoint) == 8, "IocGetIdParams has wrong size");
+
+    struct IoctlGetWaitbase {
+        u32_le unknown; // seems to be ignored? Nintendo added this
+        u32_le value;
+    };
+    static_assert(sizeof(IoctlGetWaitbase) == 0x8, "IoctlGetWaitbase is incorrect size");
+
+    struct IoctlMapBuffer {
+        u32_le num_entries;
+        u32_le data_address; // Ignored by the driver.
+        u32_le attach_host_ch_das;
+    };
+    static_assert(sizeof(IoctlMapBuffer) == 0x0C, "IoctlMapBuffer is incorrect size");
+
+    struct IocGetIdParams {
+        // Input
+        u32_le param;
+        // Output
+        u32_le value;
+    };
+    static_assert(sizeof(IocGetIdParams) == 8, "IocGetIdParams has wrong size");
+
+    // Used for mapping and unmapping command buffers
+    struct MapBufferEntry {
+        u32_le map_handle;
+        u32_le map_address;
+    };
+    static_assert(sizeof(IoctlMapBuffer) == 0x0C, "IoctlMapBuffer is incorrect size");
+
+    /// Ioctl command implementations
+    u32 SetNVMAPfd(const std::vector<u8>& input);
+    u32 Submit(const std::vector<u8>& input, std::vector<u8>& output);
+    u32 GetSyncpoint(const std::vector<u8>& input, std::vector<u8>& output);
+    u32 GetWaitbase(const std::vector<u8>& input, std::vector<u8>& output);
+    u32 MapBuffer(const std::vector<u8>& input, std::vector<u8>& output);
+    u32 UnmapBuffer(const std::vector<u8>& input, std::vector<u8>& output);
+    u32 SetSubmitTimeout(const std::vector<u8>& input, std::vector<u8>& output);
+
+    std::optional<BufferMap> FindBufferMap(GPUVAddr gpu_addr) const;
+    void AddBufferMap(GPUVAddr gpu_addr, std::size_t size, VAddr cpu_addr, bool is_allocated);
+    std::optional<std::size_t> RemoveBufferMap(GPUVAddr gpu_addr);
+
+    u32_le nvmap_fd{};
+    u32_le submit_timeout{};
+    std::shared_ptr<nvmap> nvmap_dev;
+
+    // This is expected to be ordered, therefore we must use a map, not unordered_map
+    std::map<GPUVAddr, BufferMap> buffer_mappings;
+};
+}; // namespace Service::Nvidia::Devices
--- a/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp
@ -2,15 +2,17 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.

-#include <cstring>
-
 #include "common/assert.h"
 #include "common/logging/log.h"
+#include "core/core.h"
 #include "core/hle/service/nvdrv/devices/nvhost_vic.h"
+#include "video_core/memory_manager.h"
+#include "video_core/renderer_base.h"

 namespace Service::Nvidia::Devices {
+nvhost_vic::nvhost_vic(Core::System& system, std::shared_ptr<nvmap> nvmap_dev)
+    : nvhost_nvdec_common(system, std::move(nvmap_dev)) {}

-nvhost_vic::nvhost_vic(Core::System& system) : nvdevice(system) {}
 nvhost_vic::~nvhost_vic() = default;

 u32 nvhost_vic::ioctl(Ioctl command, const std::vector<u8>& input, const std::vector<u8>& input2,
@ -21,7 +23,7 @@ u32 nvhost_vic::ioctl(Ioctl command, const std::vector<u8>& input, const std::ve

    switch (static_cast<IoctlCommand>(command.raw)) {
    case IoctlCommand::IocSetNVMAPfdCommand:
-        return SetNVMAPfd(input, output);
+        return SetNVMAPfd(input);
    case IoctlCommand::IocSubmit:
        return Submit(input, output);
    case IoctlCommand::IocGetSyncpoint:
@ -29,83 +31,19 @@ u32 nvhost_vic::ioctl(Ioctl command, const std::vector<u8>& input, const std::ve
    case IoctlCommand::IocGetWaitbase:
        return GetWaitbase(input, output);
    case IoctlCommand::IocMapBuffer:
-        return MapBuffer(input, output);
+    case IoctlCommand::IocMapBuffer2:
+    case IoctlCommand::IocMapBuffer3:
+    case IoctlCommand::IocMapBuffer4:
    case IoctlCommand::IocMapBufferEx:
        return MapBuffer(input, output);
+    case IoctlCommand::IocUnmapBuffer:
+    case IoctlCommand::IocUnmapBuffer2:
+    case IoctlCommand::IocUnmapBuffer3:
    case IoctlCommand::IocUnmapBufferEx:
-        return UnmapBufferEx(input, output);
+        return UnmapBuffer(input, output);
    }

-    UNIMPLEMENTED_MSG("Unimplemented ioctl");
-    return 0;
-}
-
-u32 nvhost_vic::SetNVMAPfd(const std::vector<u8>& input, std::vector<u8>& output) {
-    IoctlSetNvmapFD params{};
-    std::memcpy(&params, input.data(), sizeof(IoctlSetNvmapFD));
-    LOG_DEBUG(Service_NVDRV, "called, fd={}", params.nvmap_fd);
-
-    nvmap_fd = params.nvmap_fd;
-    return 0;
-}
-
-u32 nvhost_vic::Submit(const std::vector<u8>& input, std::vector<u8>& output) {
-    IoctlSubmit params{};
-    std::memcpy(&params, input.data(), sizeof(IoctlSubmit));
-    LOG_WARNING(Service_NVDRV, "(STUBBED) called");
-
-    // Workaround for Luigi's Mansion 3, as nvhost_vic is not implemented for asynch GPU
-    params.command_buffer = {};
-
-    std::memcpy(output.data(), &params, sizeof(IoctlSubmit));
-    return 0;
-}
-
-u32 nvhost_vic::GetSyncpoint(const std::vector<u8>& input, std::vector<u8>& output) {
-    IoctlGetSyncpoint params{};
-    std::memcpy(&params, input.data(), sizeof(IoctlGetSyncpoint));
-    LOG_INFO(Service_NVDRV, "called, unknown=0x{:X}", params.unknown);
-    params.value = 0; // Seems to be hard coded at 0
-    std::memcpy(output.data(), &params, sizeof(IoctlGetSyncpoint));
-    return 0;
-}
-
-u32 nvhost_vic::GetWaitbase(const std::vector<u8>& input, std::vector<u8>& output) {
-    IoctlGetWaitbase params{};
-    std::memcpy(&params, input.data(), sizeof(IoctlGetWaitbase));
-    LOG_INFO(Service_NVDRV, "called, unknown=0x{:X}", params.unknown);
-    params.value = 0; // Seems to be hard coded at 0
-    std::memcpy(output.data(), &params, sizeof(IoctlGetWaitbase));
-    return 0;
-}
-
-u32 nvhost_vic::MapBuffer(const std::vector<u8>& input, std::vector<u8>& output) {
-    IoctlMapBuffer params{};
-    std::memcpy(&params, input.data(), sizeof(IoctlMapBuffer));
-    LOG_WARNING(Service_NVDRV, "(STUBBED) called with address={:08X}{:08X}", params.address_2,
-                params.address_1);
-    params.address_1 = 0;
-    params.address_2 = 0;
-    std::memcpy(output.data(), &params, sizeof(IoctlMapBuffer));
-    return 0;
-}
-
-u32 nvhost_vic::MapBufferEx(const std::vector<u8>& input, std::vector<u8>& output) {
-    IoctlMapBufferEx params{};
-    std::memcpy(&params, input.data(), sizeof(IoctlMapBufferEx));
-    LOG_WARNING(Service_NVDRV, "(STUBBED) called with address={:08X}{:08X}", params.address_2,
-                params.address_1);
-    params.address_1 = 0;
-    params.address_2 = 0;
-    std::memcpy(output.data(), &params, sizeof(IoctlMapBufferEx));
-    return 0;
-}
-
-u32 nvhost_vic::UnmapBufferEx(const std::vector<u8>& input, std::vector<u8>& output) {
-    IoctlUnmapBufferEx params{};
-    std::memcpy(&params, input.data(), sizeof(IoctlUnmapBufferEx));
-    LOG_WARNING(Service_NVDRV, "(STUBBED) called");
-    std::memcpy(output.data(), &params, sizeof(IoctlUnmapBufferEx));
+    UNIMPLEMENTED_MSG("Unimplemented ioctl 0x{:X}", command.raw);
    return 0;
 }

--- a/src/core/hle/service/nvdrv/devices/nvhost_vic.h
+++ b/src/core/hle/service/nvdrv/devices/nvhost_vic.h
@ -4,19 +4,15 @@

 #pragma once

-#include <array>
-#include <vector>
-#include "common/common_types.h"
-#include "common/swap.h"
-#include "core/hle/service/nvdrv/devices/nvdevice.h"
+#include "core/hle/service/nvdrv/devices/nvhost_nvdec_common.h"

 namespace Service::Nvidia::Devices {
+class nvmap;

-class nvhost_vic final : public nvdevice {
+class nvhost_vic final : public nvhost_nvdec_common {
 public:
-    explicit nvhost_vic(Core::System& system);
-    ~nvhost_vic() override;
-
+    explicit nvhost_vic(Core::System& system, std::shared_ptr<nvmap> nvmap_dev);
+    ~nvhost_vic();
    u32 ioctl(Ioctl command, const std::vector<u8>& input, const std::vector<u8>& input2,
              std::vector<u8>& output, std::vector<u8>& output2, IoctlCtrl& ctrl,
              IoctlVersion version) override;
@ -28,74 +24,14 @@ private:
        IocGetSyncpoint = 0xC0080002,
        IocGetWaitbase = 0xC0080003,
        IocMapBuffer = 0xC01C0009,
+        IocMapBuffer2 = 0xC0340009,
+        IocMapBuffer3 = 0xC0140009,
+        IocMapBuffer4 = 0xC00C0009,
        IocMapBufferEx = 0xC03C0009,
-        IocUnmapBufferEx = 0xC03C000A,
+        IocUnmapBuffer = 0xC03C000A,
+        IocUnmapBuffer2 = 0xC034000A,
+        IocUnmapBuffer3 = 0xC00C000A,
+        IocUnmapBufferEx = 0xC01C000A,
    };
-
-    struct IoctlSetNvmapFD {
-        u32_le nvmap_fd;
-    };
-    static_assert(sizeof(IoctlSetNvmapFD) == 4, "IoctlSetNvmapFD is incorrect size");
-
-    struct IoctlSubmitCommandBuffer {
-        u32 id;
-        u32 offset;
-        u32 count;
-    };
-    static_assert(sizeof(IoctlSubmitCommandBuffer) == 0xC,
-                  "IoctlSubmitCommandBuffer is incorrect size");
-
-    struct IoctlSubmit {
-        u32 command_buffer_count;
-        u32 relocations_count;
-        u32 syncpt_count;
-        u32 wait_count;
-        std::array<IoctlSubmitCommandBuffer, 4> command_buffer;
-    };
-    static_assert(sizeof(IoctlSubmit) == 0x40, "IoctlSubmit is incorrect size");
-
-    struct IoctlGetSyncpoint {
-        u32 unknown; // seems to be ignored? Nintendo added this
-        u32 value;
-    };
-    static_assert(sizeof(IoctlGetSyncpoint) == 0x8, "IoctlGetSyncpoint is incorrect size");
-
-    struct IoctlGetWaitbase {
-        u32 unknown; // seems to be ignored? Nintendo added this
-        u32 value;
-    };
-    static_assert(sizeof(IoctlGetWaitbase) == 0x8, "IoctlGetWaitbase is incorrect size");
-
-    struct IoctlMapBuffer {
-        u32 unknown;
-        u32 address_1;
-        u32 address_2;
-        INSERT_PADDING_BYTES(0x10); // TODO(DarkLordZach): RE this structure
-    };
-    static_assert(sizeof(IoctlMapBuffer) == 0x1C, "IoctlMapBuffer is incorrect size");
-
-    struct IoctlMapBufferEx {
-        u32 unknown;
-        u32 address_1;
-        u32 address_2;
-        INSERT_PADDING_BYTES(0x30); // TODO(DarkLordZach): RE this structure
-    };
-    static_assert(sizeof(IoctlMapBufferEx) == 0x3C, "IoctlMapBufferEx is incorrect size");
-
-    struct IoctlUnmapBufferEx {
-        INSERT_PADDING_BYTES(0x3C); // TODO(DarkLordZach): RE this structure
-    };
-    static_assert(sizeof(IoctlUnmapBufferEx) == 0x3C, "IoctlUnmapBufferEx is incorrect size");
-
-    u32_le nvmap_fd{};
-
-    u32 SetNVMAPfd(const std::vector<u8>& input, std::vector<u8>& output);
-    u32 Submit(const std::vector<u8>& input, std::vector<u8>& output);
-    u32 GetSyncpoint(const std::vector<u8>& input, std::vector<u8>& output);
-    u32 GetWaitbase(const std::vector<u8>& input, std::vector<u8>& output);
-    u32 MapBuffer(const std::vector<u8>& input, std::vector<u8>& output);
-    u32 MapBufferEx(const std::vector<u8>& input, std::vector<u8>& output);
-    u32 UnmapBufferEx(const std::vector<u8>& input, std::vector<u8>& output);
 };
-
 } // namespace Service::Nvidia::Devices
--- a/src/core/hle/service/nvdrv/devices/nvmap.h
+++ b/src/core/hle/service/nvdrv/devices/nvmap.h
@ -37,6 +37,7 @@ public:
        VAddr addr;
        Status status;
        u32 refcount;
+        u32 dma_map_addr;
    };

    std::shared_ptr<Object> GetObject(u32 handle) const {
--- a/src/core/hle/service/nvdrv/nvdrv.cpp
+++ b/src/core/hle/service/nvdrv/nvdrv.cpp
@ -51,9 +51,9 @@ Module::Module(Core::System& system) {
    devices["/dev/nvmap"] = nvmap_dev;
    devices["/dev/nvdisp_disp0"] = std::make_shared<Devices::nvdisp_disp0>(system, nvmap_dev);
    devices["/dev/nvhost-ctrl"] = std::make_shared<Devices::nvhost_ctrl>(system, events_interface);
-    devices["/dev/nvhost-nvdec"] = std::make_shared<Devices::nvhost_nvdec>(system);
+    devices["/dev/nvhost-nvdec"] = std::make_shared<Devices::nvhost_nvdec>(system, nvmap_dev);
    devices["/dev/nvhost-nvjpg"] = std::make_shared<Devices::nvhost_nvjpg>(system);
-    devices["/dev/nvhost-vic"] = std::make_shared<Devices::nvhost_vic>(system);
+    devices["/dev/nvhost-vic"] = std::make_shared<Devices::nvhost_vic>(system, nvmap_dev);
 }

 Module::~Module() = default;
--- a/src/core/settings.cpp
+++ b/src/core/settings.cpp
@ -63,6 +63,7 @@ void LogSettings() {
    log_setting("Renderer_GPUAccuracyLevel", values.gpu_accuracy.GetValue());
    log_setting("Renderer_UseAsynchronousGpuEmulation",
                values.use_asynchronous_gpu_emulation.GetValue());
+    log_setting("Renderer_UseNvdecEmulation", values.use_nvdec_emulation.GetValue());
    log_setting("Renderer_UseVsync", values.use_vsync.GetValue());
    log_setting("Renderer_UseAssemblyShaders", values.use_assembly_shaders.GetValue());
    log_setting("Renderer_UseAsynchronousShaders", values.use_asynchronous_shaders.GetValue());
@ -119,6 +120,7 @@ void RestoreGlobalState() {
    values.use_disk_shader_cache.SetGlobal(true);
    values.gpu_accuracy.SetGlobal(true);
    values.use_asynchronous_gpu_emulation.SetGlobal(true);
+    values.use_nvdec_emulation.SetGlobal(true);
    values.use_vsync.SetGlobal(true);
    values.use_assembly_shaders.SetGlobal(true);
    values.use_asynchronous_shaders.SetGlobal(true);
--- a/src/core/settings.h
+++ b/src/core/settings.h
@ -111,6 +111,7 @@ struct Values {
    Setting<bool> use_disk_shader_cache;
    Setting<GPUAccuracy> gpu_accuracy;
    Setting<bool> use_asynchronous_gpu_emulation;
+    Setting<bool> use_nvdec_emulation;
    Setting<bool> use_vsync;
    Setting<bool> use_assembly_shaders;
    Setting<bool> use_asynchronous_shaders;
--- a/src/core/telemetry_session.cpp
+++ b/src/core/telemetry_session.cpp
@ -206,6 +206,8 @@ void TelemetrySession::AddInitialInfo(Loader::AppLoader& app_loader) {
             TranslateGPUAccuracyLevel(Settings::values.gpu_accuracy.GetValue()));
    AddField(field_type, "Renderer_UseAsynchronousGpuEmulation",
             Settings::values.use_asynchronous_gpu_emulation.GetValue());
+    AddField(field_type, "Renderer_UseNvdecEmulation",
+             Settings::values.use_nvdec_emulation.GetValue());
    AddField(field_type, "Renderer_UseVsync", Settings::values.use_vsync.GetValue());
    AddField(field_type, "Renderer_UseAssemblyShaders",
             Settings::values.use_assembly_shaders.GetValue());