Merge pull request #2592 from FernandoS27/sync1
Implement GPU Synchronization Mechanisms & Correct NVFlinger
This commit is contained in:
commit
52f54c728d
44 changed files with 730 additions and 227 deletions
|
@ -525,8 +525,9 @@ void Maxwell3D::ProcessSyncPoint() {
|
|||
const u32 sync_point = regs.sync_info.sync_point.Value();
|
||||
const u32 increment = regs.sync_info.increment.Value();
|
||||
const u32 cache_flush = regs.sync_info.unknown.Value();
|
||||
LOG_DEBUG(HW_GPU, "Syncpoint set {}, increment: {}, unk: {}", sync_point, increment,
|
||||
cache_flush);
|
||||
if (increment) {
|
||||
system.GPU().IncrementSyncPoint(sync_point);
|
||||
}
|
||||
}
|
||||
|
||||
void Maxwell3D::DrawArrays() {
|
||||
|
|
|
@ -29,7 +29,8 @@ u32 FramebufferConfig::BytesPerPixel(PixelFormat format) {
|
|||
UNREACHABLE();
|
||||
}
|
||||
|
||||
GPU::GPU(Core::System& system, VideoCore::RendererBase& renderer) : renderer{renderer} {
|
||||
GPU::GPU(Core::System& system, VideoCore::RendererBase& renderer, bool is_async)
|
||||
: system{system}, renderer{renderer}, is_async{is_async} {
|
||||
auto& rasterizer{renderer.Rasterizer()};
|
||||
memory_manager = std::make_unique<Tegra::MemoryManager>(system, rasterizer);
|
||||
dma_pusher = std::make_unique<Tegra::DmaPusher>(*this);
|
||||
|
@ -74,6 +75,51 @@ const DmaPusher& GPU::DmaPusher() const {
|
|||
return *dma_pusher;
|
||||
}
|
||||
|
||||
void GPU::IncrementSyncPoint(const u32 syncpoint_id) {
|
||||
syncpoints[syncpoint_id]++;
|
||||
std::lock_guard lock{sync_mutex};
|
||||
if (!syncpt_interrupts[syncpoint_id].empty()) {
|
||||
u32 value = syncpoints[syncpoint_id].load();
|
||||
auto it = syncpt_interrupts[syncpoint_id].begin();
|
||||
while (it != syncpt_interrupts[syncpoint_id].end()) {
|
||||
if (value >= *it) {
|
||||
TriggerCpuInterrupt(syncpoint_id, *it);
|
||||
it = syncpt_interrupts[syncpoint_id].erase(it);
|
||||
continue;
|
||||
}
|
||||
it++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
u32 GPU::GetSyncpointValue(const u32 syncpoint_id) const {
|
||||
return syncpoints[syncpoint_id].load();
|
||||
}
|
||||
|
||||
void GPU::RegisterSyncptInterrupt(const u32 syncpoint_id, const u32 value) {
|
||||
auto& interrupt = syncpt_interrupts[syncpoint_id];
|
||||
bool contains = std::any_of(interrupt.begin(), interrupt.end(),
|
||||
[value](u32 in_value) { return in_value == value; });
|
||||
if (contains) {
|
||||
return;
|
||||
}
|
||||
syncpt_interrupts[syncpoint_id].emplace_back(value);
|
||||
}
|
||||
|
||||
bool GPU::CancelSyncptInterrupt(const u32 syncpoint_id, const u32 value) {
|
||||
std::lock_guard lock{sync_mutex};
|
||||
auto& interrupt = syncpt_interrupts[syncpoint_id];
|
||||
const auto iter =
|
||||
std::find_if(interrupt.begin(), interrupt.end(),
|
||||
[value](u32 interrupt_value) { return value == interrupt_value; });
|
||||
|
||||
if (iter == interrupt.end()) {
|
||||
return false;
|
||||
}
|
||||
interrupt.erase(iter);
|
||||
return true;
|
||||
}
|
||||
|
||||
u32 RenderTargetBytesPerPixel(RenderTargetFormat format) {
|
||||
ASSERT(format != RenderTargetFormat::NONE);
|
||||
|
||||
|
|
|
@ -5,8 +5,12 @@
|
|||
#pragma once
|
||||
|
||||
#include <array>
|
||||
#include <atomic>
|
||||
#include <list>
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
#include "common/common_types.h"
|
||||
#include "core/hle/service/nvdrv/nvdata.h"
|
||||
#include "core/hle/service/nvflinger/buffer_queue.h"
|
||||
#include "video_core/dma_pusher.h"
|
||||
|
||||
|
@ -127,7 +131,7 @@ class MemoryManager;
|
|||
|
||||
class GPU {
|
||||
public:
|
||||
explicit GPU(Core::System& system, VideoCore::RendererBase& renderer);
|
||||
explicit GPU(Core::System& system, VideoCore::RendererBase& renderer, bool is_async);
|
||||
|
||||
virtual ~GPU();
|
||||
|
||||
|
@ -170,6 +174,22 @@ public:
|
|||
/// Returns a reference to the GPU DMA pusher.
|
||||
Tegra::DmaPusher& DmaPusher();
|
||||
|
||||
void IncrementSyncPoint(u32 syncpoint_id);
|
||||
|
||||
u32 GetSyncpointValue(u32 syncpoint_id) const;
|
||||
|
||||
void RegisterSyncptInterrupt(u32 syncpoint_id, u32 value);
|
||||
|
||||
bool CancelSyncptInterrupt(u32 syncpoint_id, u32 value);
|
||||
|
||||
std::unique_lock<std::mutex> LockSync() {
|
||||
return std::unique_lock{sync_mutex};
|
||||
}
|
||||
|
||||
bool IsAsync() const {
|
||||
return is_async;
|
||||
}
|
||||
|
||||
/// Returns a const reference to the GPU DMA pusher.
|
||||
const Tegra::DmaPusher& DmaPusher() const;
|
||||
|
||||
|
@ -239,6 +259,9 @@ public:
|
|||
/// Notify rasterizer that any caches of the specified region should be flushed and invalidated
|
||||
virtual void FlushAndInvalidateRegion(CacheAddr addr, u64 size) = 0;
|
||||
|
||||
protected:
|
||||
virtual void TriggerCpuInterrupt(u32 syncpoint_id, u32 value) const = 0;
|
||||
|
||||
private:
|
||||
void ProcessBindMethod(const MethodCall& method_call);
|
||||
void ProcessSemaphoreTriggerMethod();
|
||||
|
@ -257,6 +280,7 @@ private:
|
|||
protected:
|
||||
std::unique_ptr<Tegra::DmaPusher> dma_pusher;
|
||||
VideoCore::RendererBase& renderer;
|
||||
Core::System& system;
|
||||
|
||||
private:
|
||||
std::unique_ptr<Tegra::MemoryManager> memory_manager;
|
||||
|
@ -273,6 +297,14 @@ private:
|
|||
std::unique_ptr<Engines::MaxwellDMA> maxwell_dma;
|
||||
/// Inline memory engine
|
||||
std::unique_ptr<Engines::KeplerMemory> kepler_memory;
|
||||
|
||||
std::array<std::atomic<u32>, Service::Nvidia::MaxSyncPoints> syncpoints{};
|
||||
|
||||
std::array<std::list<u32>, Service::Nvidia::MaxSyncPoints> syncpt_interrupts;
|
||||
|
||||
std::mutex sync_mutex;
|
||||
|
||||
const bool is_async;
|
||||
};
|
||||
|
||||
#define ASSERT_REG_POSITION(field_name, position) \
|
||||
|
|
|
@ -2,6 +2,8 @@
|
|||
// Licensed under GPLv2 or any later version
|
||||
// Refer to the license.txt file included.
|
||||
|
||||
#include "core/core.h"
|
||||
#include "core/hardware_interrupt_manager.h"
|
||||
#include "video_core/gpu_asynch.h"
|
||||
#include "video_core/gpu_thread.h"
|
||||
#include "video_core/renderer_base.h"
|
||||
|
@ -9,7 +11,7 @@
|
|||
namespace VideoCommon {
|
||||
|
||||
GPUAsynch::GPUAsynch(Core::System& system, VideoCore::RendererBase& renderer)
|
||||
: GPU(system, renderer), gpu_thread{system} {}
|
||||
: GPU(system, renderer, true), gpu_thread{system} {}
|
||||
|
||||
GPUAsynch::~GPUAsynch() = default;
|
||||
|
||||
|
@ -38,4 +40,9 @@ void GPUAsynch::FlushAndInvalidateRegion(CacheAddr addr, u64 size) {
|
|||
gpu_thread.FlushAndInvalidateRegion(addr, size);
|
||||
}
|
||||
|
||||
void GPUAsynch::TriggerCpuInterrupt(const u32 syncpoint_id, const u32 value) const {
|
||||
auto& interrupt_manager = system.InterruptManager();
|
||||
interrupt_manager.GPUInterruptSyncpt(syncpoint_id, value);
|
||||
}
|
||||
|
||||
} // namespace VideoCommon
|
||||
|
|
|
@ -27,6 +27,9 @@ public:
|
|||
void InvalidateRegion(CacheAddr addr, u64 size) override;
|
||||
void FlushAndInvalidateRegion(CacheAddr addr, u64 size) override;
|
||||
|
||||
protected:
|
||||
void TriggerCpuInterrupt(u32 syncpoint_id, u32 value) const override;
|
||||
|
||||
private:
|
||||
GPUThread::ThreadManager gpu_thread;
|
||||
};
|
||||
|
|
|
@ -8,7 +8,7 @@
|
|||
namespace VideoCommon {
|
||||
|
||||
GPUSynch::GPUSynch(Core::System& system, VideoCore::RendererBase& renderer)
|
||||
: GPU(system, renderer) {}
|
||||
: GPU(system, renderer, false) {}
|
||||
|
||||
GPUSynch::~GPUSynch() = default;
|
||||
|
||||
|
|
|
@ -25,6 +25,10 @@ public:
|
|||
void FlushRegion(CacheAddr addr, u64 size) override;
|
||||
void InvalidateRegion(CacheAddr addr, u64 size) override;
|
||||
void FlushAndInvalidateRegion(CacheAddr addr, u64 size) override;
|
||||
|
||||
protected:
|
||||
void TriggerCpuInterrupt([[maybe_unused]] u32 syncpoint_id,
|
||||
[[maybe_unused]] u32 value) const override {}
|
||||
};
|
||||
|
||||
} // namespace VideoCommon
|
||||
|
|
|
@ -21,7 +21,8 @@ static void RunThread(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_p
|
|||
MicroProfileOnThreadCreate("GpuThread");
|
||||
|
||||
// Wait for first GPU command before acquiring the window context
|
||||
state.WaitForCommands();
|
||||
while (state.queue.Empty())
|
||||
;
|
||||
|
||||
// If emulation was stopped during disk shader loading, abort before trying to acquire context
|
||||
if (!state.is_running) {
|
||||
|
@ -32,7 +33,6 @@ static void RunThread(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_p
|
|||
|
||||
CommandDataContainer next;
|
||||
while (state.is_running) {
|
||||
state.WaitForCommands();
|
||||
while (!state.queue.Empty()) {
|
||||
state.queue.Pop(next);
|
||||
if (const auto submit_list = std::get_if<SubmitListCommand>(&next.data)) {
|
||||
|
@ -49,8 +49,7 @@ static void RunThread(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_p
|
|||
} else {
|
||||
UNREACHABLE();
|
||||
}
|
||||
state.signaled_fence = next.fence;
|
||||
state.TrySynchronize();
|
||||
state.signaled_fence.store(next.fence);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -89,12 +88,7 @@ void ThreadManager::FlushRegion(CacheAddr addr, u64 size) {
|
|||
}
|
||||
|
||||
void ThreadManager::InvalidateRegion(CacheAddr addr, u64 size) {
|
||||
if (state.queue.Empty()) {
|
||||
// It's quicker to invalidate a single region on the CPU if the queue is already empty
|
||||
system.Renderer().Rasterizer().InvalidateRegion(addr, size);
|
||||
} else {
|
||||
PushCommand(InvalidateRegionCommand(addr, size));
|
||||
}
|
||||
system.Renderer().Rasterizer().InvalidateRegion(addr, size);
|
||||
}
|
||||
|
||||
void ThreadManager::FlushAndInvalidateRegion(CacheAddr addr, u64 size) {
|
||||
|
@ -105,22 +99,13 @@ void ThreadManager::FlushAndInvalidateRegion(CacheAddr addr, u64 size) {
|
|||
u64 ThreadManager::PushCommand(CommandData&& command_data) {
|
||||
const u64 fence{++state.last_fence};
|
||||
state.queue.Push(CommandDataContainer(std::move(command_data), fence));
|
||||
state.SignalCommands();
|
||||
return fence;
|
||||
}
|
||||
|
||||
MICROPROFILE_DEFINE(GPU_wait, "GPU", "Wait for the GPU", MP_RGB(128, 128, 192));
|
||||
void SynchState::WaitForSynchronization(u64 fence) {
|
||||
if (signaled_fence >= fence) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Wait for the GPU to be idle (all commands to be executed)
|
||||
{
|
||||
MICROPROFILE_SCOPE(GPU_wait);
|
||||
std::unique_lock lock{synchronization_mutex};
|
||||
synchronization_condition.wait(lock, [this, fence] { return signaled_fence >= fence; });
|
||||
}
|
||||
while (signaled_fence.load() < fence)
|
||||
;
|
||||
}
|
||||
|
||||
} // namespace VideoCommon::GPUThread
|
||||
|
|
|
@ -88,41 +88,9 @@ struct CommandDataContainer {
|
|||
/// Struct used to synchronize the GPU thread
|
||||
struct SynchState final {
|
||||
std::atomic_bool is_running{true};
|
||||
std::atomic_int queued_frame_count{};
|
||||
std::mutex synchronization_mutex;
|
||||
std::mutex commands_mutex;
|
||||
std::condition_variable commands_condition;
|
||||
std::condition_variable synchronization_condition;
|
||||
|
||||
/// Returns true if the gap in GPU commands is small enough that we can consider the CPU and GPU
|
||||
/// synchronized. This is entirely empirical.
|
||||
bool IsSynchronized() const {
|
||||
constexpr std::size_t max_queue_gap{5};
|
||||
return queue.Size() <= max_queue_gap;
|
||||
}
|
||||
|
||||
void TrySynchronize() {
|
||||
if (IsSynchronized()) {
|
||||
std::lock_guard lock{synchronization_mutex};
|
||||
synchronization_condition.notify_one();
|
||||
}
|
||||
}
|
||||
|
||||
void WaitForSynchronization(u64 fence);
|
||||
|
||||
void SignalCommands() {
|
||||
if (queue.Empty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
commands_condition.notify_one();
|
||||
}
|
||||
|
||||
void WaitForCommands() {
|
||||
std::unique_lock lock{commands_mutex};
|
||||
commands_condition.wait(lock, [this] { return !queue.Empty(); });
|
||||
}
|
||||
|
||||
using CommandQueue = Common::SPSCQueue<CommandDataContainer>;
|
||||
CommandQueue queue;
|
||||
u64 last_fence{};
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue