Merge pull request #2592 from FernandoS27/sync1

Implement GPU Synchronization Mechanisms & Correct NVFlinger
This commit is contained in:
bunnei 2019-07-26 14:26:44 -04:00 committed by GitHub
commit 52f54c728d
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
44 changed files with 730 additions and 227 deletions

View file

@ -525,8 +525,9 @@ void Maxwell3D::ProcessSyncPoint() {
const u32 sync_point = regs.sync_info.sync_point.Value();
const u32 increment = regs.sync_info.increment.Value();
const u32 cache_flush = regs.sync_info.unknown.Value();
LOG_DEBUG(HW_GPU, "Syncpoint set {}, increment: {}, unk: {}", sync_point, increment,
cache_flush);
if (increment) {
system.GPU().IncrementSyncPoint(sync_point);
}
}
void Maxwell3D::DrawArrays() {

View file

@ -29,7 +29,8 @@ u32 FramebufferConfig::BytesPerPixel(PixelFormat format) {
UNREACHABLE();
}
GPU::GPU(Core::System& system, VideoCore::RendererBase& renderer) : renderer{renderer} {
GPU::GPU(Core::System& system, VideoCore::RendererBase& renderer, bool is_async)
: system{system}, renderer{renderer}, is_async{is_async} {
auto& rasterizer{renderer.Rasterizer()};
memory_manager = std::make_unique<Tegra::MemoryManager>(system, rasterizer);
dma_pusher = std::make_unique<Tegra::DmaPusher>(*this);
@ -74,6 +75,51 @@ const DmaPusher& GPU::DmaPusher() const {
return *dma_pusher;
}
void GPU::IncrementSyncPoint(const u32 syncpoint_id) {
syncpoints[syncpoint_id]++;
std::lock_guard lock{sync_mutex};
if (!syncpt_interrupts[syncpoint_id].empty()) {
u32 value = syncpoints[syncpoint_id].load();
auto it = syncpt_interrupts[syncpoint_id].begin();
while (it != syncpt_interrupts[syncpoint_id].end()) {
if (value >= *it) {
TriggerCpuInterrupt(syncpoint_id, *it);
it = syncpt_interrupts[syncpoint_id].erase(it);
continue;
}
it++;
}
}
}
u32 GPU::GetSyncpointValue(const u32 syncpoint_id) const {
return syncpoints[syncpoint_id].load();
}
void GPU::RegisterSyncptInterrupt(const u32 syncpoint_id, const u32 value) {
auto& interrupt = syncpt_interrupts[syncpoint_id];
bool contains = std::any_of(interrupt.begin(), interrupt.end(),
[value](u32 in_value) { return in_value == value; });
if (contains) {
return;
}
syncpt_interrupts[syncpoint_id].emplace_back(value);
}
bool GPU::CancelSyncptInterrupt(const u32 syncpoint_id, const u32 value) {
std::lock_guard lock{sync_mutex};
auto& interrupt = syncpt_interrupts[syncpoint_id];
const auto iter =
std::find_if(interrupt.begin(), interrupt.end(),
[value](u32 interrupt_value) { return value == interrupt_value; });
if (iter == interrupt.end()) {
return false;
}
interrupt.erase(iter);
return true;
}
u32 RenderTargetBytesPerPixel(RenderTargetFormat format) {
ASSERT(format != RenderTargetFormat::NONE);

View file

@ -5,8 +5,12 @@
#pragma once
#include <array>
#include <atomic>
#include <list>
#include <memory>
#include <mutex>
#include "common/common_types.h"
#include "core/hle/service/nvdrv/nvdata.h"
#include "core/hle/service/nvflinger/buffer_queue.h"
#include "video_core/dma_pusher.h"
@ -127,7 +131,7 @@ class MemoryManager;
class GPU {
public:
explicit GPU(Core::System& system, VideoCore::RendererBase& renderer);
explicit GPU(Core::System& system, VideoCore::RendererBase& renderer, bool is_async);
virtual ~GPU();
@ -170,6 +174,22 @@ public:
/// Returns a reference to the GPU DMA pusher.
Tegra::DmaPusher& DmaPusher();
void IncrementSyncPoint(u32 syncpoint_id);
u32 GetSyncpointValue(u32 syncpoint_id) const;
void RegisterSyncptInterrupt(u32 syncpoint_id, u32 value);
bool CancelSyncptInterrupt(u32 syncpoint_id, u32 value);
std::unique_lock<std::mutex> LockSync() {
return std::unique_lock{sync_mutex};
}
bool IsAsync() const {
return is_async;
}
/// Returns a const reference to the GPU DMA pusher.
const Tegra::DmaPusher& DmaPusher() const;
@ -239,6 +259,9 @@ public:
/// Notify rasterizer that any caches of the specified region should be flushed and invalidated
virtual void FlushAndInvalidateRegion(CacheAddr addr, u64 size) = 0;
protected:
virtual void TriggerCpuInterrupt(u32 syncpoint_id, u32 value) const = 0;
private:
void ProcessBindMethod(const MethodCall& method_call);
void ProcessSemaphoreTriggerMethod();
@ -257,6 +280,7 @@ private:
protected:
std::unique_ptr<Tegra::DmaPusher> dma_pusher;
VideoCore::RendererBase& renderer;
Core::System& system;
private:
std::unique_ptr<Tegra::MemoryManager> memory_manager;
@ -273,6 +297,14 @@ private:
std::unique_ptr<Engines::MaxwellDMA> maxwell_dma;
/// Inline memory engine
std::unique_ptr<Engines::KeplerMemory> kepler_memory;
std::array<std::atomic<u32>, Service::Nvidia::MaxSyncPoints> syncpoints{};
std::array<std::list<u32>, Service::Nvidia::MaxSyncPoints> syncpt_interrupts;
std::mutex sync_mutex;
const bool is_async;
};
#define ASSERT_REG_POSITION(field_name, position) \

View file

@ -2,6 +2,8 @@
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#include "core/core.h"
#include "core/hardware_interrupt_manager.h"
#include "video_core/gpu_asynch.h"
#include "video_core/gpu_thread.h"
#include "video_core/renderer_base.h"
@ -9,7 +11,7 @@
namespace VideoCommon {
GPUAsynch::GPUAsynch(Core::System& system, VideoCore::RendererBase& renderer)
: GPU(system, renderer), gpu_thread{system} {}
: GPU(system, renderer, true), gpu_thread{system} {}
GPUAsynch::~GPUAsynch() = default;
@ -38,4 +40,9 @@ void GPUAsynch::FlushAndInvalidateRegion(CacheAddr addr, u64 size) {
gpu_thread.FlushAndInvalidateRegion(addr, size);
}
void GPUAsynch::TriggerCpuInterrupt(const u32 syncpoint_id, const u32 value) const {
auto& interrupt_manager = system.InterruptManager();
interrupt_manager.GPUInterruptSyncpt(syncpoint_id, value);
}
} // namespace VideoCommon

View file

@ -27,6 +27,9 @@ public:
void InvalidateRegion(CacheAddr addr, u64 size) override;
void FlushAndInvalidateRegion(CacheAddr addr, u64 size) override;
protected:
void TriggerCpuInterrupt(u32 syncpoint_id, u32 value) const override;
private:
GPUThread::ThreadManager gpu_thread;
};

View file

@ -8,7 +8,7 @@
namespace VideoCommon {
GPUSynch::GPUSynch(Core::System& system, VideoCore::RendererBase& renderer)
: GPU(system, renderer) {}
: GPU(system, renderer, false) {}
GPUSynch::~GPUSynch() = default;

View file

@ -25,6 +25,10 @@ public:
void FlushRegion(CacheAddr addr, u64 size) override;
void InvalidateRegion(CacheAddr addr, u64 size) override;
void FlushAndInvalidateRegion(CacheAddr addr, u64 size) override;
protected:
void TriggerCpuInterrupt([[maybe_unused]] u32 syncpoint_id,
[[maybe_unused]] u32 value) const override {}
};
} // namespace VideoCommon

View file

@ -21,7 +21,8 @@ static void RunThread(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_p
MicroProfileOnThreadCreate("GpuThread");
// Wait for first GPU command before acquiring the window context
state.WaitForCommands();
while (state.queue.Empty())
;
// If emulation was stopped during disk shader loading, abort before trying to acquire context
if (!state.is_running) {
@ -32,7 +33,6 @@ static void RunThread(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_p
CommandDataContainer next;
while (state.is_running) {
state.WaitForCommands();
while (!state.queue.Empty()) {
state.queue.Pop(next);
if (const auto submit_list = std::get_if<SubmitListCommand>(&next.data)) {
@ -49,8 +49,7 @@ static void RunThread(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_p
} else {
UNREACHABLE();
}
state.signaled_fence = next.fence;
state.TrySynchronize();
state.signaled_fence.store(next.fence);
}
}
}
@ -89,12 +88,7 @@ void ThreadManager::FlushRegion(CacheAddr addr, u64 size) {
}
void ThreadManager::InvalidateRegion(CacheAddr addr, u64 size) {
if (state.queue.Empty()) {
// It's quicker to invalidate a single region on the CPU if the queue is already empty
system.Renderer().Rasterizer().InvalidateRegion(addr, size);
} else {
PushCommand(InvalidateRegionCommand(addr, size));
}
system.Renderer().Rasterizer().InvalidateRegion(addr, size);
}
void ThreadManager::FlushAndInvalidateRegion(CacheAddr addr, u64 size) {
@ -105,22 +99,13 @@ void ThreadManager::FlushAndInvalidateRegion(CacheAddr addr, u64 size) {
u64 ThreadManager::PushCommand(CommandData&& command_data) {
const u64 fence{++state.last_fence};
state.queue.Push(CommandDataContainer(std::move(command_data), fence));
state.SignalCommands();
return fence;
}
MICROPROFILE_DEFINE(GPU_wait, "GPU", "Wait for the GPU", MP_RGB(128, 128, 192));
void SynchState::WaitForSynchronization(u64 fence) {
if (signaled_fence >= fence) {
return;
}
// Wait for the GPU to be idle (all commands to be executed)
{
MICROPROFILE_SCOPE(GPU_wait);
std::unique_lock lock{synchronization_mutex};
synchronization_condition.wait(lock, [this, fence] { return signaled_fence >= fence; });
}
while (signaled_fence.load() < fence)
;
}
} // namespace VideoCommon::GPUThread

View file

@ -88,41 +88,9 @@ struct CommandDataContainer {
/// Struct used to synchronize the GPU thread
struct SynchState final {
std::atomic_bool is_running{true};
std::atomic_int queued_frame_count{};
std::mutex synchronization_mutex;
std::mutex commands_mutex;
std::condition_variable commands_condition;
std::condition_variable synchronization_condition;
/// Returns true if the gap in GPU commands is small enough that we can consider the CPU and GPU
/// synchronized. This is entirely empirical.
bool IsSynchronized() const {
constexpr std::size_t max_queue_gap{5};
return queue.Size() <= max_queue_gap;
}
void TrySynchronize() {
if (IsSynchronized()) {
std::lock_guard lock{synchronization_mutex};
synchronization_condition.notify_one();
}
}
void WaitForSynchronization(u64 fence);
void SignalCommands() {
if (queue.Empty()) {
return;
}
commands_condition.notify_one();
}
void WaitForCommands() {
std::unique_lock lock{commands_mutex};
commands_condition.wait(lock, [this] { return !queue.Empty(); });
}
using CommandQueue = Common::SPSCQueue<CommandDataContainer>;
CommandQueue queue;
u64 last_fence{};