Merge pull request #2092 from ReinUsesLisp/stg

shader/memory: Implement STG and global memory flushing
This commit is contained in:
bunnei 2019-04-16 22:15:17 -04:00 committed by GitHub
commit 1b83f255c2
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
11 changed files with 186 additions and 89 deletions

View file

@ -14,28 +14,28 @@
namespace OpenGL {
CachedGlobalRegion::CachedGlobalRegion(VAddr cpu_addr, u32 size, u8* host_ptr)
: RasterizerCacheObject{host_ptr}, cpu_addr{cpu_addr}, size{size} {
CachedGlobalRegion::CachedGlobalRegion(VAddr cpu_addr, u8* host_ptr, u32 size, u32 max_size)
: RasterizerCacheObject{host_ptr}, cpu_addr{cpu_addr}, host_ptr{host_ptr}, size{size},
max_size{max_size} {
buffer.Create();
// Bind and unbind the buffer so it gets allocated by the driver
glBindBuffer(GL_SHADER_STORAGE_BUFFER, buffer.handle);
glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0);
LabelGLObject(GL_BUFFER, buffer.handle, cpu_addr, "GlobalMemory");
}
void CachedGlobalRegion::Reload(u32 size_) {
constexpr auto max_size = static_cast<u32>(RasterizerOpenGL::MaxGlobalMemorySize);
CachedGlobalRegion::~CachedGlobalRegion() = default;
void CachedGlobalRegion::Reload(u32 size_) {
size = size_;
if (size > max_size) {
size = max_size;
LOG_CRITICAL(HW_GPU, "Global region size {} exceeded the expected size {}!", size_,
LOG_CRITICAL(HW_GPU, "Global region size {} exceeded the supported size {}!", size_,
max_size);
}
glNamedBufferData(buffer.handle, size, host_ptr, GL_STREAM_DRAW);
}
// TODO(Rodrigo): Get rid of Memory::GetPointer with a staging buffer
glBindBuffer(GL_SHADER_STORAGE_BUFFER, buffer.handle);
glBufferData(GL_SHADER_STORAGE_BUFFER, size, GetHostPtr(), GL_DYNAMIC_DRAW);
void CachedGlobalRegion::Flush() {
LOG_DEBUG(Render_OpenGL, "Flushing {} bytes to CPU memory address 0x{:16}", size, cpu_addr);
glGetNamedBufferSubData(buffer.handle, 0, static_cast<GLsizeiptr>(size), host_ptr);
}
GlobalRegion GlobalRegionCacheOpenGL::TryGetReservedGlobalRegion(CacheAddr addr, u32 size) const {
@ -46,14 +46,16 @@ GlobalRegion GlobalRegionCacheOpenGL::TryGetReservedGlobalRegion(CacheAddr addr,
return search->second;
}
GlobalRegion GlobalRegionCacheOpenGL::GetUncachedGlobalRegion(GPUVAddr addr, u32 size,
u8* host_ptr) {
GlobalRegion GlobalRegionCacheOpenGL::GetUncachedGlobalRegion(GPUVAddr addr, u8* host_ptr,
u32 size) {
GlobalRegion region{TryGetReservedGlobalRegion(ToCacheAddr(host_ptr), size)};
if (!region) {
// No reserved surface available, create a new one and reserve it
auto& memory_manager{Core::System::GetInstance().GPU().MemoryManager()};
const auto cpu_addr = *memory_manager.GpuToCpuAddress(addr);
region = std::make_shared<CachedGlobalRegion>(cpu_addr, size, host_ptr);
const auto cpu_addr{memory_manager.GpuToCpuAddress(addr)};
ASSERT(cpu_addr);
region = std::make_shared<CachedGlobalRegion>(*cpu_addr, host_ptr, size, max_ssbo_size);
ReserveGlobalRegion(region);
}
region->Reload(size);
@ -65,7 +67,11 @@ void GlobalRegionCacheOpenGL::ReserveGlobalRegion(GlobalRegion region) {
}
GlobalRegionCacheOpenGL::GlobalRegionCacheOpenGL(RasterizerOpenGL& rasterizer)
: RasterizerCache{rasterizer} {}
: RasterizerCache{rasterizer} {
GLint max_ssbo_size_;
glGetIntegerv(GL_MAX_SHADER_STORAGE_BLOCK_SIZE, &max_ssbo_size_);
max_ssbo_size = static_cast<u32>(max_ssbo_size_);
}
GlobalRegion GlobalRegionCacheOpenGL::GetGlobalRegion(
const GLShader::GlobalMemoryEntry& global_region,
@ -73,7 +79,7 @@ GlobalRegion GlobalRegionCacheOpenGL::GetGlobalRegion(
auto& gpu{Core::System::GetInstance().GPU()};
auto& memory_manager{gpu.MemoryManager()};
const auto cbufs{gpu.Maxwell3D().state.shader_stages[static_cast<u64>(stage)]};
const auto cbufs{gpu.Maxwell3D().state.shader_stages[static_cast<std::size_t>(stage)]};
const auto addr{cbufs.const_buffers[global_region.GetCbufIndex()].address +
global_region.GetCbufOffset()};
const auto actual_addr{memory_manager.Read<u64>(addr)};
@ -85,7 +91,7 @@ GlobalRegion GlobalRegionCacheOpenGL::GetGlobalRegion(
if (!region) {
// No global region found - create a new one
region = GetUncachedGlobalRegion(actual_addr, size, host_ptr);
region = GetUncachedGlobalRegion(actual_addr, host_ptr, size);
Register(region);
}

View file

@ -19,7 +19,7 @@ namespace OpenGL {
namespace GLShader {
class GlobalMemoryEntry;
} // namespace GLShader
}
class RasterizerOpenGL;
class CachedGlobalRegion;
@ -27,7 +27,8 @@ using GlobalRegion = std::shared_ptr<CachedGlobalRegion>;
class CachedGlobalRegion final : public RasterizerCacheObject {
public:
explicit CachedGlobalRegion(VAddr cpu_addr, u32 size, u8* host_ptr);
explicit CachedGlobalRegion(VAddr cpu_addr, u8* host_ptr, u32 size, u32 max_size);
~CachedGlobalRegion();
VAddr GetCpuAddr() const override {
return cpu_addr;
@ -45,14 +46,14 @@ public:
/// Reloads the global region from guest memory
void Reload(u32 size_);
// TODO(Rodrigo): When global memory is written (STG), implement flushing
void Flush() override {
UNIMPLEMENTED();
}
void Flush() override;
private:
VAddr cpu_addr{};
u8* host_ptr{};
u32 size{};
u32 max_size{};
OGLBuffer buffer;
};
@ -66,10 +67,11 @@ public:
private:
GlobalRegion TryGetReservedGlobalRegion(CacheAddr addr, u32 size) const;
GlobalRegion GetUncachedGlobalRegion(GPUVAddr addr, u32 size, u8* host_ptr);
GlobalRegion GetUncachedGlobalRegion(GPUVAddr addr, u8* host_ptr, u32 size);
void ReserveGlobalRegion(GlobalRegion region);
std::unordered_map<CacheAddr, GlobalRegion> reserve;
u32 max_ssbo_size{};
};
} // namespace OpenGL

View file

@ -756,6 +756,7 @@ void RasterizerOpenGL::FlushRegion(CacheAddr addr, u64 size) {
return;
}
res_cache.FlushRegion(addr, size);
global_cache.FlushRegion(addr, size);
}
void RasterizerOpenGL::InvalidateRegion(CacheAddr addr, u64 size) {
@ -953,6 +954,9 @@ void RasterizerOpenGL::SetupGlobalRegions(Tegra::Engines::Maxwell3D::Regs::Shade
for (std::size_t bindpoint = 0; bindpoint < entries.size(); ++bindpoint) {
const auto& entry{entries[bindpoint]};
const auto& region{global_cache.GetGlobalRegion(entry, stage)};
if (entry.IsWritten()) {
region->MarkAsModified(true, global_cache);
}
bind_ssbo_pushbuffer.Push(region->GetBufferHandle(), 0,
static_cast<GLsizeiptr>(region->GetSizeInBytes()));
}

View file

@ -71,10 +71,6 @@ public:
static_assert(MaxConstbufferSize % sizeof(GLvec4) == 0,
"The maximum size of a constbuffer must be a multiple of the size of GLvec4");
static constexpr std::size_t MaxGlobalMemorySize = 0x10000;
static_assert(MaxGlobalMemorySize % sizeof(float) == 0,
"The maximum size of a global memory must be a multiple of the size of float");
private:
class SamplerInfo {
public:

View file

@ -45,8 +45,6 @@ using TextureIR = std::variant<TextureAoffi, TextureArgument>;
enum : u32 { POSITION_VARYING_LOCATION = 0, GENERIC_VARYING_START_LOCATION = 1 };
constexpr u32 MAX_CONSTBUFFER_ELEMENTS =
static_cast<u32>(RasterizerOpenGL::MaxConstbufferSize) / (4 * sizeof(float));
constexpr u32 MAX_GLOBALMEMORY_ELEMENTS =
static_cast<u32>(RasterizerOpenGL::MaxGlobalMemorySize) / sizeof(float);
class ShaderWriter {
public:
@ -208,8 +206,10 @@ public:
for (const auto& sampler : ir.GetSamplers()) {
entries.samplers.emplace_back(sampler);
}
for (const auto& gmem : ir.GetGlobalMemoryBases()) {
entries.global_memory_entries.emplace_back(gmem.cbuf_index, gmem.cbuf_offset);
for (const auto& gmem_pair : ir.GetGlobalMemory()) {
const auto& [base, usage] = gmem_pair;
entries.global_memory_entries.emplace_back(base.cbuf_index, base.cbuf_offset,
usage.is_read, usage.is_written);
}
entries.clip_distances = ir.GetClipDistances();
entries.shader_length = ir.GetLength();
@ -380,12 +380,22 @@ private:
}
void DeclareGlobalMemory() {
for (const auto& entry : ir.GetGlobalMemoryBases()) {
for (const auto& gmem : ir.GetGlobalMemory()) {
const auto& [base, usage] = gmem;
// Since we don't know how the shader will use the shader, hint the driver to disable as
// much optimizations as possible
std::string qualifier = "coherent volatile";
if (usage.is_read && !usage.is_written)
qualifier += " readonly";
else if (usage.is_written && !usage.is_read)
qualifier += " writeonly";
const std::string binding =
fmt::format("GMEM_BINDING_{}_{}", entry.cbuf_index, entry.cbuf_offset);
code.AddLine("layout (std430, binding = " + binding + ") buffer " +
GetGlobalMemoryBlock(entry) + " {");
code.AddLine(" float " + GetGlobalMemory(entry) + "[MAX_GLOBALMEMORY_ELEMENTS];");
fmt::format("GMEM_BINDING_{}_{}", base.cbuf_index, base.cbuf_offset);
code.AddLine("layout (std430, binding = " + binding + ") " + qualifier + " buffer " +
GetGlobalMemoryBlock(base) + " {");
code.AddLine(" float " + GetGlobalMemory(base) + "[];");
code.AddLine("};");
code.AddNewLine();
}
@ -868,6 +878,12 @@ private:
} else if (const auto lmem = std::get_if<LmemNode>(dest)) {
target = GetLocalMemory() + "[ftou(" + Visit(lmem->GetAddress()) + ") / 4]";
} else if (const auto gmem = std::get_if<GmemNode>(dest)) {
const std::string real = Visit(gmem->GetRealAddress());
const std::string base = Visit(gmem->GetBaseAddress());
const std::string final_offset = "(ftou(" + real + ") - ftou(" + base + ")) / 4";
target = fmt::format("{}[{}]", GetGlobalMemory(gmem->GetDescriptor()), final_offset);
} else {
UNREACHABLE_MSG("Assign called without a proper target");
}
@ -1621,9 +1637,7 @@ private:
std::string GetCommonDeclarations() {
const auto cbuf = std::to_string(MAX_CONSTBUFFER_ELEMENTS);
const auto gmem = std::to_string(MAX_GLOBALMEMORY_ELEMENTS);
return "#define MAX_CONSTBUFFER_ELEMENTS " + cbuf + "\n" +
"#define MAX_GLOBALMEMORY_ELEMENTS " + gmem + "\n" +
"#define ftoi floatBitsToInt\n"
"#define ftou floatBitsToUint\n"
"#define itof intBitsToFloat\n"

View file

@ -39,8 +39,9 @@ private:
class GlobalMemoryEntry {
public:
explicit GlobalMemoryEntry(u32 cbuf_index, u32 cbuf_offset)
: cbuf_index{cbuf_index}, cbuf_offset{cbuf_offset} {}
explicit GlobalMemoryEntry(u32 cbuf_index, u32 cbuf_offset, bool is_read, bool is_written)
: cbuf_index{cbuf_index}, cbuf_offset{cbuf_offset}, is_read{is_read}, is_written{
is_written} {}
u32 GetCbufIndex() const {
return cbuf_index;
@ -50,9 +51,19 @@ public:
return cbuf_offset;
}
bool IsRead() const {
return is_read;
}
bool IsWritten() const {
return is_written;
}
private:
u32 cbuf_index{};
u32 cbuf_offset{};
bool is_read{};
bool is_written{};
};
struct ShaderEntries {

View file

@ -337,11 +337,16 @@ std::optional<ShaderDiskCacheDecompiled> ShaderDiskCacheOpenGL::LoadDecompiledEn
for (u32 i = 0; i < global_memory_count; ++i) {
u32 cbuf_index{};
u32 cbuf_offset{};
u8 is_read{};
u8 is_written{};
if (file.ReadBytes(&cbuf_index, sizeof(u32)) != sizeof(u32) ||
file.ReadBytes(&cbuf_offset, sizeof(u32)) != sizeof(u32)) {
file.ReadBytes(&cbuf_offset, sizeof(u32)) != sizeof(u32) ||
file.ReadBytes(&is_read, sizeof(u8)) != sizeof(u8) ||
file.ReadBytes(&is_written, sizeof(u8)) != sizeof(u8)) {
return {};
}
entry.entries.global_memory_entries.emplace_back(cbuf_index, cbuf_offset);
entry.entries.global_memory_entries.emplace_back(cbuf_index, cbuf_offset, is_read != 0,
is_written != 0);
}
for (auto& clip_distance : entry.entries.clip_distances) {
@ -397,7 +402,9 @@ bool ShaderDiskCacheOpenGL::SaveDecompiledFile(FileUtil::IOFile& file, u64 uniqu
return false;
for (const auto& gmem : entries.global_memory_entries) {
if (file.WriteObject(static_cast<u32>(gmem.GetCbufIndex())) != 1 ||
file.WriteObject(static_cast<u32>(gmem.GetCbufOffset())) != 1) {
file.WriteObject(static_cast<u32>(gmem.GetCbufOffset())) != 1 ||
file.WriteObject(static_cast<u8>(gmem.IsRead() ? 1 : 0)) != 1 ||
file.WriteObject(static_cast<u8>(gmem.IsWritten() ? 1 : 0)) != 1) {
return false;
}
}