mirror of
https://github.com/shadps4-emu/shadPS4.git
synced 2025-05-25 04:45:00 +00:00
* shader_recompiler: Move shared mem lowering into emitter * IR can be quite verbose during first stages of translation, before ssa and constant prop passes have run that drastically simplify it. This lowering can also be done during emission so why not do it then to save some compilation time * runtime_info: Pack PsColorBuffer into 8 bytes * Drops the size of the total structure by half from 396 to 204 bytes. Also should make comparison of the array a bit faster, since its a hot path done every draw * emit_spirv_context: Add infrastructure for buffer aliases * Splits out the buffer creation function so it can be reused when defining multiple type aliases * shader_recompiler: Merge srt_flatbuf into buffers list * Its no longer a special case, yay * shader_recompiler: Complete buffer aliasing support * Add a bunch more types into buffers, such as F32 for float reads/writes and 8/16 bit integer types for formatted buffers * shader_recompiler: Remove existing shared memory emulation * The current impl relies on backend side implementaton and hooking into every shared memory access. It also doesnt handle atomics. Will be replaced by an IR pass that solves these issues * shader_recompiler: Reintroduce shared memory on ssbo emulation * Now it is performed with an IR pass, and combined with the previous commit cleanup, is fully transparent from the backend, other than requiring workgroup_index be provided as an attribute (computing this on every shared memory access is gonna be too verbose * clang format * buffer_cache: Reduce buffer sizes * vk_rasterizer: Cleanup resource binding code * Reduce noise in the functions, also remove some arguments which are class members * Fix gcc
279 lines
8.2 KiB
C++
279 lines
8.2 KiB
C++
// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
|
|
// SPDX-License-Identifier: GPL-2.0-or-later
|
|
|
|
#pragma once
|
|
|
|
#include <algorithm>
|
|
#include <span>
|
|
#include <boost/container/static_vector.hpp>
|
|
#include "common/types.h"
|
|
#include "shader_recompiler/frontend/tessellation.h"
|
|
#include "video_core/amdgpu/liverpool.h"
|
|
#include "video_core/amdgpu/types.h"
|
|
|
|
namespace Shader {
|
|
|
|
enum class Stage : u32 {
|
|
Fragment,
|
|
Vertex,
|
|
Geometry,
|
|
Export,
|
|
Hull,
|
|
Local,
|
|
Compute,
|
|
};
|
|
|
|
// Vertex intentionally comes after TCS/TES due to order of compilation
|
|
enum class LogicalStage : u32 {
|
|
Fragment,
|
|
TessellationControl,
|
|
TessellationEval,
|
|
Vertex,
|
|
Geometry,
|
|
Compute,
|
|
NumLogicalStages
|
|
};
|
|
|
|
constexpr u32 MaxStageTypes = static_cast<u32>(LogicalStage::NumLogicalStages);
|
|
|
|
[[nodiscard]] constexpr Stage StageFromIndex(size_t index) noexcept {
|
|
return static_cast<Stage>(index);
|
|
}
|
|
|
|
struct LocalRuntimeInfo {
|
|
u32 ls_stride;
|
|
bool links_with_tcs;
|
|
|
|
auto operator<=>(const LocalRuntimeInfo&) const noexcept = default;
|
|
};
|
|
|
|
struct ExportRuntimeInfo {
|
|
u32 vertex_data_size;
|
|
|
|
auto operator<=>(const ExportRuntimeInfo&) const noexcept = default;
|
|
};
|
|
|
|
enum class VsOutput : u8 {
|
|
None,
|
|
PointSprite,
|
|
EdgeFlag,
|
|
KillFlag,
|
|
GsCutFlag,
|
|
GsMrtIndex,
|
|
GsVpIndex,
|
|
CullDist0,
|
|
CullDist1,
|
|
CullDist2,
|
|
CullDist3,
|
|
CullDist4,
|
|
CullDist5,
|
|
CullDist6,
|
|
CullDist7,
|
|
ClipDist0,
|
|
ClipDist1,
|
|
ClipDist2,
|
|
ClipDist3,
|
|
ClipDist4,
|
|
ClipDist5,
|
|
ClipDist6,
|
|
ClipDist7,
|
|
};
|
|
using VsOutputMap = std::array<VsOutput, 4>;
|
|
|
|
struct VertexRuntimeInfo {
|
|
u32 num_outputs;
|
|
std::array<VsOutputMap, 3> outputs;
|
|
bool emulate_depth_negative_one_to_one{};
|
|
bool clip_disable{};
|
|
// Domain
|
|
AmdGpu::TessellationType tess_type;
|
|
AmdGpu::TessellationTopology tess_topology;
|
|
AmdGpu::TessellationPartitioning tess_partitioning;
|
|
u32 hs_output_cp_stride{};
|
|
|
|
bool operator==(const VertexRuntimeInfo& other) const noexcept {
|
|
return emulate_depth_negative_one_to_one == other.emulate_depth_negative_one_to_one &&
|
|
clip_disable == other.clip_disable && tess_type == other.tess_type &&
|
|
tess_topology == other.tess_topology &&
|
|
tess_partitioning == other.tess_partitioning &&
|
|
hs_output_cp_stride == other.hs_output_cp_stride;
|
|
}
|
|
|
|
void InitFromTessConstants(Shader::TessellationDataConstantBuffer& tess_constants) {
|
|
hs_output_cp_stride = tess_constants.hs_cp_stride;
|
|
}
|
|
};
|
|
|
|
struct HullRuntimeInfo {
|
|
// from registers
|
|
u32 num_input_control_points;
|
|
u32 num_threads;
|
|
AmdGpu::TessellationType tess_type;
|
|
|
|
// from tess constants buffer
|
|
u32 ls_stride;
|
|
u32 hs_output_cp_stride;
|
|
u32 hs_output_base;
|
|
|
|
auto operator<=>(const HullRuntimeInfo&) const noexcept = default;
|
|
|
|
// It might be possible for a non-passthrough TCS to have these conditions, in some
|
|
// dumb situation.
|
|
// In that case, it should be fine to assume passthrough and declare some extra
|
|
// output control points and attributes that shouldnt be read by the TES anyways
|
|
bool IsPassthrough() const {
|
|
return hs_output_base == 0 && ls_stride == hs_output_cp_stride && num_threads == 1;
|
|
};
|
|
|
|
// regs.ls_hs_config.hs_output_control_points contains the number of threads, which
|
|
// isn't exactly the number of output control points.
|
|
// For passthrough shaders, the register field is set to 1, so use the number of
|
|
// input control points
|
|
u32 NumOutputControlPoints() const {
|
|
return IsPassthrough() ? num_input_control_points : num_threads;
|
|
}
|
|
|
|
void InitFromTessConstants(Shader::TessellationDataConstantBuffer& tess_constants) {
|
|
ls_stride = tess_constants.ls_stride;
|
|
hs_output_cp_stride = tess_constants.hs_cp_stride;
|
|
hs_output_base = tess_constants.hs_output_base;
|
|
}
|
|
};
|
|
|
|
static constexpr auto GsMaxOutputStreams = 4u;
|
|
using GsOutputPrimTypes = std::array<AmdGpu::GsOutputPrimitiveType, GsMaxOutputStreams>;
|
|
struct GeometryRuntimeInfo {
|
|
u32 num_invocations{};
|
|
u32 output_vertices{};
|
|
u32 in_vertex_data_size{};
|
|
u32 out_vertex_data_size{};
|
|
AmdGpu::PrimitiveType in_primitive;
|
|
GsOutputPrimTypes out_primitive;
|
|
std::span<const u32> vs_copy;
|
|
u64 vs_copy_hash;
|
|
|
|
bool operator==(const GeometryRuntimeInfo& other) const noexcept {
|
|
return num_invocations && other.num_invocations &&
|
|
output_vertices == other.output_vertices && in_primitive == other.in_primitive &&
|
|
std::ranges::equal(out_primitive, other.out_primitive);
|
|
}
|
|
};
|
|
|
|
enum class MrtSwizzle : u8 {
|
|
Identity = 0,
|
|
Alt = 1,
|
|
Reverse = 2,
|
|
ReverseAlt = 3,
|
|
};
|
|
static constexpr u32 MaxColorBuffers = 8;
|
|
|
|
struct PsColorBuffer {
|
|
AmdGpu::NumberFormat num_format : 4;
|
|
AmdGpu::NumberConversion num_conversion : 2;
|
|
AmdGpu::Liverpool::ShaderExportFormat export_format : 4;
|
|
u32 needs_unorm_fixup : 1;
|
|
u32 pad : 21;
|
|
AmdGpu::CompMapping swizzle;
|
|
|
|
auto operator<=>(const PsColorBuffer&) const noexcept = default;
|
|
};
|
|
|
|
struct FragmentRuntimeInfo {
|
|
struct PsInput {
|
|
u8 param_index;
|
|
bool is_default;
|
|
bool is_flat;
|
|
u8 default_value;
|
|
|
|
[[nodiscard]] bool IsDefault() const {
|
|
return is_default && !is_flat;
|
|
}
|
|
|
|
auto operator<=>(const PsInput&) const noexcept = default;
|
|
};
|
|
AmdGpu::Liverpool::PsInput en_flags;
|
|
AmdGpu::Liverpool::PsInput addr_flags;
|
|
u32 num_inputs;
|
|
std::array<PsInput, 32> inputs;
|
|
std::array<PsColorBuffer, MaxColorBuffers> color_buffers;
|
|
|
|
bool operator==(const FragmentRuntimeInfo& other) const noexcept {
|
|
return std::ranges::equal(color_buffers, other.color_buffers) &&
|
|
en_flags.raw == other.en_flags.raw && addr_flags.raw == other.addr_flags.raw &&
|
|
num_inputs == other.num_inputs &&
|
|
std::ranges::equal(inputs.begin(), inputs.begin() + num_inputs, other.inputs.begin(),
|
|
other.inputs.begin() + num_inputs);
|
|
}
|
|
};
|
|
|
|
struct ComputeRuntimeInfo {
|
|
u32 shared_memory_size;
|
|
std::array<u32, 3> workgroup_size;
|
|
std::array<bool, 3> tgid_enable;
|
|
|
|
bool operator==(const ComputeRuntimeInfo& other) const noexcept {
|
|
return workgroup_size == other.workgroup_size && tgid_enable == other.tgid_enable;
|
|
}
|
|
};
|
|
|
|
/**
|
|
* Stores information relevant to shader compilation sourced from liverpool registers.
|
|
* It may potentially differ with the same shader module so must be checked.
|
|
* It's also possible to store any other custom information that needs to be part of shader key.
|
|
*/
|
|
struct RuntimeInfo {
|
|
Stage stage;
|
|
u32 num_user_data;
|
|
u32 num_input_vgprs;
|
|
u32 num_allocated_vgprs;
|
|
AmdGpu::FpDenormMode fp_denorm_mode32;
|
|
AmdGpu::FpRoundMode fp_round_mode32;
|
|
union {
|
|
LocalRuntimeInfo ls_info;
|
|
ExportRuntimeInfo es_info;
|
|
VertexRuntimeInfo vs_info;
|
|
HullRuntimeInfo hs_info;
|
|
GeometryRuntimeInfo gs_info;
|
|
FragmentRuntimeInfo fs_info;
|
|
ComputeRuntimeInfo cs_info;
|
|
};
|
|
|
|
void Initialize(Stage stage_) {
|
|
memset(this, 0, sizeof(*this));
|
|
stage = stage_;
|
|
}
|
|
|
|
bool operator==(const RuntimeInfo& other) const noexcept {
|
|
switch (stage) {
|
|
case Stage::Fragment:
|
|
return fs_info == other.fs_info;
|
|
case Stage::Vertex:
|
|
return vs_info == other.vs_info;
|
|
case Stage::Compute:
|
|
return cs_info == other.cs_info;
|
|
case Stage::Export:
|
|
return es_info == other.es_info;
|
|
case Stage::Geometry:
|
|
return gs_info == other.gs_info;
|
|
case Stage::Hull:
|
|
return hs_info == other.hs_info;
|
|
case Stage::Local:
|
|
return ls_info == other.ls_info;
|
|
default:
|
|
return true;
|
|
}
|
|
}
|
|
};
|
|
|
|
} // namespace Shader
|
|
|
|
template <>
|
|
struct fmt::formatter<Shader::Stage> {
|
|
constexpr auto parse(format_parse_context& ctx) {
|
|
return ctx.begin();
|
|
}
|
|
auto format(const Shader::Stage stage, format_context& ctx) const {
|
|
constexpr static std::array names = {"fs", "vs", "gs", "es", "hs", "ls", "cs"};
|
|
return fmt::format_to(ctx.out(), "{}", names[static_cast<size_t>(stage)]);
|
|
}
|
|
};
|