Presentation: Only use FP16 in scaling shaders on supported devices in Vulkan
This commit is contained in:
parent
a39e867c73
commit
dcc5b4f6b0
15 changed files with 199 additions and 115 deletions
|
@ -18,16 +18,20 @@ set(SHADER_FILES
|
|||
opengl_copy_bc4.comp
|
||||
opengl_present.frag
|
||||
opengl_present.vert
|
||||
opengl_present_scaleforce.frag
|
||||
pitch_unswizzle.comp
|
||||
present_scaleforce.frag
|
||||
present_bicubic.frag
|
||||
present_gaussian.frag
|
||||
vulkan_blit_color_float.frag
|
||||
vulkan_blit_depth_stencil.frag
|
||||
vulkan_fidelityfx_fsr_easu.comp
|
||||
vulkan_fidelityfx_fsr_rcas.comp
|
||||
vulkan_fidelityfx_fsr_easu_fp16.comp
|
||||
vulkan_fidelityfx_fsr_easu_fp32.comp
|
||||
vulkan_fidelityfx_fsr_rcas_fp16.comp
|
||||
vulkan_fidelityfx_fsr_rcas_fp32.comp
|
||||
vulkan_present.frag
|
||||
vulkan_present.vert
|
||||
vulkan_present_scaleforce_fp16.frag
|
||||
vulkan_present_scaleforce_fp32.frag
|
||||
vulkan_quad_indexed.comp
|
||||
vulkan_uint8.comp
|
||||
)
|
||||
|
|
|
@ -28,80 +28,82 @@
|
|||
// THE SOFTWARE.
|
||||
|
||||
layout( push_constant ) uniform constants {
|
||||
u32vec2 input_size;
|
||||
uvec4 Const0;
|
||||
uvec4 Const1;
|
||||
uvec4 Const2;
|
||||
uvec4 Const3;
|
||||
};
|
||||
|
||||
uvec4 Const0;
|
||||
uvec4 Const1;
|
||||
uvec4 Const2;
|
||||
uvec4 Const3;
|
||||
layout(set=0,binding=0) uniform sampler2D InputTexture;
|
||||
layout(set=0,binding=1,rgba16f) uniform image2D OutputTexture;
|
||||
|
||||
#define A_GPU 1
|
||||
#define A_GLSL 1
|
||||
#define A_HALF
|
||||
|
||||
#include "ffx_a.h"
|
||||
#ifndef YUZU_USE_FP16
|
||||
#include "ffx_a.h"
|
||||
|
||||
f16vec4 LinearToSRGB(f16vec4 linear) {
|
||||
bvec4 selector = greaterThan(linear, f16vec4(0.00313066844250063));
|
||||
f16vec4 low = linear * float16_t(12.92);
|
||||
f16vec4 high = float16_t(1.055) * pow(linear, f16vec4(1 / 2.4)) - float16_t(0.055);
|
||||
return mix(low, high, selector);
|
||||
}
|
||||
#if USE_EASU
|
||||
#define FSR_EASU_F 1
|
||||
AF4 FsrEasuRF(AF2 p) { AF4 res = textureGather(InputTexture, p, 0); return res; }
|
||||
AF4 FsrEasuGF(AF2 p) { AF4 res = textureGather(InputTexture, p, 1); return res; }
|
||||
AF4 FsrEasuBF(AF2 p) { AF4 res = textureGather(InputTexture, p, 2); return res; }
|
||||
#endif
|
||||
#if USE_RCAS
|
||||
#define FSR_RCAS_F 1
|
||||
AF4 FsrRcasLoadF(ASU2 p) { return texelFetch(InputTexture, ASU2(p), 0); }
|
||||
void FsrRcasInputF(inout AF1 r, inout AF1 g, inout AF1 b) {}
|
||||
#endif
|
||||
#else
|
||||
#define A_HALF
|
||||
#include "ffx_a.h"
|
||||
|
||||
f16vec4 SRGBToLinear(f16vec4 srgb) {
|
||||
bvec4 selector = greaterThan(srgb, f16vec4(0.0404482362771082));
|
||||
f16vec4 low = srgb * float16_t(1.0 / 12.92);
|
||||
f16vec4 high = pow((srgb + float16_t(0.055)) * float16_t(1.0 / 1.055), f16vec4(2.4));
|
||||
return mix(low, high, selector);
|
||||
}
|
||||
|
||||
#if USE_EASU
|
||||
#define FSR_EASU_H 1
|
||||
f16vec4 FsrEasuRH(vec2 p) { f16vec4 res = f16vec4(textureGather(InputTexture, p, 0)); return res; }
|
||||
f16vec4 FsrEasuGH(vec2 p) { f16vec4 res = f16vec4(textureGather(InputTexture, p, 1)); return res; }
|
||||
f16vec4 FsrEasuBH(vec2 p) { f16vec4 res = f16vec4(textureGather(InputTexture, p, 2)); return res; }
|
||||
#endif
|
||||
#if USE_RCAS
|
||||
#define FSR_RCAS_H 1
|
||||
f16vec4 FsrRcasLoadH(ASW2 p) { return f16vec4(texelFetch(InputTexture, ASU2(p), 0)); }
|
||||
void FsrRcasInputH(inout float16_t r, inout float16_t g, inout float16_t b) {}
|
||||
#if USE_EASU
|
||||
#define FSR_EASU_H 1
|
||||
AH4 FsrEasuRH(AF2 p) { AH4 res = AH4(textureGather(InputTexture, p, 0)); return res; }
|
||||
AH4 FsrEasuGH(AF2 p) { AH4 res = AH4(textureGather(InputTexture, p, 1)); return res; }
|
||||
AH4 FsrEasuBH(AF2 p) { AH4 res = AH4(textureGather(InputTexture, p, 2)); return res; }
|
||||
#endif
|
||||
#if USE_RCAS
|
||||
#define FSR_RCAS_H 1
|
||||
AH4 FsrRcasLoadH(ASW2 p) { return AH4(texelFetch(InputTexture, ASU2(p), 0)); }
|
||||
void FsrRcasInputH(inout AH1 r,inout AH1 g,inout AH1 b){}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#include "ffx_fsr1.h"
|
||||
|
||||
void CurrFilter(u32vec2 pos) {
|
||||
// For debugging
|
||||
void CurrFilter(AU2 pos) {
|
||||
#if USE_BILINEAR
|
||||
vec2 pp = (vec2(pos) * vec2_AU2(Const0.xy) + vec2_AU2(Const0.zw)) * vec2_AU2(Const1.xy) + vec2(0.5, -0.5) * vec2_AU2(Const1.zw);
|
||||
imageStore(OutputTexture, ivec2(pos), textureLod(InputTexture, pp, 0.0));
|
||||
AF2 pp = (AF2(pos) * AF2_AU2(Const0.xy) + AF2_AU2(Const0.zw)) * AF2_AU2(Const1.xy) + AF2(0.5, -0.5) * AF2_AU2(Const1.zw);
|
||||
imageStore(OutputTexture, ASU2(pos), textureLod(InputTexture, pp, 0.0));
|
||||
#endif
|
||||
#if USE_EASU
|
||||
f16vec3 c;
|
||||
FsrEasuH(c, pos, Const0, Const1, Const2, Const3);
|
||||
imageStore(OutputTexture, ivec2(pos), f16vec4(c, 1));
|
||||
#ifndef YUZU_USE_FP16
|
||||
AF3 c;
|
||||
FsrEasuF(c, pos, Const0, Const1, Const2, Const3);
|
||||
imageStore(OutputTexture, ASU2(pos), AF4(c, 1));
|
||||
#else
|
||||
AH3 c;
|
||||
FsrEasuH(c, pos, Const0, Const1, Const2, Const3);
|
||||
imageStore(OutputTexture, ASU2(pos), AH4(c, 1));
|
||||
#endif
|
||||
#endif
|
||||
#if USE_RCAS
|
||||
f16vec3 c;
|
||||
FsrRcasH(c.r, c.g, c.b, pos, Const0);
|
||||
imageStore(OutputTexture, ivec2(pos), f16vec4(c, 1));
|
||||
#ifndef YUZU_USE_FP16
|
||||
AF3 c;
|
||||
FsrRcasF(c.r, c.g, c.b, pos, Const0);
|
||||
imageStore(OutputTexture, ASU2(pos), AF4(c, 1));
|
||||
#else
|
||||
AH3 c;
|
||||
FsrRcasH(c.r, c.g, c.b, pos, Const0);
|
||||
imageStore(OutputTexture, ASU2(pos), AH4(c, 1));
|
||||
#endif
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
layout(local_size_x=64) in;
|
||||
void main() {
|
||||
|
||||
#if USE_EASU || USE_BILINEAR
|
||||
vec2 ires = vec2(input_size);
|
||||
vec2 tres = textureSize(InputTexture, 0);
|
||||
vec2 ores = imageSize(OutputTexture);
|
||||
FsrEasuCon(Const0, Const1, Const2, Const3, ires.x, ires.y, tres.x, tres.y, ores.x, ores.y);
|
||||
#endif
|
||||
#if USE_RCAS
|
||||
FsrRcasCon(Const0, 0.25f);
|
||||
#endif
|
||||
|
||||
// Do remapping of local xy in workgroup for a more PS-like swizzle pattern.
|
||||
AU2 gxy = ARmp8x8(gl_LocalInvocationID.x) + AU2(gl_WorkGroupID.x << 4u, gl_WorkGroupID.y << 4u);
|
||||
CurrFilter(gxy);
|
||||
|
|
|
@ -22,11 +22,29 @@
|
|||
|
||||
// Adapted from https://github.com/BreadFish64/ScaleFish/tree/master/scaleforce
|
||||
|
||||
#version 460
|
||||
//! #version 460
|
||||
|
||||
#extension GL_ARB_separate_shader_objects : enable
|
||||
|
||||
#ifdef YUZU_USE_FP16
|
||||
|
||||
#extension GL_AMD_gpu_shader_half_float : enable
|
||||
#extension GL_NV_gpu_shader5 : enable
|
||||
|
||||
#define lfloat float16_t
|
||||
#define lvec2 f16vec2
|
||||
#define lvec3 f16vec3
|
||||
#define lvec4 f16vec4
|
||||
|
||||
#else
|
||||
|
||||
#define lfloat float
|
||||
#define lvec2 vec2
|
||||
#define lvec3 vec3
|
||||
#define lvec4 vec4
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef VULKAN
|
||||
|
||||
#define BINDING_COLOR_TEXTURE 1
|
||||
|
@ -45,25 +63,25 @@ layout (binding = BINDING_COLOR_TEXTURE) uniform sampler2D input_texture;
|
|||
|
||||
const bool ignore_alpha = true;
|
||||
|
||||
float16_t ColorDist1(f16vec4 a, f16vec4 b) {
|
||||
lfloat ColorDist1(lvec4 a, lvec4 b) {
|
||||
// https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.2020_conversion
|
||||
const f16vec3 K = f16vec3(0.2627, 0.6780, 0.0593);
|
||||
const float16_t scaleB = float16_t(0.5) / (float16_t(1.0) - K.b);
|
||||
const float16_t scaleR = float16_t(0.5) / (float16_t(1.0) - K.r);
|
||||
f16vec4 diff = a - b;
|
||||
float16_t Y = dot(diff.rgb, K);
|
||||
float16_t Cb = scaleB * (diff.b - Y);
|
||||
float16_t Cr = scaleR * (diff.r - Y);
|
||||
f16vec3 YCbCr = f16vec3(Y, Cb, Cr);
|
||||
float16_t d = length(YCbCr);
|
||||
const lvec3 K = lvec3(0.2627, 0.6780, 0.0593);
|
||||
const lfloat scaleB = lfloat(0.5) / (lfloat(1.0) - K.b);
|
||||
const lfloat scaleR = lfloat(0.5) / (lfloat(1.0) - K.r);
|
||||
lvec4 diff = a - b;
|
||||
lfloat Y = dot(diff.rgb, K);
|
||||
lfloat Cb = scaleB * (diff.b - Y);
|
||||
lfloat Cr = scaleR * (diff.r - Y);
|
||||
lvec3 YCbCr = lvec3(Y, Cb, Cr);
|
||||
lfloat d = length(YCbCr);
|
||||
if (ignore_alpha) {
|
||||
return d;
|
||||
}
|
||||
return sqrt(a.a * b.a * d * d + diff.a * diff.a);
|
||||
}
|
||||
|
||||
f16vec4 ColorDist(f16vec4 ref, f16vec4 A, f16vec4 B, f16vec4 C, f16vec4 D) {
|
||||
return f16vec4(
|
||||
lvec4 ColorDist(lvec4 ref, lvec4 A, lvec4 B, lvec4 C, lvec4 D) {
|
||||
return lvec4(
|
||||
ColorDist1(ref, A),
|
||||
ColorDist1(ref, B),
|
||||
ColorDist1(ref, C),
|
||||
|
@ -72,36 +90,36 @@ f16vec4 ColorDist(f16vec4 ref, f16vec4 A, f16vec4 B, f16vec4 C, f16vec4 D) {
|
|||
}
|
||||
|
||||
vec4 Scaleforce(sampler2D tex, vec2 tex_coord) {
|
||||
f16vec4 bl = f16vec4(textureOffset(tex, tex_coord, ivec2(-1, -1)));
|
||||
f16vec4 bc = f16vec4(textureOffset(tex, tex_coord, ivec2(0, -1)));
|
||||
f16vec4 br = f16vec4(textureOffset(tex, tex_coord, ivec2(1, -1)));
|
||||
f16vec4 cl = f16vec4(textureOffset(tex, tex_coord, ivec2(-1, 0)));
|
||||
f16vec4 cc = f16vec4(texture(tex, tex_coord));
|
||||
f16vec4 cr = f16vec4(textureOffset(tex, tex_coord, ivec2(1, 0)));
|
||||
f16vec4 tl = f16vec4(textureOffset(tex, tex_coord, ivec2(-1, 1)));
|
||||
f16vec4 tc = f16vec4(textureOffset(tex, tex_coord, ivec2(0, 1)));
|
||||
f16vec4 tr = f16vec4(textureOffset(tex, tex_coord, ivec2(1, 1)));
|
||||
lvec4 bl = lvec4(textureOffset(tex, tex_coord, ivec2(-1, -1)));
|
||||
lvec4 bc = lvec4(textureOffset(tex, tex_coord, ivec2(0, -1)));
|
||||
lvec4 br = lvec4(textureOffset(tex, tex_coord, ivec2(1, -1)));
|
||||
lvec4 cl = lvec4(textureOffset(tex, tex_coord, ivec2(-1, 0)));
|
||||
lvec4 cc = lvec4(texture(tex, tex_coord));
|
||||
lvec4 cr = lvec4(textureOffset(tex, tex_coord, ivec2(1, 0)));
|
||||
lvec4 tl = lvec4(textureOffset(tex, tex_coord, ivec2(-1, 1)));
|
||||
lvec4 tc = lvec4(textureOffset(tex, tex_coord, ivec2(0, 1)));
|
||||
lvec4 tr = lvec4(textureOffset(tex, tex_coord, ivec2(1, 1)));
|
||||
|
||||
f16vec4 offset_tl = ColorDist(cc, tl, tc, tr, cr);
|
||||
f16vec4 offset_br = ColorDist(cc, br, bc, bl, cl);
|
||||
lvec4 offset_tl = ColorDist(cc, tl, tc, tr, cr);
|
||||
lvec4 offset_br = ColorDist(cc, br, bc, bl, cl);
|
||||
|
||||
// Calculate how different cc is from the texels around it
|
||||
const float16_t plus_weight = float16_t(1.5);
|
||||
const float16_t cross_weight = float16_t(1.5);
|
||||
float16_t total_dist = dot(offset_tl + offset_br, f16vec4(cross_weight, plus_weight, cross_weight, plus_weight));
|
||||
const lfloat plus_weight = lfloat(1.5);
|
||||
const lfloat cross_weight = lfloat(1.5);
|
||||
lfloat total_dist = dot(offset_tl + offset_br, lvec4(cross_weight, plus_weight, cross_weight, plus_weight));
|
||||
|
||||
if (total_dist == float16_t(0.0)) {
|
||||
if (total_dist == lfloat(0.0)) {
|
||||
return cc;
|
||||
} else {
|
||||
// Add together all the distances with direction taken into account
|
||||
f16vec4 tmp = offset_tl - offset_br;
|
||||
f16vec2 total_offset = tmp.wy * plus_weight + (tmp.zz + f16vec2(-tmp.x, tmp.x)) * cross_weight;
|
||||
lvec4 tmp = offset_tl - offset_br;
|
||||
lvec2 total_offset = tmp.wy * plus_weight + (tmp.zz + lvec2(-tmp.x, tmp.x)) * cross_weight;
|
||||
|
||||
// When the image has thin points, they tend to split apart.
|
||||
// This is because the texels all around are different and total_offset reaches into clear areas.
|
||||
// This works pretty well to keep the offset in bounds for these cases.
|
||||
float16_t clamp_val = length(total_offset) / total_dist;
|
||||
f16vec2 final_offset = clamp(total_offset, -clamp_val, clamp_val) / f16vec2(textureSize(tex, 0));
|
||||
lfloat clamp_val = length(total_offset) / total_dist;
|
||||
vec2 final_offset = vec2(clamp(total_offset, -clamp_val, clamp_val)) / textureSize(tex, 0);
|
||||
|
||||
return texture(tex, tex_coord - final_offset);
|
||||
}
|
||||
|
@ -109,4 +127,4 @@ vec4 Scaleforce(sampler2D tex, vec2 tex_coord) {
|
|||
|
||||
void main() {
|
||||
frag_color = Scaleforce(input_texture, tex_coord);
|
||||
}
|
||||
}
|
|
@ -5,9 +5,7 @@
|
|||
#version 460 core
|
||||
#extension GL_GOOGLE_include_directive : enable
|
||||
|
||||
layout(set=0,binding=0) uniform sampler2D InputTexture;
|
||||
layout(set=0,binding=1,rgba16f) uniform image2D OutputTexture;
|
||||
|
||||
#define YUZU_USE_FP16
|
||||
#define USE_EASU 1
|
||||
|
||||
#include "fidelityfx_fsr.comp"
|
|
@ -0,0 +1,10 @@
|
|||
// Copyright 2021 yuzu Emulator Project
|
||||
// Licensed under GPLv2 or any later version
|
||||
// Refer to the license.txt file included.
|
||||
|
||||
#version 460 core
|
||||
#extension GL_GOOGLE_include_directive : enable
|
||||
|
||||
#define USE_EASU 1
|
||||
|
||||
#include "fidelityfx_fsr.comp"
|
|
@ -5,9 +5,7 @@
|
|||
#version 460 core
|
||||
#extension GL_GOOGLE_include_directive : enable
|
||||
|
||||
layout(set=0,binding=0) uniform sampler2D InputTexture;
|
||||
layout(set=0,binding=1,rgba16f) uniform image2D OutputTexture;
|
||||
|
||||
#define YUZU_USE_FP16
|
||||
#define USE_RCAS 1
|
||||
|
||||
#include "fidelityfx_fsr.comp"
|
|
@ -0,0 +1,10 @@
|
|||
// Copyright 2021 yuzu Emulator Project
|
||||
// Licensed under GPLv2 or any later version
|
||||
// Refer to the license.txt file included.
|
||||
|
||||
#version 460 core
|
||||
#extension GL_GOOGLE_include_directive : enable
|
||||
|
||||
#define USE_RCAS 1
|
||||
|
||||
#include "fidelityfx_fsr.comp"
|
|
@ -0,0 +1,7 @@
|
|||
#version 460
|
||||
|
||||
#extension GL_GOOGLE_include_directive : enable
|
||||
|
||||
#define YUZU_USE_FP16
|
||||
|
||||
#include "opengl_present_scaleforce.frag"
|
|
@ -0,0 +1,5 @@
|
|||
#version 460
|
||||
|
||||
#extension GL_GOOGLE_include_directive : enable
|
||||
|
||||
#include "opengl_present_scaleforce.frag"
|
Loading…
Add table
Add a link
Reference in a new issue