texture_cache: detiler: m8x1 and m8x4 shaders

This commit is contained in:
psucien 2024-06-05 16:14:22 +02:00
parent 184b7b7fc2
commit 440a60a43b
3 changed files with 118 additions and 14 deletions

View file

@ -0,0 +1,48 @@
// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#version 450
#extension GL_KHR_shader_subgroup_shuffle : require
// NOTE: Current subgroup utilization is subotimal on most GPUs, so
// it will be nice to process two tiles at once here.
layout (local_size_x = 16, local_size_y = 1, local_size_z = 1) in;
layout(std430, binding = 0) buffer input_buf {
uint in_data[];
};
layout(r8ui, binding = 1) uniform writeonly uimage2D output_img;
layout(push_constant) uniform image_info {
uint pitch;
} info;
#define MICRO_TILE_DIM 8
#define TEXELS_PER_ELEMENT 4
void main() {
uint p0 = in_data[gl_GlobalInvocationID.x];
uint p1 = subgroupShuffleXor(p0, 1);
uint hword = gl_LocalInvocationID.x & 1;
uint dst_tx = (hword == 1)
? (p0 & 0xffff0000) | (p1 >> 16)
: (p0 & 0x0000ffff) | (p1 << 16);
uint col = (gl_LocalInvocationID.x >> 2) & 1;
uint row = (gl_LocalInvocationID.x % TEXELS_PER_ELEMENT)
+ TEXELS_PER_ELEMENT * (gl_LocalInvocationID.x >> 3);
uint tiles_per_pitch = info.pitch / MICRO_TILE_DIM;
uint target_tile_x = gl_WorkGroupID.x % tiles_per_pitch;
uint target_tile_y = gl_WorkGroupID.x / tiles_per_pitch;
uint dw_ofs_x = target_tile_x * MICRO_TILE_DIM + TEXELS_PER_ELEMENT * col;
uint dw_ofs_y = target_tile_y * MICRO_TILE_DIM + row;
ivec2 img_pos = ivec2(dw_ofs_x, dw_ofs_y);
#pragma unroll
for (int ofs = 0; ofs < TEXELS_PER_ELEMENT; ++ofs) {
imageStore(output_img, img_pos + ivec2(ofs, 0), uvec4(dst_tx & 0xff));
dst_tx >>= 8;
}
}

View file

@ -0,0 +1,58 @@
// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#version 450
#extension GL_KHR_shader_subgroup_shuffle : require
layout (local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
layout(std430, binding = 0) buffer input_buf {
uint in_data[];
};
layout(rgba8ui, binding = 1) uniform writeonly uimage2D output_img;
layout(push_constant) uniform image_info {
uint pitch;
} info;
// Inverse morton LUT, small enough to fit into K$
uint lut_8x4[16] = {
0x11011000, 0x31213020,
0x13031202, 0x33233222,
0x51415040, 0x71617060,
0x53435242, 0x73637262,
0x15051404, 0x35253424,
0x17071606, 0x37273626,
0x55455444, 0x75657464,
0x57475646, 0x77677666,
};
#define MICRO_TILE_DIM 8
#define TEXELS_PER_ELEMENT 1
void main() {
uint src_tx = in_data[gl_GlobalInvocationID.x];
uint bit_ofs = 8 * (gl_LocalInvocationID.x % 4);
uint packed_pos = lut_8x4[gl_LocalInvocationID.x >> 2] >> bit_ofs;
uint col = bitfieldExtract(packed_pos, 4, 4);
uint row = bitfieldExtract(packed_pos, 0, 4);
uint tiles_per_pitch = info.pitch / MICRO_TILE_DIM;
uint target_tile_x = gl_WorkGroupID.x % tiles_per_pitch;
uint target_tile_y = gl_WorkGroupID.x / tiles_per_pitch;
uint dw_ofs_x = target_tile_x * MICRO_TILE_DIM + TEXELS_PER_ELEMENT * col;
uint dw_ofs_y = target_tile_y * MICRO_TILE_DIM + row;
ivec2 img_pos = ivec2(dw_ofs_x, dw_ofs_y);
uvec4 dst_tx = uvec4(
bitfieldExtract(src_tx, 0, 8),
bitfieldExtract(src_tx, 8, 8),
bitfieldExtract(src_tx, 16, 8),
bitfieldExtract(src_tx, 24, 8)
);
imageStore(output_img, img_pos, dst_tx);
}