mirror of
https://github.com/shadps4-emu/shadPS4.git
synced 2025-06-05 02:03:16 +00:00
Surface management rework (2/3) (#329)
* texture_cache: interface refactoring * a bit of fixes and improvements * texture_cache: macro tile extents for bpp 128 * texture_cache: detiler: prefer host memory for large buffers upload
This commit is contained in:
parent
0d6edaa0a0
commit
30198d5ffc
22 changed files with 478 additions and 322 deletions
|
@ -8,10 +8,14 @@ layout (local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
|
|||
layout(std430, binding = 0) buffer input_buf {
|
||||
uint in_data[];
|
||||
};
|
||||
layout(r32ui, binding = 1) uniform writeonly uimage2D output_img;
|
||||
layout(std430, binding = 1) buffer output_buf {
|
||||
uint out_data[];
|
||||
};
|
||||
|
||||
layout(push_constant) uniform image_info {
|
||||
uint num_levels;
|
||||
uint pitch;
|
||||
uint sizes[14];
|
||||
} info;
|
||||
|
||||
// Inverse morton LUT, small enough to fit into K$
|
||||
|
@ -31,20 +35,22 @@ uint rmort[16] = {
|
|||
#define TEXELS_PER_ELEMENT (1)
|
||||
|
||||
void main() {
|
||||
uint tile_base = gl_GlobalInvocationID.x - gl_LocalInvocationID.x; // WG*16
|
||||
uint p0 = in_data[gl_GlobalInvocationID.x];
|
||||
uint bit_ofs = 8 * (gl_LocalInvocationID.x % 4);
|
||||
uint packed_pos = rmort[gl_LocalInvocationID.x >> 2] >> bit_ofs;
|
||||
uint col = bitfieldExtract(packed_pos, 4, 4);
|
||||
uint row = bitfieldExtract(packed_pos, 0, 4);
|
||||
|
||||
uint p0 = in_data[gl_GlobalInvocationID.x];
|
||||
uint mip = 0;
|
||||
for (int m = 0; m < info.num_levels; ++m) {
|
||||
mip += (gl_GlobalInvocationID.x * 4) >= info.sizes[m] ? 1 : 0;
|
||||
}
|
||||
|
||||
uint tiles_per_pitch = info.pitch >> 3; // log2(MICRO_TILE_DIM)
|
||||
uint tiles_per_pitch = max((info.pitch >> mip) / MICRO_TILE_DIM, 1);
|
||||
uint target_tile_x = gl_WorkGroupID.x % tiles_per_pitch;
|
||||
uint target_tile_y = gl_WorkGroupID.x / tiles_per_pitch;
|
||||
|
||||
uint dw_ofs_x = target_tile_x * MICRO_TILE_DIM + TEXELS_PER_ELEMENT * col;
|
||||
uint dw_ofs_y = target_tile_y * MICRO_TILE_DIM + row;
|
||||
|
||||
ivec2 img_pos = ivec2(dw_ofs_x, dw_ofs_y);
|
||||
imageStore(output_img, img_pos, uvec4(p0, 0, 0, 0));
|
||||
}
|
||||
uint dw_ofs_x = target_tile_x * MICRO_TILE_DIM + col;
|
||||
uint dw_ofs_y = (target_tile_y * tiles_per_pitch * 64) + row * tiles_per_pitch * MICRO_TILE_DIM;
|
||||
out_data[dw_ofs_x + dw_ofs_y] = p0;
|
||||
}
|
||||
|
|
|
@ -8,10 +8,14 @@ layout (local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
|
|||
layout(std430, binding = 0) buffer input_buf {
|
||||
uint in_data[];
|
||||
};
|
||||
layout(rg32ui, binding = 1) uniform writeonly uimage2D output_img;
|
||||
layout(std430, binding = 1) buffer output_buf {
|
||||
uint out_data[];
|
||||
};
|
||||
|
||||
layout(push_constant) uniform image_info {
|
||||
uint num_levels;
|
||||
uint pitch;
|
||||
uint sizes[14];
|
||||
} info;
|
||||
|
||||
// Inverse morton LUT, small enough to fit into K$
|
||||
|
@ -30,19 +34,25 @@ uint rmort[16] = {
|
|||
#define MICRO_TILE_DIM (8)
|
||||
|
||||
void main() {
|
||||
uint block_ofs = 2 * gl_GlobalInvocationID.x;
|
||||
uint p0 = in_data[block_ofs + 0];
|
||||
uint p1 = in_data[block_ofs + 1];
|
||||
|
||||
uint bit_ofs = 8 * (gl_LocalInvocationID.x % 4);
|
||||
uint packed_pos = rmort[gl_LocalInvocationID.x >> 2] >> bit_ofs;
|
||||
uint col = bitfieldExtract(packed_pos, 4, 4);
|
||||
uint row = bitfieldExtract(packed_pos, 0, 4);
|
||||
|
||||
uint block_ofs = 2 * gl_GlobalInvocationID.x;
|
||||
uint p0 = in_data[block_ofs + 0];
|
||||
uint p1 = in_data[block_ofs + 1];
|
||||
uint mip = 0;
|
||||
for (int m = 0; m < info.num_levels; ++m) {
|
||||
mip += (gl_GlobalInvocationID.x * 8) >= info.sizes[m] ? 1 : 0;
|
||||
}
|
||||
|
||||
uint tiles_per_pitch = (info.pitch >> 3) >> 2; // log2(MICRO_TILE_DIM) / 4
|
||||
ivec2 img_pos = MICRO_TILE_DIM * ivec2(
|
||||
gl_WorkGroupID.x % tiles_per_pitch,
|
||||
gl_WorkGroupID.x / tiles_per_pitch
|
||||
);
|
||||
imageStore(output_img, img_pos + ivec2(col, row), uvec4(p0, p1, 0, 0));
|
||||
}
|
||||
uint tiles_per_pitch = max((info.pitch >> mip) / MICRO_TILE_DIM, 1) * 2;
|
||||
uint target_tile_x = 2 * gl_WorkGroupID.x % tiles_per_pitch;
|
||||
uint target_tile_y = 2 * gl_WorkGroupID.x / tiles_per_pitch;
|
||||
uint dw_ofs_x = target_tile_x * MICRO_TILE_DIM + col * 2;
|
||||
uint dw_ofs_y = (target_tile_y * tiles_per_pitch * 64) + row * tiles_per_pitch * MICRO_TILE_DIM;
|
||||
out_data[dw_ofs_x + dw_ofs_y] = p0;
|
||||
out_data[dw_ofs_x + dw_ofs_y + 1] = p1;
|
||||
}
|
||||
|
|
|
@ -8,10 +8,14 @@ layout (local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
|
|||
layout(std430, binding = 0) buffer input_buf {
|
||||
uint in_data[];
|
||||
};
|
||||
layout(rgba32ui, binding = 1) uniform writeonly uimage2D output_img;
|
||||
layout(std430, binding = 1) buffer output_buf {
|
||||
uint out_data[];
|
||||
};
|
||||
|
||||
layout(push_constant) uniform image_info {
|
||||
uint num_levels;
|
||||
uint pitch;
|
||||
uint sizes[14];
|
||||
} info;
|
||||
|
||||
// Inverse morton LUT, small enough to fit into K$
|
||||
|
@ -30,21 +34,29 @@ uint rmort[16] = {
|
|||
#define MICRO_TILE_DIM (8)
|
||||
|
||||
void main() {
|
||||
uint bit_ofs = 8 * (gl_LocalInvocationID.x % 4);
|
||||
uint packed_pos = rmort[gl_LocalInvocationID.x >> 2] >> bit_ofs;
|
||||
uint col = bitfieldExtract(packed_pos, 4, 4);
|
||||
uint row = bitfieldExtract(packed_pos, 0, 4);
|
||||
|
||||
uint block_ofs = 4 * gl_GlobalInvocationID.x;
|
||||
uint p0 = in_data[block_ofs + 0];
|
||||
uint p1 = in_data[block_ofs + 1];
|
||||
uint p2 = in_data[block_ofs + 2];
|
||||
uint p3 = in_data[block_ofs + 3];
|
||||
|
||||
uint tiles_per_pitch = (info.pitch >> 3) >> 2; // log2(MICRO_TILE_DIM) / 4
|
||||
ivec2 img_pos = MICRO_TILE_DIM * ivec2(
|
||||
gl_WorkGroupID.x % tiles_per_pitch,
|
||||
gl_WorkGroupID.x / tiles_per_pitch
|
||||
);
|
||||
imageStore(output_img, img_pos + ivec2(col, row), uvec4(p0, p1, p2, p3));
|
||||
}
|
||||
uint bit_ofs = 8 * (gl_LocalInvocationID.x % 4);
|
||||
uint packed_pos = rmort[gl_LocalInvocationID.x >> 2] >> bit_ofs;
|
||||
uint col = bitfieldExtract(packed_pos, 4, 4);
|
||||
uint row = bitfieldExtract(packed_pos, 0, 4);
|
||||
|
||||
uint mip = 0;
|
||||
for (int m = 0; m < info.num_levels; ++m) {
|
||||
mip += (gl_GlobalInvocationID.x * 16) >= info.sizes[m] ? 1 : 0;
|
||||
}
|
||||
|
||||
uint tiles_per_pitch = max(((info.pitch >> mip) / MICRO_TILE_DIM), 1u) * 4;
|
||||
uint target_tile_x = 4 * gl_WorkGroupID.x % tiles_per_pitch;
|
||||
uint target_tile_y = 4 * gl_WorkGroupID.x / tiles_per_pitch;
|
||||
uint dw_ofs_x = (target_tile_x * MICRO_TILE_DIM) + 4 * col;
|
||||
uint dw_ofs_y = ((target_tile_y * tiles_per_pitch) * 64u) + ((row * tiles_per_pitch) * MICRO_TILE_DIM);
|
||||
out_data[dw_ofs_x + dw_ofs_y] = p0;
|
||||
out_data[dw_ofs_x + dw_ofs_y + 1] = p1;
|
||||
out_data[dw_ofs_x + dw_ofs_y + 2] = p2;
|
||||
out_data[dw_ofs_x + dw_ofs_y + 3] = p3;
|
||||
}
|
||||
|
|
|
@ -11,10 +11,14 @@ layout (local_size_x = 16, local_size_y = 1, local_size_z = 1) in;
|
|||
layout(std430, binding = 0) buffer input_buf {
|
||||
uint in_data[];
|
||||
};
|
||||
layout(r8ui, binding = 1) uniform writeonly uimage2D output_img;
|
||||
layout(std430, binding = 1) buffer output_buf {
|
||||
uint out_data[];
|
||||
};
|
||||
|
||||
layout(push_constant) uniform image_info {
|
||||
uint num_levels;
|
||||
uint pitch;
|
||||
uint sizes[14];
|
||||
} info;
|
||||
|
||||
#define MICRO_TILE_DIM 8
|
||||
|
@ -32,17 +36,15 @@ void main() {
|
|||
uint row = (gl_LocalInvocationID.x % TEXELS_PER_ELEMENT)
|
||||
+ TEXELS_PER_ELEMENT * (gl_LocalInvocationID.x >> 3);
|
||||
|
||||
uint tiles_per_pitch = info.pitch >> 3; // log2(MICRO_TILE_DIM)
|
||||
uint mip = 0;
|
||||
for (int m = 0; m < info.num_levels; ++m) {
|
||||
mip += (gl_GlobalInvocationID.x * 4) >= info.sizes[m] ? 1 : 0;
|
||||
}
|
||||
|
||||
uint tiles_per_pitch = max((info.pitch >> mip) / 8, 1);
|
||||
uint target_tile_x = gl_WorkGroupID.x % tiles_per_pitch;
|
||||
uint target_tile_y = gl_WorkGroupID.x / tiles_per_pitch;
|
||||
uint dw_ofs_x = target_tile_x * MICRO_TILE_DIM + TEXELS_PER_ELEMENT * col;
|
||||
uint dw_ofs_y = target_tile_y * MICRO_TILE_DIM + row;
|
||||
|
||||
ivec2 img_pos = ivec2(dw_ofs_x, dw_ofs_y);
|
||||
|
||||
#pragma unroll
|
||||
for (int ofs = 0; ofs < TEXELS_PER_ELEMENT; ++ofs) {
|
||||
imageStore(output_img, img_pos + ivec2(ofs, 0), uvec4(dst_tx & 0xff));
|
||||
dst_tx >>= 8;
|
||||
}
|
||||
uint dw_ofs_x = target_tile_x * 2 + col; // 2 = uints
|
||||
uint dw_ofs_y = (target_tile_y * MICRO_TILE_DIM + row) * tiles_per_pitch * 2; // 2 = uints
|
||||
out_data[dw_ofs_x + dw_ofs_y] = dst_tx;
|
||||
}
|
|
@ -10,10 +10,14 @@ layout (local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
|
|||
layout(std430, binding = 0) buffer input_buf {
|
||||
uint in_data[];
|
||||
};
|
||||
layout(rg8ui, binding = 1) uniform writeonly uimage2D output_img;
|
||||
layout(std430, binding = 1) buffer output_buf {
|
||||
uint out_data[];
|
||||
};
|
||||
|
||||
layout(push_constant) uniform image_info {
|
||||
uint num_levels;
|
||||
uint pitch;
|
||||
uint sizes[14];
|
||||
} info;
|
||||
|
||||
#define MICRO_TILE_DIM 8
|
||||
|
@ -44,18 +48,14 @@ void main() {
|
|||
uint col = bitfieldExtract(packed_pos, 4, 4);
|
||||
uint row = bitfieldExtract(packed_pos, 0, 4);
|
||||
|
||||
uint tiles_per_pitch = info.pitch >> 3; // log2(MICRO_TILE_DIM)
|
||||
uint mip = 0u;
|
||||
for (int m = 0; m < info.num_levels; ++m) {
|
||||
mip += (gl_GlobalInvocationID.x * 4) >= info.sizes[m] ? 1 : 0;
|
||||
}
|
||||
uint tiles_per_pitch = max(((info.pitch >> mip) / 8u), 1u);
|
||||
uint target_tile_x = gl_WorkGroupID.x % tiles_per_pitch;
|
||||
uint target_tile_y = gl_WorkGroupID.x / tiles_per_pitch;
|
||||
uint dw_ofs_x = target_tile_x * MICRO_TILE_DIM + col;
|
||||
uint dw_ofs_y = target_tile_y * MICRO_TILE_DIM + row;
|
||||
|
||||
ivec2 img_pos = ivec2(dw_ofs_x, dw_ofs_y);
|
||||
|
||||
#pragma unroll
|
||||
for (int ofs = 0; ofs < TEXELS_PER_ELEMENT; ++ofs) {
|
||||
uint p0 = (p[ofs] >> 8) & 0xff;
|
||||
uint p1 = p[ofs] & 0xff;
|
||||
imageStore(output_img, img_pos + ivec2(ofs, 0), uvec4(p1, p0, 0, 0));
|
||||
}
|
||||
uint dw_ofs_x = target_tile_x * 8 + col;
|
||||
uint dw_ofs_y = (target_tile_y * tiles_per_pitch * 64) + row * tiles_per_pitch * 8;
|
||||
out_data[(dw_ofs_x + dw_ofs_y) / 2] = src_tx;
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue