Surface management rework (2/3) (#329)

* texture_cache: interface refactoring

* a bit of fixes and improvements

* texture_cache: macro tile extents for bpp 128

* texture_cache: detiler: prefer host memory for large buffers upload
This commit is contained in:
psucien 2024-07-28 17:20:42 +02:00 committed by GitHub
parent 0d6edaa0a0
commit 30198d5ffc
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
22 changed files with 478 additions and 322 deletions

View file

@ -8,10 +8,14 @@ layout (local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
layout(std430, binding = 0) buffer input_buf {
uint in_data[];
};
layout(r32ui, binding = 1) uniform writeonly uimage2D output_img;
layout(std430, binding = 1) buffer output_buf {
uint out_data[];
};
layout(push_constant) uniform image_info {
uint num_levels;
uint pitch;
uint sizes[14];
} info;
// Inverse morton LUT, small enough to fit into K$
@ -31,20 +35,22 @@ uint rmort[16] = {
#define TEXELS_PER_ELEMENT (1)
void main() {
uint tile_base = gl_GlobalInvocationID.x - gl_LocalInvocationID.x; // WG*16
uint p0 = in_data[gl_GlobalInvocationID.x];
uint bit_ofs = 8 * (gl_LocalInvocationID.x % 4);
uint packed_pos = rmort[gl_LocalInvocationID.x >> 2] >> bit_ofs;
uint col = bitfieldExtract(packed_pos, 4, 4);
uint row = bitfieldExtract(packed_pos, 0, 4);
uint p0 = in_data[gl_GlobalInvocationID.x];
uint mip = 0;
for (int m = 0; m < info.num_levels; ++m) {
mip += (gl_GlobalInvocationID.x * 4) >= info.sizes[m] ? 1 : 0;
}
uint tiles_per_pitch = info.pitch >> 3; // log2(MICRO_TILE_DIM)
uint tiles_per_pitch = max((info.pitch >> mip) / MICRO_TILE_DIM, 1);
uint target_tile_x = gl_WorkGroupID.x % tiles_per_pitch;
uint target_tile_y = gl_WorkGroupID.x / tiles_per_pitch;
uint dw_ofs_x = target_tile_x * MICRO_TILE_DIM + TEXELS_PER_ELEMENT * col;
uint dw_ofs_y = target_tile_y * MICRO_TILE_DIM + row;
ivec2 img_pos = ivec2(dw_ofs_x, dw_ofs_y);
imageStore(output_img, img_pos, uvec4(p0, 0, 0, 0));
}
uint dw_ofs_x = target_tile_x * MICRO_TILE_DIM + col;
uint dw_ofs_y = (target_tile_y * tiles_per_pitch * 64) + row * tiles_per_pitch * MICRO_TILE_DIM;
out_data[dw_ofs_x + dw_ofs_y] = p0;
}

View file

@ -8,10 +8,14 @@ layout (local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
layout(std430, binding = 0) buffer input_buf {
uint in_data[];
};
layout(rg32ui, binding = 1) uniform writeonly uimage2D output_img;
layout(std430, binding = 1) buffer output_buf {
uint out_data[];
};
layout(push_constant) uniform image_info {
uint num_levels;
uint pitch;
uint sizes[14];
} info;
// Inverse morton LUT, small enough to fit into K$
@ -30,19 +34,25 @@ uint rmort[16] = {
#define MICRO_TILE_DIM (8)
void main() {
uint block_ofs = 2 * gl_GlobalInvocationID.x;
uint p0 = in_data[block_ofs + 0];
uint p1 = in_data[block_ofs + 1];
uint bit_ofs = 8 * (gl_LocalInvocationID.x % 4);
uint packed_pos = rmort[gl_LocalInvocationID.x >> 2] >> bit_ofs;
uint col = bitfieldExtract(packed_pos, 4, 4);
uint row = bitfieldExtract(packed_pos, 0, 4);
uint block_ofs = 2 * gl_GlobalInvocationID.x;
uint p0 = in_data[block_ofs + 0];
uint p1 = in_data[block_ofs + 1];
uint mip = 0;
for (int m = 0; m < info.num_levels; ++m) {
mip += (gl_GlobalInvocationID.x * 8) >= info.sizes[m] ? 1 : 0;
}
uint tiles_per_pitch = (info.pitch >> 3) >> 2; // log2(MICRO_TILE_DIM) / 4
ivec2 img_pos = MICRO_TILE_DIM * ivec2(
gl_WorkGroupID.x % tiles_per_pitch,
gl_WorkGroupID.x / tiles_per_pitch
);
imageStore(output_img, img_pos + ivec2(col, row), uvec4(p0, p1, 0, 0));
}
uint tiles_per_pitch = max((info.pitch >> mip) / MICRO_TILE_DIM, 1) * 2;
uint target_tile_x = 2 * gl_WorkGroupID.x % tiles_per_pitch;
uint target_tile_y = 2 * gl_WorkGroupID.x / tiles_per_pitch;
uint dw_ofs_x = target_tile_x * MICRO_TILE_DIM + col * 2;
uint dw_ofs_y = (target_tile_y * tiles_per_pitch * 64) + row * tiles_per_pitch * MICRO_TILE_DIM;
out_data[dw_ofs_x + dw_ofs_y] = p0;
out_data[dw_ofs_x + dw_ofs_y + 1] = p1;
}

View file

@ -8,10 +8,14 @@ layout (local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
layout(std430, binding = 0) buffer input_buf {
uint in_data[];
};
layout(rgba32ui, binding = 1) uniform writeonly uimage2D output_img;
layout(std430, binding = 1) buffer output_buf {
uint out_data[];
};
layout(push_constant) uniform image_info {
uint num_levels;
uint pitch;
uint sizes[14];
} info;
// Inverse morton LUT, small enough to fit into K$
@ -30,21 +34,29 @@ uint rmort[16] = {
#define MICRO_TILE_DIM (8)
void main() {
uint bit_ofs = 8 * (gl_LocalInvocationID.x % 4);
uint packed_pos = rmort[gl_LocalInvocationID.x >> 2] >> bit_ofs;
uint col = bitfieldExtract(packed_pos, 4, 4);
uint row = bitfieldExtract(packed_pos, 0, 4);
uint block_ofs = 4 * gl_GlobalInvocationID.x;
uint p0 = in_data[block_ofs + 0];
uint p1 = in_data[block_ofs + 1];
uint p2 = in_data[block_ofs + 2];
uint p3 = in_data[block_ofs + 3];
uint tiles_per_pitch = (info.pitch >> 3) >> 2; // log2(MICRO_TILE_DIM) / 4
ivec2 img_pos = MICRO_TILE_DIM * ivec2(
gl_WorkGroupID.x % tiles_per_pitch,
gl_WorkGroupID.x / tiles_per_pitch
);
imageStore(output_img, img_pos + ivec2(col, row), uvec4(p0, p1, p2, p3));
}
uint bit_ofs = 8 * (gl_LocalInvocationID.x % 4);
uint packed_pos = rmort[gl_LocalInvocationID.x >> 2] >> bit_ofs;
uint col = bitfieldExtract(packed_pos, 4, 4);
uint row = bitfieldExtract(packed_pos, 0, 4);
uint mip = 0;
for (int m = 0; m < info.num_levels; ++m) {
mip += (gl_GlobalInvocationID.x * 16) >= info.sizes[m] ? 1 : 0;
}
uint tiles_per_pitch = max(((info.pitch >> mip) / MICRO_TILE_DIM), 1u) * 4;
uint target_tile_x = 4 * gl_WorkGroupID.x % tiles_per_pitch;
uint target_tile_y = 4 * gl_WorkGroupID.x / tiles_per_pitch;
uint dw_ofs_x = (target_tile_x * MICRO_TILE_DIM) + 4 * col;
uint dw_ofs_y = ((target_tile_y * tiles_per_pitch) * 64u) + ((row * tiles_per_pitch) * MICRO_TILE_DIM);
out_data[dw_ofs_x + dw_ofs_y] = p0;
out_data[dw_ofs_x + dw_ofs_y + 1] = p1;
out_data[dw_ofs_x + dw_ofs_y + 2] = p2;
out_data[dw_ofs_x + dw_ofs_y + 3] = p3;
}

View file

@ -11,10 +11,14 @@ layout (local_size_x = 16, local_size_y = 1, local_size_z = 1) in;
layout(std430, binding = 0) buffer input_buf {
uint in_data[];
};
layout(r8ui, binding = 1) uniform writeonly uimage2D output_img;
layout(std430, binding = 1) buffer output_buf {
uint out_data[];
};
layout(push_constant) uniform image_info {
uint num_levels;
uint pitch;
uint sizes[14];
} info;
#define MICRO_TILE_DIM 8
@ -32,17 +36,15 @@ void main() {
uint row = (gl_LocalInvocationID.x % TEXELS_PER_ELEMENT)
+ TEXELS_PER_ELEMENT * (gl_LocalInvocationID.x >> 3);
uint tiles_per_pitch = info.pitch >> 3; // log2(MICRO_TILE_DIM)
uint mip = 0;
for (int m = 0; m < info.num_levels; ++m) {
mip += (gl_GlobalInvocationID.x * 4) >= info.sizes[m] ? 1 : 0;
}
uint tiles_per_pitch = max((info.pitch >> mip) / 8, 1);
uint target_tile_x = gl_WorkGroupID.x % tiles_per_pitch;
uint target_tile_y = gl_WorkGroupID.x / tiles_per_pitch;
uint dw_ofs_x = target_tile_x * MICRO_TILE_DIM + TEXELS_PER_ELEMENT * col;
uint dw_ofs_y = target_tile_y * MICRO_TILE_DIM + row;
ivec2 img_pos = ivec2(dw_ofs_x, dw_ofs_y);
#pragma unroll
for (int ofs = 0; ofs < TEXELS_PER_ELEMENT; ++ofs) {
imageStore(output_img, img_pos + ivec2(ofs, 0), uvec4(dst_tx & 0xff));
dst_tx >>= 8;
}
uint dw_ofs_x = target_tile_x * 2 + col; // 2 = uints
uint dw_ofs_y = (target_tile_y * MICRO_TILE_DIM + row) * tiles_per_pitch * 2; // 2 = uints
out_data[dw_ofs_x + dw_ofs_y] = dst_tx;
}

View file

@ -10,10 +10,14 @@ layout (local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
layout(std430, binding = 0) buffer input_buf {
uint in_data[];
};
layout(rg8ui, binding = 1) uniform writeonly uimage2D output_img;
layout(std430, binding = 1) buffer output_buf {
uint out_data[];
};
layout(push_constant) uniform image_info {
uint num_levels;
uint pitch;
uint sizes[14];
} info;
#define MICRO_TILE_DIM 8
@ -44,18 +48,14 @@ void main() {
uint col = bitfieldExtract(packed_pos, 4, 4);
uint row = bitfieldExtract(packed_pos, 0, 4);
uint tiles_per_pitch = info.pitch >> 3; // log2(MICRO_TILE_DIM)
uint mip = 0u;
for (int m = 0; m < info.num_levels; ++m) {
mip += (gl_GlobalInvocationID.x * 4) >= info.sizes[m] ? 1 : 0;
}
uint tiles_per_pitch = max(((info.pitch >> mip) / 8u), 1u);
uint target_tile_x = gl_WorkGroupID.x % tiles_per_pitch;
uint target_tile_y = gl_WorkGroupID.x / tiles_per_pitch;
uint dw_ofs_x = target_tile_x * MICRO_TILE_DIM + col;
uint dw_ofs_y = target_tile_y * MICRO_TILE_DIM + row;
ivec2 img_pos = ivec2(dw_ofs_x, dw_ofs_y);
#pragma unroll
for (int ofs = 0; ofs < TEXELS_PER_ELEMENT; ++ofs) {
uint p0 = (p[ofs] >> 8) & 0xff;
uint p1 = p[ofs] & 0xff;
imageStore(output_img, img_pos + ivec2(ofs, 0), uvec4(p1, p0, 0, 0));
}
uint dw_ofs_x = target_tile_x * 8 + col;
uint dw_ofs_y = (target_tile_y * tiles_per_pitch * 64) + row * tiles_per_pitch * 8;
out_data[(dw_ofs_x + dw_ofs_y) / 2] = src_tx;
}