Surface management rework (2/3) (#329)

* texture_cache: interface refactoring * a bit of fixes and improvements * texture_cache: macro tile extents for bpp 128 * texture_cache: detiler: prefer host memory for large buffers upload
2025-06-05 02:03:16 +00:00 · 2024-07-28 17:20:42 +02:00 · 2024-07-28 17:20:42 +02:00 · 30198d5ffc
commit 30198d5ffc
parent 0d6edaa0a0
22 changed files with 478 additions and 322 deletions
--- a/src/video_core/host_shaders/detile_m32x1.comp
+++ b/src/video_core/host_shaders/detile_m32x1.comp
@ -8,10 +8,14 @@ layout (local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
 layout(std430, binding = 0) buffer input_buf {
    uint in_data[];
 };
-layout(r32ui, binding = 1) uniform writeonly uimage2D output_img;
+layout(std430, binding = 1) buffer output_buf {
+    uint out_data[];
+};

 layout(push_constant) uniform image_info {
+    uint num_levels;
    uint pitch;
+    uint sizes[14];
 } info;

 // Inverse morton LUT, small enough to fit into K$
@ -31,20 +35,22 @@ uint rmort[16] = {
 #define TEXELS_PER_ELEMENT  (1)

 void main() {
+    uint tile_base = gl_GlobalInvocationID.x - gl_LocalInvocationID.x; // WG*16
+    uint p0 = in_data[gl_GlobalInvocationID.x];
    uint bit_ofs = 8 * (gl_LocalInvocationID.x % 4);
    uint packed_pos = rmort[gl_LocalInvocationID.x >> 2] >> bit_ofs;
    uint col = bitfieldExtract(packed_pos, 4, 4);
    uint row = bitfieldExtract(packed_pos, 0, 4);

-    uint p0 = in_data[gl_GlobalInvocationID.x];
+    uint mip = 0;
+    for (int m = 0; m < info.num_levels; ++m) {
+        mip += (gl_GlobalInvocationID.x * 4) >= info.sizes[m] ? 1 : 0;
+    }

-    uint tiles_per_pitch = info.pitch >> 3; // log2(MICRO_TILE_DIM)
+    uint tiles_per_pitch = max((info.pitch >> mip) / MICRO_TILE_DIM, 1);
    uint target_tile_x = gl_WorkGroupID.x % tiles_per_pitch;
    uint target_tile_y = gl_WorkGroupID.x / tiles_per_pitch;
-
-    uint dw_ofs_x = target_tile_x * MICRO_TILE_DIM + TEXELS_PER_ELEMENT * col;
-    uint dw_ofs_y = target_tile_y * MICRO_TILE_DIM + row;
-
-    ivec2 img_pos = ivec2(dw_ofs_x, dw_ofs_y);
-    imageStore(output_img, img_pos, uvec4(p0, 0, 0, 0));
-}
+    uint dw_ofs_x = target_tile_x * MICRO_TILE_DIM + col;
+    uint dw_ofs_y = (target_tile_y * tiles_per_pitch * 64) + row * tiles_per_pitch * MICRO_TILE_DIM;
+    out_data[dw_ofs_x + dw_ofs_y] = p0;
+}
--- a/src/video_core/host_shaders/detile_m32x2.comp
+++ b/src/video_core/host_shaders/detile_m32x2.comp
@ -8,10 +8,14 @@ layout (local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
 layout(std430, binding = 0) buffer input_buf {
    uint in_data[];
 };
-layout(rg32ui, binding = 1) uniform writeonly uimage2D output_img;
+layout(std430, binding = 1) buffer output_buf {
+    uint out_data[];
+};

 layout(push_constant) uniform image_info {
+    uint num_levels;
    uint pitch;
+    uint sizes[14];
 } info;

 // Inverse morton LUT, small enough to fit into K$
@ -30,19 +34,25 @@ uint rmort[16] = {
 #define MICRO_TILE_DIM      (8)

 void main() {
+    uint block_ofs = 2 * gl_GlobalInvocationID.x;
+    uint p0 = in_data[block_ofs + 0];
+    uint p1 = in_data[block_ofs + 1];
+    
    uint bit_ofs = 8 * (gl_LocalInvocationID.x % 4);
    uint packed_pos = rmort[gl_LocalInvocationID.x >> 2] >> bit_ofs;
    uint col = bitfieldExtract(packed_pos, 4, 4);
    uint row = bitfieldExtract(packed_pos, 0, 4);

-    uint block_ofs = 2 * gl_GlobalInvocationID.x;
-    uint p0 = in_data[block_ofs + 0];
-    uint p1 = in_data[block_ofs + 1];
+    uint mip = 0;
+    for (int m = 0; m < info.num_levels; ++m) {
+        mip += (gl_GlobalInvocationID.x * 8) >= info.sizes[m] ? 1 : 0;
+    }

-    uint tiles_per_pitch = (info.pitch >> 3) >> 2; // log2(MICRO_TILE_DIM) / 4
-    ivec2 img_pos = MICRO_TILE_DIM * ivec2(
-        gl_WorkGroupID.x % tiles_per_pitch,
-        gl_WorkGroupID.x / tiles_per_pitch
-    );
-    imageStore(output_img, img_pos + ivec2(col, row), uvec4(p0, p1, 0, 0));
-}
+    uint tiles_per_pitch = max((info.pitch >> mip) / MICRO_TILE_DIM, 1) * 2;
+    uint target_tile_x = 2 * gl_WorkGroupID.x % tiles_per_pitch;
+    uint target_tile_y = 2 * gl_WorkGroupID.x / tiles_per_pitch;
+    uint dw_ofs_x = target_tile_x * MICRO_TILE_DIM + col * 2;
+    uint dw_ofs_y = (target_tile_y * tiles_per_pitch * 64) + row * tiles_per_pitch * MICRO_TILE_DIM;
+    out_data[dw_ofs_x + dw_ofs_y] = p0;
+    out_data[dw_ofs_x + dw_ofs_y + 1] = p1; 
+}
--- a/src/video_core/host_shaders/detile_m32x4.comp
+++ b/src/video_core/host_shaders/detile_m32x4.comp
@ -8,10 +8,14 @@ layout (local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
 layout(std430, binding = 0) buffer input_buf {
    uint in_data[];
 };
-layout(rgba32ui, binding = 1) uniform writeonly uimage2D output_img;
+layout(std430, binding = 1) buffer output_buf {
+    uint out_data[];
+};

 layout(push_constant) uniform image_info {
+    uint num_levels;
    uint pitch;
+    uint sizes[14];
 } info;

 // Inverse morton LUT, small enough to fit into K$
@ -30,21 +34,29 @@ uint rmort[16] = {
 #define MICRO_TILE_DIM      (8)

 void main() {
-    uint bit_ofs = 8 * (gl_LocalInvocationID.x % 4);
-    uint packed_pos = rmort[gl_LocalInvocationID.x >> 2] >> bit_ofs;
-    uint col = bitfieldExtract(packed_pos, 4, 4);
-    uint row = bitfieldExtract(packed_pos, 0, 4);
-
    uint block_ofs = 4 * gl_GlobalInvocationID.x;
    uint p0 = in_data[block_ofs + 0];
    uint p1 = in_data[block_ofs + 1];
    uint p2 = in_data[block_ofs + 2];
    uint p3 = in_data[block_ofs + 3];

-    uint tiles_per_pitch = (info.pitch >> 3) >> 2; // log2(MICRO_TILE_DIM) / 4
-    ivec2 img_pos = MICRO_TILE_DIM * ivec2(
-        gl_WorkGroupID.x % tiles_per_pitch,
-        gl_WorkGroupID.x / tiles_per_pitch
-    );
-    imageStore(output_img, img_pos + ivec2(col, row), uvec4(p0, p1, p2, p3));
-}
+    uint bit_ofs = 8 * (gl_LocalInvocationID.x % 4);
+    uint packed_pos = rmort[gl_LocalInvocationID.x >> 2] >> bit_ofs;
+    uint col = bitfieldExtract(packed_pos, 4, 4);
+    uint row = bitfieldExtract(packed_pos, 0, 4);
+
+    uint mip = 0;
+    for (int m = 0; m < info.num_levels; ++m) {
+        mip += (gl_GlobalInvocationID.x * 16) >= info.sizes[m] ? 1 : 0;
+    }
+
+    uint tiles_per_pitch = max(((info.pitch >> mip) / MICRO_TILE_DIM), 1u) * 4;
+    uint target_tile_x = 4 * gl_WorkGroupID.x % tiles_per_pitch;
+    uint target_tile_y = 4 * gl_WorkGroupID.x / tiles_per_pitch;
+    uint dw_ofs_x = (target_tile_x * MICRO_TILE_DIM) + 4 * col;
+    uint dw_ofs_y = ((target_tile_y * tiles_per_pitch) * 64u) + ((row * tiles_per_pitch) * MICRO_TILE_DIM);
+    out_data[dw_ofs_x + dw_ofs_y] = p0;
+    out_data[dw_ofs_x + dw_ofs_y + 1] = p1; 
+    out_data[dw_ofs_x + dw_ofs_y + 2] = p2; 
+    out_data[dw_ofs_x + dw_ofs_y + 3] = p3; 
+}
--- a/src/video_core/host_shaders/detile_m8x1.comp
+++ b/src/video_core/host_shaders/detile_m8x1.comp
@ -11,10 +11,14 @@ layout (local_size_x = 16, local_size_y = 1, local_size_z = 1) in;
 layout(std430, binding = 0) buffer input_buf {
    uint in_data[];
 };
-layout(r8ui, binding = 1) uniform writeonly uimage2D output_img;
+layout(std430, binding = 1) buffer output_buf {
+    uint out_data[];
+};

 layout(push_constant) uniform image_info {
+    uint num_levels;
    uint pitch;
+    uint sizes[14];
 } info;

 #define MICRO_TILE_DIM      8
@ -32,17 +36,15 @@ void main() {
    uint row = (gl_LocalInvocationID.x % TEXELS_PER_ELEMENT)
                + TEXELS_PER_ELEMENT * (gl_LocalInvocationID.x >> 3);

-    uint tiles_per_pitch = info.pitch >> 3; // log2(MICRO_TILE_DIM)
+    uint mip = 0;
+    for (int m = 0; m < info.num_levels; ++m) {
+        mip += (gl_GlobalInvocationID.x * 4) >= info.sizes[m] ? 1 : 0;
+    }
+
+    uint tiles_per_pitch = max((info.pitch >> mip) / 8, 1);
    uint target_tile_x = gl_WorkGroupID.x % tiles_per_pitch;
    uint target_tile_y = gl_WorkGroupID.x / tiles_per_pitch;
-    uint dw_ofs_x = target_tile_x * MICRO_TILE_DIM + TEXELS_PER_ELEMENT * col;
-    uint dw_ofs_y = target_tile_y * MICRO_TILE_DIM + row;
-
-    ivec2 img_pos = ivec2(dw_ofs_x, dw_ofs_y);
-
-    #pragma unroll
-    for (int ofs = 0; ofs < TEXELS_PER_ELEMENT; ++ofs) {
-        imageStore(output_img, img_pos + ivec2(ofs, 0), uvec4(dst_tx & 0xff));
-        dst_tx >>= 8;
-    }
+    uint dw_ofs_x = target_tile_x * 2 + col; // 2 = uints
+    uint dw_ofs_y = (target_tile_y * MICRO_TILE_DIM + row) * tiles_per_pitch * 2; // 2 = uints
+    out_data[dw_ofs_x + dw_ofs_y] = dst_tx;
 }
--- a/src/video_core/host_shaders/detile_m8x2.comp
+++ b/src/video_core/host_shaders/detile_m8x2.comp
@ -10,10 +10,14 @@ layout (local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
 layout(std430, binding = 0) buffer input_buf {
    uint in_data[];
 };
-layout(rg8ui, binding = 1) uniform writeonly uimage2D output_img;
+layout(std430, binding = 1) buffer output_buf {
+    uint out_data[];
+};

 layout(push_constant) uniform image_info {
+    uint num_levels;
    uint pitch;
+    uint sizes[14];
 } info;

 #define MICRO_TILE_DIM      8
@ -44,18 +48,14 @@ void main() {
    uint col = bitfieldExtract(packed_pos, 4, 4);
    uint row = bitfieldExtract(packed_pos, 0, 4);

-    uint tiles_per_pitch = info.pitch >> 3; // log2(MICRO_TILE_DIM)
+    uint mip = 0u;
+    for (int m = 0; m < info.num_levels; ++m) {
+        mip += (gl_GlobalInvocationID.x * 4) >= info.sizes[m] ? 1 : 0;
+    }    
+    uint tiles_per_pitch = max(((info.pitch >> mip) / 8u), 1u);
    uint target_tile_x = gl_WorkGroupID.x % tiles_per_pitch;
    uint target_tile_y = gl_WorkGroupID.x / tiles_per_pitch;
-    uint dw_ofs_x = target_tile_x * MICRO_TILE_DIM + col;
-    uint dw_ofs_y = target_tile_y * MICRO_TILE_DIM + row;
-
-    ivec2 img_pos = ivec2(dw_ofs_x, dw_ofs_y);
-
-    #pragma unroll
-    for (int ofs = 0; ofs < TEXELS_PER_ELEMENT; ++ofs) {
-        uint p0 = (p[ofs] >> 8) & 0xff;
-        uint p1 =  p[ofs]       & 0xff;
-        imageStore(output_img, img_pos + ivec2(ofs, 0), uvec4(p1, p0, 0, 0));
-    }
+    uint dw_ofs_x = target_tile_x * 8 + col;
+    uint dw_ofs_y = (target_tile_y * tiles_per_pitch * 64) + row * tiles_per_pitch * 8;
+    out_data[(dw_ofs_x + dw_ofs_y) / 2] = src_tx;
 }