[video_core] Implement GPU-accelerated texture unswizzling and optimize sparse texture handling (#3246)

- [Added] a new compute shader to handle block-linear unswizzling on the GPU, reducing CPU overhead during texture uploads - [Implemented] BlockLinearUnswizzle3DPass to take advantage of the new compute shader, unimplemented for OpenGL - [Implemented] texture streaming and queue system for large sparse textures to prevent hitches - [Implemented] aggressive garbage collection system to eject large sparse textures to save on memory (Unused) - [Added] user settings to adjust the streaming unswizzle system for low-end machines - [Improved] slightly the ASTC GPU decoding system Co-authored-by: Caio Oliveira <caiooliveirafarias0@gmail.com> Co-authored-by: CamilleLaVey <camillelavey99@gmail.com> Co-authored-by: DraVee <dravee@eden-emu.dev> Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/3246 Reviewed-by: Maufeat <sahyno1996@gmail.com> Reviewed-by: MaranBr <maranbr@eden-emu.dev> Reviewed-by: DraVee <dravee@eden-emu.dev> Reviewed-by: CamilleLaVey <camillelavey99@gmail.com> Co-authored-by: Forrest Keller <forrestmarkx@outlook.com> Co-committed-by: Forrest Keller <forrestmarkx@outlook.com>
2026-04-24 03:19:00 +02:00 · 2026-01-13 19:18:08 +01:00 · 2026-01-13 19:18:08 +01:00 · ecd01e13fd
commit ecd01e13fd
parent f544004b5d
20 changed files with 1076 additions and 83 deletions
--- a/src/video_core/host_shaders/CMakeLists.txt
+++ b/src/video_core/host_shaders/CMakeLists.txt
@ -18,6 +18,7 @@ set(SHADER_FILES
    blit_color_float.frag
    block_linear_unswizzle_2d.comp
    block_linear_unswizzle_3d.comp
+    block_linear_unswizzle_3d_bcn.comp
    convert_abgr8_srgb_to_d24s8.frag
    convert_abgr8_to_d24s8.frag
    convert_abgr8_to_d32f.frag
--- a/src/video_core/host_shaders/astc_decoder.comp
+++ b/src/video_core/host_shaders/astc_decoder.comp
@ -727,70 +727,35 @@ void ComputeEndpoints(out uvec4 ep1, out uvec4 ep2, uint color_endpoint_mode, ui
 }

 uint UnquantizeTexelWeight(EncodingData val) {
-    const uint encoding = Encoding(val);
-    const uint bitlen = NumBits(val);
-    const uint bitval = BitValue(val);
-    const uint A = ReplicateBitTo7((bitval & 1));
-    uint B = 0, C = 0, D = 0;
-    uint result = 0;
-    const uint bitlen_0_results[5] = {0, 16, 32, 48, 64};
-    switch (encoding) {
-    case JUST_BITS:
-        return FastReplicateTo6(bitval, bitlen);
-    case TRIT: {
+    uint encoding = Encoding(val), bitlen = NumBits(val), bitval = BitValue(val);
+    if (encoding == JUST_BITS) {
+        return (bitlen >= 1 && bitlen <= 5)
+            ? uint(floor(0.5f + float(bitval) * 64.0f / float((1 << bitlen) - 1)))
+            : FastReplicateTo6(bitval, bitlen);
+    } else if (encoding == TRIT || encoding == QUINT) {
+        uint B = 0, C = 0, D = 0;
+        uint b_mask = (0x3100 >> (bitlen * 4)) & 0xf;
+        uint b = (bitval >> 1) & b_mask;
        D = QuintTritValue(val);
-        switch (bitlen) {
-        case 0:
-            return bitlen_0_results[D * 2];
-        case 1: {
-            C = 50;
-            break;
+        if (encoding == TRIT) {
+            switch (bitlen) {
+            case 0: return D * 32; //0,32,64
+            case 1: C = 50; break;
+            case 2: C = 23; B = (b << 6) | (b << 2) | b; break;
+            case 3: C = 11; B = (b << 5) | b; break;
+            }
+        } else if (encoding == QUINT) {
+            switch (bitlen) {
+            case 0: return D * 16; //0, 16, 32, 48, 64
+            case 1: C = 28; break;
+            case 2: C = 13; B = (b << 6) | (b << 1); break;
+            }
        }
-        case 2: {
-            C = 23;
-            const uint b = (bitval >> 1) & 1;
-            B = (b << 6) | (b << 2) | b;
-            break;
-        }
-        case 3: {
-            C = 11;
-            const uint cb = (bitval >> 1) & 3;
-            B = (cb << 5) | cb;
-            break;
-        }
-        default:
-            break;
-        }
-        break;
+        uint A = ReplicateBitTo7(bitval & 1);
+        uint res = (A & 0x20) | (((D * C + B) ^ A) >> 2);
+        return res + (res > 32 ? 1 : 0);
    }
-    case QUINT: {
-        D = QuintTritValue(val);
-        switch (bitlen) {
-        case 0:
-            return bitlen_0_results[D];
-        case 1: {
-            C = 28;
-            break;
-        }
-        case 2: {
-            C = 13;
-            const uint b = (bitval >> 1) & 1;
-            B = (b << 6) | (b << 1);
-            break;
-        }
-        }
-        break;
-    }
-    }
-    if (encoding != JUST_BITS && bitlen > 0) {
-        result = D * C + B;
-        result ^= A;
-        result = (A & 0x20) | (result >> 2);
-    }
-    if (result > 32) {
-        result += 1;
-    }
-    return result;
+    return 0;
 }

 void UnquantizeTexelWeights(uvec2 size, bool is_dual_plane) {
@ -1159,10 +1124,11 @@ void DecompressBlock(ivec3 coord) {
 }

 uint SwizzleOffset(uvec2 pos) {
-    const uint x = pos.x;
-    const uint y = pos.y;
-    return ((x % 64) / 32) * 256 + ((y % 8) / 2) * 64 +
-            ((x % 32) / 16) * 32 + (y % 2) * 16 + (x % 16);
+    return ((pos.x & 32u) << 3u) |
+           ((pos.y & 6u)  << 5u) |
+           ((pos.x & 16u) << 1u) |
+           ((pos.y & 1u)  << 4u) |
+           (pos.x & 15u);
 }

 void main() {
--- a/src/video_core/host_shaders/block_linear_unswizzle_3d_bcn.comp
+++ b/src/video_core/host_shaders/block_linear_unswizzle_3d_bcn.comp
@ -0,0 +1,160 @@
+// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#version 430
+
+#ifdef VULKAN
+    #extension GL_EXT_shader_16bit_storage : require
+    #extension GL_EXT_shader_8bit_storage  : require
+    #define HAS_EXTENDED_TYPES 1
+    #define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants {
+    #define END_PUSH_CONSTANTS };
+    #define UNIFORM(n)
+    #define BINDING_SWIZZLE_BUFFER 0
+    #define BINDING_INPUT_BUFFER   1
+    #define BINDING_OUTPUT_BUFFER  2
+#else
+    #extension GL_NV_gpu_shader5 : enable
+    #ifdef GL_NV_gpu_shader5
+        #define HAS_EXTENDED_TYPES 1
+    #else
+        #define HAS_EXTENDED_TYPES 0
+    #endif
+    #define BEGIN_PUSH_CONSTANTS
+    #define END_PUSH_CONSTANTS
+    #define UNIFORM(n) layout(location = n) uniform
+    #define BINDING_SWIZZLE_BUFFER 0
+    #define BINDING_INPUT_BUFFER   1
+    #define BINDING_OUTPUT_BUFFER  0
+#endif
+
+// --- Push Constants / Uniforms ---
+#ifdef VULKAN
+layout(push_constant) uniform PushConstants {
+    uvec3 blocks_dim;           // Offset 0
+    uint bytes_per_block_log2;  // Offset 12
+
+    uvec3 origin;               // Offset 16
+    uint slice_size;            // Offset 28
+
+    uint block_size;            // Offset 32
+    uint x_shift;               // Offset 36
+    uint block_height;          // Offset 40
+    uint block_height_mask;     // Offset 44
+
+    uint block_depth;           // Offset 48
+    uint block_depth_mask;      // Offset 52
+    int _pad;                   // Offset 56
+
+    ivec3 destination;          // Offset 60
+} pc;
+#else
+BEGIN_PUSH_CONSTANTS
+    UNIFORM(0)  uvec3 origin;
+    UNIFORM(1)  ivec3 destination;
+    UNIFORM(2)  uint  bytes_per_block_log2;
+    UNIFORM(3)  uint  slice_size;
+    UNIFORM(4)  uint  block_size;
+    UNIFORM(5)  uint  x_shift;
+    UNIFORM(6)  uint  block_height;
+    UNIFORM(7)  uint  block_height_mask;
+    UNIFORM(8)  uint  block_depth;
+    UNIFORM(9)  uint  block_depth_mask;
+    UNIFORM(10) uvec3 blocks_dim;
+END_PUSH_CONSTANTS
+#define pc // Map pc prefix to nothing for OpenGL compatibility
+#endif
+
+// --- Buffers ---
+layout(binding = BINDING_SWIZZLE_BUFFER, std430) readonly buffer SwizzleTable {
+    uint swizzle_table[];
+};
+
+#if HAS_EXTENDED_TYPES
+    layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU8   { uint8_t  u8data[];  };
+    layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU16  { uint16_t u16data[]; };
+#endif
+layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU32  { uint   u32data[];  };
+layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU64  { uvec2  u64data[];  };
+layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU128 { uvec4  u128data[]; };
+
+layout(binding = BINDING_OUTPUT_BUFFER, std430) writeonly buffer OutputBuffer {
+    uint out_u32[];
+};
+
+// --- Constants ---
+layout(local_size_x = 8, local_size_y = 8, local_size_z = 4) in;
+
+const uint GOB_SIZE_X = 64;
+const uint GOB_SIZE_Y = 8;
+const uint GOB_SIZE_Z = 1;
+const uint GOB_SIZE   = GOB_SIZE_X * GOB_SIZE_Y * GOB_SIZE_Z;
+
+const uint GOB_SIZE_X_SHIFT = 6;
+const uint GOB_SIZE_Y_SHIFT = 3;
+const uint GOB_SIZE_Z_SHIFT = 0;
+const uint GOB_SIZE_SHIFT   = GOB_SIZE_X_SHIFT + GOB_SIZE_Y_SHIFT + GOB_SIZE_Z_SHIFT;
+const uvec2 SWIZZLE_MASK = uvec2(GOB_SIZE_X - 1u, GOB_SIZE_Y - 1u);
+
+// --- Helpers ---
+uint SwizzleOffset(uvec2 pos) {
+    pos &= SWIZZLE_MASK;
+    return swizzle_table[pos.y * 64u + pos.x];
+}
+
+uvec4 ReadTexel(uint offset) {
+    uint bpl2 = pc.bytes_per_block_log2;
+    switch (bpl2) {
+#if HAS_EXTENDED_TYPES
+        case 0u: return uvec4(u8data[offset], 0u, 0u, 0u);
+        case 1u: return uvec4(u16data[offset / 2u], 0u, 0u, 0u);
+#else
+        case 0u: return uvec4(bitfieldExtract(u32data[offset / 4u], int((offset * 8u) & 24u), 8), 0u, 0u, 0u);
+        case 1u: return uvec4(bitfieldExtract(u32data[offset / 4u], int((offset * 8u) & 16u), 16), 0u, 0u, 0u);
+#endif
+        case 2u: return uvec4(u32data[offset / 4u], 0u, 0u, 0u);
+        case 3u: return uvec4(u64data[offset / 8u], 0u, 0u);
+        case 4u: return u128data[offset / 16u];
+    }
+    return uvec4(0u);
+}
+
+void main() {
+    uvec3 block_coord = gl_GlobalInvocationID;
+    if (any(greaterThanEqual(block_coord, pc.blocks_dim))) {
+        return;
+    }
+
+    uint bytes_per_block = 1u << pc.bytes_per_block_log2;
+    // Origin is in pixels, divide by 4 for block-space (e.g. BCn formats)
+    uvec3 pos;
+    pos.x = (block_coord.x + (pc.origin.x >> 2u)) * bytes_per_block;
+    pos.y = block_coord.y + (pc.origin.y >> 2u);
+    pos.z = block_coord.z + pc.origin.z;
+
+    uint swizzle = SwizzleOffset(pos.xy);
+    uint block_y = pos.y >> GOB_SIZE_Y_SHIFT;
+    uint offset  = 0u;
+    // Apply block-linear offsets
+    offset += (pos.z >> pc.block_depth) * pc.slice_size;
+    offset += (pos.z & pc.block_depth_mask) << (GOB_SIZE_SHIFT + pc.block_height);
+    offset += (block_y >> pc.block_height) * pc.block_size;
+    offset += (block_y & pc.block_height_mask) << GOB_SIZE_SHIFT;
+    offset += (pos.x >> GOB_SIZE_X_SHIFT) << pc.x_shift;
+    offset += swizzle;
+
+    uvec4 texel = ReadTexel(offset);
+
+    // Calculate linear output index
+    uint block_index = block_coord.x +
+                       (block_coord.y * pc.blocks_dim.x) +
+                       (block_coord.z * pc.blocks_dim.x * pc.blocks_dim.y);
+    uint out_idx = block_index * (bytes_per_block >> 2u);
+
+    out_u32[out_idx]     = texel.x;
+    out_u32[out_idx + 1u] = texel.y;
+    if (pc.bytes_per_block_log2 == 4u) {
+        out_u32[out_idx + 2u] = texel.z;
+        out_u32[out_idx + 3u] = texel.w;
+    }
+}