[video_core] Implement GPU-accelerated texture unswizzling and optimize sparse texture handling (#3246)

- [Added] a new compute shader to handle block-linear unswizzling on the GPU, reducing CPU overhead during texture uploads - [Implemented] BlockLinearUnswizzle3DPass to take advantage of the new compute shader, unimplemented for OpenGL - [Implemented] texture streaming and queue system for large sparse textures to prevent hitches - [Implemented] aggressive garbage collection system to eject large sparse textures to save on memory (Unused) - [Added] user settings to adjust the streaming unswizzle system for low-end machines - [Improved] slightly the ASTC GPU decoding system Co-authored-by: Caio Oliveira <caiooliveirafarias0@gmail.com> Co-authored-by: CamilleLaVey <camillelavey99@gmail.com> Co-authored-by: DraVee <dravee@eden-emu.dev> Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/3246 Reviewed-by: Maufeat <sahyno1996@gmail.com> Reviewed-by: MaranBr <maranbr@eden-emu.dev> Reviewed-by: DraVee <dravee@eden-emu.dev> Reviewed-by: CamilleLaVey <camillelavey99@gmail.com> Co-authored-by: Forrest Keller <forrestmarkx@outlook.com> Co-committed-by: Forrest Keller <forrestmarkx@outlook.com>
2026-06-05 20:07:13 +02:00 · 2026-01-13 19:18:08 +01:00 · 2026-01-13 19:18:08 +01:00 · ecd01e13fd
commit ecd01e13fd
parent f544004b5d
20 changed files with 1076 additions and 83 deletions
--- a/src/video_core/host_shaders/CMakeLists.txt
+++ b/src/video_core/host_shaders/CMakeLists.txt
@ -18,6 +18,7 @@ set(SHADER_FILES
    blit_color_float.frag
    block_linear_unswizzle_2d.comp
    block_linear_unswizzle_3d.comp
+    block_linear_unswizzle_3d_bcn.comp
    convert_abgr8_srgb_to_d24s8.frag
    convert_abgr8_to_d24s8.frag
    convert_abgr8_to_d32f.frag
--- a/src/video_core/host_shaders/astc_decoder.comp
+++ b/src/video_core/host_shaders/astc_decoder.comp
@ -727,70 +727,35 @@ void ComputeEndpoints(out uvec4 ep1, out uvec4 ep2, uint color_endpoint_mode, ui
 }

 uint UnquantizeTexelWeight(EncodingData val) {
-    const uint encoding = Encoding(val);
-    const uint bitlen = NumBits(val);
-    const uint bitval = BitValue(val);
-    const uint A = ReplicateBitTo7((bitval & 1));
-    uint B = 0, C = 0, D = 0;
-    uint result = 0;
-    const uint bitlen_0_results[5] = {0, 16, 32, 48, 64};
-    switch (encoding) {
-    case JUST_BITS:
-        return FastReplicateTo6(bitval, bitlen);
-    case TRIT: {
+    uint encoding = Encoding(val), bitlen = NumBits(val), bitval = BitValue(val);
+    if (encoding == JUST_BITS) {
+        return (bitlen >= 1 && bitlen <= 5)
+            ? uint(floor(0.5f + float(bitval) * 64.0f / float((1 << bitlen) - 1)))
+            : FastReplicateTo6(bitval, bitlen);
+    } else if (encoding == TRIT || encoding == QUINT) {
+        uint B = 0, C = 0, D = 0;
+        uint b_mask = (0x3100 >> (bitlen * 4)) & 0xf;
+        uint b = (bitval >> 1) & b_mask;
        D = QuintTritValue(val);
-        switch (bitlen) {
-        case 0:
-            return bitlen_0_results[D * 2];
-        case 1: {
-            C = 50;
-            break;
+        if (encoding == TRIT) {
+            switch (bitlen) {
+            case 0: return D * 32; //0,32,64
+            case 1: C = 50; break;
+            case 2: C = 23; B = (b << 6) | (b << 2) | b; break;
+            case 3: C = 11; B = (b << 5) | b; break;
+            }
+        } else if (encoding == QUINT) {
+            switch (bitlen) {
+            case 0: return D * 16; //0, 16, 32, 48, 64
+            case 1: C = 28; break;
+            case 2: C = 13; B = (b << 6) | (b << 1); break;
+            }
        }
-        case 2: {
-            C = 23;
-            const uint b = (bitval >> 1) & 1;
-            B = (b << 6) | (b << 2) | b;
-            break;
-        }
-        case 3: {
-            C = 11;
-            const uint cb = (bitval >> 1) & 3;
-            B = (cb << 5) | cb;
-            break;
-        }
-        default:
-            break;
-        }
-        break;
+        uint A = ReplicateBitTo7(bitval & 1);
+        uint res = (A & 0x20) | (((D * C + B) ^ A) >> 2);
+        return res + (res > 32 ? 1 : 0);
    }
-    case QUINT: {
-        D = QuintTritValue(val);
-        switch (bitlen) {
-        case 0:
-            return bitlen_0_results[D];
-        case 1: {
-            C = 28;
-            break;
-        }
-        case 2: {
-            C = 13;
-            const uint b = (bitval >> 1) & 1;
-            B = (b << 6) | (b << 1);
-            break;
-        }
-        }
-        break;
-    }
-    }
-    if (encoding != JUST_BITS && bitlen > 0) {
-        result = D * C + B;
-        result ^= A;
-        result = (A & 0x20) | (result >> 2);
-    }
-    if (result > 32) {
-        result += 1;
-    }
-    return result;
+    return 0;
 }

 void UnquantizeTexelWeights(uvec2 size, bool is_dual_plane) {
@ -1159,10 +1124,11 @@ void DecompressBlock(ivec3 coord) {
 }

 uint SwizzleOffset(uvec2 pos) {
-    const uint x = pos.x;
-    const uint y = pos.y;
-    return ((x % 64) / 32) * 256 + ((y % 8) / 2) * 64 +
-            ((x % 32) / 16) * 32 + (y % 2) * 16 + (x % 16);
+    return ((pos.x & 32u) << 3u) |
+           ((pos.y & 6u)  << 5u) |
+           ((pos.x & 16u) << 1u) |
+           ((pos.y & 1u)  << 4u) |
+           (pos.x & 15u);
 }

 void main() {
--- a/src/video_core/host_shaders/block_linear_unswizzle_3d_bcn.comp
+++ b/src/video_core/host_shaders/block_linear_unswizzle_3d_bcn.comp
@ -0,0 +1,160 @@
+// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#version 430
+
+#ifdef VULKAN
+    #extension GL_EXT_shader_16bit_storage : require
+    #extension GL_EXT_shader_8bit_storage  : require
+    #define HAS_EXTENDED_TYPES 1
+    #define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants {
+    #define END_PUSH_CONSTANTS };
+    #define UNIFORM(n)
+    #define BINDING_SWIZZLE_BUFFER 0
+    #define BINDING_INPUT_BUFFER   1
+    #define BINDING_OUTPUT_BUFFER  2
+#else
+    #extension GL_NV_gpu_shader5 : enable
+    #ifdef GL_NV_gpu_shader5
+        #define HAS_EXTENDED_TYPES 1
+    #else
+        #define HAS_EXTENDED_TYPES 0
+    #endif
+    #define BEGIN_PUSH_CONSTANTS
+    #define END_PUSH_CONSTANTS
+    #define UNIFORM(n) layout(location = n) uniform
+    #define BINDING_SWIZZLE_BUFFER 0
+    #define BINDING_INPUT_BUFFER   1
+    #define BINDING_OUTPUT_BUFFER  0
+#endif
+
+// --- Push Constants / Uniforms ---
+#ifdef VULKAN
+layout(push_constant) uniform PushConstants {
+    uvec3 blocks_dim;           // Offset 0
+    uint bytes_per_block_log2;  // Offset 12
+
+    uvec3 origin;               // Offset 16
+    uint slice_size;            // Offset 28
+
+    uint block_size;            // Offset 32
+    uint x_shift;               // Offset 36
+    uint block_height;          // Offset 40
+    uint block_height_mask;     // Offset 44
+
+    uint block_depth;           // Offset 48
+    uint block_depth_mask;      // Offset 52
+    int _pad;                   // Offset 56
+
+    ivec3 destination;          // Offset 60
+} pc;
+#else
+BEGIN_PUSH_CONSTANTS
+    UNIFORM(0)  uvec3 origin;
+    UNIFORM(1)  ivec3 destination;
+    UNIFORM(2)  uint  bytes_per_block_log2;
+    UNIFORM(3)  uint  slice_size;
+    UNIFORM(4)  uint  block_size;
+    UNIFORM(5)  uint  x_shift;
+    UNIFORM(6)  uint  block_height;
+    UNIFORM(7)  uint  block_height_mask;
+    UNIFORM(8)  uint  block_depth;
+    UNIFORM(9)  uint  block_depth_mask;
+    UNIFORM(10) uvec3 blocks_dim;
+END_PUSH_CONSTANTS
+#define pc // Map pc prefix to nothing for OpenGL compatibility
+#endif
+
+// --- Buffers ---
+layout(binding = BINDING_SWIZZLE_BUFFER, std430) readonly buffer SwizzleTable {
+    uint swizzle_table[];
+};
+
+#if HAS_EXTENDED_TYPES
+    layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU8   { uint8_t  u8data[];  };
+    layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU16  { uint16_t u16data[]; };
+#endif
+layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU32  { uint   u32data[];  };
+layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU64  { uvec2  u64data[];  };
+layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU128 { uvec4  u128data[]; };
+
+layout(binding = BINDING_OUTPUT_BUFFER, std430) writeonly buffer OutputBuffer {
+    uint out_u32[];
+};
+
+// --- Constants ---
+layout(local_size_x = 8, local_size_y = 8, local_size_z = 4) in;
+
+const uint GOB_SIZE_X = 64;
+const uint GOB_SIZE_Y = 8;
+const uint GOB_SIZE_Z = 1;
+const uint GOB_SIZE   = GOB_SIZE_X * GOB_SIZE_Y * GOB_SIZE_Z;
+
+const uint GOB_SIZE_X_SHIFT = 6;
+const uint GOB_SIZE_Y_SHIFT = 3;
+const uint GOB_SIZE_Z_SHIFT = 0;
+const uint GOB_SIZE_SHIFT   = GOB_SIZE_X_SHIFT + GOB_SIZE_Y_SHIFT + GOB_SIZE_Z_SHIFT;
+const uvec2 SWIZZLE_MASK = uvec2(GOB_SIZE_X - 1u, GOB_SIZE_Y - 1u);
+
+// --- Helpers ---
+uint SwizzleOffset(uvec2 pos) {
+    pos &= SWIZZLE_MASK;
+    return swizzle_table[pos.y * 64u + pos.x];
+}
+
+uvec4 ReadTexel(uint offset) {
+    uint bpl2 = pc.bytes_per_block_log2;
+    switch (bpl2) {
+#if HAS_EXTENDED_TYPES
+        case 0u: return uvec4(u8data[offset], 0u, 0u, 0u);
+        case 1u: return uvec4(u16data[offset / 2u], 0u, 0u, 0u);
+#else
+        case 0u: return uvec4(bitfieldExtract(u32data[offset / 4u], int((offset * 8u) & 24u), 8), 0u, 0u, 0u);
+        case 1u: return uvec4(bitfieldExtract(u32data[offset / 4u], int((offset * 8u) & 16u), 16), 0u, 0u, 0u);
+#endif
+        case 2u: return uvec4(u32data[offset / 4u], 0u, 0u, 0u);
+        case 3u: return uvec4(u64data[offset / 8u], 0u, 0u);
+        case 4u: return u128data[offset / 16u];
+    }
+    return uvec4(0u);
+}
+
+void main() {
+    uvec3 block_coord = gl_GlobalInvocationID;
+    if (any(greaterThanEqual(block_coord, pc.blocks_dim))) {
+        return;
+    }
+
+    uint bytes_per_block = 1u << pc.bytes_per_block_log2;
+    // Origin is in pixels, divide by 4 for block-space (e.g. BCn formats)
+    uvec3 pos;
+    pos.x = (block_coord.x + (pc.origin.x >> 2u)) * bytes_per_block;
+    pos.y = block_coord.y + (pc.origin.y >> 2u);
+    pos.z = block_coord.z + pc.origin.z;
+
+    uint swizzle = SwizzleOffset(pos.xy);
+    uint block_y = pos.y >> GOB_SIZE_Y_SHIFT;
+    uint offset  = 0u;
+    // Apply block-linear offsets
+    offset += (pos.z >> pc.block_depth) * pc.slice_size;
+    offset += (pos.z & pc.block_depth_mask) << (GOB_SIZE_SHIFT + pc.block_height);
+    offset += (block_y >> pc.block_height) * pc.block_size;
+    offset += (block_y & pc.block_height_mask) << GOB_SIZE_SHIFT;
+    offset += (pos.x >> GOB_SIZE_X_SHIFT) << pc.x_shift;
+    offset += swizzle;
+
+    uvec4 texel = ReadTexel(offset);
+
+    // Calculate linear output index
+    uint block_index = block_coord.x +
+                       (block_coord.y * pc.blocks_dim.x) +
+                       (block_coord.z * pc.blocks_dim.x * pc.blocks_dim.y);
+    uint out_idx = block_index * (bytes_per_block >> 2u);
+
+    out_u32[out_idx]     = texel.x;
+    out_u32[out_idx + 1u] = texel.y;
+    if (pc.bytes_per_block_log2 == 4u) {
+        out_u32[out_idx + 2u] = texel.z;
+        out_u32[out_idx + 3u] = texel.w;
+    }
+}
--- a/src/video_core/renderer_opengl/gl_texture_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp
@ -556,7 +556,7 @@ void TextureCacheRuntime::Finish() {
    glFinish();
 }

-StagingBufferMap TextureCacheRuntime::UploadStagingBuffer(size_t size) {
+StagingBufferMap TextureCacheRuntime::UploadStagingBuffer(size_t size, bool deferred) {
    return staging_buffer_pool.RequestUploadBuffer(size);
 }

@ -651,7 +651,8 @@ void TextureCacheRuntime::BlitFramebuffer(Framebuffer* dst, Framebuffer* src,
 }

 void TextureCacheRuntime::AccelerateImageUpload(Image& image, const StagingBufferMap& map,
-                                                std::span<const SwizzleParameters> swizzles) {
+                                                std::span<const SwizzleParameters> swizzles,
+                                                u32 z_start, u32 z_count) {
    switch (image.info.type) {
    case ImageType::e2D:
        if (IsPixelFormatASTC(image.info.format)) {
--- a/src/video_core/renderer_opengl/gl_texture_cache.h
+++ b/src/video_core/renderer_opengl/gl_texture_cache.h
@ -1,3 +1,6 @@
+// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
+// SPDX-License-Identifier: GPL-3.0-or-later
+
 // SPDX-FileCopyrightText: Copyright 2019 yuzu Emulator Project
 // SPDX-License-Identifier: GPL-2.0-or-later

@ -72,7 +75,7 @@ public:

    void Finish();

-    StagingBufferMap UploadStagingBuffer(size_t size);
+    StagingBufferMap UploadStagingBuffer(size_t size, bool deferred = false);

    StagingBufferMap DownloadStagingBuffer(size_t size, bool deferred = false);

@ -116,7 +119,8 @@ public:
                         Tegra::Engines::Fermi2D::Operation operation);

    void AccelerateImageUpload(Image& image, const StagingBufferMap& map,
-                               std::span<const VideoCommon::SwizzleParameters> swizzles);
+                               std::span<const VideoCommon::SwizzleParameters> swizzles,
+                               u32 z_start, u32 z_count);

    void InsertUploadMemoryBarrier();

@ -223,6 +227,8 @@ public:

    bool ScaleDown(bool ignore = false);

+    u64 allocation_tick;
+
 private:
    void CopyBufferToImage(const VideoCommon::BufferImageCopy& copy, size_t buffer_offset);

--- a/src/video_core/renderer_vulkan/vk_compute_pass.cpp
+++ b/src/video_core/renderer_vulkan/vk_compute_pass.cpp
@ -24,6 +24,7 @@
 #include "video_core/host_shaders/resolve_conditional_render_comp_spv.h"
 #include "video_core/host_shaders/vulkan_quad_indexed_comp_spv.h"
 #include "video_core/host_shaders/vulkan_uint8_comp_spv.h"
+#include "video_core/host_shaders/block_linear_unswizzle_3d_bcn_comp_spv.h"
 #include "video_core/renderer_vulkan/vk_compute_pass.h"
 #include "video_core/renderer_vulkan/vk_descriptor_pool.h"
 #include "video_core/renderer_vulkan/vk_scheduler.h"
@ -622,7 +623,7 @@ void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map,
            .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
            .pNext = nullptr,
            .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT,
-            .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT,
+            .dstAccessMask = VK_ACCESS_SHADER_READ_BIT,
            .oldLayout = VK_IMAGE_LAYOUT_GENERAL,
            .newLayout = VK_IMAGE_LAYOUT_GENERAL,
            .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
@ -637,9 +638,292 @@ void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map,
            },
        };
        cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
-                               VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, image_barrier);
+                               VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, 0, image_barrier);
+    });
+}
+
+constexpr u32 BL3D_BINDING_SWIZZLE_TABLE = 0;
+constexpr u32 BL3D_BINDING_INPUT_BUFFER  = 1;
+constexpr u32 BL3D_BINDING_OUTPUT_BUFFER = 2;
+
+constexpr std::array<VkDescriptorSetLayoutBinding, 3> BL3D_DESCRIPTOR_SET_BINDINGS{{
+    {
+        .binding = BL3D_BINDING_SWIZZLE_TABLE,
+        .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, // swizzle_table[]
+        .descriptorCount = 1,
+        .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+        .pImmutableSamplers = nullptr,
+    },
+    {
+        .binding = BL3D_BINDING_INPUT_BUFFER,
+        .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, // block-linear input
+        .descriptorCount = 1,
+        .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+        .pImmutableSamplers = nullptr,
+    },
+    {
+        .binding = BL3D_BINDING_OUTPUT_BUFFER,
+        .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+        .descriptorCount = 1,
+        .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+        .pImmutableSamplers = nullptr,
+    },
+}};
+
+constexpr DescriptorBankInfo BL3D_BANK_INFO{
+    .uniform_buffers = 0,
+    .storage_buffers = 3,
+    .texture_buffers = 0,
+    .image_buffers = 0,
+    .textures = 0,
+    .images = 0,
+    .score = 3,
+};
+
+constexpr std::array<VkDescriptorUpdateTemplateEntry, 3>
+    BL3D_DESCRIPTOR_UPDATE_TEMPLATE_ENTRY{{
+        {
+            .dstBinding = BL3D_BINDING_SWIZZLE_TABLE,
+            .dstArrayElement = 0,
+            .descriptorCount = 1,
+            .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+            .offset = BL3D_BINDING_SWIZZLE_TABLE * sizeof(DescriptorUpdateEntry),
+            .stride = sizeof(DescriptorUpdateEntry),
+        },
+        {
+            .dstBinding = BL3D_BINDING_INPUT_BUFFER,
+            .dstArrayElement = 0,
+            .descriptorCount = 1,
+            .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+            .offset = BL3D_BINDING_INPUT_BUFFER * sizeof(DescriptorUpdateEntry),
+            .stride = sizeof(DescriptorUpdateEntry),
+        },
+        {
+            .dstBinding = BL3D_BINDING_OUTPUT_BUFFER,
+            .dstArrayElement = 0,
+            .descriptorCount = 1,
+            .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+            .offset = BL3D_BINDING_OUTPUT_BUFFER * sizeof(DescriptorUpdateEntry),
+            .stride = sizeof(DescriptorUpdateEntry),
+        }
+    }};
+
+struct alignas(16) BlockLinearUnswizzle3DPushConstants {
+    u32 blocks_dim[3];           // Offset 0
+    u32 bytes_per_block_log2;    // Offset 12
+
+    u32 origin[3];               // Offset 16
+    u32 slice_size;              // Offset 28
+
+    u32 block_size;              // Offset 32
+    u32 x_shift;                 // Offset 36
+    u32 block_height;            // Offset 40
+    u32 block_height_mask;       // Offset 44
+
+    u32 block_depth;             // Offset 48
+    u32 block_depth_mask;        // Offset 52
+    s32 _pad;                    // Offset 56
+
+    s32 destination[3];          // Offset 60
+    s32 _pad_end;                // Offset 72
+};
+static_assert(sizeof(BlockLinearUnswizzle3DPushConstants) <= 128);
+
+BlockLinearUnswizzle3DPass::BlockLinearUnswizzle3DPass(
+    const Device& device_, Scheduler& scheduler_,
+    DescriptorPool& descriptor_pool_,
+    StagingBufferPool& staging_buffer_pool_,
+    ComputePassDescriptorQueue& compute_pass_descriptor_queue_)
+    : ComputePass(
+          device_, descriptor_pool_,
+          BL3D_DESCRIPTOR_SET_BINDINGS,
+          BL3D_DESCRIPTOR_UPDATE_TEMPLATE_ENTRY,
+          BL3D_BANK_INFO,
+          COMPUTE_PUSH_CONSTANT_RANGE<sizeof(BlockLinearUnswizzle3DPushConstants)>,
+          BLOCK_LINEAR_UNSWIZZLE_3D_BCN_COMP_SPV),
+      scheduler{scheduler_},
+      staging_buffer_pool{staging_buffer_pool_},
+      compute_pass_descriptor_queue{compute_pass_descriptor_queue_} {}
+
+BlockLinearUnswizzle3DPass::~BlockLinearUnswizzle3DPass() = default;
+
+// God have mercy on my soul
+void BlockLinearUnswizzle3DPass::Unswizzle(
+    Image& image,
+    const StagingBufferRef& swizzled,
+    std::span<const VideoCommon::SwizzleParameters> swizzles,
+    u32 z_start, u32 z_count)
+{
+    using namespace VideoCommon::Accelerated;
+
+    const u32 MAX_BATCH_SLICES = std::min(z_count, image.info.size.depth);
+
+    if (!image.has_compute_unswizzle_buffer) {
+        // Allocate exactly what this batch needs
+        image.AllocateComputeUnswizzleBuffer(MAX_BATCH_SLICES);
+    }
+
+    ASSERT(swizzles.size() == 1);
+    const auto& sw = swizzles[0];
+    const auto params = MakeBlockLinearSwizzle3DParams(sw, image.info);
+
+    const u32 blocks_x = (image.info.size.width  + 3) / 4;
+    const u32 blocks_y = (image.info.size.height + 3) / 4;
+
+    scheduler.RequestOutsideRenderPassOperationContext();
+    for (u32 z_offset = 0; z_offset < z_count; z_offset += MAX_BATCH_SLICES) {
+        const u32 current_chunk_slices = std::min(MAX_BATCH_SLICES, z_count - z_offset);
+        const u32 current_z_start = z_start + z_offset;
+
+        UnswizzleChunk(image, swizzled, sw, params, blocks_x, blocks_y,
+                       current_z_start, current_chunk_slices);
+    }
+}
+
+void BlockLinearUnswizzle3DPass::UnswizzleChunk(
+    Image& image,
+    const StagingBufferRef& swizzled,
+    const VideoCommon::SwizzleParameters& sw,
+    const BlockLinearSwizzle3DParams& params,
+    u32 blocks_x, u32 blocks_y,
+    u32 z_start, u32 z_count)
+{
+    BlockLinearUnswizzle3DPushConstants pc{};
+    pc.origin[0] = params.origin[0];
+    pc.origin[1] = params.origin[1];
+    pc.origin[2] = z_start; // Current chunk's Z start
+
+    pc.destination[0] = params.destination[0];
+    pc.destination[1] = params.destination[1];
+    pc.destination[2] = 0; // Shader writes to start of output buffer
+
+    pc.bytes_per_block_log2 = params.bytes_per_block_log2;
+    pc.slice_size           = params.slice_size;
+    pc.block_size           = params.block_size;
+    pc.x_shift              = params.x_shift;
+    pc.block_height         = params.block_height;
+    pc.block_height_mask    = params.block_height_mask;
+    pc.block_depth          = params.block_depth;
+    pc.block_depth_mask     = params.block_depth_mask;
+
+    pc.blocks_dim[0] = blocks_x;
+    pc.blocks_dim[1] = blocks_y;
+    pc.blocks_dim[2] = z_count; // Only process the count
+
+    compute_pass_descriptor_queue.Acquire();
+    compute_pass_descriptor_queue.AddBuffer(*image.runtime->swizzle_table_buffer, 0,
+                                           image.runtime->swizzle_table_size);
+    compute_pass_descriptor_queue.AddBuffer(swizzled.buffer,
+                                           sw.buffer_offset + swizzled.offset,
+                                           image.guest_size_bytes - sw.buffer_offset);
+    compute_pass_descriptor_queue.AddBuffer(*image.compute_unswizzle_buffer, 0,
+                                           image.compute_unswizzle_buffer_size);
+
+    const void* descriptor_data = compute_pass_descriptor_queue.UpdateData();
+    const VkDescriptorSet set = descriptor_allocator.Commit();
+
+    const u32 gx = Common::DivCeil(blocks_x, 8u);
+    const u32 gy = Common::DivCeil(blocks_y, 8u);
+    const u32 gz = Common::DivCeil(z_count, 4u);
+
+    const u32 bytes_per_block = 1u << pc.bytes_per_block_log2;
+    const VkDeviceSize output_slice_size =
+        static_cast<VkDeviceSize>(blocks_x) * blocks_y * bytes_per_block;
+    const VkDeviceSize barrier_size = output_slice_size * z_count;
+
+    const bool is_first_chunk = (z_start == 0);
+
+    const VkBuffer out_buffer = *image.compute_unswizzle_buffer;
+    const VkImage dst_image = image.Handle();
+    const VkImageAspectFlags aspect = image.AspectMask();
+    const u32 image_width = image.info.size.width;
+    const u32 image_height = image.info.size.height;
+
+    scheduler.Record([this, set, descriptor_data, pc, gx, gy, gz, z_start, z_count,
+                      barrier_size, is_first_chunk, out_buffer, dst_image, aspect,
+                      image_width, image_height
+                      ](vk::CommandBuffer cmdbuf) {
+
+        if (dst_image == VK_NULL_HANDLE || out_buffer == VK_NULL_HANDLE) {
+            return;
+        }
+
+        device.GetLogical().UpdateDescriptorSet(set, *descriptor_template, descriptor_data);
+        cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
+        cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, *layout, 0, set, {});
+        cmdbuf.PushConstants(*layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(pc), &pc);
+        cmdbuf.Dispatch(gx, gy, gz);
+
+        // Single barrier for compute -> transfer (buffer ready, image transition)
+        const VkBufferMemoryBarrier buffer_barrier{
+            .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
+            .pNext = nullptr,
+            .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT,
+            .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT,
+            .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+            .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+            .buffer = out_buffer,
+            .offset = 0,
+            .size = barrier_size,
+        };
+
+        // Image layout transition
+        const VkImageMemoryBarrier pre_barrier{
+            .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
+            .pNext = nullptr,
+            .srcAccessMask = is_first_chunk ? VkAccessFlags{} :
+                            static_cast<VkAccessFlags>(VK_ACCESS_TRANSFER_WRITE_BIT),
+            .dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
+            .oldLayout = is_first_chunk ? VK_IMAGE_LAYOUT_UNDEFINED :
+                        VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
+            .newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
+            .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+            .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+            .image = dst_image,
+            .subresourceRange = {aspect, 0, 1, 0, 1},
+        };
+
+        // Single barrier handles both buffer and image
+        cmdbuf.PipelineBarrier(
+            VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
+            VK_PIPELINE_STAGE_TRANSFER_BIT,
+            0,
+            nullptr, buffer_barrier, pre_barrier
+        );
+
+        // Copy chunk to correct Z position in image
+        const VkBufferImageCopy copy{
+            .bufferOffset = 0, // Read from start of staging buffer
+            .bufferRowLength = 0,
+            .bufferImageHeight = 0,
+            .imageSubresource = {aspect, 0, 0, 1},
+            .imageOffset = {0, 0, static_cast<s32>(z_start)}, // Write to correct Z
+            .imageExtent = {image_width, image_height, z_count},
+        };
+        cmdbuf.CopyBufferToImage(out_buffer, dst_image,
+                                VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, copy);
+
+        // Post-copy transition
+        const VkImageMemoryBarrier post_barrier{
+            .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
+            .pNext = nullptr,
+            .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
+            .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT,
+            .oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
+            .newLayout = VK_IMAGE_LAYOUT_GENERAL,
+            .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+            .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+            .image = dst_image,
+            .subresourceRange = {aspect, 0, 1, 0, 1},
+        };
+
+        cmdbuf.PipelineBarrier(
+            VK_PIPELINE_STAGE_TRANSFER_BIT,
+            VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
+            0,
+            nullptr, nullptr, post_barrier
+        );
    });
-    scheduler.Finish();
 }

 MSAACopyPass::MSAACopyPass(const Device& device_, Scheduler& scheduler_,
--- a/src/video_core/renderer_vulkan/vk_compute_pass.h
+++ b/src/video_core/renderer_vulkan/vk_compute_pass.h
@ -1,3 +1,6 @@
+// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
+// SPDX-License-Identifier: GPL-3.0-or-later
+
 // SPDX-FileCopyrightText: Copyright 2019 yuzu Emulator Project
 // SPDX-License-Identifier: GPL-2.0-or-later

@ -14,6 +17,7 @@
 #include "video_core/texture_cache/types.h"
 #include "video_core/vulkan_common/vulkan_memory_allocator.h"
 #include "video_core/vulkan_common/vulkan_wrapper.h"
+#include "video_core/texture_cache/accelerated_swizzle.h"

 namespace VideoCommon {
 struct SwizzleParameters;
@ -21,6 +25,8 @@ struct SwizzleParameters;

 namespace Vulkan {

+using VideoCommon::Accelerated::BlockLinearSwizzle3DParams;
+
 class Device;
 class StagingBufferPool;
 class Scheduler;
@ -131,6 +137,34 @@ private:
    MemoryAllocator& memory_allocator;
 };

+class BlockLinearUnswizzle3DPass final : public ComputePass {
+public:
+    explicit BlockLinearUnswizzle3DPass(const Device& device_, Scheduler& scheduler_,
+                             DescriptorPool& descriptor_pool_,
+                             StagingBufferPool& staging_buffer_pool_,
+                             ComputePassDescriptorQueue& compute_pass_descriptor_queue_);
+    ~BlockLinearUnswizzle3DPass();
+
+    void Unswizzle(Image& image,
+                   const StagingBufferRef& swizzled,
+                   std::span<const VideoCommon::SwizzleParameters> swizzles,
+                   u32 z_start, u32 z_count);
+
+    void UnswizzleChunk(
+        Image& image,
+        const StagingBufferRef& swizzled,
+        const VideoCommon::SwizzleParameters& sw,
+        const BlockLinearSwizzle3DParams& params,
+        u32 blocks_x, u32 blocks_y,
+        u32 z_start, u32 z_count);
+
+private:
+    Scheduler& scheduler;
+    StagingBufferPool& staging_buffer_pool;
+    ComputePassDescriptorQueue& compute_pass_descriptor_queue;
+};
+
+
 class MSAACopyPass final : public ComputePass {
 public:
    explicit MSAACopyPass(const Device& device_, Scheduler& scheduler_,
--- a/src/video_core/renderer_vulkan/vk_scheduler.cpp
+++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp
@ -43,6 +43,16 @@ Scheduler::Scheduler(const Device& device_, StateTracker& state_tracker_)
    : device{device_}, state_tracker{state_tracker_},
      master_semaphore{std::make_unique<MasterSemaphore>(device)},
      command_pool{std::make_unique<CommandPool>(*master_semaphore, device)} {
+
+    /*// PRE-OPTIMIZATION: Warm up the pool to prevent mid-frame spikes
+    {
+        std::scoped_lock rl{reserve_mutex};
+        chunk_reserve.reserve(2048); // Prevent vector resizing
+        for (int i = 0; i < 1024; ++i) {
+            chunk_reserve.push_back(std::make_unique<CommandChunk>());
+        }
+    }*/
+
    AcquireNewChunk();
    AllocateWorkerCommandBuffer();
    worker_thread = std::jthread([this](std::stop_token token) { WorkerThread(token); });
--- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp
@ -24,12 +24,14 @@
 #include "video_core/renderer_vulkan/vk_render_pass_cache.h"
 #include "video_core/renderer_vulkan/vk_scheduler.h"
 #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h"
+#include "video_core/surface.h"
 #include "video_core/texture_cache/formatter.h"
 #include "video_core/texture_cache/samples_helper.h"
 #include "video_core/texture_cache/util.h"
 #include "video_core/vulkan_common/vulkan_device.h"
 #include "video_core/vulkan_common/vulkan_memory_allocator.h"
 #include "video_core/vulkan_common/vulkan_wrapper.h"
+#include "video_core/textures/decoders.h"

 namespace Vulkan {

@ -878,14 +880,51 @@ TextureCacheRuntime::TextureCacheRuntime(const Device& device_, Scheduler& sched
            }
        }
    }
+
+    bl3d_unswizzle_pass.emplace(device, scheduler, descriptor_pool,
+                            staging_buffer_pool, compute_pass_descriptor_queue);
+
+    // --- Create swizzle table buffer ---
+    {
+        auto table = Tegra::Texture::MakeSwizzleTable();
+
+        swizzle_table_size = static_cast<VkDeviceSize>(table.size() * sizeof(table[0]));
+
+        auto staging = staging_buffer_pool.Request(swizzle_table_size, MemoryUsage::Upload);
+        std::memcpy(staging.mapped_span.data(), table.data(), static_cast<size_t>(swizzle_table_size));
+
+        VkBufferCreateInfo ci{
+            .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
+            .size = swizzle_table_size,
+            .usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
+                     VK_BUFFER_USAGE_TRANSFER_DST_BIT |
+                     VK_BUFFER_USAGE_TRANSFER_SRC_BIT,
+            .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
+        };
+        swizzle_table_buffer = memory_allocator.CreateBuffer(ci, MemoryUsage::DeviceLocal);
+
+        scheduler.RequestOutsideRenderPassOperationContext();
+        scheduler.Record([staging_buf = staging.buffer,
+                          dst_buf = *swizzle_table_buffer,
+                          size = swizzle_table_size,
+                          src_off = staging.offset](vk::CommandBuffer cmdbuf) {
+
+            const VkBufferCopy region{
+                .srcOffset = src_off,
+                .dstOffset = 0,
+                .size = size,
+            };
+            cmdbuf.CopyBuffer(staging_buf, dst_buf, region);
+        });
+    }
 }

 void TextureCacheRuntime::Finish() {
    scheduler.Finish();
 }

-StagingBufferRef TextureCacheRuntime::UploadStagingBuffer(size_t size) {
-    return staging_buffer_pool.Request(size, MemoryUsage::Upload);
+StagingBufferRef TextureCacheRuntime::UploadStagingBuffer(size_t size, bool deferred) {
+    return staging_buffer_pool.Request(size, MemoryUsage::Upload, deferred);
 }

 StagingBufferRef TextureCacheRuntime::DownloadStagingBuffer(size_t size, bool deferred) {
@ -1581,6 +1620,46 @@ Image::Image(const VideoCommon::NullImageParams& params) : VideoCommon::ImageBas

 Image::~Image() = default;

+void Image::AllocateComputeUnswizzleBuffer(u32 max_slices) {
+    if (has_compute_unswizzle_buffer)
+        return;
+
+    using VideoCore::Surface::BytesPerBlock;
+
+    const u32 block_bytes  = BytesPerBlock(info.format); // 8 for BC1, 16 for BC6H
+    const u32 block_width  = 4;
+    const u32 block_height = 4;
+
+    // BCn is 4x4x1 blocks
+    const u32 blocks_x = (info.size.width  + block_width  - 1) / block_width;
+    const u32 blocks_y = (info.size.height + block_height - 1) / block_height;
+    const u32 blocks_z = std::min(max_slices, info.size.depth);
+
+    const u64 block_count =
+        static_cast<u64>(blocks_x) *
+        static_cast<u64>(blocks_y) *
+        static_cast<u64>(blocks_z);
+
+    compute_unswizzle_buffer_size = block_count * block_bytes;
+
+    VkBufferCreateInfo ci{
+        .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
+        .pNext = nullptr,
+        .flags = 0,
+        .size = compute_unswizzle_buffer_size,
+        .usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
+                 VK_BUFFER_USAGE_TRANSFER_SRC_BIT,
+        .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
+        .queueFamilyIndexCount = 0,
+        .pQueueFamilyIndices = nullptr,
+    };
+
+    compute_unswizzle_buffer =
+        runtime->memory_allocator.CreateBuffer(ci, MemoryUsage::DeviceLocal);
+
+    has_compute_unswizzle_buffer = true;
+}
+
 void Image::UploadMemory(VkBuffer buffer, VkDeviceSize offset,
                         std::span<const VideoCommon::BufferImageCopy> copies) {
    // TODO: Move this to another API
@ -2397,10 +2476,22 @@ void Framebuffer::CreateFramebuffer(TextureCacheRuntime& runtime,

 void TextureCacheRuntime::AccelerateImageUpload(
    Image& image, const StagingBufferRef& map,
-    std::span<const VideoCommon::SwizzleParameters> swizzles) {
+    std::span<const VideoCommon::SwizzleParameters> swizzles,
+    u32 z_start, u32 z_count) {
+
    if (IsPixelFormatASTC(image.info.format)) {
        return astc_decoder_pass->Assemble(image, map, swizzles);
    }
+
+    if (bl3d_unswizzle_pass &&
+        IsPixelFormatBCn(image.info.format) &&
+        image.info.type == ImageType::e3D &&
+        image.info.resources.levels == 1 &&
+        image.info.resources.layers == 1) {
+
+        return bl3d_unswizzle_pass->Unswizzle(image, map, swizzles, z_start, z_count);
+    }
+
    ASSERT(false);
 }

--- a/src/video_core/renderer_vulkan/vk_texture_cache.h
+++ b/src/video_core/renderer_vulkan/vk_texture_cache.h
@ -51,7 +51,7 @@ public:

    void Finish();

-    StagingBufferRef UploadStagingBuffer(size_t size);
+    StagingBufferRef UploadStagingBuffer(size_t size, bool deferred = false);

    StagingBufferRef DownloadStagingBuffer(size_t size, bool deferred = false);

@ -91,7 +91,8 @@ public:
    }

    void AccelerateImageUpload(Image&, const StagingBufferRef&,
-                               std::span<const VideoCommon::SwizzleParameters>);
+                               std::span<const VideoCommon::SwizzleParameters>,
+                               u32 z_start, u32 z_count);

    void InsertUploadMemoryBarrier() {}

@ -127,6 +128,11 @@ public:
    BlitImageHelper& blit_image_helper;
    RenderPassCache& render_pass_cache;
    std::optional<ASTCDecoderPass> astc_decoder_pass;
+
+    std::optional<BlockLinearUnswizzle3DPass> bl3d_unswizzle_pass;
+    vk::Buffer swizzle_table_buffer;
+    VkDeviceSize swizzle_table_size = 0;
+
    std::unique_ptr<MSAACopyPass> msaa_copy_pass;
    const Settings::ResolutionScalingInfo& resolution;
    std::array<std::vector<VkFormat>, VideoCore::Surface::MaxPixelFormat> view_formats;
@ -164,6 +170,8 @@ public:
    void DownloadMemory(const StagingBufferRef& map,
                        std::span<const VideoCommon::BufferImageCopy> copies);

+    void AllocateComputeUnswizzleImage();
+
    [[nodiscard]] VkImage Handle() const noexcept {
        return *(this->*current_image);
    }
@ -189,6 +197,10 @@ public:

    bool ScaleDown(bool ignore = false);

+    u64 allocation_tick;
+
+    friend class BlockLinearUnswizzle3DPass;
+
 private:
    bool BlitScaleHelper(bool scale_up);

@ -200,6 +212,12 @@ private:
    vk::Image original_image;
    vk::Image scaled_image;

+    vk::Buffer compute_unswizzle_buffer;
+    VkDeviceSize compute_unswizzle_buffer_size = 0;
+    bool has_compute_unswizzle_buffer = false;
+
+    void AllocateComputeUnswizzleBuffer(u32 max_slices);
+
    // Use a pointer to field because it is relative, so that the object can be
    // moved without breaking the reference.
    vk::Image Image::*current_image{};
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@ -8,6 +8,7 @@

 #include <limits>
 #include <optional>
+#include <bit>
 #include <unordered_set>
 #include <boost/container/small_vector.hpp>

@ -22,6 +23,7 @@
 #include "video_core/texture_cache/samples_helper.h"
 #include "video_core/texture_cache/texture_cache_base.h"
 #include "video_core/texture_cache/util.h"
+#include "video_core/textures/decoders.h"

 namespace VideoCommon {

@ -68,10 +70,41 @@ TextureCache<P>::TextureCache(Runtime& runtime_, Tegra::MaxwellDeviceMemoryManag
            (std::max)((std::min)(device_local_memory - min_vacancy_critical, min_spacing_critical),
                     DEFAULT_CRITICAL_MEMORY));
        minimum_memory = static_cast<u64>((device_local_memory - mem_threshold) / 2);
+
+        lowmemorydevice = false;
    } else {
        expected_memory = DEFAULT_EXPECTED_MEMORY + 512_MiB;
        critical_memory = DEFAULT_CRITICAL_MEMORY + 1_GiB;
        minimum_memory = 0;
+
+        lowmemorydevice = true;
+    }
+
+    switch (Settings::values.gpu_unzwizzle_texture_size.GetValue()) {
+        case Settings::GpuUnswizzleSize::VerySmall:    gpu_unswizzle_maxsize = 16_MiB; break;
+        case Settings::GpuUnswizzleSize::Small:        gpu_unswizzle_maxsize = 32_MiB; break;
+        case Settings::GpuUnswizzleSize::Normal:       gpu_unswizzle_maxsize = 128_MiB; break;
+        case Settings::GpuUnswizzleSize::Large:        gpu_unswizzle_maxsize = 256_MiB; break;
+        case Settings::GpuUnswizzleSize::VeryLarge:    gpu_unswizzle_maxsize = 512_MiB; break;
+        default:                                       gpu_unswizzle_maxsize = 128_MiB; break;
+    }
+
+    switch (Settings::values.gpu_unzwizzle_stream_size.GetValue()) {
+        case Settings::GpuUnswizzle::VeryLow: swizzle_chunk_size = 4_MiB; break;
+        case Settings::GpuUnswizzle::Low:     swizzle_chunk_size = 8_MiB; break;
+        case Settings::GpuUnswizzle::Normal:  swizzle_chunk_size = 16_MiB; break;
+        case Settings::GpuUnswizzle::Medium:  swizzle_chunk_size = 32_MiB; break;
+        case Settings::GpuUnswizzle::High:    swizzle_chunk_size = 64_MiB; break;
+        default:                              swizzle_chunk_size = 16_MiB;
+    }
+
+    switch (Settings::values.gpu_unzwizzle_chunk_size.GetValue()) {
+        case Settings::GpuUnswizzleChunk::VeryLow: swizzle_slices_per_batch = 32; break;
+        case Settings::GpuUnswizzleChunk::Low:     swizzle_slices_per_batch = 64; break;
+        case Settings::GpuUnswizzleChunk::Normal:  swizzle_slices_per_batch = 128; break;
+        case Settings::GpuUnswizzleChunk::Medium:  swizzle_slices_per_batch = 256; break;
+        case Settings::GpuUnswizzleChunk::High:    swizzle_slices_per_batch = 512; break;
+        default:                                   swizzle_slices_per_batch = 128;
    }
 }

@ -88,6 +121,7 @@ void TextureCache<P>::RunGarbageCollector() {
        ticks_to_destroy = aggressive_mode ? 10ULL : high_priority_mode ? 25ULL : 50ULL;
        num_iterations = aggressive_mode ? 40 : (high_priority_mode ? 20 : 10);
    };
+
    const auto Cleanup = [this, &num_iterations, &high_priority_mode,
                          &aggressive_mode](ImageId image_id) {
        if (num_iterations == 0) {
@ -95,20 +129,36 @@ void TextureCache<P>::RunGarbageCollector() {
        }
        --num_iterations;
        auto& image = slot_images[image_id];
+
+        // Never delete recently allocated sparse textures (within 3 frames)
+        const bool is_recently_allocated = image.allocation_tick >= frame_tick - 3;
+        if (is_recently_allocated && image.info.is_sparse) {
+            return false;
+        }
+
        if (True(image.flags & ImageFlagBits::IsDecoding)) {
            // This image is still being decoded, deleting it will invalidate the slot
            // used by the async decoder thread.
            return false;
        }
-        if (!aggressive_mode && True(image.flags & ImageFlagBits::CostlyLoad)) {
+
+        // Prioritize large sparse textures for cleanup
+        const bool is_large_sparse = lowmemorydevice &&
+                                     image.info.is_sparse &&
+                                     image.guest_size_bytes >= 256_MiB;
+
+        if (!aggressive_mode && !is_large_sparse &&
+            True(image.flags & ImageFlagBits::CostlyLoad)) {
            return false;
        }
+
        const bool must_download =
            image.IsSafeDownload() && False(image.flags & ImageFlagBits::BadOverlap);
-        if (!high_priority_mode && must_download) {
+        if (!high_priority_mode && !is_large_sparse && must_download) {
            return false;
        }
-        if (must_download) {
+
+        if (must_download && !is_large_sparse) {
            auto map = runtime.DownloadStagingBuffer(image.unswizzled_size_bytes);
            const auto copies = FixSmallVectorADL(FullDownloadCopies(image.info));
            image.DownloadMemory(map, copies);
@ -116,11 +166,13 @@ void TextureCache<P>::RunGarbageCollector() {
            SwizzleImage(*gpu_memory, image.gpu_addr, image.info, copies, map.mapped_span,
                         swizzle_data_buffer);
        }
+
        if (True(image.flags & ImageFlagBits::Tracked)) {
            UntrackImage(image, image_id);
        }
        UnregisterImage(image_id);
        DeleteImage(image_id, image.scale_tick > frame_tick + 5);
+
        if (total_used_memory < critical_memory) {
            if (aggressive_mode) {
                // Sink the aggresiveness.
@ -136,7 +188,24 @@ void TextureCache<P>::RunGarbageCollector() {
        return false;
    };

-    // Try to remove anything old enough and not high priority.
+    // Aggressively clear massive sparse textures
+    if (total_used_memory >= expected_memory) {
+        lru_cache.ForEachItemBelow(frame_tick, [&](ImageId image_id) {
+            auto& image = slot_images[image_id];
+            // Only target sparse textures that are old enough
+            if (lowmemorydevice &&
+                image.info.is_sparse &&
+                image.guest_size_bytes >= 256_MiB &&
+                image.allocation_tick < frame_tick - 3) {
+                LOG_DEBUG(HW_GPU, "GC targeting old sparse texture at 0x{:X} ({} MiB, age: {} frames)",
+                         image.gpu_addr, image.guest_size_bytes / (1024 * 1024),
+                         frame_tick - image.allocation_tick);
+                return Cleanup(image_id);
+            }
+            return false;
+        });
+    }
+
    Configure(false);
    lru_cache.ForEachItemBelow(frame_tick - ticks_to_destroy, Cleanup);

@ -160,6 +229,7 @@ void TextureCache<P>::TickFrame() {
    sentenced_framebuffers.Tick();
    sentenced_image_view.Tick();
    TickAsyncDecode();
+    TickAsyncUnswizzle();

    runtime.TickFrame();
    ++frame_tick;
@ -627,7 +697,6 @@ void TextureCache<P>::UnmapGPUMemory(size_t as_id, GPUVAddr gpu_addr, size_t siz
                UntrackImage(image, id);
            }
        }
-
        if (True(image.flags & ImageFlagBits::Remapped)) {
            continue;
        }
@ -1055,7 +1124,12 @@ void TextureCache<P>::RefreshContents(Image& image, ImageId image_id) {
        // Only upload modified images
        return;
    }
+
    image.flags &= ~ImageFlagBits::CpuModified;
+    if( lowmemorydevice && image.info.format == PixelFormat::BC1_RGBA_UNORM && MapSizeBytes(image) >= 256_MiB ) {
+        return;
+    }
+
    TrackImage(image, image_id);

    if (image.info.num_samples > 1 && !runtime.CanUploadMSAA()) {
@ -1067,6 +1141,16 @@ void TextureCache<P>::RefreshContents(Image& image, ImageId image_id) {
        QueueAsyncDecode(image, image_id);
        return;
    }
+    if (IsPixelFormatBCn(image.info.format) &&
+        image.info.type == ImageType::e3D &&
+        image.info.resources.levels == 1 &&
+        image.info.resources.layers == 1 &&
+        MapSizeBytes(image) >= gpu_unswizzle_maxsize &&
+        False(image.flags & ImageFlagBits::GpuModified)) {
+
+        QueueAsyncUnswizzle(image, image_id);
+        return;
+    }
    auto staging = runtime.UploadStagingBuffer(MapSizeBytes(image));
    UploadImageContents(image, staging);
    runtime.InsertUploadMemoryBarrier();
@ -1082,7 +1166,7 @@ void TextureCache<P>::UploadImageContents(Image& image, StagingBuffer& staging)
        gpu_memory->ReadBlock(gpu_addr, mapped_span.data(), mapped_span.size_bytes(),
                              VideoCommon::CacheType::NoTextureCache);
        const auto uploads = FullUploadSwizzles(image.info);
-        runtime.AccelerateImageUpload(image, staging, FixSmallVectorADL(uploads));
+        runtime.AccelerateImageUpload(image, staging, FixSmallVectorADL(uploads), 0, 0);
        return;
    }

@ -1311,6 +1395,20 @@ void TextureCache<P>::QueueAsyncDecode(Image& image, ImageId image_id) {
    texture_decode_worker.QueueWork(std::move(func));
 }

+template <class P>
+void TextureCache<P>::QueueAsyncUnswizzle(Image& image, ImageId image_id) {
+    if (True(image.flags & ImageFlagBits::IsDecoding)) {
+        return;
+    }
+
+    image.flags |= ImageFlagBits::IsDecoding;
+
+    unswizzle_queue.push_back({
+        .image_id = image_id,
+        .info = image.info
+    });
+}
+
 template <class P>
 void TextureCache<P>::TickAsyncDecode() {
    bool has_uploads{};
@ -1336,6 +1434,83 @@ void TextureCache<P>::TickAsyncDecode() {
    }
 }

+template <class P>
+void TextureCache<P>::TickAsyncUnswizzle() {
+    if (unswizzle_queue.empty()) {
+        return;
+    }
+
+    if(current_unswizzle_frame > 0) {
+        current_unswizzle_frame--;
+        return;
+    }
+
+    PendingUnswizzle& task = unswizzle_queue.front();
+    Image& image = slot_images[task.image_id];
+
+    if (!task.initialized) {
+        task.total_size = MapSizeBytes(image);
+        task.staging_buffer = runtime.UploadStagingBuffer(task.total_size, true);
+
+        const auto& info = image.info;
+        const u32 bytes_per_block = BytesPerBlock(info.format);
+        const u32 width_blocks = Common::DivCeil(info.size.width, 4u);
+        const u32 height_blocks = Common::DivCeil(info.size.height, 4u);
+
+        const u32 stride = width_blocks * bytes_per_block;
+        const u32 aligned_height = height_blocks;
+        task.bytes_per_slice = static_cast<size_t>(stride) * aligned_height;
+        task.last_submitted_offset = 0;
+        task.initialized = true;
+    }
+
+    // Read data
+    if (task.current_offset < task.total_size) {
+        const size_t remaining = task.total_size - task.current_offset;
+
+        size_t copy_amount = std::min(swizzle_chunk_size, remaining);
+
+        if (remaining > swizzle_chunk_size) {
+            copy_amount = (copy_amount / task.bytes_per_slice) * task.bytes_per_slice;
+            if (copy_amount == 0) copy_amount = task.bytes_per_slice;
+        }
+
+        gpu_memory->ReadBlock(image.gpu_addr + task.current_offset,
+                              task.staging_buffer.mapped_span.data() + task.current_offset,
+                              copy_amount);
+        task.current_offset += copy_amount;
+    }
+
+    const bool is_final_batch = task.current_offset >= task.total_size;
+    const size_t bytes_ready = task.current_offset - task.last_submitted_offset;
+    const u32 complete_slices = static_cast<u32>(bytes_ready / task.bytes_per_slice);
+
+    if (complete_slices >= swizzle_slices_per_batch || (is_final_batch && complete_slices > 0)) {
+        const u32 z_start = static_cast<u32>(task.last_submitted_offset / task.bytes_per_slice);
+        const u32 slices_to_process = std::min(complete_slices, swizzle_slices_per_batch);
+        const u32 z_count = std::min(slices_to_process, image.info.size.depth - z_start);
+
+        if (z_count > 0) {
+            const auto uploads = FullUploadSwizzles(task.info);
+            runtime.AccelerateImageUpload(image, task.staging_buffer, FixSmallVectorADL(uploads), z_start, z_count);
+            task.last_submitted_offset += (static_cast<size_t>(z_count) * task.bytes_per_slice);
+        }
+    }
+
+    // Check if complete
+    const u32 slices_submitted = static_cast<u32>(task.last_submitted_offset / task.bytes_per_slice);
+    const bool all_slices_submitted = slices_submitted >= image.info.size.depth;
+
+    if (is_final_batch && all_slices_submitted) {
+        runtime.FreeDeferredStagingBuffer(task.staging_buffer);
+        image.flags &= ~ImageFlagBits::IsDecoding;
+        unswizzle_queue.pop_front();
+
+        // Wait 4 frames to process the next entry
+        current_unswizzle_frame = 4u;
+    }
+}
+
 template <class P>
 bool TextureCache<P>::ScaleUp(Image& image) {
    const bool has_copy = image.HasScaled();
@ -1374,6 +1549,39 @@ ImageId TextureCache<P>::InsertImage(const ImageInfo& info, GPUVAddr gpu_addr,
        }
    }
    ASSERT_MSG(cpu_addr, "Tried to insert an image to an invalid gpu_addr=0x{:x}", gpu_addr);
+
+    // For large sparse textures, aggressively clean up old allocations at same address
+    if (lowmemorydevice && info.is_sparse && CalculateGuestSizeInBytes(info) >= 256_MiB) {
+        const auto alloc_it = image_allocs_table.find(gpu_addr);
+        if (alloc_it != image_allocs_table.end()) {
+            const ImageAllocId alloc_id = alloc_it->second;
+            auto& alloc_images = slot_image_allocs[alloc_id].images;
+
+            // Collect old images at this address that were created more than 2 frames ago
+            boost::container::small_vector<ImageId, 4> to_delete;
+            for (ImageId old_image_id : alloc_images) {
+                Image& old_image = slot_images[old_image_id];
+                if (old_image.info.is_sparse &&
+                    old_image.gpu_addr == gpu_addr &&
+                    old_image.allocation_tick < frame_tick - 2) {  // Try not to delete fresh textures
+                    to_delete.push_back(old_image_id);
+                }
+            }
+
+            // Delete old images immediately
+            for (ImageId old_id : to_delete) {
+                Image& old_image = slot_images[old_id];
+                LOG_DEBUG(HW_GPU, "Immediately deleting old sparse texture at 0x{:X} ({} MiB)",
+                         gpu_addr, old_image.guest_size_bytes / (1024 * 1024));
+                if (True(old_image.flags & ImageFlagBits::Tracked)) {
+                    UntrackImage(old_image, old_id);
+                }
+                UnregisterImage(old_id);
+                DeleteImage(old_id, true);
+            }
+        }
+    }
+
    const ImageId image_id = JoinImages(info, gpu_addr, *cpu_addr);
    const Image& image = slot_images[image_id];
    // Using "image.gpu_addr" instead of "gpu_addr" is important because it might be different
@ -1389,6 +1597,27 @@ template <class P>
 ImageId TextureCache<P>::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, DAddr cpu_addr) {
    ImageInfo new_info = info;
    const size_t size_bytes = CalculateGuestSizeInBytes(new_info);
+
+    // Proactive cleanup for large sparse texture allocations
+    if (lowmemorydevice && new_info.is_sparse && size_bytes >= 256_MiB) {
+        const u64 estimated_alloc_size = size_bytes;
+
+        if (total_used_memory + estimated_alloc_size >= critical_memory) {
+            LOG_DEBUG(HW_GPU, "Large sparse texture allocation ({} MiB) - running aggressive GC. "
+                       "Current memory: {} MiB, Critical: {} MiB",
+                       size_bytes / (1024 * 1024),
+                       total_used_memory / (1024 * 1024),
+                       critical_memory / (1024 * 1024));
+            RunGarbageCollector();
+
+            // If still over threshold after GC, try one more aggressive pass
+            if (total_used_memory + estimated_alloc_size >= critical_memory) {
+                LOG_DEBUG(HW_GPU, "Still critically low on memory, running second GC pass");
+                RunGarbageCollector();
+            }
+        }
+    }
+
    const bool broken_views = runtime.HasBrokenTextureViewFormats();
    const bool native_bgr = runtime.HasNativeBgr();
    join_overlap_ids.clear();
@ -1485,6 +1714,8 @@ ImageId TextureCache<P>::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, DA
    const ImageId new_image_id = slot_images.insert(runtime, new_info, gpu_addr, cpu_addr);
    Image& new_image = slot_images[new_image_id];

+    new_image.allocation_tick = frame_tick;
+
    if (!gpu_memory->IsContinuousRange(new_image.gpu_addr, new_image.guest_size_bytes) &&
        new_info.is_sparse) {
        new_image.flags |= ImageFlagBits::Sparse;
--- a/src/video_core/texture_cache/texture_cache_base.h
+++ b/src/video_core/texture_cache/texture_cache_base.h
@ -129,6 +129,17 @@ class TextureCache : public VideoCommon::ChannelSetupCaches<TextureCacheChannelI
    using AsyncBuffer = typename P::AsyncBuffer;
    using BufferType = typename P::BufferType;

+    struct PendingUnswizzle {
+        ImageId image_id;
+        VideoCommon::ImageInfo info;
+        size_t current_offset = 0;
+        size_t total_size = 0;
+        AsyncBuffer staging_buffer;
+        size_t last_submitted_offset = 0;
+        size_t bytes_per_slice;
+        bool initialized = false;
+    };
+
    struct BlitImages {
        ImageId dst_id;
        ImageId src_id;
@ -433,6 +444,9 @@ private:
    void TrimInactiveSamplers(size_t budget);
    std::optional<size_t> QuerySamplerBudget() const;

+    void QueueAsyncUnswizzle(Image& image, ImageId image_id);
+    void TickAsyncUnswizzle();
+
    Runtime& runtime;

    Tegra::MaxwellDeviceMemoryManager& device_memory;
@ -453,6 +467,10 @@ private:
    u64 minimum_memory;
    u64 expected_memory;
    u64 critical_memory;
+    bool lowmemorydevice = false;
+    size_t gpu_unswizzle_maxsize = 0;
+    size_t swizzle_chunk_size = 0;
+    u32 swizzle_slices_per_batch = 0;

    struct BufferDownload {
        GPUVAddr address;
@ -508,6 +526,9 @@ private:
    Common::ThreadWorker texture_decode_worker{1, "TextureDecoder"};
    std::vector<std::unique_ptr<AsyncDecodeContext>> async_decodes;

+    std::deque<PendingUnswizzle> unswizzle_queue;
+    u8 current_unswizzle_frame;
+
    // Join caching
    boost::container::small_vector<ImageId, 4> join_overlap_ids;
    std::unordered_set<ImageId> join_overlaps_found;