[video_core] Implement GPU-accelerated texture unswizzling and optimize sparse texture handling (#3246)

- [Added] a new compute shader to handle block-linear unswizzling on the GPU, reducing CPU overhead during texture uploads
- [Implemented] BlockLinearUnswizzle3DPass to take advantage of the new compute shader, unimplemented for OpenGL
- [Implemented] texture streaming and queue system for large sparse textures to prevent hitches
- [Implemented] aggressive garbage collection system to eject large sparse textures to save on memory (Unused)
- [Added] user settings to adjust the streaming unswizzle system for low-end machines
- [Improved] slightly the ASTC GPU decoding system

Co-authored-by: Caio Oliveira <caiooliveirafarias0@gmail.com>
Co-authored-by: CamilleLaVey <camillelavey99@gmail.com>
Co-authored-by: DraVee <dravee@eden-emu.dev>
Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/3246
Reviewed-by: Maufeat <sahyno1996@gmail.com>
Reviewed-by: MaranBr <maranbr@eden-emu.dev>
Reviewed-by: DraVee <dravee@eden-emu.dev>
Reviewed-by: CamilleLaVey <camillelavey99@gmail.com>
Co-authored-by: Forrest Keller <forrestmarkx@outlook.com>
Co-committed-by: Forrest Keller <forrestmarkx@outlook.com>
This commit is contained in:
Forrest Keller 2026-01-13 19:18:08 +01:00 committed by crueter
parent f544004b5d
commit ecd01e13fd
No known key found for this signature in database
GPG key ID: 425ACD2D4830EBC6
20 changed files with 1076 additions and 83 deletions

View file

@ -18,6 +18,7 @@ set(SHADER_FILES
blit_color_float.frag
block_linear_unswizzle_2d.comp
block_linear_unswizzle_3d.comp
block_linear_unswizzle_3d_bcn.comp
convert_abgr8_srgb_to_d24s8.frag
convert_abgr8_to_d24s8.frag
convert_abgr8_to_d32f.frag

View file

@ -727,70 +727,35 @@ void ComputeEndpoints(out uvec4 ep1, out uvec4 ep2, uint color_endpoint_mode, ui
}
uint UnquantizeTexelWeight(EncodingData val) {
const uint encoding = Encoding(val);
const uint bitlen = NumBits(val);
const uint bitval = BitValue(val);
const uint A = ReplicateBitTo7((bitval & 1));
uint B = 0, C = 0, D = 0;
uint result = 0;
const uint bitlen_0_results[5] = {0, 16, 32, 48, 64};
switch (encoding) {
case JUST_BITS:
return FastReplicateTo6(bitval, bitlen);
case TRIT: {
uint encoding = Encoding(val), bitlen = NumBits(val), bitval = BitValue(val);
if (encoding == JUST_BITS) {
return (bitlen >= 1 && bitlen <= 5)
? uint(floor(0.5f + float(bitval) * 64.0f / float((1 << bitlen) - 1)))
: FastReplicateTo6(bitval, bitlen);
} else if (encoding == TRIT || encoding == QUINT) {
uint B = 0, C = 0, D = 0;
uint b_mask = (0x3100 >> (bitlen * 4)) & 0xf;
uint b = (bitval >> 1) & b_mask;
D = QuintTritValue(val);
switch (bitlen) {
case 0:
return bitlen_0_results[D * 2];
case 1: {
C = 50;
break;
if (encoding == TRIT) {
switch (bitlen) {
case 0: return D * 32; //0,32,64
case 1: C = 50; break;
case 2: C = 23; B = (b << 6) | (b << 2) | b; break;
case 3: C = 11; B = (b << 5) | b; break;
}
} else if (encoding == QUINT) {
switch (bitlen) {
case 0: return D * 16; //0, 16, 32, 48, 64
case 1: C = 28; break;
case 2: C = 13; B = (b << 6) | (b << 1); break;
}
}
case 2: {
C = 23;
const uint b = (bitval >> 1) & 1;
B = (b << 6) | (b << 2) | b;
break;
}
case 3: {
C = 11;
const uint cb = (bitval >> 1) & 3;
B = (cb << 5) | cb;
break;
}
default:
break;
}
break;
uint A = ReplicateBitTo7(bitval & 1);
uint res = (A & 0x20) | (((D * C + B) ^ A) >> 2);
return res + (res > 32 ? 1 : 0);
}
case QUINT: {
D = QuintTritValue(val);
switch (bitlen) {
case 0:
return bitlen_0_results[D];
case 1: {
C = 28;
break;
}
case 2: {
C = 13;
const uint b = (bitval >> 1) & 1;
B = (b << 6) | (b << 1);
break;
}
}
break;
}
}
if (encoding != JUST_BITS && bitlen > 0) {
result = D * C + B;
result ^= A;
result = (A & 0x20) | (result >> 2);
}
if (result > 32) {
result += 1;
}
return result;
return 0;
}
void UnquantizeTexelWeights(uvec2 size, bool is_dual_plane) {
@ -1159,10 +1124,11 @@ void DecompressBlock(ivec3 coord) {
}
uint SwizzleOffset(uvec2 pos) {
const uint x = pos.x;
const uint y = pos.y;
return ((x % 64) / 32) * 256 + ((y % 8) / 2) * 64 +
((x % 32) / 16) * 32 + (y % 2) * 16 + (x % 16);
return ((pos.x & 32u) << 3u) |
((pos.y & 6u) << 5u) |
((pos.x & 16u) << 1u) |
((pos.y & 1u) << 4u) |
(pos.x & 15u);
}
void main() {

View file

@ -0,0 +1,160 @@
// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
// SPDX-License-Identifier: GPL-3.0-or-later
#version 430
#ifdef VULKAN
#extension GL_EXT_shader_16bit_storage : require
#extension GL_EXT_shader_8bit_storage : require
#define HAS_EXTENDED_TYPES 1
#define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants {
#define END_PUSH_CONSTANTS };
#define UNIFORM(n)
#define BINDING_SWIZZLE_BUFFER 0
#define BINDING_INPUT_BUFFER 1
#define BINDING_OUTPUT_BUFFER 2
#else
#extension GL_NV_gpu_shader5 : enable
#ifdef GL_NV_gpu_shader5
#define HAS_EXTENDED_TYPES 1
#else
#define HAS_EXTENDED_TYPES 0
#endif
#define BEGIN_PUSH_CONSTANTS
#define END_PUSH_CONSTANTS
#define UNIFORM(n) layout(location = n) uniform
#define BINDING_SWIZZLE_BUFFER 0
#define BINDING_INPUT_BUFFER 1
#define BINDING_OUTPUT_BUFFER 0
#endif
// --- Push Constants / Uniforms ---
#ifdef VULKAN
layout(push_constant) uniform PushConstants {
uvec3 blocks_dim; // Offset 0
uint bytes_per_block_log2; // Offset 12
uvec3 origin; // Offset 16
uint slice_size; // Offset 28
uint block_size; // Offset 32
uint x_shift; // Offset 36
uint block_height; // Offset 40
uint block_height_mask; // Offset 44
uint block_depth; // Offset 48
uint block_depth_mask; // Offset 52
int _pad; // Offset 56
ivec3 destination; // Offset 60
} pc;
#else
BEGIN_PUSH_CONSTANTS
UNIFORM(0) uvec3 origin;
UNIFORM(1) ivec3 destination;
UNIFORM(2) uint bytes_per_block_log2;
UNIFORM(3) uint slice_size;
UNIFORM(4) uint block_size;
UNIFORM(5) uint x_shift;
UNIFORM(6) uint block_height;
UNIFORM(7) uint block_height_mask;
UNIFORM(8) uint block_depth;
UNIFORM(9) uint block_depth_mask;
UNIFORM(10) uvec3 blocks_dim;
END_PUSH_CONSTANTS
#define pc // Map pc prefix to nothing for OpenGL compatibility
#endif
// --- Buffers ---
layout(binding = BINDING_SWIZZLE_BUFFER, std430) readonly buffer SwizzleTable {
uint swizzle_table[];
};
#if HAS_EXTENDED_TYPES
layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU8 { uint8_t u8data[]; };
layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU16 { uint16_t u16data[]; };
#endif
layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU32 { uint u32data[]; };
layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU64 { uvec2 u64data[]; };
layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU128 { uvec4 u128data[]; };
layout(binding = BINDING_OUTPUT_BUFFER, std430) writeonly buffer OutputBuffer {
uint out_u32[];
};
// --- Constants ---
layout(local_size_x = 8, local_size_y = 8, local_size_z = 4) in;
const uint GOB_SIZE_X = 64;
const uint GOB_SIZE_Y = 8;
const uint GOB_SIZE_Z = 1;
const uint GOB_SIZE = GOB_SIZE_X * GOB_SIZE_Y * GOB_SIZE_Z;
const uint GOB_SIZE_X_SHIFT = 6;
const uint GOB_SIZE_Y_SHIFT = 3;
const uint GOB_SIZE_Z_SHIFT = 0;
const uint GOB_SIZE_SHIFT = GOB_SIZE_X_SHIFT + GOB_SIZE_Y_SHIFT + GOB_SIZE_Z_SHIFT;
const uvec2 SWIZZLE_MASK = uvec2(GOB_SIZE_X - 1u, GOB_SIZE_Y - 1u);
// --- Helpers ---
uint SwizzleOffset(uvec2 pos) {
pos &= SWIZZLE_MASK;
return swizzle_table[pos.y * 64u + pos.x];
}
uvec4 ReadTexel(uint offset) {
uint bpl2 = pc.bytes_per_block_log2;
switch (bpl2) {
#if HAS_EXTENDED_TYPES
case 0u: return uvec4(u8data[offset], 0u, 0u, 0u);
case 1u: return uvec4(u16data[offset / 2u], 0u, 0u, 0u);
#else
case 0u: return uvec4(bitfieldExtract(u32data[offset / 4u], int((offset * 8u) & 24u), 8), 0u, 0u, 0u);
case 1u: return uvec4(bitfieldExtract(u32data[offset / 4u], int((offset * 8u) & 16u), 16), 0u, 0u, 0u);
#endif
case 2u: return uvec4(u32data[offset / 4u], 0u, 0u, 0u);
case 3u: return uvec4(u64data[offset / 8u], 0u, 0u);
case 4u: return u128data[offset / 16u];
}
return uvec4(0u);
}
void main() {
uvec3 block_coord = gl_GlobalInvocationID;
if (any(greaterThanEqual(block_coord, pc.blocks_dim))) {
return;
}
uint bytes_per_block = 1u << pc.bytes_per_block_log2;
// Origin is in pixels, divide by 4 for block-space (e.g. BCn formats)
uvec3 pos;
pos.x = (block_coord.x + (pc.origin.x >> 2u)) * bytes_per_block;
pos.y = block_coord.y + (pc.origin.y >> 2u);
pos.z = block_coord.z + pc.origin.z;
uint swizzle = SwizzleOffset(pos.xy);
uint block_y = pos.y >> GOB_SIZE_Y_SHIFT;
uint offset = 0u;
// Apply block-linear offsets
offset += (pos.z >> pc.block_depth) * pc.slice_size;
offset += (pos.z & pc.block_depth_mask) << (GOB_SIZE_SHIFT + pc.block_height);
offset += (block_y >> pc.block_height) * pc.block_size;
offset += (block_y & pc.block_height_mask) << GOB_SIZE_SHIFT;
offset += (pos.x >> GOB_SIZE_X_SHIFT) << pc.x_shift;
offset += swizzle;
uvec4 texel = ReadTexel(offset);
// Calculate linear output index
uint block_index = block_coord.x +
(block_coord.y * pc.blocks_dim.x) +
(block_coord.z * pc.blocks_dim.x * pc.blocks_dim.y);
uint out_idx = block_index * (bytes_per_block >> 2u);
out_u32[out_idx] = texel.x;
out_u32[out_idx + 1u] = texel.y;
if (pc.bytes_per_block_log2 == 4u) {
out_u32[out_idx + 2u] = texel.z;
out_u32[out_idx + 3u] = texel.w;
}
}

View file

@ -556,7 +556,7 @@ void TextureCacheRuntime::Finish() {
glFinish();
}
StagingBufferMap TextureCacheRuntime::UploadStagingBuffer(size_t size) {
StagingBufferMap TextureCacheRuntime::UploadStagingBuffer(size_t size, bool deferred) {
return staging_buffer_pool.RequestUploadBuffer(size);
}
@ -651,7 +651,8 @@ void TextureCacheRuntime::BlitFramebuffer(Framebuffer* dst, Framebuffer* src,
}
void TextureCacheRuntime::AccelerateImageUpload(Image& image, const StagingBufferMap& map,
std::span<const SwizzleParameters> swizzles) {
std::span<const SwizzleParameters> swizzles,
u32 z_start, u32 z_count) {
switch (image.info.type) {
case ImageType::e2D:
if (IsPixelFormatASTC(image.info.format)) {

View file

@ -1,3 +1,6 @@
// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
// SPDX-License-Identifier: GPL-3.0-or-later
// SPDX-FileCopyrightText: Copyright 2019 yuzu Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
@ -72,7 +75,7 @@ public:
void Finish();
StagingBufferMap UploadStagingBuffer(size_t size);
StagingBufferMap UploadStagingBuffer(size_t size, bool deferred = false);
StagingBufferMap DownloadStagingBuffer(size_t size, bool deferred = false);
@ -116,7 +119,8 @@ public:
Tegra::Engines::Fermi2D::Operation operation);
void AccelerateImageUpload(Image& image, const StagingBufferMap& map,
std::span<const VideoCommon::SwizzleParameters> swizzles);
std::span<const VideoCommon::SwizzleParameters> swizzles,
u32 z_start, u32 z_count);
void InsertUploadMemoryBarrier();
@ -223,6 +227,8 @@ public:
bool ScaleDown(bool ignore = false);
u64 allocation_tick;
private:
void CopyBufferToImage(const VideoCommon::BufferImageCopy& copy, size_t buffer_offset);

View file

@ -24,6 +24,7 @@
#include "video_core/host_shaders/resolve_conditional_render_comp_spv.h"
#include "video_core/host_shaders/vulkan_quad_indexed_comp_spv.h"
#include "video_core/host_shaders/vulkan_uint8_comp_spv.h"
#include "video_core/host_shaders/block_linear_unswizzle_3d_bcn_comp_spv.h"
#include "video_core/renderer_vulkan/vk_compute_pass.h"
#include "video_core/renderer_vulkan/vk_descriptor_pool.h"
#include "video_core/renderer_vulkan/vk_scheduler.h"
@ -622,7 +623,7 @@ void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map,
.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
.pNext = nullptr,
.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT,
.dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT,
.dstAccessMask = VK_ACCESS_SHADER_READ_BIT,
.oldLayout = VK_IMAGE_LAYOUT_GENERAL,
.newLayout = VK_IMAGE_LAYOUT_GENERAL,
.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
@ -637,9 +638,292 @@ void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map,
},
};
cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, image_barrier);
VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, 0, image_barrier);
});
}
constexpr u32 BL3D_BINDING_SWIZZLE_TABLE = 0;
constexpr u32 BL3D_BINDING_INPUT_BUFFER = 1;
constexpr u32 BL3D_BINDING_OUTPUT_BUFFER = 2;
constexpr std::array<VkDescriptorSetLayoutBinding, 3> BL3D_DESCRIPTOR_SET_BINDINGS{{
{
.binding = BL3D_BINDING_SWIZZLE_TABLE,
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, // swizzle_table[]
.descriptorCount = 1,
.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
.pImmutableSamplers = nullptr,
},
{
.binding = BL3D_BINDING_INPUT_BUFFER,
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, // block-linear input
.descriptorCount = 1,
.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
.pImmutableSamplers = nullptr,
},
{
.binding = BL3D_BINDING_OUTPUT_BUFFER,
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
.descriptorCount = 1,
.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
.pImmutableSamplers = nullptr,
},
}};
constexpr DescriptorBankInfo BL3D_BANK_INFO{
.uniform_buffers = 0,
.storage_buffers = 3,
.texture_buffers = 0,
.image_buffers = 0,
.textures = 0,
.images = 0,
.score = 3,
};
constexpr std::array<VkDescriptorUpdateTemplateEntry, 3>
BL3D_DESCRIPTOR_UPDATE_TEMPLATE_ENTRY{{
{
.dstBinding = BL3D_BINDING_SWIZZLE_TABLE,
.dstArrayElement = 0,
.descriptorCount = 1,
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
.offset = BL3D_BINDING_SWIZZLE_TABLE * sizeof(DescriptorUpdateEntry),
.stride = sizeof(DescriptorUpdateEntry),
},
{
.dstBinding = BL3D_BINDING_INPUT_BUFFER,
.dstArrayElement = 0,
.descriptorCount = 1,
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
.offset = BL3D_BINDING_INPUT_BUFFER * sizeof(DescriptorUpdateEntry),
.stride = sizeof(DescriptorUpdateEntry),
},
{
.dstBinding = BL3D_BINDING_OUTPUT_BUFFER,
.dstArrayElement = 0,
.descriptorCount = 1,
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
.offset = BL3D_BINDING_OUTPUT_BUFFER * sizeof(DescriptorUpdateEntry),
.stride = sizeof(DescriptorUpdateEntry),
}
}};
struct alignas(16) BlockLinearUnswizzle3DPushConstants {
u32 blocks_dim[3]; // Offset 0
u32 bytes_per_block_log2; // Offset 12
u32 origin[3]; // Offset 16
u32 slice_size; // Offset 28
u32 block_size; // Offset 32
u32 x_shift; // Offset 36
u32 block_height; // Offset 40
u32 block_height_mask; // Offset 44
u32 block_depth; // Offset 48
u32 block_depth_mask; // Offset 52
s32 _pad; // Offset 56
s32 destination[3]; // Offset 60
s32 _pad_end; // Offset 72
};
static_assert(sizeof(BlockLinearUnswizzle3DPushConstants) <= 128);
BlockLinearUnswizzle3DPass::BlockLinearUnswizzle3DPass(
const Device& device_, Scheduler& scheduler_,
DescriptorPool& descriptor_pool_,
StagingBufferPool& staging_buffer_pool_,
ComputePassDescriptorQueue& compute_pass_descriptor_queue_)
: ComputePass(
device_, descriptor_pool_,
BL3D_DESCRIPTOR_SET_BINDINGS,
BL3D_DESCRIPTOR_UPDATE_TEMPLATE_ENTRY,
BL3D_BANK_INFO,
COMPUTE_PUSH_CONSTANT_RANGE<sizeof(BlockLinearUnswizzle3DPushConstants)>,
BLOCK_LINEAR_UNSWIZZLE_3D_BCN_COMP_SPV),
scheduler{scheduler_},
staging_buffer_pool{staging_buffer_pool_},
compute_pass_descriptor_queue{compute_pass_descriptor_queue_} {}
BlockLinearUnswizzle3DPass::~BlockLinearUnswizzle3DPass() = default;
// God have mercy on my soul
void BlockLinearUnswizzle3DPass::Unswizzle(
Image& image,
const StagingBufferRef& swizzled,
std::span<const VideoCommon::SwizzleParameters> swizzles,
u32 z_start, u32 z_count)
{
using namespace VideoCommon::Accelerated;
const u32 MAX_BATCH_SLICES = std::min(z_count, image.info.size.depth);
if (!image.has_compute_unswizzle_buffer) {
// Allocate exactly what this batch needs
image.AllocateComputeUnswizzleBuffer(MAX_BATCH_SLICES);
}
ASSERT(swizzles.size() == 1);
const auto& sw = swizzles[0];
const auto params = MakeBlockLinearSwizzle3DParams(sw, image.info);
const u32 blocks_x = (image.info.size.width + 3) / 4;
const u32 blocks_y = (image.info.size.height + 3) / 4;
scheduler.RequestOutsideRenderPassOperationContext();
for (u32 z_offset = 0; z_offset < z_count; z_offset += MAX_BATCH_SLICES) {
const u32 current_chunk_slices = std::min(MAX_BATCH_SLICES, z_count - z_offset);
const u32 current_z_start = z_start + z_offset;
UnswizzleChunk(image, swizzled, sw, params, blocks_x, blocks_y,
current_z_start, current_chunk_slices);
}
}
void BlockLinearUnswizzle3DPass::UnswizzleChunk(
Image& image,
const StagingBufferRef& swizzled,
const VideoCommon::SwizzleParameters& sw,
const BlockLinearSwizzle3DParams& params,
u32 blocks_x, u32 blocks_y,
u32 z_start, u32 z_count)
{
BlockLinearUnswizzle3DPushConstants pc{};
pc.origin[0] = params.origin[0];
pc.origin[1] = params.origin[1];
pc.origin[2] = z_start; // Current chunk's Z start
pc.destination[0] = params.destination[0];
pc.destination[1] = params.destination[1];
pc.destination[2] = 0; // Shader writes to start of output buffer
pc.bytes_per_block_log2 = params.bytes_per_block_log2;
pc.slice_size = params.slice_size;
pc.block_size = params.block_size;
pc.x_shift = params.x_shift;
pc.block_height = params.block_height;
pc.block_height_mask = params.block_height_mask;
pc.block_depth = params.block_depth;
pc.block_depth_mask = params.block_depth_mask;
pc.blocks_dim[0] = blocks_x;
pc.blocks_dim[1] = blocks_y;
pc.blocks_dim[2] = z_count; // Only process the count
compute_pass_descriptor_queue.Acquire();
compute_pass_descriptor_queue.AddBuffer(*image.runtime->swizzle_table_buffer, 0,
image.runtime->swizzle_table_size);
compute_pass_descriptor_queue.AddBuffer(swizzled.buffer,
sw.buffer_offset + swizzled.offset,
image.guest_size_bytes - sw.buffer_offset);
compute_pass_descriptor_queue.AddBuffer(*image.compute_unswizzle_buffer, 0,
image.compute_unswizzle_buffer_size);
const void* descriptor_data = compute_pass_descriptor_queue.UpdateData();
const VkDescriptorSet set = descriptor_allocator.Commit();
const u32 gx = Common::DivCeil(blocks_x, 8u);
const u32 gy = Common::DivCeil(blocks_y, 8u);
const u32 gz = Common::DivCeil(z_count, 4u);
const u32 bytes_per_block = 1u << pc.bytes_per_block_log2;
const VkDeviceSize output_slice_size =
static_cast<VkDeviceSize>(blocks_x) * blocks_y * bytes_per_block;
const VkDeviceSize barrier_size = output_slice_size * z_count;
const bool is_first_chunk = (z_start == 0);
const VkBuffer out_buffer = *image.compute_unswizzle_buffer;
const VkImage dst_image = image.Handle();
const VkImageAspectFlags aspect = image.AspectMask();
const u32 image_width = image.info.size.width;
const u32 image_height = image.info.size.height;
scheduler.Record([this, set, descriptor_data, pc, gx, gy, gz, z_start, z_count,
barrier_size, is_first_chunk, out_buffer, dst_image, aspect,
image_width, image_height
](vk::CommandBuffer cmdbuf) {
if (dst_image == VK_NULL_HANDLE || out_buffer == VK_NULL_HANDLE) {
return;
}
device.GetLogical().UpdateDescriptorSet(set, *descriptor_template, descriptor_data);
cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, *layout, 0, set, {});
cmdbuf.PushConstants(*layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(pc), &pc);
cmdbuf.Dispatch(gx, gy, gz);
// Single barrier for compute -> transfer (buffer ready, image transition)
const VkBufferMemoryBarrier buffer_barrier{
.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
.pNext = nullptr,
.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT,
.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT,
.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.buffer = out_buffer,
.offset = 0,
.size = barrier_size,
};
// Image layout transition
const VkImageMemoryBarrier pre_barrier{
.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
.pNext = nullptr,
.srcAccessMask = is_first_chunk ? VkAccessFlags{} :
static_cast<VkAccessFlags>(VK_ACCESS_TRANSFER_WRITE_BIT),
.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
.oldLayout = is_first_chunk ? VK_IMAGE_LAYOUT_UNDEFINED :
VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
.newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.image = dst_image,
.subresourceRange = {aspect, 0, 1, 0, 1},
};
// Single barrier handles both buffer and image
cmdbuf.PipelineBarrier(
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
VK_PIPELINE_STAGE_TRANSFER_BIT,
0,
nullptr, buffer_barrier, pre_barrier
);
// Copy chunk to correct Z position in image
const VkBufferImageCopy copy{
.bufferOffset = 0, // Read from start of staging buffer
.bufferRowLength = 0,
.bufferImageHeight = 0,
.imageSubresource = {aspect, 0, 0, 1},
.imageOffset = {0, 0, static_cast<s32>(z_start)}, // Write to correct Z
.imageExtent = {image_width, image_height, z_count},
};
cmdbuf.CopyBufferToImage(out_buffer, dst_image,
VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, copy);
// Post-copy transition
const VkImageMemoryBarrier post_barrier{
.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
.pNext = nullptr,
.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
.dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT,
.oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
.newLayout = VK_IMAGE_LAYOUT_GENERAL,
.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.image = dst_image,
.subresourceRange = {aspect, 0, 1, 0, 1},
};
cmdbuf.PipelineBarrier(
VK_PIPELINE_STAGE_TRANSFER_BIT,
VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
0,
nullptr, nullptr, post_barrier
);
});
scheduler.Finish();
}
MSAACopyPass::MSAACopyPass(const Device& device_, Scheduler& scheduler_,

View file

@ -1,3 +1,6 @@
// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
// SPDX-License-Identifier: GPL-3.0-or-later
// SPDX-FileCopyrightText: Copyright 2019 yuzu Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
@ -14,6 +17,7 @@
#include "video_core/texture_cache/types.h"
#include "video_core/vulkan_common/vulkan_memory_allocator.h"
#include "video_core/vulkan_common/vulkan_wrapper.h"
#include "video_core/texture_cache/accelerated_swizzle.h"
namespace VideoCommon {
struct SwizzleParameters;
@ -21,6 +25,8 @@ struct SwizzleParameters;
namespace Vulkan {
using VideoCommon::Accelerated::BlockLinearSwizzle3DParams;
class Device;
class StagingBufferPool;
class Scheduler;
@ -131,6 +137,34 @@ private:
MemoryAllocator& memory_allocator;
};
class BlockLinearUnswizzle3DPass final : public ComputePass {
public:
explicit BlockLinearUnswizzle3DPass(const Device& device_, Scheduler& scheduler_,
DescriptorPool& descriptor_pool_,
StagingBufferPool& staging_buffer_pool_,
ComputePassDescriptorQueue& compute_pass_descriptor_queue_);
~BlockLinearUnswizzle3DPass();
void Unswizzle(Image& image,
const StagingBufferRef& swizzled,
std::span<const VideoCommon::SwizzleParameters> swizzles,
u32 z_start, u32 z_count);
void UnswizzleChunk(
Image& image,
const StagingBufferRef& swizzled,
const VideoCommon::SwizzleParameters& sw,
const BlockLinearSwizzle3DParams& params,
u32 blocks_x, u32 blocks_y,
u32 z_start, u32 z_count);
private:
Scheduler& scheduler;
StagingBufferPool& staging_buffer_pool;
ComputePassDescriptorQueue& compute_pass_descriptor_queue;
};
class MSAACopyPass final : public ComputePass {
public:
explicit MSAACopyPass(const Device& device_, Scheduler& scheduler_,

View file

@ -43,6 +43,16 @@ Scheduler::Scheduler(const Device& device_, StateTracker& state_tracker_)
: device{device_}, state_tracker{state_tracker_},
master_semaphore{std::make_unique<MasterSemaphore>(device)},
command_pool{std::make_unique<CommandPool>(*master_semaphore, device)} {
/*// PRE-OPTIMIZATION: Warm up the pool to prevent mid-frame spikes
{
std::scoped_lock rl{reserve_mutex};
chunk_reserve.reserve(2048); // Prevent vector resizing
for (int i = 0; i < 1024; ++i) {
chunk_reserve.push_back(std::make_unique<CommandChunk>());
}
}*/
AcquireNewChunk();
AllocateWorkerCommandBuffer();
worker_thread = std::jthread([this](std::stop_token token) { WorkerThread(token); });

View file

@ -24,12 +24,14 @@
#include "video_core/renderer_vulkan/vk_render_pass_cache.h"
#include "video_core/renderer_vulkan/vk_scheduler.h"
#include "video_core/renderer_vulkan/vk_staging_buffer_pool.h"
#include "video_core/surface.h"
#include "video_core/texture_cache/formatter.h"
#include "video_core/texture_cache/samples_helper.h"
#include "video_core/texture_cache/util.h"
#include "video_core/vulkan_common/vulkan_device.h"
#include "video_core/vulkan_common/vulkan_memory_allocator.h"
#include "video_core/vulkan_common/vulkan_wrapper.h"
#include "video_core/textures/decoders.h"
namespace Vulkan {
@ -878,14 +880,51 @@ TextureCacheRuntime::TextureCacheRuntime(const Device& device_, Scheduler& sched
}
}
}
bl3d_unswizzle_pass.emplace(device, scheduler, descriptor_pool,
staging_buffer_pool, compute_pass_descriptor_queue);
// --- Create swizzle table buffer ---
{
auto table = Tegra::Texture::MakeSwizzleTable();
swizzle_table_size = static_cast<VkDeviceSize>(table.size() * sizeof(table[0]));
auto staging = staging_buffer_pool.Request(swizzle_table_size, MemoryUsage::Upload);
std::memcpy(staging.mapped_span.data(), table.data(), static_cast<size_t>(swizzle_table_size));
VkBufferCreateInfo ci{
.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
.size = swizzle_table_size,
.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
VK_BUFFER_USAGE_TRANSFER_DST_BIT |
VK_BUFFER_USAGE_TRANSFER_SRC_BIT,
.sharingMode = VK_SHARING_MODE_EXCLUSIVE,
};
swizzle_table_buffer = memory_allocator.CreateBuffer(ci, MemoryUsage::DeviceLocal);
scheduler.RequestOutsideRenderPassOperationContext();
scheduler.Record([staging_buf = staging.buffer,
dst_buf = *swizzle_table_buffer,
size = swizzle_table_size,
src_off = staging.offset](vk::CommandBuffer cmdbuf) {
const VkBufferCopy region{
.srcOffset = src_off,
.dstOffset = 0,
.size = size,
};
cmdbuf.CopyBuffer(staging_buf, dst_buf, region);
});
}
}
void TextureCacheRuntime::Finish() {
scheduler.Finish();
}
StagingBufferRef TextureCacheRuntime::UploadStagingBuffer(size_t size) {
return staging_buffer_pool.Request(size, MemoryUsage::Upload);
StagingBufferRef TextureCacheRuntime::UploadStagingBuffer(size_t size, bool deferred) {
return staging_buffer_pool.Request(size, MemoryUsage::Upload, deferred);
}
StagingBufferRef TextureCacheRuntime::DownloadStagingBuffer(size_t size, bool deferred) {
@ -1581,6 +1620,46 @@ Image::Image(const VideoCommon::NullImageParams& params) : VideoCommon::ImageBas
Image::~Image() = default;
void Image::AllocateComputeUnswizzleBuffer(u32 max_slices) {
if (has_compute_unswizzle_buffer)
return;
using VideoCore::Surface::BytesPerBlock;
const u32 block_bytes = BytesPerBlock(info.format); // 8 for BC1, 16 for BC6H
const u32 block_width = 4;
const u32 block_height = 4;
// BCn is 4x4x1 blocks
const u32 blocks_x = (info.size.width + block_width - 1) / block_width;
const u32 blocks_y = (info.size.height + block_height - 1) / block_height;
const u32 blocks_z = std::min(max_slices, info.size.depth);
const u64 block_count =
static_cast<u64>(blocks_x) *
static_cast<u64>(blocks_y) *
static_cast<u64>(blocks_z);
compute_unswizzle_buffer_size = block_count * block_bytes;
VkBufferCreateInfo ci{
.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
.pNext = nullptr,
.flags = 0,
.size = compute_unswizzle_buffer_size,
.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
VK_BUFFER_USAGE_TRANSFER_SRC_BIT,
.sharingMode = VK_SHARING_MODE_EXCLUSIVE,
.queueFamilyIndexCount = 0,
.pQueueFamilyIndices = nullptr,
};
compute_unswizzle_buffer =
runtime->memory_allocator.CreateBuffer(ci, MemoryUsage::DeviceLocal);
has_compute_unswizzle_buffer = true;
}
void Image::UploadMemory(VkBuffer buffer, VkDeviceSize offset,
std::span<const VideoCommon::BufferImageCopy> copies) {
// TODO: Move this to another API
@ -2397,10 +2476,22 @@ void Framebuffer::CreateFramebuffer(TextureCacheRuntime& runtime,
void TextureCacheRuntime::AccelerateImageUpload(
Image& image, const StagingBufferRef& map,
std::span<const VideoCommon::SwizzleParameters> swizzles) {
std::span<const VideoCommon::SwizzleParameters> swizzles,
u32 z_start, u32 z_count) {
if (IsPixelFormatASTC(image.info.format)) {
return astc_decoder_pass->Assemble(image, map, swizzles);
}
if (bl3d_unswizzle_pass &&
IsPixelFormatBCn(image.info.format) &&
image.info.type == ImageType::e3D &&
image.info.resources.levels == 1 &&
image.info.resources.layers == 1) {
return bl3d_unswizzle_pass->Unswizzle(image, map, swizzles, z_start, z_count);
}
ASSERT(false);
}

View file

@ -51,7 +51,7 @@ public:
void Finish();
StagingBufferRef UploadStagingBuffer(size_t size);
StagingBufferRef UploadStagingBuffer(size_t size, bool deferred = false);
StagingBufferRef DownloadStagingBuffer(size_t size, bool deferred = false);
@ -91,7 +91,8 @@ public:
}
void AccelerateImageUpload(Image&, const StagingBufferRef&,
std::span<const VideoCommon::SwizzleParameters>);
std::span<const VideoCommon::SwizzleParameters>,
u32 z_start, u32 z_count);
void InsertUploadMemoryBarrier() {}
@ -127,6 +128,11 @@ public:
BlitImageHelper& blit_image_helper;
RenderPassCache& render_pass_cache;
std::optional<ASTCDecoderPass> astc_decoder_pass;
std::optional<BlockLinearUnswizzle3DPass> bl3d_unswizzle_pass;
vk::Buffer swizzle_table_buffer;
VkDeviceSize swizzle_table_size = 0;
std::unique_ptr<MSAACopyPass> msaa_copy_pass;
const Settings::ResolutionScalingInfo& resolution;
std::array<std::vector<VkFormat>, VideoCore::Surface::MaxPixelFormat> view_formats;
@ -164,6 +170,8 @@ public:
void DownloadMemory(const StagingBufferRef& map,
std::span<const VideoCommon::BufferImageCopy> copies);
void AllocateComputeUnswizzleImage();
[[nodiscard]] VkImage Handle() const noexcept {
return *(this->*current_image);
}
@ -189,6 +197,10 @@ public:
bool ScaleDown(bool ignore = false);
u64 allocation_tick;
friend class BlockLinearUnswizzle3DPass;
private:
bool BlitScaleHelper(bool scale_up);
@ -200,6 +212,12 @@ private:
vk::Image original_image;
vk::Image scaled_image;
vk::Buffer compute_unswizzle_buffer;
VkDeviceSize compute_unswizzle_buffer_size = 0;
bool has_compute_unswizzle_buffer = false;
void AllocateComputeUnswizzleBuffer(u32 max_slices);
// Use a pointer to field because it is relative, so that the object can be
// moved without breaking the reference.
vk::Image Image::*current_image{};

View file

@ -8,6 +8,7 @@
#include <limits>
#include <optional>
#include <bit>
#include <unordered_set>
#include <boost/container/small_vector.hpp>
@ -22,6 +23,7 @@
#include "video_core/texture_cache/samples_helper.h"
#include "video_core/texture_cache/texture_cache_base.h"
#include "video_core/texture_cache/util.h"
#include "video_core/textures/decoders.h"
namespace VideoCommon {
@ -68,10 +70,41 @@ TextureCache<P>::TextureCache(Runtime& runtime_, Tegra::MaxwellDeviceMemoryManag
(std::max)((std::min)(device_local_memory - min_vacancy_critical, min_spacing_critical),
DEFAULT_CRITICAL_MEMORY));
minimum_memory = static_cast<u64>((device_local_memory - mem_threshold) / 2);
lowmemorydevice = false;
} else {
expected_memory = DEFAULT_EXPECTED_MEMORY + 512_MiB;
critical_memory = DEFAULT_CRITICAL_MEMORY + 1_GiB;
minimum_memory = 0;
lowmemorydevice = true;
}
switch (Settings::values.gpu_unzwizzle_texture_size.GetValue()) {
case Settings::GpuUnswizzleSize::VerySmall: gpu_unswizzle_maxsize = 16_MiB; break;
case Settings::GpuUnswizzleSize::Small: gpu_unswizzle_maxsize = 32_MiB; break;
case Settings::GpuUnswizzleSize::Normal: gpu_unswizzle_maxsize = 128_MiB; break;
case Settings::GpuUnswizzleSize::Large: gpu_unswizzle_maxsize = 256_MiB; break;
case Settings::GpuUnswizzleSize::VeryLarge: gpu_unswizzle_maxsize = 512_MiB; break;
default: gpu_unswizzle_maxsize = 128_MiB; break;
}
switch (Settings::values.gpu_unzwizzle_stream_size.GetValue()) {
case Settings::GpuUnswizzle::VeryLow: swizzle_chunk_size = 4_MiB; break;
case Settings::GpuUnswizzle::Low: swizzle_chunk_size = 8_MiB; break;
case Settings::GpuUnswizzle::Normal: swizzle_chunk_size = 16_MiB; break;
case Settings::GpuUnswizzle::Medium: swizzle_chunk_size = 32_MiB; break;
case Settings::GpuUnswizzle::High: swizzle_chunk_size = 64_MiB; break;
default: swizzle_chunk_size = 16_MiB;
}
switch (Settings::values.gpu_unzwizzle_chunk_size.GetValue()) {
case Settings::GpuUnswizzleChunk::VeryLow: swizzle_slices_per_batch = 32; break;
case Settings::GpuUnswizzleChunk::Low: swizzle_slices_per_batch = 64; break;
case Settings::GpuUnswizzleChunk::Normal: swizzle_slices_per_batch = 128; break;
case Settings::GpuUnswizzleChunk::Medium: swizzle_slices_per_batch = 256; break;
case Settings::GpuUnswizzleChunk::High: swizzle_slices_per_batch = 512; break;
default: swizzle_slices_per_batch = 128;
}
}
@ -88,6 +121,7 @@ void TextureCache<P>::RunGarbageCollector() {
ticks_to_destroy = aggressive_mode ? 10ULL : high_priority_mode ? 25ULL : 50ULL;
num_iterations = aggressive_mode ? 40 : (high_priority_mode ? 20 : 10);
};
const auto Cleanup = [this, &num_iterations, &high_priority_mode,
&aggressive_mode](ImageId image_id) {
if (num_iterations == 0) {
@ -95,20 +129,36 @@ void TextureCache<P>::RunGarbageCollector() {
}
--num_iterations;
auto& image = slot_images[image_id];
// Never delete recently allocated sparse textures (within 3 frames)
const bool is_recently_allocated = image.allocation_tick >= frame_tick - 3;
if (is_recently_allocated && image.info.is_sparse) {
return false;
}
if (True(image.flags & ImageFlagBits::IsDecoding)) {
// This image is still being decoded, deleting it will invalidate the slot
// used by the async decoder thread.
return false;
}
if (!aggressive_mode && True(image.flags & ImageFlagBits::CostlyLoad)) {
// Prioritize large sparse textures for cleanup
const bool is_large_sparse = lowmemorydevice &&
image.info.is_sparse &&
image.guest_size_bytes >= 256_MiB;
if (!aggressive_mode && !is_large_sparse &&
True(image.flags & ImageFlagBits::CostlyLoad)) {
return false;
}
const bool must_download =
image.IsSafeDownload() && False(image.flags & ImageFlagBits::BadOverlap);
if (!high_priority_mode && must_download) {
if (!high_priority_mode && !is_large_sparse && must_download) {
return false;
}
if (must_download) {
if (must_download && !is_large_sparse) {
auto map = runtime.DownloadStagingBuffer(image.unswizzled_size_bytes);
const auto copies = FixSmallVectorADL(FullDownloadCopies(image.info));
image.DownloadMemory(map, copies);
@ -116,11 +166,13 @@ void TextureCache<P>::RunGarbageCollector() {
SwizzleImage(*gpu_memory, image.gpu_addr, image.info, copies, map.mapped_span,
swizzle_data_buffer);
}
if (True(image.flags & ImageFlagBits::Tracked)) {
UntrackImage(image, image_id);
}
UnregisterImage(image_id);
DeleteImage(image_id, image.scale_tick > frame_tick + 5);
if (total_used_memory < critical_memory) {
if (aggressive_mode) {
// Sink the aggresiveness.
@ -136,7 +188,24 @@ void TextureCache<P>::RunGarbageCollector() {
return false;
};
// Try to remove anything old enough and not high priority.
// Aggressively clear massive sparse textures
if (total_used_memory >= expected_memory) {
lru_cache.ForEachItemBelow(frame_tick, [&](ImageId image_id) {
auto& image = slot_images[image_id];
// Only target sparse textures that are old enough
if (lowmemorydevice &&
image.info.is_sparse &&
image.guest_size_bytes >= 256_MiB &&
image.allocation_tick < frame_tick - 3) {
LOG_DEBUG(HW_GPU, "GC targeting old sparse texture at 0x{:X} ({} MiB, age: {} frames)",
image.gpu_addr, image.guest_size_bytes / (1024 * 1024),
frame_tick - image.allocation_tick);
return Cleanup(image_id);
}
return false;
});
}
Configure(false);
lru_cache.ForEachItemBelow(frame_tick - ticks_to_destroy, Cleanup);
@ -160,6 +229,7 @@ void TextureCache<P>::TickFrame() {
sentenced_framebuffers.Tick();
sentenced_image_view.Tick();
TickAsyncDecode();
TickAsyncUnswizzle();
runtime.TickFrame();
++frame_tick;
@ -627,7 +697,6 @@ void TextureCache<P>::UnmapGPUMemory(size_t as_id, GPUVAddr gpu_addr, size_t siz
UntrackImage(image, id);
}
}
if (True(image.flags & ImageFlagBits::Remapped)) {
continue;
}
@ -1055,7 +1124,12 @@ void TextureCache<P>::RefreshContents(Image& image, ImageId image_id) {
// Only upload modified images
return;
}
image.flags &= ~ImageFlagBits::CpuModified;
if( lowmemorydevice && image.info.format == PixelFormat::BC1_RGBA_UNORM && MapSizeBytes(image) >= 256_MiB ) {
return;
}
TrackImage(image, image_id);
if (image.info.num_samples > 1 && !runtime.CanUploadMSAA()) {
@ -1067,6 +1141,16 @@ void TextureCache<P>::RefreshContents(Image& image, ImageId image_id) {
QueueAsyncDecode(image, image_id);
return;
}
if (IsPixelFormatBCn(image.info.format) &&
image.info.type == ImageType::e3D &&
image.info.resources.levels == 1 &&
image.info.resources.layers == 1 &&
MapSizeBytes(image) >= gpu_unswizzle_maxsize &&
False(image.flags & ImageFlagBits::GpuModified)) {
QueueAsyncUnswizzle(image, image_id);
return;
}
auto staging = runtime.UploadStagingBuffer(MapSizeBytes(image));
UploadImageContents(image, staging);
runtime.InsertUploadMemoryBarrier();
@ -1082,7 +1166,7 @@ void TextureCache<P>::UploadImageContents(Image& image, StagingBuffer& staging)
gpu_memory->ReadBlock(gpu_addr, mapped_span.data(), mapped_span.size_bytes(),
VideoCommon::CacheType::NoTextureCache);
const auto uploads = FullUploadSwizzles(image.info);
runtime.AccelerateImageUpload(image, staging, FixSmallVectorADL(uploads));
runtime.AccelerateImageUpload(image, staging, FixSmallVectorADL(uploads), 0, 0);
return;
}
@ -1311,6 +1395,20 @@ void TextureCache<P>::QueueAsyncDecode(Image& image, ImageId image_id) {
texture_decode_worker.QueueWork(std::move(func));
}
template <class P>
void TextureCache<P>::QueueAsyncUnswizzle(Image& image, ImageId image_id) {
if (True(image.flags & ImageFlagBits::IsDecoding)) {
return;
}
image.flags |= ImageFlagBits::IsDecoding;
unswizzle_queue.push_back({
.image_id = image_id,
.info = image.info
});
}
template <class P>
void TextureCache<P>::TickAsyncDecode() {
bool has_uploads{};
@ -1336,6 +1434,83 @@ void TextureCache<P>::TickAsyncDecode() {
}
}
template <class P>
void TextureCache<P>::TickAsyncUnswizzle() {
if (unswizzle_queue.empty()) {
return;
}
if(current_unswizzle_frame > 0) {
current_unswizzle_frame--;
return;
}
PendingUnswizzle& task = unswizzle_queue.front();
Image& image = slot_images[task.image_id];
if (!task.initialized) {
task.total_size = MapSizeBytes(image);
task.staging_buffer = runtime.UploadStagingBuffer(task.total_size, true);
const auto& info = image.info;
const u32 bytes_per_block = BytesPerBlock(info.format);
const u32 width_blocks = Common::DivCeil(info.size.width, 4u);
const u32 height_blocks = Common::DivCeil(info.size.height, 4u);
const u32 stride = width_blocks * bytes_per_block;
const u32 aligned_height = height_blocks;
task.bytes_per_slice = static_cast<size_t>(stride) * aligned_height;
task.last_submitted_offset = 0;
task.initialized = true;
}
// Read data
if (task.current_offset < task.total_size) {
const size_t remaining = task.total_size - task.current_offset;
size_t copy_amount = std::min(swizzle_chunk_size, remaining);
if (remaining > swizzle_chunk_size) {
copy_amount = (copy_amount / task.bytes_per_slice) * task.bytes_per_slice;
if (copy_amount == 0) copy_amount = task.bytes_per_slice;
}
gpu_memory->ReadBlock(image.gpu_addr + task.current_offset,
task.staging_buffer.mapped_span.data() + task.current_offset,
copy_amount);
task.current_offset += copy_amount;
}
const bool is_final_batch = task.current_offset >= task.total_size;
const size_t bytes_ready = task.current_offset - task.last_submitted_offset;
const u32 complete_slices = static_cast<u32>(bytes_ready / task.bytes_per_slice);
if (complete_slices >= swizzle_slices_per_batch || (is_final_batch && complete_slices > 0)) {
const u32 z_start = static_cast<u32>(task.last_submitted_offset / task.bytes_per_slice);
const u32 slices_to_process = std::min(complete_slices, swizzle_slices_per_batch);
const u32 z_count = std::min(slices_to_process, image.info.size.depth - z_start);
if (z_count > 0) {
const auto uploads = FullUploadSwizzles(task.info);
runtime.AccelerateImageUpload(image, task.staging_buffer, FixSmallVectorADL(uploads), z_start, z_count);
task.last_submitted_offset += (static_cast<size_t>(z_count) * task.bytes_per_slice);
}
}
// Check if complete
const u32 slices_submitted = static_cast<u32>(task.last_submitted_offset / task.bytes_per_slice);
const bool all_slices_submitted = slices_submitted >= image.info.size.depth;
if (is_final_batch && all_slices_submitted) {
runtime.FreeDeferredStagingBuffer(task.staging_buffer);
image.flags &= ~ImageFlagBits::IsDecoding;
unswizzle_queue.pop_front();
// Wait 4 frames to process the next entry
current_unswizzle_frame = 4u;
}
}
template <class P>
bool TextureCache<P>::ScaleUp(Image& image) {
const bool has_copy = image.HasScaled();
@ -1374,6 +1549,39 @@ ImageId TextureCache<P>::InsertImage(const ImageInfo& info, GPUVAddr gpu_addr,
}
}
ASSERT_MSG(cpu_addr, "Tried to insert an image to an invalid gpu_addr=0x{:x}", gpu_addr);
// For large sparse textures, aggressively clean up old allocations at same address
if (lowmemorydevice && info.is_sparse && CalculateGuestSizeInBytes(info) >= 256_MiB) {
const auto alloc_it = image_allocs_table.find(gpu_addr);
if (alloc_it != image_allocs_table.end()) {
const ImageAllocId alloc_id = alloc_it->second;
auto& alloc_images = slot_image_allocs[alloc_id].images;
// Collect old images at this address that were created more than 2 frames ago
boost::container::small_vector<ImageId, 4> to_delete;
for (ImageId old_image_id : alloc_images) {
Image& old_image = slot_images[old_image_id];
if (old_image.info.is_sparse &&
old_image.gpu_addr == gpu_addr &&
old_image.allocation_tick < frame_tick - 2) { // Try not to delete fresh textures
to_delete.push_back(old_image_id);
}
}
// Delete old images immediately
for (ImageId old_id : to_delete) {
Image& old_image = slot_images[old_id];
LOG_DEBUG(HW_GPU, "Immediately deleting old sparse texture at 0x{:X} ({} MiB)",
gpu_addr, old_image.guest_size_bytes / (1024 * 1024));
if (True(old_image.flags & ImageFlagBits::Tracked)) {
UntrackImage(old_image, old_id);
}
UnregisterImage(old_id);
DeleteImage(old_id, true);
}
}
}
const ImageId image_id = JoinImages(info, gpu_addr, *cpu_addr);
const Image& image = slot_images[image_id];
// Using "image.gpu_addr" instead of "gpu_addr" is important because it might be different
@ -1389,6 +1597,27 @@ template <class P>
ImageId TextureCache<P>::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, DAddr cpu_addr) {
ImageInfo new_info = info;
const size_t size_bytes = CalculateGuestSizeInBytes(new_info);
// Proactive cleanup for large sparse texture allocations
if (lowmemorydevice && new_info.is_sparse && size_bytes >= 256_MiB) {
const u64 estimated_alloc_size = size_bytes;
if (total_used_memory + estimated_alloc_size >= critical_memory) {
LOG_DEBUG(HW_GPU, "Large sparse texture allocation ({} MiB) - running aggressive GC. "
"Current memory: {} MiB, Critical: {} MiB",
size_bytes / (1024 * 1024),
total_used_memory / (1024 * 1024),
critical_memory / (1024 * 1024));
RunGarbageCollector();
// If still over threshold after GC, try one more aggressive pass
if (total_used_memory + estimated_alloc_size >= critical_memory) {
LOG_DEBUG(HW_GPU, "Still critically low on memory, running second GC pass");
RunGarbageCollector();
}
}
}
const bool broken_views = runtime.HasBrokenTextureViewFormats();
const bool native_bgr = runtime.HasNativeBgr();
join_overlap_ids.clear();
@ -1485,6 +1714,8 @@ ImageId TextureCache<P>::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, DA
const ImageId new_image_id = slot_images.insert(runtime, new_info, gpu_addr, cpu_addr);
Image& new_image = slot_images[new_image_id];
new_image.allocation_tick = frame_tick;
if (!gpu_memory->IsContinuousRange(new_image.gpu_addr, new_image.guest_size_bytes) &&
new_info.is_sparse) {
new_image.flags |= ImageFlagBits::Sparse;

View file

@ -129,6 +129,17 @@ class TextureCache : public VideoCommon::ChannelSetupCaches<TextureCacheChannelI
using AsyncBuffer = typename P::AsyncBuffer;
using BufferType = typename P::BufferType;
struct PendingUnswizzle {
ImageId image_id;
VideoCommon::ImageInfo info;
size_t current_offset = 0;
size_t total_size = 0;
AsyncBuffer staging_buffer;
size_t last_submitted_offset = 0;
size_t bytes_per_slice;
bool initialized = false;
};
struct BlitImages {
ImageId dst_id;
ImageId src_id;
@ -433,6 +444,9 @@ private:
void TrimInactiveSamplers(size_t budget);
std::optional<size_t> QuerySamplerBudget() const;
void QueueAsyncUnswizzle(Image& image, ImageId image_id);
void TickAsyncUnswizzle();
Runtime& runtime;
Tegra::MaxwellDeviceMemoryManager& device_memory;
@ -453,6 +467,10 @@ private:
u64 minimum_memory;
u64 expected_memory;
u64 critical_memory;
bool lowmemorydevice = false;
size_t gpu_unswizzle_maxsize = 0;
size_t swizzle_chunk_size = 0;
u32 swizzle_slices_per_batch = 0;
struct BufferDownload {
GPUVAddr address;
@ -508,6 +526,9 @@ private:
Common::ThreadWorker texture_decode_worker{1, "TextureDecoder"};
std::vector<std::unique_ptr<AsyncDecodeContext>> async_decodes;
std::deque<PendingUnswizzle> unswizzle_queue;
u8 current_unswizzle_frame;
// Join caching
boost::container::small_vector<ImageId, 4> join_overlap_ids;
std::unordered_set<ImageId> join_overlaps_found;