mirror of
https://git.eden-emu.dev/eden-emu/eden
synced 2026-04-24 03:19:00 +02:00
[video_core] Implement GPU-accelerated texture unswizzling and optimize sparse texture handling (#3246)
- [Added] a new compute shader to handle block-linear unswizzling on the GPU, reducing CPU overhead during texture uploads - [Implemented] BlockLinearUnswizzle3DPass to take advantage of the new compute shader, unimplemented for OpenGL - [Implemented] texture streaming and queue system for large sparse textures to prevent hitches - [Implemented] aggressive garbage collection system to eject large sparse textures to save on memory (Unused) - [Added] user settings to adjust the streaming unswizzle system for low-end machines - [Improved] slightly the ASTC GPU decoding system Co-authored-by: Caio Oliveira <caiooliveirafarias0@gmail.com> Co-authored-by: CamilleLaVey <camillelavey99@gmail.com> Co-authored-by: DraVee <dravee@eden-emu.dev> Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/3246 Reviewed-by: Maufeat <sahyno1996@gmail.com> Reviewed-by: MaranBr <maranbr@eden-emu.dev> Reviewed-by: DraVee <dravee@eden-emu.dev> Reviewed-by: CamilleLaVey <camillelavey99@gmail.com> Co-authored-by: Forrest Keller <forrestmarkx@outlook.com> Co-committed-by: Forrest Keller <forrestmarkx@outlook.com>
This commit is contained in:
parent
f544004b5d
commit
ecd01e13fd
20 changed files with 1076 additions and 83 deletions
|
|
@ -18,6 +18,7 @@ set(SHADER_FILES
|
|||
blit_color_float.frag
|
||||
block_linear_unswizzle_2d.comp
|
||||
block_linear_unswizzle_3d.comp
|
||||
block_linear_unswizzle_3d_bcn.comp
|
||||
convert_abgr8_srgb_to_d24s8.frag
|
||||
convert_abgr8_to_d24s8.frag
|
||||
convert_abgr8_to_d32f.frag
|
||||
|
|
|
|||
|
|
@ -727,70 +727,35 @@ void ComputeEndpoints(out uvec4 ep1, out uvec4 ep2, uint color_endpoint_mode, ui
|
|||
}
|
||||
|
||||
uint UnquantizeTexelWeight(EncodingData val) {
|
||||
const uint encoding = Encoding(val);
|
||||
const uint bitlen = NumBits(val);
|
||||
const uint bitval = BitValue(val);
|
||||
const uint A = ReplicateBitTo7((bitval & 1));
|
||||
uint B = 0, C = 0, D = 0;
|
||||
uint result = 0;
|
||||
const uint bitlen_0_results[5] = {0, 16, 32, 48, 64};
|
||||
switch (encoding) {
|
||||
case JUST_BITS:
|
||||
return FastReplicateTo6(bitval, bitlen);
|
||||
case TRIT: {
|
||||
uint encoding = Encoding(val), bitlen = NumBits(val), bitval = BitValue(val);
|
||||
if (encoding == JUST_BITS) {
|
||||
return (bitlen >= 1 && bitlen <= 5)
|
||||
? uint(floor(0.5f + float(bitval) * 64.0f / float((1 << bitlen) - 1)))
|
||||
: FastReplicateTo6(bitval, bitlen);
|
||||
} else if (encoding == TRIT || encoding == QUINT) {
|
||||
uint B = 0, C = 0, D = 0;
|
||||
uint b_mask = (0x3100 >> (bitlen * 4)) & 0xf;
|
||||
uint b = (bitval >> 1) & b_mask;
|
||||
D = QuintTritValue(val);
|
||||
switch (bitlen) {
|
||||
case 0:
|
||||
return bitlen_0_results[D * 2];
|
||||
case 1: {
|
||||
C = 50;
|
||||
break;
|
||||
if (encoding == TRIT) {
|
||||
switch (bitlen) {
|
||||
case 0: return D * 32; //0,32,64
|
||||
case 1: C = 50; break;
|
||||
case 2: C = 23; B = (b << 6) | (b << 2) | b; break;
|
||||
case 3: C = 11; B = (b << 5) | b; break;
|
||||
}
|
||||
} else if (encoding == QUINT) {
|
||||
switch (bitlen) {
|
||||
case 0: return D * 16; //0, 16, 32, 48, 64
|
||||
case 1: C = 28; break;
|
||||
case 2: C = 13; B = (b << 6) | (b << 1); break;
|
||||
}
|
||||
}
|
||||
case 2: {
|
||||
C = 23;
|
||||
const uint b = (bitval >> 1) & 1;
|
||||
B = (b << 6) | (b << 2) | b;
|
||||
break;
|
||||
}
|
||||
case 3: {
|
||||
C = 11;
|
||||
const uint cb = (bitval >> 1) & 3;
|
||||
B = (cb << 5) | cb;
|
||||
break;
|
||||
}
|
||||
default:
|
||||
break;
|
||||
}
|
||||
break;
|
||||
uint A = ReplicateBitTo7(bitval & 1);
|
||||
uint res = (A & 0x20) | (((D * C + B) ^ A) >> 2);
|
||||
return res + (res > 32 ? 1 : 0);
|
||||
}
|
||||
case QUINT: {
|
||||
D = QuintTritValue(val);
|
||||
switch (bitlen) {
|
||||
case 0:
|
||||
return bitlen_0_results[D];
|
||||
case 1: {
|
||||
C = 28;
|
||||
break;
|
||||
}
|
||||
case 2: {
|
||||
C = 13;
|
||||
const uint b = (bitval >> 1) & 1;
|
||||
B = (b << 6) | (b << 1);
|
||||
break;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (encoding != JUST_BITS && bitlen > 0) {
|
||||
result = D * C + B;
|
||||
result ^= A;
|
||||
result = (A & 0x20) | (result >> 2);
|
||||
}
|
||||
if (result > 32) {
|
||||
result += 1;
|
||||
}
|
||||
return result;
|
||||
return 0;
|
||||
}
|
||||
|
||||
void UnquantizeTexelWeights(uvec2 size, bool is_dual_plane) {
|
||||
|
|
@ -1159,10 +1124,11 @@ void DecompressBlock(ivec3 coord) {
|
|||
}
|
||||
|
||||
uint SwizzleOffset(uvec2 pos) {
|
||||
const uint x = pos.x;
|
||||
const uint y = pos.y;
|
||||
return ((x % 64) / 32) * 256 + ((y % 8) / 2) * 64 +
|
||||
((x % 32) / 16) * 32 + (y % 2) * 16 + (x % 16);
|
||||
return ((pos.x & 32u) << 3u) |
|
||||
((pos.y & 6u) << 5u) |
|
||||
((pos.x & 16u) << 1u) |
|
||||
((pos.y & 1u) << 4u) |
|
||||
(pos.x & 15u);
|
||||
}
|
||||
|
||||
void main() {
|
||||
|
|
|
|||
160
src/video_core/host_shaders/block_linear_unswizzle_3d_bcn.comp
Normal file
160
src/video_core/host_shaders/block_linear_unswizzle_3d_bcn.comp
Normal file
|
|
@ -0,0 +1,160 @@
|
|||
// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
|
||||
// SPDX-License-Identifier: GPL-3.0-or-later
|
||||
|
||||
#version 430
|
||||
|
||||
#ifdef VULKAN
|
||||
#extension GL_EXT_shader_16bit_storage : require
|
||||
#extension GL_EXT_shader_8bit_storage : require
|
||||
#define HAS_EXTENDED_TYPES 1
|
||||
#define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants {
|
||||
#define END_PUSH_CONSTANTS };
|
||||
#define UNIFORM(n)
|
||||
#define BINDING_SWIZZLE_BUFFER 0
|
||||
#define BINDING_INPUT_BUFFER 1
|
||||
#define BINDING_OUTPUT_BUFFER 2
|
||||
#else
|
||||
#extension GL_NV_gpu_shader5 : enable
|
||||
#ifdef GL_NV_gpu_shader5
|
||||
#define HAS_EXTENDED_TYPES 1
|
||||
#else
|
||||
#define HAS_EXTENDED_TYPES 0
|
||||
#endif
|
||||
#define BEGIN_PUSH_CONSTANTS
|
||||
#define END_PUSH_CONSTANTS
|
||||
#define UNIFORM(n) layout(location = n) uniform
|
||||
#define BINDING_SWIZZLE_BUFFER 0
|
||||
#define BINDING_INPUT_BUFFER 1
|
||||
#define BINDING_OUTPUT_BUFFER 0
|
||||
#endif
|
||||
|
||||
// --- Push Constants / Uniforms ---
|
||||
#ifdef VULKAN
|
||||
layout(push_constant) uniform PushConstants {
|
||||
uvec3 blocks_dim; // Offset 0
|
||||
uint bytes_per_block_log2; // Offset 12
|
||||
|
||||
uvec3 origin; // Offset 16
|
||||
uint slice_size; // Offset 28
|
||||
|
||||
uint block_size; // Offset 32
|
||||
uint x_shift; // Offset 36
|
||||
uint block_height; // Offset 40
|
||||
uint block_height_mask; // Offset 44
|
||||
|
||||
uint block_depth; // Offset 48
|
||||
uint block_depth_mask; // Offset 52
|
||||
int _pad; // Offset 56
|
||||
|
||||
ivec3 destination; // Offset 60
|
||||
} pc;
|
||||
#else
|
||||
BEGIN_PUSH_CONSTANTS
|
||||
UNIFORM(0) uvec3 origin;
|
||||
UNIFORM(1) ivec3 destination;
|
||||
UNIFORM(2) uint bytes_per_block_log2;
|
||||
UNIFORM(3) uint slice_size;
|
||||
UNIFORM(4) uint block_size;
|
||||
UNIFORM(5) uint x_shift;
|
||||
UNIFORM(6) uint block_height;
|
||||
UNIFORM(7) uint block_height_mask;
|
||||
UNIFORM(8) uint block_depth;
|
||||
UNIFORM(9) uint block_depth_mask;
|
||||
UNIFORM(10) uvec3 blocks_dim;
|
||||
END_PUSH_CONSTANTS
|
||||
#define pc // Map pc prefix to nothing for OpenGL compatibility
|
||||
#endif
|
||||
|
||||
// --- Buffers ---
|
||||
layout(binding = BINDING_SWIZZLE_BUFFER, std430) readonly buffer SwizzleTable {
|
||||
uint swizzle_table[];
|
||||
};
|
||||
|
||||
#if HAS_EXTENDED_TYPES
|
||||
layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU8 { uint8_t u8data[]; };
|
||||
layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU16 { uint16_t u16data[]; };
|
||||
#endif
|
||||
layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU32 { uint u32data[]; };
|
||||
layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU64 { uvec2 u64data[]; };
|
||||
layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU128 { uvec4 u128data[]; };
|
||||
|
||||
layout(binding = BINDING_OUTPUT_BUFFER, std430) writeonly buffer OutputBuffer {
|
||||
uint out_u32[];
|
||||
};
|
||||
|
||||
// --- Constants ---
|
||||
layout(local_size_x = 8, local_size_y = 8, local_size_z = 4) in;
|
||||
|
||||
const uint GOB_SIZE_X = 64;
|
||||
const uint GOB_SIZE_Y = 8;
|
||||
const uint GOB_SIZE_Z = 1;
|
||||
const uint GOB_SIZE = GOB_SIZE_X * GOB_SIZE_Y * GOB_SIZE_Z;
|
||||
|
||||
const uint GOB_SIZE_X_SHIFT = 6;
|
||||
const uint GOB_SIZE_Y_SHIFT = 3;
|
||||
const uint GOB_SIZE_Z_SHIFT = 0;
|
||||
const uint GOB_SIZE_SHIFT = GOB_SIZE_X_SHIFT + GOB_SIZE_Y_SHIFT + GOB_SIZE_Z_SHIFT;
|
||||
const uvec2 SWIZZLE_MASK = uvec2(GOB_SIZE_X - 1u, GOB_SIZE_Y - 1u);
|
||||
|
||||
// --- Helpers ---
|
||||
uint SwizzleOffset(uvec2 pos) {
|
||||
pos &= SWIZZLE_MASK;
|
||||
return swizzle_table[pos.y * 64u + pos.x];
|
||||
}
|
||||
|
||||
uvec4 ReadTexel(uint offset) {
|
||||
uint bpl2 = pc.bytes_per_block_log2;
|
||||
switch (bpl2) {
|
||||
#if HAS_EXTENDED_TYPES
|
||||
case 0u: return uvec4(u8data[offset], 0u, 0u, 0u);
|
||||
case 1u: return uvec4(u16data[offset / 2u], 0u, 0u, 0u);
|
||||
#else
|
||||
case 0u: return uvec4(bitfieldExtract(u32data[offset / 4u], int((offset * 8u) & 24u), 8), 0u, 0u, 0u);
|
||||
case 1u: return uvec4(bitfieldExtract(u32data[offset / 4u], int((offset * 8u) & 16u), 16), 0u, 0u, 0u);
|
||||
#endif
|
||||
case 2u: return uvec4(u32data[offset / 4u], 0u, 0u, 0u);
|
||||
case 3u: return uvec4(u64data[offset / 8u], 0u, 0u);
|
||||
case 4u: return u128data[offset / 16u];
|
||||
}
|
||||
return uvec4(0u);
|
||||
}
|
||||
|
||||
void main() {
|
||||
uvec3 block_coord = gl_GlobalInvocationID;
|
||||
if (any(greaterThanEqual(block_coord, pc.blocks_dim))) {
|
||||
return;
|
||||
}
|
||||
|
||||
uint bytes_per_block = 1u << pc.bytes_per_block_log2;
|
||||
// Origin is in pixels, divide by 4 for block-space (e.g. BCn formats)
|
||||
uvec3 pos;
|
||||
pos.x = (block_coord.x + (pc.origin.x >> 2u)) * bytes_per_block;
|
||||
pos.y = block_coord.y + (pc.origin.y >> 2u);
|
||||
pos.z = block_coord.z + pc.origin.z;
|
||||
|
||||
uint swizzle = SwizzleOffset(pos.xy);
|
||||
uint block_y = pos.y >> GOB_SIZE_Y_SHIFT;
|
||||
uint offset = 0u;
|
||||
// Apply block-linear offsets
|
||||
offset += (pos.z >> pc.block_depth) * pc.slice_size;
|
||||
offset += (pos.z & pc.block_depth_mask) << (GOB_SIZE_SHIFT + pc.block_height);
|
||||
offset += (block_y >> pc.block_height) * pc.block_size;
|
||||
offset += (block_y & pc.block_height_mask) << GOB_SIZE_SHIFT;
|
||||
offset += (pos.x >> GOB_SIZE_X_SHIFT) << pc.x_shift;
|
||||
offset += swizzle;
|
||||
|
||||
uvec4 texel = ReadTexel(offset);
|
||||
|
||||
// Calculate linear output index
|
||||
uint block_index = block_coord.x +
|
||||
(block_coord.y * pc.blocks_dim.x) +
|
||||
(block_coord.z * pc.blocks_dim.x * pc.blocks_dim.y);
|
||||
uint out_idx = block_index * (bytes_per_block >> 2u);
|
||||
|
||||
out_u32[out_idx] = texel.x;
|
||||
out_u32[out_idx + 1u] = texel.y;
|
||||
if (pc.bytes_per_block_log2 == 4u) {
|
||||
out_u32[out_idx + 2u] = texel.z;
|
||||
out_u32[out_idx + 3u] = texel.w;
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue