mirror of
https://git.eden-emu.dev/eden-emu/eden
synced 2026-04-21 05:58:59 +02:00
[video_core] Implement GPU-accelerated texture unswizzling and optimize sparse texture handling (#3246)
- [Added] a new compute shader to handle block-linear unswizzling on the GPU, reducing CPU overhead during texture uploads - [Implemented] BlockLinearUnswizzle3DPass to take advantage of the new compute shader, unimplemented for OpenGL - [Implemented] texture streaming and queue system for large sparse textures to prevent hitches - [Implemented] aggressive garbage collection system to eject large sparse textures to save on memory (Unused) - [Added] user settings to adjust the streaming unswizzle system for low-end machines - [Improved] slightly the ASTC GPU decoding system Co-authored-by: Caio Oliveira <caiooliveirafarias0@gmail.com> Co-authored-by: CamilleLaVey <camillelavey99@gmail.com> Co-authored-by: DraVee <dravee@eden-emu.dev> Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/3246 Reviewed-by: Maufeat <sahyno1996@gmail.com> Reviewed-by: MaranBr <maranbr@eden-emu.dev> Reviewed-by: DraVee <dravee@eden-emu.dev> Reviewed-by: CamilleLaVey <camillelavey99@gmail.com> Co-authored-by: Forrest Keller <forrestmarkx@outlook.com> Co-committed-by: Forrest Keller <forrestmarkx@outlook.com>
This commit is contained in:
parent
f544004b5d
commit
ecd01e13fd
20 changed files with 1076 additions and 83 deletions
|
|
@ -24,6 +24,7 @@
|
|||
#include "video_core/host_shaders/resolve_conditional_render_comp_spv.h"
|
||||
#include "video_core/host_shaders/vulkan_quad_indexed_comp_spv.h"
|
||||
#include "video_core/host_shaders/vulkan_uint8_comp_spv.h"
|
||||
#include "video_core/host_shaders/block_linear_unswizzle_3d_bcn_comp_spv.h"
|
||||
#include "video_core/renderer_vulkan/vk_compute_pass.h"
|
||||
#include "video_core/renderer_vulkan/vk_descriptor_pool.h"
|
||||
#include "video_core/renderer_vulkan/vk_scheduler.h"
|
||||
|
|
@ -622,7 +623,7 @@ void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map,
|
|||
.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
|
||||
.pNext = nullptr,
|
||||
.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT,
|
||||
.dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT,
|
||||
.dstAccessMask = VK_ACCESS_SHADER_READ_BIT,
|
||||
.oldLayout = VK_IMAGE_LAYOUT_GENERAL,
|
||||
.newLayout = VK_IMAGE_LAYOUT_GENERAL,
|
||||
.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
|
||||
|
|
@ -637,9 +638,292 @@ void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map,
|
|||
},
|
||||
};
|
||||
cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
|
||||
VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, image_barrier);
|
||||
VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, 0, image_barrier);
|
||||
});
|
||||
}
|
||||
|
||||
constexpr u32 BL3D_BINDING_SWIZZLE_TABLE = 0;
|
||||
constexpr u32 BL3D_BINDING_INPUT_BUFFER = 1;
|
||||
constexpr u32 BL3D_BINDING_OUTPUT_BUFFER = 2;
|
||||
|
||||
constexpr std::array<VkDescriptorSetLayoutBinding, 3> BL3D_DESCRIPTOR_SET_BINDINGS{{
|
||||
{
|
||||
.binding = BL3D_BINDING_SWIZZLE_TABLE,
|
||||
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, // swizzle_table[]
|
||||
.descriptorCount = 1,
|
||||
.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
|
||||
.pImmutableSamplers = nullptr,
|
||||
},
|
||||
{
|
||||
.binding = BL3D_BINDING_INPUT_BUFFER,
|
||||
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, // block-linear input
|
||||
.descriptorCount = 1,
|
||||
.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
|
||||
.pImmutableSamplers = nullptr,
|
||||
},
|
||||
{
|
||||
.binding = BL3D_BINDING_OUTPUT_BUFFER,
|
||||
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
|
||||
.descriptorCount = 1,
|
||||
.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
|
||||
.pImmutableSamplers = nullptr,
|
||||
},
|
||||
}};
|
||||
|
||||
constexpr DescriptorBankInfo BL3D_BANK_INFO{
|
||||
.uniform_buffers = 0,
|
||||
.storage_buffers = 3,
|
||||
.texture_buffers = 0,
|
||||
.image_buffers = 0,
|
||||
.textures = 0,
|
||||
.images = 0,
|
||||
.score = 3,
|
||||
};
|
||||
|
||||
constexpr std::array<VkDescriptorUpdateTemplateEntry, 3>
|
||||
BL3D_DESCRIPTOR_UPDATE_TEMPLATE_ENTRY{{
|
||||
{
|
||||
.dstBinding = BL3D_BINDING_SWIZZLE_TABLE,
|
||||
.dstArrayElement = 0,
|
||||
.descriptorCount = 1,
|
||||
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
|
||||
.offset = BL3D_BINDING_SWIZZLE_TABLE * sizeof(DescriptorUpdateEntry),
|
||||
.stride = sizeof(DescriptorUpdateEntry),
|
||||
},
|
||||
{
|
||||
.dstBinding = BL3D_BINDING_INPUT_BUFFER,
|
||||
.dstArrayElement = 0,
|
||||
.descriptorCount = 1,
|
||||
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
|
||||
.offset = BL3D_BINDING_INPUT_BUFFER * sizeof(DescriptorUpdateEntry),
|
||||
.stride = sizeof(DescriptorUpdateEntry),
|
||||
},
|
||||
{
|
||||
.dstBinding = BL3D_BINDING_OUTPUT_BUFFER,
|
||||
.dstArrayElement = 0,
|
||||
.descriptorCount = 1,
|
||||
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
|
||||
.offset = BL3D_BINDING_OUTPUT_BUFFER * sizeof(DescriptorUpdateEntry),
|
||||
.stride = sizeof(DescriptorUpdateEntry),
|
||||
}
|
||||
}};
|
||||
|
||||
struct alignas(16) BlockLinearUnswizzle3DPushConstants {
|
||||
u32 blocks_dim[3]; // Offset 0
|
||||
u32 bytes_per_block_log2; // Offset 12
|
||||
|
||||
u32 origin[3]; // Offset 16
|
||||
u32 slice_size; // Offset 28
|
||||
|
||||
u32 block_size; // Offset 32
|
||||
u32 x_shift; // Offset 36
|
||||
u32 block_height; // Offset 40
|
||||
u32 block_height_mask; // Offset 44
|
||||
|
||||
u32 block_depth; // Offset 48
|
||||
u32 block_depth_mask; // Offset 52
|
||||
s32 _pad; // Offset 56
|
||||
|
||||
s32 destination[3]; // Offset 60
|
||||
s32 _pad_end; // Offset 72
|
||||
};
|
||||
static_assert(sizeof(BlockLinearUnswizzle3DPushConstants) <= 128);
|
||||
|
||||
BlockLinearUnswizzle3DPass::BlockLinearUnswizzle3DPass(
|
||||
const Device& device_, Scheduler& scheduler_,
|
||||
DescriptorPool& descriptor_pool_,
|
||||
StagingBufferPool& staging_buffer_pool_,
|
||||
ComputePassDescriptorQueue& compute_pass_descriptor_queue_)
|
||||
: ComputePass(
|
||||
device_, descriptor_pool_,
|
||||
BL3D_DESCRIPTOR_SET_BINDINGS,
|
||||
BL3D_DESCRIPTOR_UPDATE_TEMPLATE_ENTRY,
|
||||
BL3D_BANK_INFO,
|
||||
COMPUTE_PUSH_CONSTANT_RANGE<sizeof(BlockLinearUnswizzle3DPushConstants)>,
|
||||
BLOCK_LINEAR_UNSWIZZLE_3D_BCN_COMP_SPV),
|
||||
scheduler{scheduler_},
|
||||
staging_buffer_pool{staging_buffer_pool_},
|
||||
compute_pass_descriptor_queue{compute_pass_descriptor_queue_} {}
|
||||
|
||||
BlockLinearUnswizzle3DPass::~BlockLinearUnswizzle3DPass() = default;
|
||||
|
||||
// God have mercy on my soul
|
||||
void BlockLinearUnswizzle3DPass::Unswizzle(
|
||||
Image& image,
|
||||
const StagingBufferRef& swizzled,
|
||||
std::span<const VideoCommon::SwizzleParameters> swizzles,
|
||||
u32 z_start, u32 z_count)
|
||||
{
|
||||
using namespace VideoCommon::Accelerated;
|
||||
|
||||
const u32 MAX_BATCH_SLICES = std::min(z_count, image.info.size.depth);
|
||||
|
||||
if (!image.has_compute_unswizzle_buffer) {
|
||||
// Allocate exactly what this batch needs
|
||||
image.AllocateComputeUnswizzleBuffer(MAX_BATCH_SLICES);
|
||||
}
|
||||
|
||||
ASSERT(swizzles.size() == 1);
|
||||
const auto& sw = swizzles[0];
|
||||
const auto params = MakeBlockLinearSwizzle3DParams(sw, image.info);
|
||||
|
||||
const u32 blocks_x = (image.info.size.width + 3) / 4;
|
||||
const u32 blocks_y = (image.info.size.height + 3) / 4;
|
||||
|
||||
scheduler.RequestOutsideRenderPassOperationContext();
|
||||
for (u32 z_offset = 0; z_offset < z_count; z_offset += MAX_BATCH_SLICES) {
|
||||
const u32 current_chunk_slices = std::min(MAX_BATCH_SLICES, z_count - z_offset);
|
||||
const u32 current_z_start = z_start + z_offset;
|
||||
|
||||
UnswizzleChunk(image, swizzled, sw, params, blocks_x, blocks_y,
|
||||
current_z_start, current_chunk_slices);
|
||||
}
|
||||
}
|
||||
|
||||
void BlockLinearUnswizzle3DPass::UnswizzleChunk(
|
||||
Image& image,
|
||||
const StagingBufferRef& swizzled,
|
||||
const VideoCommon::SwizzleParameters& sw,
|
||||
const BlockLinearSwizzle3DParams& params,
|
||||
u32 blocks_x, u32 blocks_y,
|
||||
u32 z_start, u32 z_count)
|
||||
{
|
||||
BlockLinearUnswizzle3DPushConstants pc{};
|
||||
pc.origin[0] = params.origin[0];
|
||||
pc.origin[1] = params.origin[1];
|
||||
pc.origin[2] = z_start; // Current chunk's Z start
|
||||
|
||||
pc.destination[0] = params.destination[0];
|
||||
pc.destination[1] = params.destination[1];
|
||||
pc.destination[2] = 0; // Shader writes to start of output buffer
|
||||
|
||||
pc.bytes_per_block_log2 = params.bytes_per_block_log2;
|
||||
pc.slice_size = params.slice_size;
|
||||
pc.block_size = params.block_size;
|
||||
pc.x_shift = params.x_shift;
|
||||
pc.block_height = params.block_height;
|
||||
pc.block_height_mask = params.block_height_mask;
|
||||
pc.block_depth = params.block_depth;
|
||||
pc.block_depth_mask = params.block_depth_mask;
|
||||
|
||||
pc.blocks_dim[0] = blocks_x;
|
||||
pc.blocks_dim[1] = blocks_y;
|
||||
pc.blocks_dim[2] = z_count; // Only process the count
|
||||
|
||||
compute_pass_descriptor_queue.Acquire();
|
||||
compute_pass_descriptor_queue.AddBuffer(*image.runtime->swizzle_table_buffer, 0,
|
||||
image.runtime->swizzle_table_size);
|
||||
compute_pass_descriptor_queue.AddBuffer(swizzled.buffer,
|
||||
sw.buffer_offset + swizzled.offset,
|
||||
image.guest_size_bytes - sw.buffer_offset);
|
||||
compute_pass_descriptor_queue.AddBuffer(*image.compute_unswizzle_buffer, 0,
|
||||
image.compute_unswizzle_buffer_size);
|
||||
|
||||
const void* descriptor_data = compute_pass_descriptor_queue.UpdateData();
|
||||
const VkDescriptorSet set = descriptor_allocator.Commit();
|
||||
|
||||
const u32 gx = Common::DivCeil(blocks_x, 8u);
|
||||
const u32 gy = Common::DivCeil(blocks_y, 8u);
|
||||
const u32 gz = Common::DivCeil(z_count, 4u);
|
||||
|
||||
const u32 bytes_per_block = 1u << pc.bytes_per_block_log2;
|
||||
const VkDeviceSize output_slice_size =
|
||||
static_cast<VkDeviceSize>(blocks_x) * blocks_y * bytes_per_block;
|
||||
const VkDeviceSize barrier_size = output_slice_size * z_count;
|
||||
|
||||
const bool is_first_chunk = (z_start == 0);
|
||||
|
||||
const VkBuffer out_buffer = *image.compute_unswizzle_buffer;
|
||||
const VkImage dst_image = image.Handle();
|
||||
const VkImageAspectFlags aspect = image.AspectMask();
|
||||
const u32 image_width = image.info.size.width;
|
||||
const u32 image_height = image.info.size.height;
|
||||
|
||||
scheduler.Record([this, set, descriptor_data, pc, gx, gy, gz, z_start, z_count,
|
||||
barrier_size, is_first_chunk, out_buffer, dst_image, aspect,
|
||||
image_width, image_height
|
||||
](vk::CommandBuffer cmdbuf) {
|
||||
|
||||
if (dst_image == VK_NULL_HANDLE || out_buffer == VK_NULL_HANDLE) {
|
||||
return;
|
||||
}
|
||||
|
||||
device.GetLogical().UpdateDescriptorSet(set, *descriptor_template, descriptor_data);
|
||||
cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
|
||||
cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, *layout, 0, set, {});
|
||||
cmdbuf.PushConstants(*layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(pc), &pc);
|
||||
cmdbuf.Dispatch(gx, gy, gz);
|
||||
|
||||
// Single barrier for compute -> transfer (buffer ready, image transition)
|
||||
const VkBufferMemoryBarrier buffer_barrier{
|
||||
.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
|
||||
.pNext = nullptr,
|
||||
.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT,
|
||||
.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT,
|
||||
.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
|
||||
.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
|
||||
.buffer = out_buffer,
|
||||
.offset = 0,
|
||||
.size = barrier_size,
|
||||
};
|
||||
|
||||
// Image layout transition
|
||||
const VkImageMemoryBarrier pre_barrier{
|
||||
.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
|
||||
.pNext = nullptr,
|
||||
.srcAccessMask = is_first_chunk ? VkAccessFlags{} :
|
||||
static_cast<VkAccessFlags>(VK_ACCESS_TRANSFER_WRITE_BIT),
|
||||
.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
|
||||
.oldLayout = is_first_chunk ? VK_IMAGE_LAYOUT_UNDEFINED :
|
||||
VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
|
||||
.newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
|
||||
.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
|
||||
.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
|
||||
.image = dst_image,
|
||||
.subresourceRange = {aspect, 0, 1, 0, 1},
|
||||
};
|
||||
|
||||
// Single barrier handles both buffer and image
|
||||
cmdbuf.PipelineBarrier(
|
||||
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
|
||||
VK_PIPELINE_STAGE_TRANSFER_BIT,
|
||||
0,
|
||||
nullptr, buffer_barrier, pre_barrier
|
||||
);
|
||||
|
||||
// Copy chunk to correct Z position in image
|
||||
const VkBufferImageCopy copy{
|
||||
.bufferOffset = 0, // Read from start of staging buffer
|
||||
.bufferRowLength = 0,
|
||||
.bufferImageHeight = 0,
|
||||
.imageSubresource = {aspect, 0, 0, 1},
|
||||
.imageOffset = {0, 0, static_cast<s32>(z_start)}, // Write to correct Z
|
||||
.imageExtent = {image_width, image_height, z_count},
|
||||
};
|
||||
cmdbuf.CopyBufferToImage(out_buffer, dst_image,
|
||||
VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, copy);
|
||||
|
||||
// Post-copy transition
|
||||
const VkImageMemoryBarrier post_barrier{
|
||||
.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
|
||||
.pNext = nullptr,
|
||||
.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
|
||||
.dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT,
|
||||
.oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
|
||||
.newLayout = VK_IMAGE_LAYOUT_GENERAL,
|
||||
.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
|
||||
.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
|
||||
.image = dst_image,
|
||||
.subresourceRange = {aspect, 0, 1, 0, 1},
|
||||
};
|
||||
|
||||
cmdbuf.PipelineBarrier(
|
||||
VK_PIPELINE_STAGE_TRANSFER_BIT,
|
||||
VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
|
||||
0,
|
||||
nullptr, nullptr, post_barrier
|
||||
);
|
||||
});
|
||||
scheduler.Finish();
|
||||
}
|
||||
|
||||
MSAACopyPass::MSAACopyPass(const Device& device_, Scheduler& scheduler_,
|
||||
|
|
|
|||
|
|
@ -1,3 +1,6 @@
|
|||
// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
|
||||
// SPDX-License-Identifier: GPL-3.0-or-later
|
||||
|
||||
// SPDX-FileCopyrightText: Copyright 2019 yuzu Emulator Project
|
||||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
|
||||
|
|
@ -14,6 +17,7 @@
|
|||
#include "video_core/texture_cache/types.h"
|
||||
#include "video_core/vulkan_common/vulkan_memory_allocator.h"
|
||||
#include "video_core/vulkan_common/vulkan_wrapper.h"
|
||||
#include "video_core/texture_cache/accelerated_swizzle.h"
|
||||
|
||||
namespace VideoCommon {
|
||||
struct SwizzleParameters;
|
||||
|
|
@ -21,6 +25,8 @@ struct SwizzleParameters;
|
|||
|
||||
namespace Vulkan {
|
||||
|
||||
using VideoCommon::Accelerated::BlockLinearSwizzle3DParams;
|
||||
|
||||
class Device;
|
||||
class StagingBufferPool;
|
||||
class Scheduler;
|
||||
|
|
@ -131,6 +137,34 @@ private:
|
|||
MemoryAllocator& memory_allocator;
|
||||
};
|
||||
|
||||
class BlockLinearUnswizzle3DPass final : public ComputePass {
|
||||
public:
|
||||
explicit BlockLinearUnswizzle3DPass(const Device& device_, Scheduler& scheduler_,
|
||||
DescriptorPool& descriptor_pool_,
|
||||
StagingBufferPool& staging_buffer_pool_,
|
||||
ComputePassDescriptorQueue& compute_pass_descriptor_queue_);
|
||||
~BlockLinearUnswizzle3DPass();
|
||||
|
||||
void Unswizzle(Image& image,
|
||||
const StagingBufferRef& swizzled,
|
||||
std::span<const VideoCommon::SwizzleParameters> swizzles,
|
||||
u32 z_start, u32 z_count);
|
||||
|
||||
void UnswizzleChunk(
|
||||
Image& image,
|
||||
const StagingBufferRef& swizzled,
|
||||
const VideoCommon::SwizzleParameters& sw,
|
||||
const BlockLinearSwizzle3DParams& params,
|
||||
u32 blocks_x, u32 blocks_y,
|
||||
u32 z_start, u32 z_count);
|
||||
|
||||
private:
|
||||
Scheduler& scheduler;
|
||||
StagingBufferPool& staging_buffer_pool;
|
||||
ComputePassDescriptorQueue& compute_pass_descriptor_queue;
|
||||
};
|
||||
|
||||
|
||||
class MSAACopyPass final : public ComputePass {
|
||||
public:
|
||||
explicit MSAACopyPass(const Device& device_, Scheduler& scheduler_,
|
||||
|
|
|
|||
|
|
@ -43,6 +43,16 @@ Scheduler::Scheduler(const Device& device_, StateTracker& state_tracker_)
|
|||
: device{device_}, state_tracker{state_tracker_},
|
||||
master_semaphore{std::make_unique<MasterSemaphore>(device)},
|
||||
command_pool{std::make_unique<CommandPool>(*master_semaphore, device)} {
|
||||
|
||||
/*// PRE-OPTIMIZATION: Warm up the pool to prevent mid-frame spikes
|
||||
{
|
||||
std::scoped_lock rl{reserve_mutex};
|
||||
chunk_reserve.reserve(2048); // Prevent vector resizing
|
||||
for (int i = 0; i < 1024; ++i) {
|
||||
chunk_reserve.push_back(std::make_unique<CommandChunk>());
|
||||
}
|
||||
}*/
|
||||
|
||||
AcquireNewChunk();
|
||||
AllocateWorkerCommandBuffer();
|
||||
worker_thread = std::jthread([this](std::stop_token token) { WorkerThread(token); });
|
||||
|
|
|
|||
|
|
@ -24,12 +24,14 @@
|
|||
#include "video_core/renderer_vulkan/vk_render_pass_cache.h"
|
||||
#include "video_core/renderer_vulkan/vk_scheduler.h"
|
||||
#include "video_core/renderer_vulkan/vk_staging_buffer_pool.h"
|
||||
#include "video_core/surface.h"
|
||||
#include "video_core/texture_cache/formatter.h"
|
||||
#include "video_core/texture_cache/samples_helper.h"
|
||||
#include "video_core/texture_cache/util.h"
|
||||
#include "video_core/vulkan_common/vulkan_device.h"
|
||||
#include "video_core/vulkan_common/vulkan_memory_allocator.h"
|
||||
#include "video_core/vulkan_common/vulkan_wrapper.h"
|
||||
#include "video_core/textures/decoders.h"
|
||||
|
||||
namespace Vulkan {
|
||||
|
||||
|
|
@ -878,14 +880,51 @@ TextureCacheRuntime::TextureCacheRuntime(const Device& device_, Scheduler& sched
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
bl3d_unswizzle_pass.emplace(device, scheduler, descriptor_pool,
|
||||
staging_buffer_pool, compute_pass_descriptor_queue);
|
||||
|
||||
// --- Create swizzle table buffer ---
|
||||
{
|
||||
auto table = Tegra::Texture::MakeSwizzleTable();
|
||||
|
||||
swizzle_table_size = static_cast<VkDeviceSize>(table.size() * sizeof(table[0]));
|
||||
|
||||
auto staging = staging_buffer_pool.Request(swizzle_table_size, MemoryUsage::Upload);
|
||||
std::memcpy(staging.mapped_span.data(), table.data(), static_cast<size_t>(swizzle_table_size));
|
||||
|
||||
VkBufferCreateInfo ci{
|
||||
.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
|
||||
.size = swizzle_table_size,
|
||||
.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
|
||||
VK_BUFFER_USAGE_TRANSFER_DST_BIT |
|
||||
VK_BUFFER_USAGE_TRANSFER_SRC_BIT,
|
||||
.sharingMode = VK_SHARING_MODE_EXCLUSIVE,
|
||||
};
|
||||
swizzle_table_buffer = memory_allocator.CreateBuffer(ci, MemoryUsage::DeviceLocal);
|
||||
|
||||
scheduler.RequestOutsideRenderPassOperationContext();
|
||||
scheduler.Record([staging_buf = staging.buffer,
|
||||
dst_buf = *swizzle_table_buffer,
|
||||
size = swizzle_table_size,
|
||||
src_off = staging.offset](vk::CommandBuffer cmdbuf) {
|
||||
|
||||
const VkBufferCopy region{
|
||||
.srcOffset = src_off,
|
||||
.dstOffset = 0,
|
||||
.size = size,
|
||||
};
|
||||
cmdbuf.CopyBuffer(staging_buf, dst_buf, region);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
void TextureCacheRuntime::Finish() {
|
||||
scheduler.Finish();
|
||||
}
|
||||
|
||||
StagingBufferRef TextureCacheRuntime::UploadStagingBuffer(size_t size) {
|
||||
return staging_buffer_pool.Request(size, MemoryUsage::Upload);
|
||||
StagingBufferRef TextureCacheRuntime::UploadStagingBuffer(size_t size, bool deferred) {
|
||||
return staging_buffer_pool.Request(size, MemoryUsage::Upload, deferred);
|
||||
}
|
||||
|
||||
StagingBufferRef TextureCacheRuntime::DownloadStagingBuffer(size_t size, bool deferred) {
|
||||
|
|
@ -1581,6 +1620,46 @@ Image::Image(const VideoCommon::NullImageParams& params) : VideoCommon::ImageBas
|
|||
|
||||
Image::~Image() = default;
|
||||
|
||||
void Image::AllocateComputeUnswizzleBuffer(u32 max_slices) {
|
||||
if (has_compute_unswizzle_buffer)
|
||||
return;
|
||||
|
||||
using VideoCore::Surface::BytesPerBlock;
|
||||
|
||||
const u32 block_bytes = BytesPerBlock(info.format); // 8 for BC1, 16 for BC6H
|
||||
const u32 block_width = 4;
|
||||
const u32 block_height = 4;
|
||||
|
||||
// BCn is 4x4x1 blocks
|
||||
const u32 blocks_x = (info.size.width + block_width - 1) / block_width;
|
||||
const u32 blocks_y = (info.size.height + block_height - 1) / block_height;
|
||||
const u32 blocks_z = std::min(max_slices, info.size.depth);
|
||||
|
||||
const u64 block_count =
|
||||
static_cast<u64>(blocks_x) *
|
||||
static_cast<u64>(blocks_y) *
|
||||
static_cast<u64>(blocks_z);
|
||||
|
||||
compute_unswizzle_buffer_size = block_count * block_bytes;
|
||||
|
||||
VkBufferCreateInfo ci{
|
||||
.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
|
||||
.pNext = nullptr,
|
||||
.flags = 0,
|
||||
.size = compute_unswizzle_buffer_size,
|
||||
.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
|
||||
VK_BUFFER_USAGE_TRANSFER_SRC_BIT,
|
||||
.sharingMode = VK_SHARING_MODE_EXCLUSIVE,
|
||||
.queueFamilyIndexCount = 0,
|
||||
.pQueueFamilyIndices = nullptr,
|
||||
};
|
||||
|
||||
compute_unswizzle_buffer =
|
||||
runtime->memory_allocator.CreateBuffer(ci, MemoryUsage::DeviceLocal);
|
||||
|
||||
has_compute_unswizzle_buffer = true;
|
||||
}
|
||||
|
||||
void Image::UploadMemory(VkBuffer buffer, VkDeviceSize offset,
|
||||
std::span<const VideoCommon::BufferImageCopy> copies) {
|
||||
// TODO: Move this to another API
|
||||
|
|
@ -2397,10 +2476,22 @@ void Framebuffer::CreateFramebuffer(TextureCacheRuntime& runtime,
|
|||
|
||||
void TextureCacheRuntime::AccelerateImageUpload(
|
||||
Image& image, const StagingBufferRef& map,
|
||||
std::span<const VideoCommon::SwizzleParameters> swizzles) {
|
||||
std::span<const VideoCommon::SwizzleParameters> swizzles,
|
||||
u32 z_start, u32 z_count) {
|
||||
|
||||
if (IsPixelFormatASTC(image.info.format)) {
|
||||
return astc_decoder_pass->Assemble(image, map, swizzles);
|
||||
}
|
||||
|
||||
if (bl3d_unswizzle_pass &&
|
||||
IsPixelFormatBCn(image.info.format) &&
|
||||
image.info.type == ImageType::e3D &&
|
||||
image.info.resources.levels == 1 &&
|
||||
image.info.resources.layers == 1) {
|
||||
|
||||
return bl3d_unswizzle_pass->Unswizzle(image, map, swizzles, z_start, z_count);
|
||||
}
|
||||
|
||||
ASSERT(false);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -51,7 +51,7 @@ public:
|
|||
|
||||
void Finish();
|
||||
|
||||
StagingBufferRef UploadStagingBuffer(size_t size);
|
||||
StagingBufferRef UploadStagingBuffer(size_t size, bool deferred = false);
|
||||
|
||||
StagingBufferRef DownloadStagingBuffer(size_t size, bool deferred = false);
|
||||
|
||||
|
|
@ -91,7 +91,8 @@ public:
|
|||
}
|
||||
|
||||
void AccelerateImageUpload(Image&, const StagingBufferRef&,
|
||||
std::span<const VideoCommon::SwizzleParameters>);
|
||||
std::span<const VideoCommon::SwizzleParameters>,
|
||||
u32 z_start, u32 z_count);
|
||||
|
||||
void InsertUploadMemoryBarrier() {}
|
||||
|
||||
|
|
@ -127,6 +128,11 @@ public:
|
|||
BlitImageHelper& blit_image_helper;
|
||||
RenderPassCache& render_pass_cache;
|
||||
std::optional<ASTCDecoderPass> astc_decoder_pass;
|
||||
|
||||
std::optional<BlockLinearUnswizzle3DPass> bl3d_unswizzle_pass;
|
||||
vk::Buffer swizzle_table_buffer;
|
||||
VkDeviceSize swizzle_table_size = 0;
|
||||
|
||||
std::unique_ptr<MSAACopyPass> msaa_copy_pass;
|
||||
const Settings::ResolutionScalingInfo& resolution;
|
||||
std::array<std::vector<VkFormat>, VideoCore::Surface::MaxPixelFormat> view_formats;
|
||||
|
|
@ -164,6 +170,8 @@ public:
|
|||
void DownloadMemory(const StagingBufferRef& map,
|
||||
std::span<const VideoCommon::BufferImageCopy> copies);
|
||||
|
||||
void AllocateComputeUnswizzleImage();
|
||||
|
||||
[[nodiscard]] VkImage Handle() const noexcept {
|
||||
return *(this->*current_image);
|
||||
}
|
||||
|
|
@ -189,6 +197,10 @@ public:
|
|||
|
||||
bool ScaleDown(bool ignore = false);
|
||||
|
||||
u64 allocation_tick;
|
||||
|
||||
friend class BlockLinearUnswizzle3DPass;
|
||||
|
||||
private:
|
||||
bool BlitScaleHelper(bool scale_up);
|
||||
|
||||
|
|
@ -200,6 +212,12 @@ private:
|
|||
vk::Image original_image;
|
||||
vk::Image scaled_image;
|
||||
|
||||
vk::Buffer compute_unswizzle_buffer;
|
||||
VkDeviceSize compute_unswizzle_buffer_size = 0;
|
||||
bool has_compute_unswizzle_buffer = false;
|
||||
|
||||
void AllocateComputeUnswizzleBuffer(u32 max_slices);
|
||||
|
||||
// Use a pointer to field because it is relative, so that the object can be
|
||||
// moved without breaking the reference.
|
||||
vk::Image Image::*current_image{};
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue