diff --git a/src/common/settings.h b/src/common/settings.h index e10f5105a1..08005bb7a4 100644 --- a/src/common/settings.h +++ b/src/common/settings.h @@ -591,6 +591,8 @@ struct Values { SwitchableSetting gpu_unswizzle_enabled{linkage, false, "gpu_unswizzle_enabled", Category::RendererHacks}; + SwitchableSetting legacy_descriptor_indices{linkage, true, "legacy_descriptor_indices", Category::RendererHacks}; + SwitchableSetting dyna_state{linkage, #if defined(ANDROID) ExtendedDynamicState::Disabled, diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_image.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_image.cpp index c2511942d9..7745a73fb9 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_image.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_image.cpp @@ -14,6 +14,38 @@ namespace Shader::Backend::SPIRV { namespace { +class DescriptorIndex { +public: + explicit DescriptorIndex(EmitContext& ctx, const IR::Value& index) + : id{index.IsImmediate() ? ctx.Const(index.U32()) : ctx.Def(index)}, + is_non_uniform{ctx.profile.support_sampled_image_array_nonuniform_indexing && + !index.IsImmediate()} { + if (!is_non_uniform) { + return; + } + if (ctx.profile.supported_spirv < 0x00010400) { + ctx.AddExtension("SPV_EXT_descriptor_indexing"); + } + ctx.AddCapability(spv::Capability::ShaderNonUniform); + ctx.AddCapability(spv::Capability::SampledImageArrayNonUniformIndexing); + Decorate(ctx, id); + } + + Id Value() const { + return id; + } + + void Decorate(EmitContext& ctx, Id object) const { + if (is_non_uniform) { + ctx.Decorate(object, spv::Decoration::NonUniform); + } + } + +private: + Id id; + bool is_non_uniform; +}; + class ImageOperands { public: [[maybe_unused]] static constexpr bool ImageSampleOffsetAllowed = false; @@ -189,8 +221,17 @@ private: Id Texture(EmitContext& ctx, IR::TextureInstInfo info, [[maybe_unused]] const IR::Value& index) { const TextureDefinition& def{ctx.textures.at(info.descriptor_index)}; if (def.count > 1) { - const Id pointer{ctx.OpAccessChain(def.pointer_type, def.id, ctx.Def(index))}; - return ctx.OpLoad(def.sampled_type, pointer); + if (Settings::values.legacy_descriptor_indices.GetValue()) { + const Id pointer{ctx.OpAccessChain(def.pointer_type, def.id, ctx.Def(index))}; + return ctx.OpLoad(def.sampled_type, pointer); + } else { + const DescriptorIndex idx{ctx, index}; + const Id pointer{ctx.OpAccessChain(def.pointer_type, def.id, idx.Value())}; + idx.Decorate(ctx, pointer); + const Id object{ctx.OpLoad(def.sampled_type, pointer)}; + idx.Decorate(ctx, object); + return object; + } } else { return ctx.OpLoad(def.sampled_type, def.id); } @@ -208,9 +249,20 @@ Id TextureImage(EmitContext& ctx, IR::TextureInstInfo info, const IR::Value& ind } else { const TextureDefinition& def{ctx.textures.at(info.descriptor_index)}; if (def.count > 1) { - const Id idx{index.IsImmediate() ? ctx.Const(index.U32()) : ctx.Def(index)}; - const Id ptr{ctx.OpAccessChain(def.pointer_type, def.id, idx)}; - return ctx.OpImage(def.image_type, ctx.OpLoad(def.sampled_type, ptr)); + if (Settings::values.legacy_descriptor_indices.GetValue()) { + const Id idx{index.IsImmediate() ? ctx.Const(index.U32()) : ctx.Def(index)}; + const Id ptr{ctx.OpAccessChain(def.pointer_type, def.id, idx)}; + return ctx.OpImage(def.image_type, ctx.OpLoad(def.sampled_type, ptr)); + } else { + const DescriptorIndex idx{ctx, index}; + const Id ptr{ctx.OpAccessChain(def.pointer_type, def.id, idx.Value())}; + idx.Decorate(ctx, ptr); + const Id object{ctx.OpLoad(def.sampled_type, ptr)}; + idx.Decorate(ctx, object); + const Id image{ctx.OpImage(def.image_type, object)}; + idx.Decorate(ctx, image); + return image; + } } return ctx.OpImage(def.image_type, ctx.OpLoad(def.sampled_type, def.id)); } diff --git a/src/shader_recompiler/host_translate_info.h b/src/shader_recompiler/host_translate_info.h index 7e7127c5b8..4a9261d5f5 100644 --- a/src/shader_recompiler/host_translate_info.h +++ b/src/shader_recompiler/host_translate_info.h @@ -20,6 +20,9 @@ struct HostTranslateInfo { bool support_snorm_render_buffer{}; ///< True when the device supports SNORM render buffers bool support_viewport_index_layer{}; ///< True when the device supports gl_Layer in VS u32 min_ssbo_alignment{}; ///< Minimum alignment supported by the device for SSBOs + u32 max_per_stage_descriptor_sampled_images{1024}; ///< maximum sampled descriptors per stage + u32 max_per_stage_resources{4096}; ///< maximum resources per stage + u32 max_descriptor_set_sampled_images{1024}; ///< maximum sampled descriptors per set bool support_geometry_shader_passthrough{}; ///< True when the device supports geometry ///< passthrough shaders bool support_conditional_barrier{}; ///< True when the device supports barriers in conditional diff --git a/src/shader_recompiler/ir_opt/texture_pass.cpp b/src/shader_recompiler/ir_opt/texture_pass.cpp index 20b8591072..2366996292 100644 --- a/src/shader_recompiler/ir_opt/texture_pass.cpp +++ b/src/shader_recompiler/ir_opt/texture_pass.cpp @@ -12,6 +12,7 @@ #include #include +#include "common/settings.h" #include "shader_recompiler/environment.h" #include "shader_recompiler/frontend/ir/basic_block.h" #include "shader_recompiler/frontend/ir/breadth_first_search.h" @@ -32,6 +33,71 @@ using TextureInstVector = boost::container::small_vector; constexpr u32 DESCRIPTOR_SIZE = 8; constexpr u32 DESCRIPTOR_SIZE_SHIFT = static_cast(std::countr_zero(DESCRIPTOR_SIZE)); +constexpr u32 DYNAMIC_DESCRIPTOR_CBUF_BYTES = 16 * 1024; +constexpr u32 MAX_DYNAMIC_DESCRIPTOR_COUNT = 1024; + +u32 DynamicDescriptorSizeShift(const IR::U32& dynamic_offset) { + const IR::Inst* const inst{dynamic_offset.InstRecursive()}; + if (!inst || inst->GetOpcode() != IR::Opcode::ShiftLeftLogical32) { + return DESCRIPTOR_SIZE_SHIFT; + } + const IR::Value shift{inst->Arg(1)}; + if (!shift.IsImmediate()) { + return DESCRIPTOR_SIZE_SHIFT; + } + const u32 size_shift{shift.U32()}; + return size_shift >= DESCRIPTOR_SIZE_SHIFT && size_shift < 31 ? size_shift + : DESCRIPTOR_SIZE_SHIFT; +} + +u32 DynamicDescriptorCount(u32 base_offset, u32 size_shift) { + if (size_shift >= 31 || base_offset >= DYNAMIC_DESCRIPTOR_CBUF_BYTES) { + return 1; + } + const u32 stride{1U << size_shift}; + const u32 available{DYNAMIC_DESCRIPTOR_CBUF_BYTES - base_offset}; + if (available < DESCRIPTOR_SIZE) { + return 1; + } + const u32 available_count{1U + (available - DESCRIPTOR_SIZE) / stride}; + return std::min(MAX_DYNAMIC_DESCRIPTOR_COUNT, available_count); +} + +u32 SaturatingSub(u32 lhs, u32 rhs) { + return lhs > rhs ? lhs - rhs : 0; +} + +template +u32 StaticDescriptorCount(const Descriptors& descriptors) { + u32 count{}; + for (const auto& desc : descriptors) { + if (desc.count <= 1) { + count += desc.count; + } + } + return count; +} + +u32 DynamicSampledTextureCap(const Info& info, const HostTranslateInfo& host_info, + u32 dynamic_arrays) { + if (dynamic_arrays == 0) { + return MAX_DYNAMIC_DESCRIPTOR_COUNT; + } + const u32 sampled_static_count{StaticDescriptorCount(info.texture_buffer_descriptors) + + StaticDescriptorCount(info.texture_descriptors)}; + const u32 resource_static_count{ + NumDescriptors(info.constant_buffer_descriptors) + + NumDescriptors(info.storage_buffers_descriptors) + sampled_static_count + + NumDescriptors(info.image_buffer_descriptors) + NumDescriptors(info.image_descriptors)}; + const u32 sampled_limit{std::min(host_info.max_per_stage_descriptor_sampled_images, + host_info.max_descriptor_set_sampled_images)}; + const u32 sampled_budget{SaturatingSub(sampled_limit, sampled_static_count)}; + const u32 resource_budget{SaturatingSub(host_info.max_per_stage_resources, + resource_static_count)}; + const u32 sampled_cap{sampled_budget / dynamic_arrays}; + const u32 resource_cap{resource_budget / dynamic_arrays}; + return std::max(1U, std::min({MAX_DYNAMIC_DESCRIPTOR_COUNT, sampled_cap, resource_cap})); +} IR::Opcode IndexedInstruction(const IR::Inst& inst) { switch (inst.GetOpcode()) { @@ -109,6 +175,39 @@ IR::Opcode IndexedInstruction(const IR::Inst& inst) { } } +bool IsStorageImageOpcode(IR::Opcode opcode) { + switch (opcode) { + case IR::Opcode::ImageRead: + case IR::Opcode::ImageAtomicIAdd32: + case IR::Opcode::ImageAtomicSMin32: + case IR::Opcode::ImageAtomicUMin32: + case IR::Opcode::ImageAtomicSMax32: + case IR::Opcode::ImageAtomicUMax32: + case IR::Opcode::ImageAtomicInc32: + case IR::Opcode::ImageAtomicDec32: + case IR::Opcode::ImageAtomicAnd32: + case IR::Opcode::ImageAtomicOr32: + case IR::Opcode::ImageAtomicXor32: + case IR::Opcode::ImageAtomicExchange32: + case IR::Opcode::ImageWrite: + return true; + default: + return false; + } +} + +u32 DynamicSampledTextureArrayCount(const TextureInstVector& to_replace) { + u32 count{}; + for (const TextureInst& inst : to_replace) { + const auto flags{inst.inst->Flags()}; + if (inst.cbuf.count > 1 && !IsStorageImageOpcode(IndexedInstruction(*inst.inst)) && + flags.type != TextureType::Buffer) { + ++count; + } + } + return count; +} + bool IsBindless(const IR::Inst& inst) { switch (inst.GetOpcode()) { case IR::Opcode::BindlessImageSampleImplicitLod: @@ -354,6 +453,7 @@ std::optional TryGetConstBuffer(const IR::Inst* inst, Environme } else { return std::nullopt; } + const u32 size_shift{DynamicDescriptorSizeShift(dynamic_offset)}; return ConstBufferAddr{ .index = index.U32(), .offset = base_offset, @@ -362,7 +462,7 @@ std::optional TryGetConstBuffer(const IR::Inst* inst, Environme .secondary_offset = 0, .secondary_shift_left = 0, .dynamic_offset = dynamic_offset, - .count = 8, + .count = Settings::values.legacy_descriptor_indices.GetValue() ? 8 : DynamicDescriptorCount(base_offset, size_shift), .has_secondary = false, }; } @@ -589,6 +689,8 @@ void TexturePass(Environment& env, IR::Program& program, const HostTranslateInfo program.info.texture_descriptors, program.info.image_descriptors, }; + const u32 sampled_dynamic_cap{ + DynamicSampledTextureCap(program.info, host_info, DynamicSampledTextureArrayCount(to_replace))}; for (TextureInst& texture_inst : to_replace) { // TODO: Handle arrays IR::Inst* const inst{texture_inst.inst}; @@ -632,6 +734,10 @@ void TexturePass(Environment& env, IR::Program& program, const HostTranslateInfo break; } u32 index; + u32 size_shift = cbuf.count > 1 ? DynamicDescriptorSizeShift(cbuf.dynamic_offset) : DESCRIPTOR_SIZE_SHIFT; + if (Settings::values.legacy_descriptor_indices.GetValue()) + size_shift = DESCRIPTOR_SIZE_SHIFT; + u32 count = cbuf.count; switch (inst->GetOpcode()) { case IR::Opcode::ImageRead: case IR::Opcode::ImageAtomicIAdd32: @@ -660,8 +766,8 @@ void TexturePass(Environment& env, IR::Program& program, const HostTranslateInfo .is_integer = is_integer, .cbuf_index = cbuf.index, .cbuf_offset = cbuf.offset, - .count = cbuf.count, - .size_shift = DESCRIPTOR_SIZE_SHIFT, + .count = count, + .size_shift = size_shift, }); } else { index = descriptors.Add(ImageDescriptor{ @@ -672,8 +778,8 @@ void TexturePass(Environment& env, IR::Program& program, const HostTranslateInfo .is_integer = is_integer, .cbuf_index = cbuf.index, .cbuf_offset = cbuf.offset, - .count = cbuf.count, - .size_shift = DESCRIPTOR_SIZE_SHIFT, + .count = count, + .size_shift = size_shift, }); } break; @@ -688,10 +794,11 @@ void TexturePass(Environment& env, IR::Program& program, const HostTranslateInfo .secondary_cbuf_index = cbuf.secondary_index, .secondary_cbuf_offset = cbuf.secondary_offset, .secondary_shift_left = cbuf.secondary_shift_left, - .count = cbuf.count, - .size_shift = DESCRIPTOR_SIZE_SHIFT, + .count = count, + .size_shift = size_shift, }); } else { + count = std::min(count, sampled_dynamic_cap); index = descriptors.Add(TextureDescriptor{ .type = flags.type, .is_depth = flags.is_depth != 0, @@ -703,8 +810,8 @@ void TexturePass(Environment& env, IR::Program& program, const HostTranslateInfo .secondary_cbuf_index = cbuf.secondary_index, .secondary_cbuf_offset = cbuf.secondary_offset, .secondary_shift_left = cbuf.secondary_shift_left, - .count = cbuf.count, - .size_shift = DESCRIPTOR_SIZE_SHIFT, + .count = count, + .size_shift = size_shift, }); } break; @@ -712,12 +819,11 @@ void TexturePass(Environment& env, IR::Program& program, const HostTranslateInfo flags.descriptor_index.Assign(index); inst->SetFlags(flags); - if (cbuf.count > 1) { + if (count > 1) { const auto insert_point{IR::Block::InstructionList::s_iterator_to(*inst)}; IR::IREmitter ir{*texture_inst.block, insert_point}; - const IR::U32 shift{ir.Imm32(DESCRIPTOR_SIZE_SHIFT)}; - inst->SetArg(0, ir.UMin(ir.ShiftRightLogical(cbuf.dynamic_offset, shift), - ir.Imm32(DESCRIPTOR_SIZE - 1))); + const IR::U32 shift{ir.Imm32(size_shift)}; + inst->SetArg(0, ir.UMin(ir.ShiftRightLogical(cbuf.dynamic_offset, shift), ir.Imm32(count - 1))); } else { inst->SetArg(0, IR::Value{}); } diff --git a/src/shader_recompiler/profile.h b/src/shader_recompiler/profile.h index 35fcad3843..ff19f0710f 100644 --- a/src/shader_recompiler/profile.h +++ b/src/shader_recompiler/profile.h @@ -48,6 +48,7 @@ struct Profile { bool support_scaled_attributes{}; bool support_multi_viewport{}; bool support_geometry_streams{}; + bool support_sampled_image_array_nonuniform_indexing{}; bool warp_size_potentially_larger_than_guest{}; diff --git a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp index 1a62324c95..490dd7acfe 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp @@ -127,9 +127,8 @@ void ComputePipeline::Configure(Tegra::Engines::KeplerCompute& kepler_compute, texture_cache.SynchronizeComputeDescriptors(); - static constexpr size_t max_elements = 64; - boost::container::static_vector views; - boost::container::static_vector samplers; + boost::container::small_vector views; + boost::container::small_vector samplers; const auto& qmd{kepler_compute.launch_description}; const auto& cbufs{qmd.const_buffer_config}; diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index f927c69d48..5345bdc306 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -416,6 +416,8 @@ PipelineCache::PipelineCache(Tegra::MaxwellDeviceMemoryManager& device_memory_, .support_scaled_attributes = !device.MustEmulateScaledFormats(), .support_multi_viewport = device.SupportsMultiViewport(), .support_geometry_streams = device.AreTransformFeedbackGeometryStreamsSupported(), + .support_sampled_image_array_nonuniform_indexing = + device.IsSampledImageArrayNonUniformIndexingSupported(), .warp_size_potentially_larger_than_guest = device.IsWarpSizePotentiallyBiggerThanGuest(), @@ -450,6 +452,9 @@ PipelineCache::PipelineCache(Tegra::MaxwellDeviceMemoryManager& device_memory_, .support_snorm_render_buffer = true, .support_viewport_index_layer = device.IsExtShaderViewportIndexLayerSupported(), .min_ssbo_alignment = static_cast(device.GetStorageBufferAlignment()), + .max_per_stage_descriptor_sampled_images = device.GetMaxPerStageDescriptorSampledImages(), + .max_per_stage_resources = device.GetMaxPerStageResources(), + .max_descriptor_set_sampled_images = device.GetMaxDescriptorSetSampledImages(), .support_geometry_shader_passthrough = device.IsNvGeometryShaderPassthroughSupported(), .support_conditional_barrier = device.SupportsConditionalBarriers(), }; diff --git a/src/video_core/vulkan_common/vulkan_device.h b/src/video_core/vulkan_common/vulkan_device.h index a8a89aee89..13fe1b371f 100644 --- a/src/video_core/vulkan_common/vulkan_device.h +++ b/src/video_core/vulkan_common/vulkan_device.h @@ -33,6 +33,7 @@ VK_DEFINE_HANDLE(VmaAllocator) FEATURE(KHR, VariablePointer, VARIABLE_POINTERS, variable_pointer) #define FOR_EACH_VK_FEATURE_1_2(FEATURE) \ + FEATURE(EXT, DescriptorIndexing, DESCRIPTOR_INDEXING, descriptor_indexing) \ FEATURE(EXT, HostQueryReset, HOST_QUERY_RESET, host_query_reset) \ FEATURE(KHR, 8BitStorage, 8BIT_STORAGE, bit8_storage) \ FEATURE(KHR, TimelineSemaphore, TIMELINE_SEMAPHORE, timeline_semaphore) @@ -335,6 +336,18 @@ public: return properties.properties.limits.maxDescriptorSetUniformBuffersDynamic; } + u32 GetMaxPerStageDescriptorSampledImages() const { + return properties.properties.limits.maxPerStageDescriptorSampledImages; + } + + u32 GetMaxPerStageResources() const { + return properties.properties.limits.maxPerStageResources; + } + + u32 GetMaxDescriptorSetSampledImages() const { + return properties.properties.limits.maxDescriptorSetSampledImages; + } + /// Returns float control properties of the device. const VkPhysicalDeviceFloatControlsPropertiesKHR& FloatControlProperties() const { return properties.float_controls; @@ -355,6 +368,10 @@ public: return GetDriverID() != VK_DRIVER_ID_QUALCOMM_PROPRIETARY; } + bool IsSampledImageArrayNonUniformIndexingSupported() const { + return features.descriptor_indexing.shaderSampledImageArrayNonUniformIndexing; + } + /// Returns true if the device supports float64 natively. bool IsFloat64Supported() const { return features.features.shaderFloat64;