diff --git a/src/video_core/dma_pusher.cpp b/src/video_core/dma_pusher.cpp index 3844a8e2f9..d6d44e66b4 100644 --- a/src/video_core/dma_pusher.cpp +++ b/src/video_core/dma_pusher.cpp @@ -122,7 +122,35 @@ void DmaPusher::ProcessCommands(std::span commands) { dma_state.is_last_call = true; index += max_write; } else if (dma_state.method_count) { - auto const command_header = commands[index]; //can copy + if (!dma_state.non_incrementing && !dma_increment_once && + dma_state.method >= non_puller_methods) { + auto subchannel = subchannels[dma_state.subchannel]; + const u32 available = u32(std::min( + index + dma_state.method_count, commands.size()) - index); + u32 batch = 0; + u32 method = dma_state.method; + while (batch < available) { + const bool needs_exec = + (method < Engines::EngineInterface::EXECUTION_MASK_TABLE_SIZE) + ? subchannel->execution_mask[method] + : subchannel->execution_mask_default; + if (needs_exec) break; + batch++; + method++; + } + if (batch > 0) { + auto& sink = subchannel->method_sink; + sink.reserve(sink.size() + batch); + for (u32 j = 0; j < batch; j++) { + sink.emplace_back(dma_state.method + j, commands[index + j].argument); + } + dma_state.method += batch; + dma_state.method_count -= batch; + index += batch; + continue; + } + } + auto const command_header = commands[index]; dma_state.dma_word_offset = u32(index * sizeof(u32)); dma_state.is_last_call = dma_state.method_count <= 1; CallMethod(command_header.argument); @@ -181,7 +209,11 @@ void DmaPusher::CallMethod(u32 argument) const { }); } else { auto subchannel = subchannels[dma_state.subchannel]; - if (!subchannel->execution_mask[dma_state.method]) { + const bool needs_execution = + (dma_state.method < Engines::EngineInterface::EXECUTION_MASK_TABLE_SIZE) + ? subchannel->execution_mask[dma_state.method] + : subchannel->execution_mask_default; + if (!needs_execution) { subchannel->method_sink.emplace_back(dma_state.method, argument); } else { subchannel->ConsumeSink(); diff --git a/src/video_core/engines/engine_interface.h b/src/video_core/engines/engine_interface.h index bf3bd66aca..292f0a5738 100644 --- a/src/video_core/engines/engine_interface.h +++ b/src/video_core/engines/engine_interface.h @@ -6,9 +6,9 @@ #pragma once -#include -#include -#include +#include + +#include #include "common/common_types.h" @@ -41,8 +41,11 @@ public: ConsumeSinkImpl(); } - std::bitset<(std::numeric_limits::max)()> execution_mask{}; - std::vector> method_sink{}; + static constexpr size_t EXECUTION_MASK_TABLE_SIZE = 0xE00; + + std::array execution_mask{}; + bool execution_mask_default{}; + boost::container::small_vector, 64> method_sink{}; bool current_dirty{}; GPUVAddr current_dma_segment; diff --git a/src/video_core/engines/fermi_2d.cpp b/src/video_core/engines/fermi_2d.cpp index b442c5cc76..11f60ef32b 100644 --- a/src/video_core/engines/fermi_2d.cpp +++ b/src/video_core/engines/fermi_2d.cpp @@ -26,7 +26,7 @@ Fermi2D::Fermi2D(MemoryManager& memory_manager_) : memory_manager{memory_manager regs.src.depth = 1; regs.dst.depth = 1; - execution_mask.reset(); + execution_mask.fill(0); execution_mask[FERMI2D_REG_INDEX(pixels_from_memory.src_y0) + 1] = true; } diff --git a/src/video_core/engines/kepler_compute.cpp b/src/video_core/engines/kepler_compute.cpp index 7b4efeb1e0..d6ee80f6e2 100644 --- a/src/video_core/engines/kepler_compute.cpp +++ b/src/video_core/engines/kepler_compute.cpp @@ -18,7 +18,7 @@ namespace Tegra::Engines { KeplerCompute::KeplerCompute(Core::System& system_, MemoryManager& memory_manager_) : system{system_}, memory_manager{memory_manager_}, upload_state{memory_manager, regs.upload} { - execution_mask.reset(); + execution_mask.fill(0); execution_mask[KEPLER_COMPUTE_REG_INDEX(exec_upload)] = true; execution_mask[KEPLER_COMPUTE_REG_INDEX(data_upload)] = true; execution_mask[KEPLER_COMPUTE_REG_INDEX(launch)] = true; diff --git a/src/video_core/engines/kepler_memory.cpp b/src/video_core/engines/kepler_memory.cpp index 5d4c4720d3..013a644c1b 100644 --- a/src/video_core/engines/kepler_memory.cpp +++ b/src/video_core/engines/kepler_memory.cpp @@ -22,7 +22,7 @@ KeplerMemory::~KeplerMemory() = default; void KeplerMemory::BindRasterizer(VideoCore::RasterizerInterface* rasterizer_) { upload_state.BindRasterizer(rasterizer_); - execution_mask.reset(); + execution_mask.fill(0); execution_mask[KEPLERMEMORY_REG_INDEX(exec)] = true; execution_mask[KEPLERMEMORY_REG_INDEX(data)] = true; } diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp index 6d9ebd6296..88869917fd 100644 --- a/src/video_core/engines/maxwell_3d.cpp +++ b/src/video_core/engines/maxwell_3d.cpp @@ -4,8 +4,14 @@ // SPDX-FileCopyrightText: Copyright 2018 yuzu Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later +#include #include #include + +#if defined(_MSC_VER) && !defined(__clang__) +#include +#endif + #include "common/assert.h" #include "common/bit_util.h" #include "common/scope_exit.h" @@ -22,6 +28,16 @@ namespace Tegra::Engines { +namespace { +inline void PrefetchLine(const void* addr) { +#if defined(_MSC_VER) && !defined(__clang__) + _mm_prefetch(static_cast(addr), _MM_HINT_T0); +#else + __builtin_prefetch(addr, 0, 1); +#endif +} +} // namespace + /// First register id that is actually a Macro call. constexpr u32 MacroRegistersStart = 0xE00; @@ -37,9 +53,10 @@ Maxwell3D::Maxwell3D(Core::System& system_, MemoryManager& memory_manager_) { dirty.flags.flip(); InitializeRegisterDefaults(); - execution_mask.reset(); - for (size_t i = 0; i < execution_mask.size(); i++) + execution_mask.fill(0); + for (size_t i = 0; i < EXECUTION_MASK_TABLE_SIZE; i++) execution_mask[i] = IsMethodExecutable(u32(i)); + execution_mask_default = true; } Maxwell3D::~Maxwell3D() = default; @@ -298,18 +315,44 @@ u32 Maxwell3D::ProcessShadowRam(u32 method, u32 argument) { } void Maxwell3D::ConsumeSinkImpl() { + std::stable_sort(method_sink.begin(), method_sink.end(), + [](const auto& a, const auto& b) { return a.first < b.first; }); + + const auto sink_size = method_sink.size(); const auto control = shadow_state.shadow_ram_control; if (control == Regs::ShadowRamControl::Track || control == Regs::ShadowRamControl::TrackWithFilter) { - for (auto [method, value] : method_sink) { + for (size_t i = 0; i < sink_size; ++i) { + const auto [method, value] = method_sink[i]; + if (i + 1 < sink_size) { + const u32 next = method_sink[i + 1].first; + PrefetchLine(®s.reg_array[next]); + PrefetchLine(&shadow_state.reg_array[next]); + PrefetchLine(&dirty.tables[0][next]); + } shadow_state.reg_array[method] = value; ProcessDirtyRegisters(method, value); } } else if (control == Regs::ShadowRamControl::Replay) { - for (auto [method, value] : method_sink) + for (size_t i = 0; i < sink_size; ++i) { + const auto [method, value] = method_sink[i]; + if (i + 1 < sink_size) { + const u32 next = method_sink[i + 1].first; + PrefetchLine(®s.reg_array[next]); + PrefetchLine(&shadow_state.reg_array[next]); + PrefetchLine(&dirty.tables[0][next]); + } ProcessDirtyRegisters(method, shadow_state.reg_array[method]); + } } else { - for (auto [method, value] : method_sink) + for (size_t i = 0; i < sink_size; ++i) { + const auto [method, value] = method_sink[i]; + if (i + 1 < sink_size) { + const u32 next = method_sink[i + 1].first; + PrefetchLine(®s.reg_array[next]); + PrefetchLine(&dirty.tables[0][next]); + } ProcessDirtyRegisters(method, value); + } } method_sink.clear(); } diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp index 089d118a09..c99039cda7 100644 --- a/src/video_core/engines/maxwell_dma.cpp +++ b/src/video_core/engines/maxwell_dma.cpp @@ -23,7 +23,7 @@ using namespace Texture; MaxwellDMA::MaxwellDMA(Core::System& system_, MemoryManager& memory_manager_) : system{system_}, memory_manager{memory_manager_} { - execution_mask.reset(); + execution_mask.fill(0); execution_mask[offsetof(Regs, launch_dma) / sizeof(u32)] = true; }