diff --git a/src/video_core/dma_pusher.cpp b/src/video_core/dma_pusher.cpp
index 3844a8e2f9..d6d44e66b4 100644
--- a/src/video_core/dma_pusher.cpp
+++ b/src/video_core/dma_pusher.cpp
@@ -122,7 +122,35 @@ void DmaPusher::ProcessCommands(std::span<const CommandHeader> commands) {
             dma_state.is_last_call = true;
             index += max_write;
         } else if (dma_state.method_count) {
-            auto const command_header = commands[index]; //can copy
+            if (!dma_state.non_incrementing && !dma_increment_once &&
+                dma_state.method >= non_puller_methods) {
+                auto subchannel = subchannels[dma_state.subchannel];
+                const u32 available = u32(std::min<size_t>(
+                    index + dma_state.method_count, commands.size()) - index);
+                u32 batch = 0;
+                u32 method = dma_state.method;
+                while (batch < available) {
+                    const bool needs_exec =
+                        (method < Engines::EngineInterface::EXECUTION_MASK_TABLE_SIZE)
+                            ? subchannel->execution_mask[method]
+                            : subchannel->execution_mask_default;
+                    if (needs_exec) break;
+                    batch++;
+                    method++;
+                }
+                if (batch > 0) {
+                    auto& sink = subchannel->method_sink;
+                    sink.reserve(sink.size() + batch);
+                    for (u32 j = 0; j < batch; j++) {
+                        sink.emplace_back(dma_state.method + j, commands[index + j].argument);
+                    }
+                    dma_state.method += batch;
+                    dma_state.method_count -= batch;
+                    index += batch;
+                    continue;
+                }
+            }
+            auto const command_header = commands[index];
             dma_state.dma_word_offset = u32(index * sizeof(u32));
             dma_state.is_last_call = dma_state.method_count <= 1;
             CallMethod(command_header.argument);
@@ -181,7 +209,11 @@ void DmaPusher::CallMethod(u32 argument) const {
         });
     } else {
         auto subchannel = subchannels[dma_state.subchannel];
-        if (!subchannel->execution_mask[dma_state.method]) {
+        const bool needs_execution =
+            (dma_state.method < Engines::EngineInterface::EXECUTION_MASK_TABLE_SIZE)
+                ? subchannel->execution_mask[dma_state.method]
+                : subchannel->execution_mask_default;
+        if (!needs_execution) {
             subchannel->method_sink.emplace_back(dma_state.method, argument);
         } else {
             subchannel->ConsumeSink();
diff --git a/src/video_core/engines/engine_interface.h b/src/video_core/engines/engine_interface.h
index bf3bd66aca..292f0a5738 100644
--- a/src/video_core/engines/engine_interface.h
+++ b/src/video_core/engines/engine_interface.h
@@ -6,9 +6,9 @@
 
 #pragma once
 
-#include <bitset>
-#include <limits>
-#include <vector>
+#include <array>
+
+#include <boost/container/small_vector.hpp>
 
 #include "common/common_types.h"
 
@@ -41,8 +41,11 @@ public:
         ConsumeSinkImpl();
     }
 
-    std::bitset<(std::numeric_limits<u16>::max)()> execution_mask{};
-    std::vector<std::pair<u32, u32>> method_sink{};
+    static constexpr size_t EXECUTION_MASK_TABLE_SIZE = 0xE00;
+
+    std::array<u8, EXECUTION_MASK_TABLE_SIZE> execution_mask{};
+    bool execution_mask_default{};
+    boost::container::small_vector<std::pair<u32, u32>, 64> method_sink{};
     bool current_dirty{};
     GPUVAddr current_dma_segment;
 
diff --git a/src/video_core/engines/fermi_2d.cpp b/src/video_core/engines/fermi_2d.cpp
index b442c5cc76..11f60ef32b 100644
--- a/src/video_core/engines/fermi_2d.cpp
+++ b/src/video_core/engines/fermi_2d.cpp
@@ -26,7 +26,7 @@ Fermi2D::Fermi2D(MemoryManager& memory_manager_) : memory_manager{memory_manager
     regs.src.depth = 1;
     regs.dst.depth = 1;
 
-    execution_mask.reset();
+    execution_mask.fill(0);
     execution_mask[FERMI2D_REG_INDEX(pixels_from_memory.src_y0) + 1] = true;
 }
 
diff --git a/src/video_core/engines/kepler_compute.cpp b/src/video_core/engines/kepler_compute.cpp
index 7b4efeb1e0..d6ee80f6e2 100644
--- a/src/video_core/engines/kepler_compute.cpp
+++ b/src/video_core/engines/kepler_compute.cpp
@@ -18,7 +18,7 @@ namespace Tegra::Engines {
 
 KeplerCompute::KeplerCompute(Core::System& system_, MemoryManager& memory_manager_)
     : system{system_}, memory_manager{memory_manager_}, upload_state{memory_manager, regs.upload} {
-    execution_mask.reset();
+    execution_mask.fill(0);
     execution_mask[KEPLER_COMPUTE_REG_INDEX(exec_upload)] = true;
     execution_mask[KEPLER_COMPUTE_REG_INDEX(data_upload)] = true;
     execution_mask[KEPLER_COMPUTE_REG_INDEX(launch)] = true;
diff --git a/src/video_core/engines/kepler_memory.cpp b/src/video_core/engines/kepler_memory.cpp
index 5d4c4720d3..013a644c1b 100644
--- a/src/video_core/engines/kepler_memory.cpp
+++ b/src/video_core/engines/kepler_memory.cpp
@@ -22,7 +22,7 @@ KeplerMemory::~KeplerMemory() = default;
 void KeplerMemory::BindRasterizer(VideoCore::RasterizerInterface* rasterizer_) {
     upload_state.BindRasterizer(rasterizer_);
 
-    execution_mask.reset();
+    execution_mask.fill(0);
     execution_mask[KEPLERMEMORY_REG_INDEX(exec)] = true;
     execution_mask[KEPLERMEMORY_REG_INDEX(data)] = true;
 }
diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp
index 6d9ebd6296..88869917fd 100644
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -4,8 +4,14 @@
 // SPDX-FileCopyrightText: Copyright 2018 yuzu Emulator Project
 // SPDX-License-Identifier: GPL-2.0-or-later
 
+#include <algorithm>
 #include <cstring>
 #include <optional>
+
+#if defined(_MSC_VER) && !defined(__clang__)
+#include <intrin.h>
+#endif
+
 #include "common/assert.h"
 #include "common/bit_util.h"
 #include "common/scope_exit.h"
@@ -22,6 +28,16 @@
 
 namespace Tegra::Engines {
 
+namespace {
+inline void PrefetchLine(const void* addr) {
+#if defined(_MSC_VER) && !defined(__clang__)
+    _mm_prefetch(static_cast<const char*>(addr), _MM_HINT_T0);
+#else
+    __builtin_prefetch(addr, 0, 1);
+#endif
+}
+} // namespace
+
 /// First register id that is actually a Macro call.
 constexpr u32 MacroRegistersStart = 0xE00;
 
@@ -37,9 +53,10 @@ Maxwell3D::Maxwell3D(Core::System& system_, MemoryManager& memory_manager_)
 {
     dirty.flags.flip();
     InitializeRegisterDefaults();
-    execution_mask.reset();
-    for (size_t i = 0; i < execution_mask.size(); i++)
+    execution_mask.fill(0);
+    for (size_t i = 0; i < EXECUTION_MASK_TABLE_SIZE; i++)
         execution_mask[i] = IsMethodExecutable(u32(i));
+    execution_mask_default = true;
 }
 
 Maxwell3D::~Maxwell3D() = default;
@@ -298,18 +315,44 @@ u32 Maxwell3D::ProcessShadowRam(u32 method, u32 argument) {
 }
 
 void Maxwell3D::ConsumeSinkImpl() {
+    std::stable_sort(method_sink.begin(), method_sink.end(),
+                     [](const auto& a, const auto& b) { return a.first < b.first; });
+
+    const auto sink_size = method_sink.size();
     const auto control = shadow_state.shadow_ram_control;
     if (control == Regs::ShadowRamControl::Track || control == Regs::ShadowRamControl::TrackWithFilter) {
-        for (auto [method, value] : method_sink) {
+        for (size_t i = 0; i < sink_size; ++i) {
+            const auto [method, value] = method_sink[i];
+            if (i + 1 < sink_size) {
+                const u32 next = method_sink[i + 1].first;
+                PrefetchLine(&regs.reg_array[next]);
+                PrefetchLine(&shadow_state.reg_array[next]);
+                PrefetchLine(&dirty.tables[0][next]);
+            }
             shadow_state.reg_array[method] = value;
             ProcessDirtyRegisters(method, value);
         }
     } else if (control == Regs::ShadowRamControl::Replay) {
-        for (auto [method, value] : method_sink)
+        for (size_t i = 0; i < sink_size; ++i) {
+            const auto [method, value] = method_sink[i];
+            if (i + 1 < sink_size) {
+                const u32 next = method_sink[i + 1].first;
+                PrefetchLine(&regs.reg_array[next]);
+                PrefetchLine(&shadow_state.reg_array[next]);
+                PrefetchLine(&dirty.tables[0][next]);
+            }
             ProcessDirtyRegisters(method, shadow_state.reg_array[method]);
+        }
     } else {
-        for (auto [method, value] : method_sink)
+        for (size_t i = 0; i < sink_size; ++i) {
+            const auto [method, value] = method_sink[i];
+            if (i + 1 < sink_size) {
+                const u32 next = method_sink[i + 1].first;
+                PrefetchLine(&regs.reg_array[next]);
+                PrefetchLine(&dirty.tables[0][next]);
+            }
             ProcessDirtyRegisters(method, value);
+        }
     }
     method_sink.clear();
 }
diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp
index 089d118a09..c99039cda7 100644
--- a/src/video_core/engines/maxwell_dma.cpp
+++ b/src/video_core/engines/maxwell_dma.cpp
@@ -23,7 +23,7 @@ using namespace Texture;
 
 MaxwellDMA::MaxwellDMA(Core::System& system_, MemoryManager& memory_manager_)
     : system{system_}, memory_manager{memory_manager_} {
-    execution_mask.reset();
+    execution_mask.fill(0);
     execution_mask[offsetof(Regs, launch_dma) / sizeof(u32)] = true;
 }