[scheduler, dma, maxwell] Reduce CPU stalls in the GPU command processing pipeline through multiple targeted optimizations (#3296)

- Scheduler: Reduced lock scope to allow parallel command preparation across channels - DmaPusher: Added command prefetching (16-command lookahead) to improve cache hit rate - Maxwell3D: Pre-allocated macro parameter vectors to eliminate dynamic allocations and unrolls dirty register tracking loop for better cache locality - MacroEngine: Added last-executed macro cache to skip hash table lookups on hot path Co-authored-by: lizzie <lizzie@eden-emu.dev> Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/3296 Reviewed-by: Maufeat <sahyno1996@gmail.com> Reviewed-by: DraVee <dravee@eden-emu.dev> Co-authored-by: CamilleLaVey <camillelavey99@gmail.com> Co-committed-by: CamilleLaVey <camillelavey99@gmail.com>
2026-04-15 02:28:56 +02:00 · 2026-01-18 03:45:18 +01:00 · 2026-01-18 03:45:18 +01:00 · 51cc1bc6be
commit 51cc1bc6be
parent 6ec6ca7c37
4 changed files with 100 additions and 30 deletions
--- a/src/video_core/control/scheduler.cpp
+++ b/src/video_core/control/scheduler.cpp
@ -17,11 +17,16 @@ Scheduler::Scheduler(GPU& gpu_) : gpu{gpu_} {}
 Scheduler::~Scheduler() = default;

 void Scheduler::Push(s32 channel, CommandList&& entries) {
-    std::unique_lock lk(scheduling_guard);
-    auto it = channels.find(channel);
-    ASSERT(it != channels.end());
-    auto& channel_state = it->second;
-    gpu.BindChannel(channel_state->bind_id);
+    std::shared_ptr<ChannelState> channel_state;
+    {
+        std::unique_lock lk(scheduling_guard);
+        auto it = channels.find(channel);
+        ASSERT(it != channels.end());
+        channel_state = it->second;
+        gpu.BindChannel(channel_state->bind_id);
+    }
+    // Process commands outside the lock to reduce contention.
+    // Multiple channels can prepare their commands in parallel.
    channel_state->dma_pusher->Push(std::move(entries));
    channel_state->dma_pusher->DispatchCalls();
 }