[core/core_timing] better MWAITX and WAITPKG delays (#3984)

This implements MWAITX and WAITPKG extensions (umonitor, mwait) for CPUs that support them.

Reduces wait times and bypasses the timing stuff from the OS that is slow (windows notably). generally it should answer within 0.2 to 0.5 microsecs (since most requests wait for that long).

Also does a general rework of static ctors and stuff

Signed-off-by: lizzie <lizzie@eden-emu.dev>
Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/3984
Reviewed-by: MaranBr <maranbr@eden-emu.dev>
Reviewed-by: crueter <crueter@eden-emu.dev>
This commit is contained in:
lizzie 2026-05-30 21:59:10 +02:00 committed by crueter
parent ff7bbaea7d
commit 7c32cf03a1
No known key found for this signature in database
GPG key ID: 425ACD2D4830EBC6
19 changed files with 477 additions and 533 deletions

View file

@ -4,6 +4,8 @@
// SPDX-FileCopyrightText: 2014 Citra Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#include <chrono>
#include <limits>
#include <string>
#include <thread>
@ -18,24 +20,35 @@
#elif defined(_WIN32)
#include <windows.h>
#include "common/string_util.h"
#include "common/windows/timer_resolution.h"
#else
#if defined(__FreeBSD__)
#include <sys/cpuset.h>
#include <sys/_cpuset.h>
#include <pthread_np.h>
// Compatibility with CPUset
#define cpu_set_t cpuset_t
#elif defined(__DragonFly__) || defined(__OpenBSD__) || defined(__Bitrig__)
#include <pthread_np.h>
#endif
#include <pthread.h>
#include <sched.h>
#endif
#ifndef _WIN32
#include <unistd.h>
#endif
#ifdef __FreeBSD__
# define cpu_set_t cpuset_t
#include "common/cpu_features.h"
#ifdef ARCHITECTURE_x86_64
#ifdef _MSC_VER
#include <intrin.h>
#else
#include <x86intrin.h>
#endif
#include "common/x64/rdtsc.h"
#endif
#include "core/core_timing.h"
namespace Common {
@ -144,4 +157,93 @@ void PinCurrentThreadToPerformanceCore(size_t core_id) {
}
}
#ifdef ARCHITECTURE_x86_64
// On Linux and UNIX systems, a futex would nominally be used to cover the costs
// the idea is that it's intuitivelly cheaper to use a direct instruction as opposed to a full futex call
// the underlying libc++ implementation uses pthread_cond_timedwait which MAY invoke a futex
// Let's pretend the OS is too expensive to jump into, and avoid ANY context switches
// this should *IN THEORY* lower CPU usage while just waiting for stuff effectively
// For windows the minimal quanta resolution is about 500us, and normal CRT cond var is 1.5ms(?)
// so may as well avoid that too
// Let's just give ALL platforms the same mechanisms (almost) for when they have umonitor OR waitpkg
#ifdef __clang__
__attribute__((target("waitpkg,mwaitx")))
#elif defined(__GNUC__)
#pragma GCC target("waitpkg")
#pragma GCC target("mwaitx")
#endif
bool Event::WaitFor(const std::chrono::nanoseconds time) {
#ifdef _WIN32
auto const start = Common::X64::FencedRDTSC();
auto const& caps = Common::g_cpu_caps;
[[maybe_unused]] auto const end = start + Common::g_wall_clock.NsToTicks(time);
if (caps.monitorx) {
while (true) {
// Armed monitor, as per manual, MWAITX must be conditional if the condition isn't satisfied
// to prevent a race condition.
_mm_monitorx(reinterpret_cast<u64*>(std::addressof(is_set)), 0, 0);
if (!is_set.load()) {
// RDTSC may be fenced here due to atomic load
auto const now = _rdtsc();
if (end > now) {
u32 const cycles = std::min<u32>((std::numeric_limits<u32>::max)(), s64(end) - s64(now));
// See here: https://github.com/torvalds/linux/blob/948a64995aca6820abefd17f1a4258f5835c5ad9/arch/x86/lib/delay.c#L93
// MWAITX accepts a 32-bit input timer which determines the total number of cycles to wait for
// NOT THE TOTAL ABSOLUTE TSC VALUE, it's just a delta
// BIT[1] = use a timer
// Hint = 0: Use C1 state when sleepy (means slower wakeup but better savings)
_mm_mwaitx(1 << 1, 0u, cycles);
if (!is_set.load())
return false;
} else
return false; //timeout
}
bool expected = true;
if (is_set.compare_exchange_weak(expected, false, std::memory_order_release))
return true;
}
} else if (caps.waitpkg) {
// #UD If CPUID.7.0:ECX.WAITPKG[bit 5]=0.
while (true) {
_umonitor(std::addressof(is_set));
if (!is_set.load() && !_umwait(0, end)) //umwait is absolute time!!!
return false;
bool expected = true;
if (is_set.compare_exchange_weak(expected, false, std::memory_order_release))
return true;
}
} else {
while (!is_set.load() && end > _rdtsc())
Common::Windows::SleepForOneTick();
if (is_set.load())
Reset();
return true;
}
#else
std::unique_lock lk{mutex};
if (!condvar.wait_for(lk, time, [this] { return is_set.load(); }))
return false;
is_set = false;
return true;
#endif
}
#else
bool Event::WaitFor(const std::chrono::nanoseconds time) {
#ifdef _WIN32
auto const end = Common::g_wall_clock.GetTimeNS() + time;
while (!is_set.load() && end > Common::g_wall_clock.GetTimeNS())
Common::Windows::SleepForOneTick();
if (is_set.load())
Reset();
return true;
#else
std::unique_lock lk{mutex};
if (!condvar.wait_for(lk, time, [this] { return is_set.load(); }))
return false;
is_set = false;
return true;
#endif
}
#endif
} // namespace Common