From f946118c057ed51dcc324e5b64edfe29644ed316 Mon Sep 17 00:00:00 2001 From: lizzie Date: Tue, 19 May 2026 07:46:59 +0000 Subject: [PATCH] [core/core_timing] better MWAITX and WAITPKG delays Signed-off-by: lizzie --- src/common/x64/cpu_wait.cpp | 81 +++++++++++++------------------------ src/common/x64/cpu_wait.h | 4 +- src/core/core_timing.cpp | 2 +- 3 files changed, 32 insertions(+), 55 deletions(-) diff --git a/src/common/x64/cpu_wait.cpp b/src/common/x64/cpu_wait.cpp index 85d27161ba..78a2d2d921 100644 --- a/src/common/x64/cpu_wait.cpp +++ b/src/common/x64/cpu_wait.cpp @@ -5,6 +5,8 @@ #ifdef _MSC_VER #include +#else +#include #endif #include "common/x64/cpu_detect.h" @@ -13,60 +15,33 @@ namespace Common::X64 { -namespace { - -// 100,000 cycles is a reasonable amount of time to wait to save on CPU resources. -// For reference: -// At 1 GHz, 100K cycles is 100us -// At 2 GHz, 100K cycles is 50us -// At 4 GHz, 100K cycles is 25us -constexpr auto PauseCycles = 100'000U; - -} // Anonymous namespace - -#if defined(_MSC_VER) && !defined(__clang__) -__forceinline static void TPAUSE() { - static constexpr auto RequestC02State = 0U; - _tpause(RequestC02State, FencedRDTSC() + PauseCycles); -} - -__forceinline static void MWAITX() { - static constexpr auto EnableWaitTimeFlag = 1U << 1; - static constexpr auto RequestC1State = 0U; - - // monitor_var should be aligned to a cache line. - alignas(64) u64 monitor_var{}; - _mm_monitorx(&monitor_var, 0, 0); - _mm_mwaitx(EnableWaitTimeFlag, RequestC1State, PauseCycles); -} -#else -static void TPAUSE() { - static constexpr auto RequestC02State = 0U; - const auto tsc = FencedRDTSC() + PauseCycles; - const auto eax = static_cast(tsc & 0xFFFFFFFF); - const auto edx = static_cast(tsc >> 32); - asm volatile("tpause %0" : : "r"(RequestC02State), "d"(edx), "a"(eax)); -} - -static void MWAITX() { - static constexpr auto EnableWaitTimeFlag = 1U << 1; - static constexpr auto RequestC1State = 0U; - - // monitor_var should be aligned to a cache line. - alignas(64) u64 monitor_var{}; - asm volatile("monitorx" : : "a"(&monitor_var), "c"(0), "d"(0)); - asm volatile("mwaitx" : : "a"(RequestC1State), "b"(PauseCycles), "c"(EnableWaitTimeFlag)); -} +#ifdef __clang__ +__attribute__((target("waitpkg"))) +__attribute__((target("mwaitx"))) +#elif defined(__GNUC__) +#pragma GCC target("waitpkg") +#pragma GCC target("mwaitx") #endif - -void MicroSleep() { - static const bool has_waitpkg = GetCPUCaps().waitpkg; - static const bool has_monitorx = GetCPUCaps().monitorx; - - if (has_waitpkg) { - TPAUSE(); - } else if (has_monitorx) { - MWAITX(); +void MicroSleep(u64 rem) { + // 100,000 cycles is a reasonable amount of time to wait to save on CPU resources. + // For reference: + // At 1 GHz, 100K cycles is 100us + // At 2 GHz, 100K cycles is 50us + // At 4 GHz, 100K cycles is 25us + auto& caps = GetCPUCaps(); + u32 cycles = caps.invariant_tsc + ? 1'000'000U + : rem * (caps.tsc_frequency / 1000000ULL); + if (caps.waitpkg) { + constexpr auto RequestC02State = 0U; + _tpause(RequestC02State, FencedRDTSC() + cycles); + } else if (caps.monitorx) { + constexpr auto EnableWaitTimeFlag = 1U << 1; + constexpr auto RequestC1State = 0U; + // monitor_var should be aligned to a cache line. + alignas(64) u64 monitor_var{}; + _mm_monitorx(&monitor_var, 0, 0); + _mm_mwaitx(EnableWaitTimeFlag, RequestC1State, cycles); } else { std::this_thread::yield(); } diff --git a/src/common/x64/cpu_wait.h b/src/common/x64/cpu_wait.h index 472ddca815..8bf47783ec 100644 --- a/src/common/x64/cpu_wait.h +++ b/src/common/x64/cpu_wait.h @@ -3,8 +3,10 @@ #pragma once +#include "common/common_types.h" + namespace Common::X64 { -void MicroSleep(); +void MicroSleep(u64 rem); } // namespace Common::X64 diff --git a/src/core/core_timing.cpp b/src/core/core_timing.cpp index 6738a0d73d..d91361f2e7 100644 --- a/src/core/core_timing.cpp +++ b/src/core/core_timing.cpp @@ -78,7 +78,7 @@ void CoreTiming::Initialize(std::function&& on_thread_init_) { Common::Windows::SleepForOneTick(); } else { #ifdef ARCHITECTURE_x86_64 - Common::X64::MicroSleep(); + Common::X64::MicroSleep(wait_time); #else std::this_thread::yield(); #endif