[core/core_timing] better MWAITX and WAITPKG delays (#3984)

This implements MWAITX and WAITPKG extensions (umonitor, mwait) for CPUs that support them.

Reduces wait times and bypasses the timing stuff from the OS that is slow (windows notably). generally it should answer within 0.2 to 0.5 microsecs (since most requests wait for that long).

Also does a general rework of static ctors and stuff

Signed-off-by: lizzie <lizzie@eden-emu.dev>
Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/3984
Reviewed-by: MaranBr <maranbr@eden-emu.dev>
Reviewed-by: crueter <crueter@eden-emu.dev>
This commit is contained in:
lizzie 2026-05-30 21:59:10 +02:00 committed by crueter
parent ff7bbaea7d
commit 7c32cf03a1
No known key found for this signature in database
GPG key ID: 425ACD2D4830EBC6
19 changed files with 477 additions and 533 deletions

View file

@ -1,250 +0,0 @@
// SPDX-FileCopyrightText: Copyright 2026 Eden Emulator Project
// SPDX-License-Identifier: GPL-3.0-or-later
// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project
// SPDX-FileCopyrightText: Copyright 2013 Dolphin Emulator Project / 2015 Citra Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#include <array>
#include <cstring>
#include <fstream>
#include <iterator>
#include <optional>
#include <string_view>
#include <thread>
#include <vector>
#include "common/bit_util.h"
#include "common/common_types.h"
#include "common/logging.h"
#include "common/x64/cpu_detect.h"
#include "common/x64/rdtsc.h"
#ifdef _WIN32
#include <windows.h>
#endif
#ifdef _MSC_VER
#include <intrin.h>
static inline u64 xgetbv(u32 index) {
return _xgetbv(index);
}
#else
#if defined(__DragonFly__) || defined(__FreeBSD__)
// clang-format off
#include <sys/types.h>
#include <machine/cpufunc.h>
// clang-format on
#endif
static inline void __cpuidex(int info[4], u32 function_id, u32 subfunction_id) {
#if defined(__DragonFly__) || defined(__FreeBSD__)
// Despite the name, this is just do_cpuid() with ECX as second input.
cpuid_count((u_int)function_id, (u_int)subfunction_id, (u_int*)info);
#else
info[0] = function_id; // eax
info[2] = subfunction_id; // ecx
__asm__("cpuid"
: "=a"(info[0]), "=b"(info[1]), "=c"(info[2]), "=d"(info[3])
: "a"(function_id), "c"(subfunction_id));
#endif
}
static inline void __cpuid(int info[4], u32 function_id) {
return __cpuidex(info, function_id, 0);
}
#define _XCR_XFEATURE_ENABLED_MASK 0
static inline u64 xgetbv(u32 index) {
u32 eax, edx;
__asm__ __volatile__("xgetbv" : "=a"(eax), "=d"(edx) : "c"(index));
return ((u64)edx << 32) | eax;
}
#endif // _MSC_VER
namespace Common {
CPUCaps::Manufacturer CPUCaps::ParseManufacturer(std::string_view brand_string) {
if (brand_string == "GenuineIntel") {
return Manufacturer::Intel;
} else if (brand_string == "AuthenticAMD") {
return Manufacturer::AMD;
} else if (brand_string == "HygonGenuine") {
return Manufacturer::Hygon;
}
return Manufacturer::Unknown;
}
// Detects the various CPU features
static CPUCaps Detect() {
CPUCaps caps = {};
// Assumes the CPU supports the CPUID instruction. Those that don't would likely not support
// yuzu at all anyway
int cpu_id[4];
// Detect CPU's CPUID capabilities and grab manufacturer string
__cpuid(cpu_id, 0x00000000);
const u32 max_std_fn = cpu_id[0]; // EAX
std::memset(caps.brand_string, 0, std::size(caps.brand_string));
std::memcpy(&caps.brand_string[0], &cpu_id[1], sizeof(u32));
std::memcpy(&caps.brand_string[4], &cpu_id[3], sizeof(u32));
std::memcpy(&caps.brand_string[8], &cpu_id[2], sizeof(u32));
caps.manufacturer = CPUCaps::ParseManufacturer(caps.brand_string);
// Set reasonable default cpu string even if brand string not available
std::strncpy(caps.cpu_string, caps.brand_string, std::size(caps.brand_string));
__cpuid(cpu_id, 0x80000000);
const u32 max_ex_fn = cpu_id[0];
// Detect family and other miscellaneous features
if (max_std_fn >= 1) {
__cpuid(cpu_id, 0x00000001);
caps.sse3 = Common::Bit<0>(cpu_id[2]);
caps.pclmulqdq = Common::Bit<1>(cpu_id[2]);
caps.ssse3 = Common::Bit<9>(cpu_id[2]);
caps.sse4_1 = Common::Bit<19>(cpu_id[2]);
caps.sse4_2 = Common::Bit<20>(cpu_id[2]);
caps.movbe = Common::Bit<22>(cpu_id[2]);
caps.popcnt = Common::Bit<23>(cpu_id[2]);
caps.aes = Common::Bit<25>(cpu_id[2]);
caps.f16c = Common::Bit<29>(cpu_id[2]);
// AVX support requires 3 separate checks:
// - Is the AVX bit set in CPUID?
// - Is the XSAVE bit set in CPUID?
// - XGETBV result has the XCR bit set.
if (Common::Bit<28>(cpu_id[2]) && Common::Bit<27>(cpu_id[2])) {
if ((xgetbv(_XCR_XFEATURE_ENABLED_MASK) & 0x6) == 0x6) {
caps.avx = true;
if (Common::Bit<12>(cpu_id[2]))
caps.fma = true;
}
}
if (max_std_fn >= 7) {
__cpuidex(cpu_id, 0x00000007, 0x00000000);
// Can't enable AVX{2,512} unless the XSAVE/XGETBV checks above passed
if (caps.avx) {
caps.avx2 = Common::Bit<5>(cpu_id[1]);
caps.avx512f = Common::Bit<16>(cpu_id[1]);
caps.avx512dq = Common::Bit<17>(cpu_id[1]);
caps.avx512cd = Common::Bit<28>(cpu_id[1]);
caps.avx512bw = Common::Bit<30>(cpu_id[1]);
caps.avx512vl = Common::Bit<31>(cpu_id[1]);
caps.avx512vbmi = Common::Bit<1>(cpu_id[2]);
caps.avx512bitalg = Common::Bit<12>(cpu_id[2]);
}
caps.bmi1 = Common::Bit<3>(cpu_id[1]);
caps.bmi2 = Common::Bit<8>(cpu_id[1]);
caps.sha = Common::Bit<29>(cpu_id[1]);
caps.waitpkg = Common::Bit<5>(cpu_id[2]);
caps.gfni = Common::Bit<8>(cpu_id[2]);
}
}
if (max_ex_fn >= 0x80000004) {
// Extract CPU model string
__cpuid(cpu_id, 0x80000002);
std::memcpy(caps.cpu_string, cpu_id, sizeof(cpu_id));
__cpuid(cpu_id, 0x80000003);
std::memcpy(caps.cpu_string + 16, cpu_id, sizeof(cpu_id));
__cpuid(cpu_id, 0x80000004);
std::memcpy(caps.cpu_string + 32, cpu_id, sizeof(cpu_id));
}
if (max_ex_fn >= 0x80000001) {
// Check for more features
__cpuid(cpu_id, 0x80000001);
caps.lzcnt = Common::Bit<5>(cpu_id[2]);
caps.monitorx = Common::Bit<29>(cpu_id[2]);
}
if (max_ex_fn >= 0x80000007) {
__cpuid(cpu_id, 0x80000007);
caps.invariant_tsc = Common::Bit<8>(cpu_id[3]);
}
if (max_std_fn >= 0x15) {
__cpuid(cpu_id, 0x15);
caps.tsc_crystal_ratio_denominator = cpu_id[0];
caps.tsc_crystal_ratio_numerator = cpu_id[1];
caps.crystal_frequency = cpu_id[2];
// Some CPU models might not return a crystal frequency.
// The CPU model can be detected to use the values from turbostat
// https://github.com/torvalds/linux/blob/master/tools/power/x86/turbostat/turbostat.c#L5569
// but it's easier to just estimate the TSC tick rate for these cases.
if (caps.tsc_crystal_ratio_denominator) {
caps.tsc_frequency = static_cast<u64>(caps.crystal_frequency) *
caps.tsc_crystal_ratio_numerator /
caps.tsc_crystal_ratio_denominator;
} else {
caps.tsc_frequency = X64::EstimateRDTSCFrequency();
}
}
if (max_std_fn >= 0x16) {
__cpuid(cpu_id, 0x16);
caps.base_frequency = cpu_id[0];
caps.max_frequency = cpu_id[1];
caps.bus_frequency = cpu_id[2];
}
return caps;
}
const CPUCaps& GetCPUCaps() {
static CPUCaps caps = Detect();
return caps;
}
std::optional<int> GetProcessorCount() {
#if defined(_WIN32)
// Get the buffer length.
DWORD length = 0;
GetLogicalProcessorInformation(nullptr, &length);
if (GetLastError() != ERROR_INSUFFICIENT_BUFFER) {
LOG_ERROR(Frontend, "Failed to query core count.");
return std::nullopt;
}
std::vector<SYSTEM_LOGICAL_PROCESSOR_INFORMATION> buffer(
length / sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION));
// Now query the core count.
if (!GetLogicalProcessorInformation(buffer.data(), &length)) {
LOG_ERROR(Frontend, "Failed to query core count.");
return std::nullopt;
}
return static_cast<int>(
std::count_if(buffer.cbegin(), buffer.cend(), [](const auto& proc_info) {
return proc_info.Relationship == RelationProcessorCore;
}));
#elif defined(__unix__)
const int thread_count = std::thread::hardware_concurrency();
std::ifstream smt("/sys/devices/system/cpu/smt/active");
char state = '0';
if (smt) {
smt.read(&state, sizeof(state));
}
switch (state) {
case '0':
return thread_count;
case '1':
return thread_count / 2;
default:
return std::nullopt;
}
#else
// Shame on you
return std::nullopt;
#endif
}
} // namespace Common

View file

@ -1,82 +0,0 @@
// SPDX-FileCopyrightText: Copyright 2026 Eden Emulator Project
// SPDX-License-Identifier: GPL-3.0-or-later
// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project
// SPDX-FileCopyrightText: Copyright 2013 Dolphin Emulator Project / 2015 Citra Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#pragma once
#include <optional>
#include <string_view>
#include "common/common_types.h"
namespace Common {
/// x86/x64 CPU capabilities that may be detected by this module
struct CPUCaps {
enum class Manufacturer : u8 {
Unknown = 0,
Intel = 1,
AMD = 2,
Hygon = 3,
};
static Manufacturer ParseManufacturer(std::string_view brand_string);
Manufacturer manufacturer;
char brand_string[13];
char cpu_string[48];
u32 base_frequency;
u32 max_frequency;
u32 bus_frequency;
u32 tsc_crystal_ratio_denominator;
u32 tsc_crystal_ratio_numerator;
u32 crystal_frequency;
u64 tsc_frequency; // Derived from the above three values
bool sse3 : 1;
bool ssse3 : 1;
bool sse4_1 : 1;
bool sse4_2 : 1;
bool avx : 1;
bool avx2 : 1;
bool avx512f : 1;
bool avx512dq : 1;
bool avx512cd : 1;
bool avx512bw : 1;
bool avx512vl : 1;
bool avx512vbmi : 1;
bool avx512bitalg : 1;
bool aes : 1;
bool bmi1 : 1;
bool bmi2 : 1;
bool f16c : 1;
bool fma : 1;
bool gfni : 1;
bool invariant_tsc : 1;
bool lzcnt : 1;
bool monitorx : 1;
bool movbe : 1;
bool pclmulqdq : 1;
bool popcnt : 1;
bool sha : 1;
bool waitpkg : 1;
};
/**
* Gets the supported capabilities of the host CPU
* @return Reference to a CPUCaps struct with the detected host CPU capabilities
*/
const CPUCaps& GetCPUCaps();
/// Detects CPU core count
std::optional<int> GetProcessorCount();
} // namespace Common

View file

@ -1,75 +0,0 @@
// SPDX-FileCopyrightText: Copyright 2026 Eden Emulator Project
// SPDX-License-Identifier: GPL-3.0-or-later
#include <thread>
#ifdef _MSC_VER
#include <intrin.h>
#endif
#include "common/x64/cpu_detect.h"
#include "common/x64/cpu_wait.h"
#include "common/x64/rdtsc.h"
namespace Common::X64 {
namespace {
// 100,000 cycles is a reasonable amount of time to wait to save on CPU resources.
// For reference:
// At 1 GHz, 100K cycles is 100us
// At 2 GHz, 100K cycles is 50us
// At 4 GHz, 100K cycles is 25us
constexpr auto PauseCycles = 100'000U;
} // Anonymous namespace
#if defined(_MSC_VER) && !defined(__clang__)
__forceinline static void TPAUSE() {
static constexpr auto RequestC02State = 0U;
_tpause(RequestC02State, FencedRDTSC() + PauseCycles);
}
__forceinline static void MWAITX() {
static constexpr auto EnableWaitTimeFlag = 1U << 1;
static constexpr auto RequestC1State = 0U;
// monitor_var should be aligned to a cache line.
alignas(64) u64 monitor_var{};
_mm_monitorx(&monitor_var, 0, 0);
_mm_mwaitx(EnableWaitTimeFlag, RequestC1State, PauseCycles);
}
#else
static void TPAUSE() {
static constexpr auto RequestC02State = 0U;
const auto tsc = FencedRDTSC() + PauseCycles;
const auto eax = static_cast<u32>(tsc & 0xFFFFFFFF);
const auto edx = static_cast<u32>(tsc >> 32);
asm volatile("tpause %0" : : "r"(RequestC02State), "d"(edx), "a"(eax));
}
static void MWAITX() {
static constexpr auto EnableWaitTimeFlag = 1U << 1;
static constexpr auto RequestC1State = 0U;
// monitor_var should be aligned to a cache line.
alignas(64) u64 monitor_var{};
asm volatile("monitorx" : : "a"(&monitor_var), "c"(0), "d"(0));
asm volatile("mwaitx" : : "a"(RequestC1State), "b"(PauseCycles), "c"(EnableWaitTimeFlag));
}
#endif
void MicroSleep() {
static const bool has_waitpkg = GetCPUCaps().waitpkg;
static const bool has_monitorx = GetCPUCaps().monitorx;
if (has_waitpkg) {
TPAUSE();
} else if (has_monitorx) {
MWAITX();
} else {
std::this_thread::yield();
}
}
} // namespace Common::X64

View file

@ -1,10 +0,0 @@
// SPDX-FileCopyrightText: Copyright 2026 Eden Emulator Project
// SPDX-License-Identifier: GPL-3.0-or-later
#pragma once
namespace Common::X64 {
void MicroSleep();
} // namespace Common::X64