eden-miror/src/common/thread.cpp
lizzie 7c32cf03a1
[core/core_timing] better MWAITX and WAITPKG delays (#3984)
This implements MWAITX and WAITPKG extensions (umonitor, mwait) for CPUs that support them.

Reduces wait times and bypasses the timing stuff from the OS that is slow (windows notably). generally it should answer within 0.2 to 0.5 microsecs (since most requests wait for that long).

Also does a general rework of static ctors and stuff

Signed-off-by: lizzie <lizzie@eden-emu.dev>
Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/3984
Reviewed-by: MaranBr <maranbr@eden-emu.dev>
Reviewed-by: crueter <crueter@eden-emu.dev>
2026-05-30 21:59:10 +02:00

249 lines
9.1 KiB
C++

// SPDX-FileCopyrightText: Copyright 2026 Eden Emulator Project
// SPDX-License-Identifier: GPL-3.0-or-later
// SPDX-FileCopyrightText: 2013 Dolphin Emulator Project
// SPDX-FileCopyrightText: 2014 Citra Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#include <chrono>
#include <limits>
#include <string>
#include <thread>
#include "common/error.h"
#include "common/logging.h"
#include "common/assert.h"
#include "common/thread.h"
#ifdef __APPLE__
#include <mach/mach.h>
#elif defined(__HAIKU__)
#include <kernel/OS.h>
#elif defined(_WIN32)
#include <windows.h>
#include "common/string_util.h"
#include "common/windows/timer_resolution.h"
#else
#if defined(__FreeBSD__)
#include <sys/cpuset.h>
#include <sys/_cpuset.h>
#include <pthread_np.h>
// Compatibility with CPUset
#define cpu_set_t cpuset_t
#elif defined(__DragonFly__) || defined(__OpenBSD__) || defined(__Bitrig__)
#include <pthread_np.h>
#endif
#include <pthread.h>
#include <sched.h>
#endif
#ifndef _WIN32
#include <unistd.h>
#endif
#include "common/cpu_features.h"
#ifdef ARCHITECTURE_x86_64
#ifdef _MSC_VER
#include <intrin.h>
#else
#include <x86intrin.h>
#endif
#include "common/x64/rdtsc.h"
#endif
#include "core/core_timing.h"
namespace Common {
void SetCurrentThreadPriority(ThreadPriority new_priority) {
#ifdef _WIN32
int windows_priority = [&]() {
switch (new_priority) {
case ThreadPriority::Low: return THREAD_PRIORITY_BELOW_NORMAL;
case ThreadPriority::Normal: return THREAD_PRIORITY_NORMAL;
case ThreadPriority::High: return THREAD_PRIORITY_ABOVE_NORMAL;
case ThreadPriority::VeryHigh: return THREAD_PRIORITY_HIGHEST;
case ThreadPriority::Critical: return THREAD_PRIORITY_TIME_CRITICAL;
default: return THREAD_PRIORITY_NORMAL;
}
}();
SetThreadPriority(GetCurrentThread(), windows_priority);
#elif defined(__HAIKU__)
// TODO: We have priorities for 3D rendering applications - may help lavapipe?
int priority = [&]() {
switch (new_priority) {
case ThreadPriority::Low: return B_LOW_PRIORITY;
case ThreadPriority::Normal: return B_NORMAL_PRIORITY;
case ThreadPriority::High: return B_DISPLAY_PRIORITY;
case ThreadPriority::VeryHigh: return B_URGENT_DISPLAY_PRIORITY;
case ThreadPriority::Critical: return B_URGENT_PRIORITY;
default: return B_NORMAL_PRIORITY;
}
}();
set_thread_priority(find_thread(NULL), priority);
#else
pthread_t this_thread = pthread_self();
const auto scheduling_type = SCHED_OTHER;
s32 max_prio = sched_get_priority_max(scheduling_type);
s32 min_prio = sched_get_priority_min(scheduling_type);
u32 level = (std::max)(u32(new_priority) + 1, 4U);
struct sched_param params;
if (max_prio > min_prio) {
params.sched_priority = min_prio + ((max_prio - min_prio) * level) / 4;
} else {
params.sched_priority = min_prio - ((min_prio - max_prio) * level) / 4;
}
pthread_setschedparam(this_thread, scheduling_type, &params);
#endif
}
void SetCurrentThreadName(const char* name) {
#ifdef _MSC_VER
// Sets the debugger-visible name of the current thread.
if (auto pf = (decltype(&SetThreadDescription))(void*)GetProcAddress(GetModuleHandle(TEXT("KernelBase.dll")), "SetThreadDescription"); pf)
pf(GetCurrentThread(), UTF8ToUTF16W(name).data()); // Windows 10+
else
; // No-op
#elif defined(__APPLE__)
pthread_setname_np(name);
#elif defined(__HAIKU__)
rename_thread(find_thread(NULL), name);
#elif defined(__Bitrig__) || defined(__DragonFly__) || defined(__FreeBSD__) || defined(__OpenBSD__)
pthread_set_name_np(pthread_self(), name);
#elif defined(__NetBSD__)
pthread_setname_np(pthread_self(), "%s", (void*)name);
#elif defined(__linux__) || defined(__CYGWIN__) || defined(__sun__) || defined(__glibc__) || defined(__managarm__)
int ret = pthread_setname_np(pthread_self(), name);
if (ret == ERANGE) {
// Linux limits thread names to 15 characters and will outright reject any
// attempt to set a longer name with ERANGE.
char buf[16];
size_t const len = std::min<size_t>(std::strlen(name), sizeof(buf) - 1);
std::memcpy(buf, name, len);
buf[len] = '\0';
pthread_setname_np(pthread_self(), buf);
}
#elif defined(_WIN32)
// MinGW with the POSIX threading model does not support pthread_setname_np
// See for reference
// https://gitlab.freedesktop.org/mesa/mesa/-/blame/main/src/util/u_thread.c?ref_type=heads#L75
(void)name;
#else
pthread_setname_np(pthread_self(), name);
#endif
}
void PinCurrentThreadToPerformanceCore(size_t core_id) {
ASSERT(core_id < 4);
// If we set a flag for a CPU that doesn't exist, the thread may not be allowed to
// run in ANY processor!
auto const total_cores = std::thread::hardware_concurrency();
if (core_id < total_cores) {
#if defined(__ANDROID__)
cpu_set_t set;
CPU_ZERO(&set);
CPU_SET(core_id, &set);
sched_setaffinity(pthread_self(), sizeof(set), &set);
#elif defined(__linux__) || defined(__FreeBSD__)
cpu_set_t set;
CPU_ZERO(&set);
CPU_SET(core_id, &set);
pthread_setaffinity_np(pthread_self(), sizeof(set), &set);
#elif defined(_WIN32)
DWORD set = 1UL << core_id;
SetThreadAffinityMask(GetCurrentThread(), set);
#else
// No pin functionality implemented
#endif
}
}
#ifdef ARCHITECTURE_x86_64
// On Linux and UNIX systems, a futex would nominally be used to cover the costs
// the idea is that it's intuitivelly cheaper to use a direct instruction as opposed to a full futex call
// the underlying libc++ implementation uses pthread_cond_timedwait which MAY invoke a futex
// Let's pretend the OS is too expensive to jump into, and avoid ANY context switches
// this should *IN THEORY* lower CPU usage while just waiting for stuff effectively
// For windows the minimal quanta resolution is about 500us, and normal CRT cond var is 1.5ms(?)
// so may as well avoid that too
// Let's just give ALL platforms the same mechanisms (almost) for when they have umonitor OR waitpkg
#ifdef __clang__
__attribute__((target("waitpkg,mwaitx")))
#elif defined(__GNUC__)
#pragma GCC target("waitpkg")
#pragma GCC target("mwaitx")
#endif
bool Event::WaitFor(const std::chrono::nanoseconds time) {
#ifdef _WIN32
auto const start = Common::X64::FencedRDTSC();
auto const& caps = Common::g_cpu_caps;
[[maybe_unused]] auto const end = start + Common::g_wall_clock.NsToTicks(time);
if (caps.monitorx) {
while (true) {
// Armed monitor, as per manual, MWAITX must be conditional if the condition isn't satisfied
// to prevent a race condition.
_mm_monitorx(reinterpret_cast<u64*>(std::addressof(is_set)), 0, 0);
if (!is_set.load()) {
// RDTSC may be fenced here due to atomic load
auto const now = _rdtsc();
if (end > now) {
u32 const cycles = std::min<u32>((std::numeric_limits<u32>::max)(), s64(end) - s64(now));
// See here: https://github.com/torvalds/linux/blob/948a64995aca6820abefd17f1a4258f5835c5ad9/arch/x86/lib/delay.c#L93
// MWAITX accepts a 32-bit input timer which determines the total number of cycles to wait for
// NOT THE TOTAL ABSOLUTE TSC VALUE, it's just a delta
// BIT[1] = use a timer
// Hint = 0: Use C1 state when sleepy (means slower wakeup but better savings)
_mm_mwaitx(1 << 1, 0u, cycles);
if (!is_set.load())
return false;
} else
return false; //timeout
}
bool expected = true;
if (is_set.compare_exchange_weak(expected, false, std::memory_order_release))
return true;
}
} else if (caps.waitpkg) {
// #UD If CPUID.7.0:ECX.WAITPKG[bit 5]=0.
while (true) {
_umonitor(std::addressof(is_set));
if (!is_set.load() && !_umwait(0, end)) //umwait is absolute time!!!
return false;
bool expected = true;
if (is_set.compare_exchange_weak(expected, false, std::memory_order_release))
return true;
}
} else {
while (!is_set.load() && end > _rdtsc())
Common::Windows::SleepForOneTick();
if (is_set.load())
Reset();
return true;
}
#else
std::unique_lock lk{mutex};
if (!condvar.wait_for(lk, time, [this] { return is_set.load(); }))
return false;
is_set = false;
return true;
#endif
}
#else
bool Event::WaitFor(const std::chrono::nanoseconds time) {
#ifdef _WIN32
auto const end = Common::g_wall_clock.GetTimeNS() + time;
while (!is_set.load() && end > Common::g_wall_clock.GetTimeNS())
Common::Windows::SleepForOneTick();
if (is_set.load())
Reset();
return true;
#else
std::unique_lock lk{mutex};
if (!condvar.wait_for(lk, time, [this] { return is_set.load(); }))
return false;
is_set = false;
return true;
#endif
}
#endif
} // namespace Common