mirror of
https://git.eden-emu.dev/eden-emu/eden
synced 2026-06-04 09:27:05 +02:00
[core/core_timing] better MWAITX and WAITPKG delays (#3984)
This implements MWAITX and WAITPKG extensions (umonitor, mwait) for CPUs that support them. Reduces wait times and bypasses the timing stuff from the OS that is slow (windows notably). generally it should answer within 0.2 to 0.5 microsecs (since most requests wait for that long). Also does a general rework of static ctors and stuff Signed-off-by: lizzie <lizzie@eden-emu.dev> Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/3984 Reviewed-by: MaranBr <maranbr@eden-emu.dev> Reviewed-by: crueter <crueter@eden-emu.dev>
This commit is contained in:
parent
ff7bbaea7d
commit
7c32cf03a1
19 changed files with 477 additions and 533 deletions
|
|
@ -4,6 +4,8 @@
|
|||
// SPDX-FileCopyrightText: 2014 Citra Emulator Project
|
||||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
|
||||
#include <chrono>
|
||||
#include <limits>
|
||||
#include <string>
|
||||
#include <thread>
|
||||
|
||||
|
|
@ -18,24 +20,35 @@
|
|||
#elif defined(_WIN32)
|
||||
#include <windows.h>
|
||||
#include "common/string_util.h"
|
||||
#include "common/windows/timer_resolution.h"
|
||||
#else
|
||||
#if defined(__FreeBSD__)
|
||||
#include <sys/cpuset.h>
|
||||
#include <sys/_cpuset.h>
|
||||
#include <pthread_np.h>
|
||||
// Compatibility with CPUset
|
||||
#define cpu_set_t cpuset_t
|
||||
#elif defined(__DragonFly__) || defined(__OpenBSD__) || defined(__Bitrig__)
|
||||
#include <pthread_np.h>
|
||||
#endif
|
||||
#include <pthread.h>
|
||||
#include <sched.h>
|
||||
#endif
|
||||
|
||||
#ifndef _WIN32
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
#ifdef __FreeBSD__
|
||||
# define cpu_set_t cpuset_t
|
||||
#include "common/cpu_features.h"
|
||||
#ifdef ARCHITECTURE_x86_64
|
||||
#ifdef _MSC_VER
|
||||
#include <intrin.h>
|
||||
#else
|
||||
#include <x86intrin.h>
|
||||
#endif
|
||||
#include "common/x64/rdtsc.h"
|
||||
#endif
|
||||
#include "core/core_timing.h"
|
||||
|
||||
namespace Common {
|
||||
|
||||
|
|
@ -144,4 +157,93 @@ void PinCurrentThreadToPerformanceCore(size_t core_id) {
|
|||
}
|
||||
}
|
||||
|
||||
#ifdef ARCHITECTURE_x86_64
|
||||
// On Linux and UNIX systems, a futex would nominally be used to cover the costs
|
||||
// the idea is that it's intuitivelly cheaper to use a direct instruction as opposed to a full futex call
|
||||
// the underlying libc++ implementation uses pthread_cond_timedwait which MAY invoke a futex
|
||||
// Let's pretend the OS is too expensive to jump into, and avoid ANY context switches
|
||||
// this should *IN THEORY* lower CPU usage while just waiting for stuff effectively
|
||||
// For windows the minimal quanta resolution is about 500us, and normal CRT cond var is 1.5ms(?)
|
||||
// so may as well avoid that too
|
||||
// Let's just give ALL platforms the same mechanisms (almost) for when they have umonitor OR waitpkg
|
||||
#ifdef __clang__
|
||||
__attribute__((target("waitpkg,mwaitx")))
|
||||
#elif defined(__GNUC__)
|
||||
#pragma GCC target("waitpkg")
|
||||
#pragma GCC target("mwaitx")
|
||||
#endif
|
||||
bool Event::WaitFor(const std::chrono::nanoseconds time) {
|
||||
#ifdef _WIN32
|
||||
auto const start = Common::X64::FencedRDTSC();
|
||||
auto const& caps = Common::g_cpu_caps;
|
||||
[[maybe_unused]] auto const end = start + Common::g_wall_clock.NsToTicks(time);
|
||||
if (caps.monitorx) {
|
||||
while (true) {
|
||||
// Armed monitor, as per manual, MWAITX must be conditional if the condition isn't satisfied
|
||||
// to prevent a race condition.
|
||||
_mm_monitorx(reinterpret_cast<u64*>(std::addressof(is_set)), 0, 0);
|
||||
if (!is_set.load()) {
|
||||
// RDTSC may be fenced here due to atomic load
|
||||
auto const now = _rdtsc();
|
||||
if (end > now) {
|
||||
u32 const cycles = std::min<u32>((std::numeric_limits<u32>::max)(), s64(end) - s64(now));
|
||||
// See here: https://github.com/torvalds/linux/blob/948a64995aca6820abefd17f1a4258f5835c5ad9/arch/x86/lib/delay.c#L93
|
||||
// MWAITX accepts a 32-bit input timer which determines the total number of cycles to wait for
|
||||
// NOT THE TOTAL ABSOLUTE TSC VALUE, it's just a delta
|
||||
// BIT[1] = use a timer
|
||||
// Hint = 0: Use C1 state when sleepy (means slower wakeup but better savings)
|
||||
_mm_mwaitx(1 << 1, 0u, cycles);
|
||||
if (!is_set.load())
|
||||
return false;
|
||||
} else
|
||||
return false; //timeout
|
||||
}
|
||||
bool expected = true;
|
||||
if (is_set.compare_exchange_weak(expected, false, std::memory_order_release))
|
||||
return true;
|
||||
}
|
||||
} else if (caps.waitpkg) {
|
||||
// #UD If CPUID.7.0:ECX.WAITPKG[bit 5]=0.
|
||||
while (true) {
|
||||
_umonitor(std::addressof(is_set));
|
||||
if (!is_set.load() && !_umwait(0, end)) //umwait is absolute time!!!
|
||||
return false;
|
||||
bool expected = true;
|
||||
if (is_set.compare_exchange_weak(expected, false, std::memory_order_release))
|
||||
return true;
|
||||
}
|
||||
} else {
|
||||
while (!is_set.load() && end > _rdtsc())
|
||||
Common::Windows::SleepForOneTick();
|
||||
if (is_set.load())
|
||||
Reset();
|
||||
return true;
|
||||
}
|
||||
#else
|
||||
std::unique_lock lk{mutex};
|
||||
if (!condvar.wait_for(lk, time, [this] { return is_set.load(); }))
|
||||
return false;
|
||||
is_set = false;
|
||||
return true;
|
||||
#endif
|
||||
}
|
||||
#else
|
||||
bool Event::WaitFor(const std::chrono::nanoseconds time) {
|
||||
#ifdef _WIN32
|
||||
auto const end = Common::g_wall_clock.GetTimeNS() + time;
|
||||
while (!is_set.load() && end > Common::g_wall_clock.GetTimeNS())
|
||||
Common::Windows::SleepForOneTick();
|
||||
if (is_set.load())
|
||||
Reset();
|
||||
return true;
|
||||
#else
|
||||
std::unique_lock lk{mutex};
|
||||
if (!condvar.wait_for(lk, time, [this] { return is_set.load(); }))
|
||||
return false;
|
||||
is_set = false;
|
||||
return true;
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
} // namespace Common
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue