coalesced event wait for with waitpkg/mwaitx/umonitorrrrrrr

This commit is contained in:
lizzie 2026-05-23 10:26:28 +00:00
parent bf175450d4
commit 60db19dca9
3 changed files with 70 additions and 26 deletions

View file

@ -11,6 +11,7 @@
#include "common/logging.h"
#include "common/assert.h"
#include "common/thread.h"
#include "common/x64/cpu_detect.h"
#ifdef __APPLE__
#include <mach/mach.h>
#elif defined(__HAIKU__)
@ -33,6 +34,12 @@
#include <unistd.h>
#endif
#ifdef _MSC_VER
#include <intrin.h>
#else
#include <x86intrin.h>
#endif
#ifdef __FreeBSD__
# define cpu_set_t cpuset_t
#endif
@ -144,4 +151,63 @@ void PinCurrentThreadToPerformanceCore(size_t core_id) {
}
}
// On Linux and UNIX systems, a futex would nominally be used to cover the costs
// the idea is that it's intuitivelly cheaper to use a direct instruction as opposed to a full futex call
// the underlying libc++ implementation uses pthread_cond_timedwait which MAY invoke a futex
// Let's pretend the OS is too expensive to jump into, and avoid ANY context switches
// this should *IN THEORY* lower CPU usage while just waiting for stuff effectively
// For windows the minimal quanta resolution is about 500us, and normal CRT cond var is 1.5ms(?)
// so may as well avoid that too
// Let's just give ALL platforms the same mechanisms (almost) for when they have umonitor OR waitpkg
#ifdef __clang__
__attribute__((target("waitpkg,mwaitx")))
#elif defined(__GNUC__)
#pragma GCC target("waitpkg")
#pragma GCC target("mwaitx")
#endif
bool Event::WaitFor(const std::chrono::nanoseconds& time) {
auto const& caps = Common::GetCPUCaps();
auto const ns_ratio = std::max<u64>(1, caps.max_frequency / 1'000);
auto const target_tsc = Common::X64::FencedRDTSC() + time.count() * ns_ratio;
if (caps.monitorx) {
while (true) {
_mm_monitorx(reinterpret_cast<u64*>(std::addressof(is_set)), 0, 0);
if (!IsSet()) {
constexpr auto EnableWaitTimeFlag = 1U << 1;
constexpr auto RequestC1State = 0U;
_mm_mwaitx(EnableWaitTimeFlag, RequestC1State, target_tsc);
if (!is_set.load())
return false;
}
bool expected = true;
if (is_set.compare_exchange_weak(expected, false, std::memory_order_release))
return true;
}
} else if (caps.waitpkg) {
// #UD If CPUID.7.0:ECX.WAITPKG[bit 5]=0.
while (true) {
_umonitor(std::addressof(is_set));
if (!IsSet() && !_umwait(0, target_tsc))
return false;
bool expected = true;
if (is_set.compare_exchange_weak(expected, false, std::memory_order_release))
return true;
}
} else {
#ifdef _WIN32
while (!IsSet() && _rdtsc() < target_tsc)
Common::Windows::SleepForOneTick();
if (event.IsSet())
event.Reset();
return true;
#else
std::unique_lock lk{mutex};
if (!condvar.wait_for(lk, time, [this] { return is_set.load(); }))
return false;
is_set = false;
return true;
#endif
}
}
} // namespace Common

View file

@ -15,6 +15,7 @@
#include <thread>
#include "common/common_types.h"
#include "common/polyfill_thread.h"
#include "common/x64/rdtsc.h"
namespace Common {
@ -34,15 +35,9 @@ public:
is_set = false;
}
bool WaitFor(const std::chrono::nanoseconds& time) {
std::unique_lock lk{mutex};
if (!condvar.wait_for(lk, time, [this] { return is_set.load(); }))
return false;
is_set = false;
return true;
}
bool WaitFor(const std::chrono::nanoseconds& time);
template <class Clock, class Duration>
template<class Clock, class Duration>
bool WaitUntil(const std::chrono::time_point<Clock, Duration>& time) {
std::unique_lock lk{mutex};
if (!condvar.wait_until(lk, time, [this] { return is_set.load(); }))
@ -63,9 +58,9 @@ public:
}
private:
alignas(64) std::atomic<bool> is_set{false};
std::condition_variable condvar;
std::mutex mutex;
std::atomic_bool is_set{false};
};
class Barrier {

View file

@ -76,24 +76,7 @@ void CoreTiming::Initialize(std::function<void()>&& on_thread_init_) {
// There are more events left in the queue, wait until the next event.
auto const wait_time = *next_time - GetGlobalTimeNs().count();
if (wait_time > 0) {
#ifdef _WIN32
while (!paused && !event.IsSet() && wait_time > 0) {
wait_time = *next_time - GetGlobalTimeNs().count();
if (wait_time >= timer_resolution_ns) {
Common::Windows::SleepForOneTick();
} else {
#ifdef ARCHITECTURE_x86_64
Common::X64::MicroSleep(caps, wait_time * ns_scale);
#else
std::this_thread::yield();
#endif
}
}
if (event.IsSet())
event.Reset();
#else
event.WaitFor(std::chrono::nanoseconds(wait_time));
#endif
}
} else {
// Queue is empty, wait until another event is scheduled and signals us to