diff --git a/src/common/thread.cpp b/src/common/thread.cpp index f4bdb3f7c0..81685315f8 100644 --- a/src/common/thread.cpp +++ b/src/common/thread.cpp @@ -11,6 +11,7 @@ #include "common/logging.h" #include "common/assert.h" #include "common/thread.h" +#include "common/x64/cpu_detect.h" #ifdef __APPLE__ #include #elif defined(__HAIKU__) @@ -33,6 +34,12 @@ #include #endif +#ifdef _MSC_VER +#include +#else +#include +#endif + #ifdef __FreeBSD__ # define cpu_set_t cpuset_t #endif @@ -144,4 +151,63 @@ void PinCurrentThreadToPerformanceCore(size_t core_id) { } } +// On Linux and UNIX systems, a futex would nominally be used to cover the costs +// the idea is that it's intuitivelly cheaper to use a direct instruction as opposed to a full futex call +// the underlying libc++ implementation uses pthread_cond_timedwait which MAY invoke a futex +// Let's pretend the OS is too expensive to jump into, and avoid ANY context switches +// this should *IN THEORY* lower CPU usage while just waiting for stuff effectively +// For windows the minimal quanta resolution is about 500us, and normal CRT cond var is 1.5ms(?) +// so may as well avoid that too +// Let's just give ALL platforms the same mechanisms (almost) for when they have umonitor OR waitpkg +#ifdef __clang__ +__attribute__((target("waitpkg,mwaitx"))) +#elif defined(__GNUC__) +#pragma GCC target("waitpkg") +#pragma GCC target("mwaitx") +#endif +bool Event::WaitFor(const std::chrono::nanoseconds& time) { + auto const& caps = Common::GetCPUCaps(); + auto const ns_ratio = std::max(1, caps.max_frequency / 1'000); + auto const target_tsc = Common::X64::FencedRDTSC() + time.count() * ns_ratio; + if (caps.monitorx) { + while (true) { + _mm_monitorx(reinterpret_cast(std::addressof(is_set)), 0, 0); + if (!IsSet()) { + constexpr auto EnableWaitTimeFlag = 1U << 1; + constexpr auto RequestC1State = 0U; + _mm_mwaitx(EnableWaitTimeFlag, RequestC1State, target_tsc); + if (!is_set.load()) + return false; + } + bool expected = true; + if (is_set.compare_exchange_weak(expected, false, std::memory_order_release)) + return true; + } + } else if (caps.waitpkg) { + // #UD If CPUID.7.0:ECX.WAITPKG[bit 5]=0. + while (true) { + _umonitor(std::addressof(is_set)); + if (!IsSet() && !_umwait(0, target_tsc)) + return false; + bool expected = true; + if (is_set.compare_exchange_weak(expected, false, std::memory_order_release)) + return true; + } + } else { +#ifdef _WIN32 + while (!IsSet() && _rdtsc() < target_tsc) + Common::Windows::SleepForOneTick(); + if (event.IsSet()) + event.Reset(); + return true; +#else + std::unique_lock lk{mutex}; + if (!condvar.wait_for(lk, time, [this] { return is_set.load(); })) + return false; + is_set = false; + return true; +#endif + } +} + } // namespace Common diff --git a/src/common/thread.h b/src/common/thread.h index ea6f5d6b3b..93688bb455 100644 --- a/src/common/thread.h +++ b/src/common/thread.h @@ -15,6 +15,7 @@ #include #include "common/common_types.h" #include "common/polyfill_thread.h" +#include "common/x64/rdtsc.h" namespace Common { @@ -34,15 +35,9 @@ public: is_set = false; } - bool WaitFor(const std::chrono::nanoseconds& time) { - std::unique_lock lk{mutex}; - if (!condvar.wait_for(lk, time, [this] { return is_set.load(); })) - return false; - is_set = false; - return true; - } + bool WaitFor(const std::chrono::nanoseconds& time); - template + template bool WaitUntil(const std::chrono::time_point& time) { std::unique_lock lk{mutex}; if (!condvar.wait_until(lk, time, [this] { return is_set.load(); })) @@ -63,9 +58,9 @@ public: } private: + alignas(64) std::atomic is_set{false}; std::condition_variable condvar; std::mutex mutex; - std::atomic_bool is_set{false}; }; class Barrier { diff --git a/src/core/core_timing.cpp b/src/core/core_timing.cpp index c8f845fd04..5c2237ff6f 100644 --- a/src/core/core_timing.cpp +++ b/src/core/core_timing.cpp @@ -76,24 +76,7 @@ void CoreTiming::Initialize(std::function&& on_thread_init_) { // There are more events left in the queue, wait until the next event. auto const wait_time = *next_time - GetGlobalTimeNs().count(); if (wait_time > 0) { -#ifdef _WIN32 - while (!paused && !event.IsSet() && wait_time > 0) { - wait_time = *next_time - GetGlobalTimeNs().count(); - if (wait_time >= timer_resolution_ns) { - Common::Windows::SleepForOneTick(); - } else { -#ifdef ARCHITECTURE_x86_64 - Common::X64::MicroSleep(caps, wait_time * ns_scale); -#else - std::this_thread::yield(); -#endif - } - } - if (event.IsSet()) - event.Reset(); -#else event.WaitFor(std::chrono::nanoseconds(wait_time)); -#endif } } else { // Queue is empty, wait until another event is scheduled and signals us to