[dynarmic, macroHLE] Use faster ankerl for xbyak maps (#3716)

the nominal std::unordered_map<> isn't enough to warrant it's continued usage in xbyak internal structures, thus using ankerl should greatly remove a lot of indirection/stdc++ specific overhead from the usually poorly performant std::unordered_map

Both dynarmic and macroHLE should benefit greatly from a less-stupid unordered_dense

This should speedup both CPU and shader compilation latency (NOT BY A GREAT MARGIN) just enough to make loading zones in ToTK less horrific

Signed-off-by: lizzie <lizzie@eden-emu.dev>
Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/3716
Reviewed-by: crueter <crueter@eden-emu.dev>
This commit is contained in:
lizzie 2026-05-15 22:07:45 +02:00 committed by crueter
parent 413c7543ba
commit 2f0f8a979c
No known key found for this signature in database
GPG key ID: 425ACD2D4830EBC6
11 changed files with 93 additions and 95 deletions

View file

@ -224,7 +224,7 @@ void A64EmitX64::GenTerminalHandlers() {
terminal_handler_fast_dispatch_hint = code.getCurr<const void*>();
calculate_location_descriptor();
code.L(rsb_cache_miss);
code.mov(r8, reinterpret_cast<u64>(fast_dispatch_table.data()));
code.mov(r8, u64(fast_dispatch_table.data()));
//code.mov(r12, qword[code.ABI_JIT_PTR + offsetof(A64JitState, pc)]);
code.mov(r12, rbx);
if (code.HasHostFeature(HostFeature::SSE42)) {
@ -244,7 +244,7 @@ void A64EmitX64::GenTerminalHandlers() {
code.align();
fast_dispatch_table_lookup = code.getCurr<FastDispatchEntry& (*)(u64)>();
code.mov(code.ABI_PARAM2, reinterpret_cast<u64>(fast_dispatch_table.data()));
code.mov(code.ABI_PARAM2, u64(fast_dispatch_table.data()));
if (code.HasHostFeature(HostFeature::SSE42)) {
code.crc32(code.ABI_PARAM1, code.ABI_PARAM2);
}

View file

@ -26,7 +26,7 @@ struct FrameInfo {
};
static_assert(ABI_SHADOW_SPACE <= 32);
static FrameInfo CalculateFrameInfo(const size_t num_gprs, const size_t num_xmms, size_t frame_size) {
static FrameInfo CalculateFrameInfo(const size_t num_gprs, const size_t num_xmms, size_t frame_size) noexcept {
// We are initially 8 byte aligned because the return value is pushed onto an aligned stack after a call.
const size_t rsp_alignment = (num_gprs % 2 == 0) ? 8 : 0;
const size_t total_xmm_size = num_xmms * XMM_SIZE;
@ -40,7 +40,7 @@ static FrameInfo CalculateFrameInfo(const size_t num_gprs, const size_t num_xmms
};
}
void ABI_PushRegistersAndAdjustStack(BlockOfCode& code, const size_t frame_size, std::bitset<32> const& regs) {
static void ABI_PushRegistersAndAdjustStack(BlockOfCode& code, const size_t frame_size, std::bitset<32> regs) noexcept {
using namespace Xbyak::util;
const size_t num_gprs = (ABI_ALL_GPRS & regs).count();
@ -65,7 +65,7 @@ void ABI_PushRegistersAndAdjustStack(BlockOfCode& code, const size_t frame_size,
}
}
void ABI_PopRegistersAndAdjustStack(BlockOfCode& code, const size_t frame_size, std::bitset<32> const& regs) {
static void ABI_PopRegistersAndAdjustStack(BlockOfCode& code, const size_t frame_size, std::bitset<32> regs) noexcept {
using namespace Xbyak::util;
const size_t num_gprs = (ABI_ALL_GPRS & regs).count();
@ -107,13 +107,13 @@ void ABI_PopCallerSaveRegistersAndAdjustStack(BlockOfCode& code, const std::size
// Windows ABI registers are not in the same allocation algorithm as unix's
void ABI_PushCallerSaveRegistersAndAdjustStackExcept(BlockOfCode& code, const HostLoc exception) {
std::bitset<32> regs = ABI_ALL_CALLER_SAVE;
auto regs = ABI_ALL_CALLER_SAVE;
regs.reset(size_t(exception));
ABI_PushRegistersAndAdjustStack(code, 0, regs);
}
void ABI_PopCallerSaveRegistersAndAdjustStackExcept(BlockOfCode& code, const HostLoc exception) {
std::bitset<32> regs = ABI_ALL_CALLER_SAVE;
auto regs = ABI_ALL_CALLER_SAVE;
regs.reset(size_t(exception));
ABI_PopRegistersAndAdjustStack(code, 0, regs);
}

View file

@ -6,23 +6,23 @@
* SPDX-License-Identifier: 0BSD
*/
#include "dynarmic/backend/x64/constant_pool.h"
#include <cstring>
#include "common/assert.h"
#include "dynarmic/backend/x64/block_of_code.h"
#include "dynarmic/backend/x64/constant_pool.h"
namespace Dynarmic::Backend::X64 {
ConstantPool::ConstantPool(BlockOfCode& code, size_t size)
: code(code), insertion_point(0) {
: code(code)
, insertion_point(0)
{
code.EnsureMemoryCommitted(align_size + size);
code.int3();
code.align(align_size);
pool = std::span<ConstantT>(
reinterpret_cast<ConstantT*>(code.AllocateFromCodeSpace(size)), size / align_size);
pool = std::span<ConstantT>(reinterpret_cast<ConstantT*>(code.AllocateFromCodeSpace(size)), size / align_size);
}
Xbyak::Address ConstantPool::GetConstant(const Xbyak::AddressFrame& frame, u64 lower, u64 upper) {

View file

@ -8,8 +8,6 @@
#pragma once
#include <bitset>
#include <xbyak/xbyak.h>
#include "common/assert.h"
#include "common/common_types.h"
#include "dynarmic/backend/x64/xbyak.h"

View file

@ -3,13 +3,11 @@
#pragma once
#include <unordered_map>
#include <unordered_set>
// TODO: Defining this crashes e v e r y t h i n g
// #define XBYAK_STD_UNORDERED_SET ankerl::unordered_dense::set
// #define XBYAK_STD_UNORDERED_MAP ankerl::unordered_dense::map
// #define XBYAK_STD_UNORDERED_MULTIMAP boost::unordered_multimap
// You must ensure this matches with src/common/x64/xbyak.h on root dir
#include <ankerl/unordered_dense.h>
#include <boost/unordered_map.hpp>
#define XBYAK_STD_UNORDERED_SET ankerl::unordered_dense::set
#define XBYAK_STD_UNORDERED_MAP ankerl::unordered_dense::map
#define XBYAK_STD_UNORDERED_MULTIMAP boost::unordered_multimap
#include <xbyak/xbyak.h>
#include <xbyak/xbyak_util.h>

View file

@ -78,7 +78,7 @@ u64 FPToFixed(size_t ibits, FPT op, size_t fbits, bool unsigned_, FPCR fpcr, Rou
}
// Detect Overflow
const int min_exponent_for_overflow = static_cast<int>(ibits) - static_cast<int>(mcl::bit::highest_set_bit(value.mantissa + (round_up ? Safe::LogicalShiftRight<u64>(1, exponent) : 0))) - (unsigned_ ? 0 : 1);
const int min_exponent_for_overflow = int(ibits) - int(mcl::bit::highest_set_bit(value.mantissa + (round_up ? Safe::LogicalShiftRight<u64>(1, exponent) : 0))) - (unsigned_ ? 0 : 1);
if (exponent >= min_exponent_for_overflow) {
// Positive overflow
if (unsigned_ || !sign) {
@ -87,10 +87,10 @@ u64 FPToFixed(size_t ibits, FPT op, size_t fbits, bool unsigned_, FPCR fpcr, Rou
}
// Negative overflow
const u64 min_value = Safe::Negate<u64>(static_cast<u64>(1) << (ibits - 1));
const u64 min_value = Safe::Negate<u64>(u64(1) << (ibits - 1));
if (!(exponent == min_exponent_for_overflow && int_result == min_value)) {
FPProcessException(FPExc::InvalidOp, fpcr, fpsr);
return static_cast<u64>(1) << (ibits - 1);
return u64(1) << (ibits - 1);
}
}

View file

@ -1,3 +1,6 @@
// SPDX-FileCopyrightText: Copyright 2026 Eden Emulator Project
// SPDX-License-Identifier: GPL-3.0-or-later
/* This file is part of the dynarmic project.
* Copyright (c) 2020 MerryMage
* SPDX-License-Identifier: 0BSD
@ -11,7 +14,7 @@
#include <utility>
#include <catch2/catch_test_macros.hpp>
#include <xbyak/xbyak_util.h>
#include "dynarmic/backend/x64/xbyak.h"
TEST_CASE("Host CPU supports", "[a64]") {
using Cpu = Xbyak::util::Cpu;