[dynarmic] VEX encode movd/movq

Signed-off-by: lizzie <lizzie@eden-emu.dev>
2026-05-26 19:37:01 +02:00 · 2026-05-24 18:47:11 +00:00 · 2026-05-24 18:47:11 +00:00 · 21bb1b8210
commit 21bb1b8210
parent 0d736d49d6
5 changed files with 95 additions and 54 deletions
--- a/src/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.cpp
+++ b/src/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.cpp
@ -348,7 +348,11 @@ void A32EmitX64::EmitA32SetRegister(A32EmitContext& ctx, IR::Inst* inst) {
        code.mov(MJitStateReg(reg), args[1].GetImmediateU32());
    } else if (args[1].IsInXmm(ctx.reg_alloc)) {
        const Xbyak::Xmm to_store = ctx.reg_alloc.UseXmm(code, args[1]);
-        code.movd(MJitStateReg(reg), to_store);
+        if (code.HasHostFeature(HostFeature::AVX)) {
+            code.vmovd(MJitStateReg(reg), to_store);
+        } else {
+            code.movd(MJitStateReg(reg), to_store);
+        }
    } else {
        const Xbyak::Reg32 to_store = ctx.reg_alloc.UseGpr(code, args[1]).cvt32();
        code.mov(MJitStateReg(reg), to_store);
@ -641,7 +645,11 @@ void A32EmitX64::EmitA32OrQFlag(A32EmitContext& ctx, IR::Inst* inst) {

 void A32EmitX64::EmitA32GetGEFlags(A32EmitContext& ctx, IR::Inst* inst) {
    const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
-    code.movd(result, dword[code.ABI_JIT_PTR + offsetof(A32JitState, cpsr_ge)]);
+    if (code.HasHostFeature(HostFeature::AVX)) {
+        code.vmovd(result, dword[code.ABI_JIT_PTR + offsetof(A32JitState, cpsr_ge)]);
+    } else {
+        code.movd(result, dword[code.ABI_JIT_PTR + offsetof(A32JitState, cpsr_ge)]);
+    }
    ctx.reg_alloc.DefineValue(code, inst, result);
 }

@ -651,7 +659,11 @@ void A32EmitX64::EmitA32SetGEFlags(A32EmitContext& ctx, IR::Inst* inst) {

    if (args[0].IsInXmm(ctx.reg_alloc)) {
        const Xbyak::Xmm to_store = ctx.reg_alloc.UseXmm(code, args[0]);
-        code.movd(dword[code.ABI_JIT_PTR + offsetof(A32JitState, cpsr_ge)], to_store);
+        if (code.HasHostFeature(HostFeature::AVX)) {
+            code.vmovd(dword[code.ABI_JIT_PTR + offsetof(A32JitState, cpsr_ge)], to_store);
+        } else {
+            code.movd(dword[code.ABI_JIT_PTR + offsetof(A32JitState, cpsr_ge)], to_store);
+        }
    } else {
        const Xbyak::Reg32 to_store = ctx.reg_alloc.UseGpr(code, args[0]).cvt32();
        code.mov(dword[code.ABI_JIT_PTR + offsetof(A32JitState, cpsr_ge)], to_store);
--- a/src/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.cpp
+++ b/src/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.cpp
@ -339,7 +339,11 @@ void A64EmitX64::EmitA64GetS(A64EmitContext& ctx, IR::Inst* inst) {
    const auto addr = qword[code.ABI_JIT_PTR + offsetof(A64JitState, vec) + sizeof(u64) * 2 * static_cast<size_t>(vec)];

    const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
-    code.movd(result, addr);
+    if (code.HasHostFeature(HostFeature::AVX)) {
+        code.vmovd(result, addr);
+    } else {
+        code.movd(result, addr);
+    }
    ctx.reg_alloc.DefineValue(code, inst, result);
 }

@ -348,7 +352,11 @@ void A64EmitX64::EmitA64GetD(A64EmitContext& ctx, IR::Inst* inst) {
    const auto addr = qword[code.ABI_JIT_PTR + offsetof(A64JitState, vec) + sizeof(u64) * 2 * static_cast<size_t>(vec)];

    const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
-    code.movq(result, addr);
+    if (code.HasHostFeature(HostFeature::AVX)) {
+        code.vmovq(result, addr);
+    } else {
+        code.movq(result, addr);
+    }
    ctx.reg_alloc.DefineValue(code, inst, result);
 }

@ -357,7 +365,11 @@ void A64EmitX64::EmitA64GetQ(A64EmitContext& ctx, IR::Inst* inst) {
    const auto addr = xword[code.ABI_JIT_PTR + offsetof(A64JitState, vec) + sizeof(u64) * 2 * static_cast<size_t>(vec)];

    const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
-    code.movaps(result, addr);
+    if (code.HasHostFeature(HostFeature::AVX)) {
+        code.vmovaps(result, addr);
+    } else {
+        code.movaps(result, addr);
+    }
    ctx.reg_alloc.DefineValue(code, inst, result);
 }

--- a/src/dynarmic/src/dynarmic/backend/x64/emit_x64_floating_point.cpp
+++ b/src/dynarmic/src/dynarmic/backend/x64/emit_x64_floating_point.cpp
@ -226,13 +226,22 @@ void EmitPostProcessNaNs(BlockOfCode& code, Xbyak::Xmm result, Xbyak::Xmm op1, X
    // op1 == Inf  && op2 == QNaN
    // op1 == QNaN && op2 == SNaN <<< The problematic case
    // op1 == QNaN && op2 == Inf
-
-    if constexpr (fsize == 32) {
-        code.movd(tmp.cvt32(), op2);
-        code.shl(tmp.cvt32(), 32 - mantissa_msb_bit);
+    if (code.HasHostFeature(HostFeature::AVX)) {
+        if constexpr (fsize == 32) {
+            code.movd(tmp.cvt32(), op2);
+            code.shl(tmp.cvt32(), 32 - mantissa_msb_bit);
+        } else {
+            code.movq(tmp, op2);
+            code.shl(tmp, 64 - mantissa_msb_bit);
+        }
    } else {
-        code.movq(tmp, op2);
-        code.shl(tmp, 64 - mantissa_msb_bit);
+        if constexpr (fsize == 32) {
+            code.vmovd(tmp.cvt32(), op2);
+            code.shl(tmp.cvt32(), 32 - mantissa_msb_bit);
+        } else {
+            code.vmovq(tmp, op2);
+            code.shl(tmp, 64 - mantissa_msb_bit);
+        }
    }
    // If op2 is a SNaN, CF = 0 and ZF = 0.
    code.jna(end, code.T_NEAR);
@ -477,10 +486,18 @@ static inline void EmitFPMinMaxNumeric(BlockOfCode& code, EmitContext& ctx, IR::
        tmp.setBit(fsize);

        const auto move_to_tmp = [=, &code](const Xbyak::Xmm& xmm) {
-            if constexpr (fsize == 32) {
-                code.movd(tmp.cvt32(), xmm);
+            if (code.HasHostFeature(HostFeature::AVX)) {
+                if constexpr (fsize == 32) {
+                    code.vmovd(tmp.cvt32(), xmm);
+                } else {
+                    code.vmovq(tmp.cvt64(), xmm);
+                }
            } else {
-                code.movq(tmp.cvt64(), xmm);
+                if constexpr (fsize == 32) {
+                    code.movd(tmp.cvt32(), xmm);
+                } else {
+                    code.movq(tmp.cvt64(), xmm);
+                }
            }
        };

@ -1156,7 +1173,11 @@ static void EmitFPRSqrtEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* i

            code.L(*bad_values);
            if constexpr (fsize == 32) {
-                code.movd(tmp, operand);
+                if (code.HasHostFeature(HostFeature::AVX)) {
+                    code.vmovd(tmp, operand);
+                } else {
+                    code.movd(tmp, operand);
+                }

                if (!ctx.FPCR().FZ()) {
                    if (ctx.FPCR().DN()) {
@ -1186,7 +1207,12 @@ static void EmitFPRSqrtEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* i
                }

                code.L(default_nan);
-                code.movd(result, code.Const(xword, 0x7FC00000));
+
+                if (code.HasHostFeature(HostFeature::AVX)) {
+                    code.vmovd(result, code.Const(xword, 0x7FC00000));
+                } else {
+                    code.movd(result, code.Const(xword, 0x7FC00000));
+                }
                code.jmp(*end, code.T_NEAR);
            } else {
                Xbyak::Label nan, zero;
--- a/src/dynarmic/src/dynarmic/backend/x64/emit_x64_vector.cpp
+++ b/src/dynarmic/src/dynarmic/backend/x64/emit_x64_vector.cpp
@ -227,7 +227,11 @@ void EmitX64::EmitVectorGetElement32(EmitContext& ctx, IR::Inst* inst) {
    } else {
        auto const source = ctx.reg_alloc.UseScratchXmm(code, args[0]);
        code.pshufd(source, source, index);
-        code.movd(dest, source);
+        if (code.HasHostFeature(HostFeature::AVX)) {
+            code.vmovd(dest, source);
+        } else {
+            code.movd(dest, source);
+        }
    }

    ctx.reg_alloc.DefineValue(code, inst, dest);
--- a/src/dynarmic/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp
+++ b/src/dynarmic/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp
@ -11,6 +11,7 @@
 #include <tuple>
 #include <type_traits>
 #include <utility>
+#include <smmintrin.h>

 #include "common/assert.h"
 #include "dynarmic/mcl/function_info.hpp"
@ -1652,25 +1653,23 @@ static void EmitFPVectorRoundIntThunk(VectorArray<FPT>& output, const VectorArra

 template<size_t fsize>
 void EmitFPVectorRoundInt(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
+    //auto args = ctx.reg_alloc.GetArgumentInfo(inst);
    const auto rounding = FP::RoundingMode(inst->GetArg(1).GetU8());
    const bool exact = inst->GetArg(2).GetU1();
-
    if constexpr (fsize != 16) {
        if (code.HasHostFeature(HostFeature::SSE41) && rounding != FP::RoundingMode::ToNearest_TieAwayFromZero && !exact) {
-            const u8 round_imm = [&]() -> u8 {
+            const u8 round_imm = [rounding]() -> u8 {
                switch (rounding) {
-                case FP::RoundingMode::ToNearest_TieEven: return 0b00;
-                case FP::RoundingMode::TowardsPlusInfinity: return 0b10;
-                case FP::RoundingMode::TowardsMinusInfinity: return 0b01;
-                case FP::RoundingMode::TowardsZero: return 0b11;
+                case FP::RoundingMode::ToNearest_TieEven: return _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
+                case FP::RoundingMode::TowardsPlusInfinity: return _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC;
+                case FP::RoundingMode::TowardsMinusInfinity: return _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC;
+                case FP::RoundingMode::TowardsZero: return _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC;
                default: UNREACHABLE();
                }
            }();
-
-            EmitTwoOpVectorOperation<fsize, DefaultIndexer, 3>(code, ctx, inst, [&](const Xbyak::Xmm& result, const Xbyak::Xmm& xmm_a) {
+            EmitTwoOpVectorOperation<fsize, DefaultIndexer, 3>(code, ctx, inst, [&code, round_imm](const Xbyak::Xmm result, const Xbyak::Xmm xmm_a) {
                FCODE(roundp)(result, xmm_a, round_imm);
            });
-
            return;
        }
    }
@ -1678,33 +1677,21 @@ void EmitFPVectorRoundInt(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
    // Do not make a LUT out of this, let the compiler do it's thing
    using FPT = mcl::unsigned_integer_of_size<fsize>;
    switch (rounding) {
-    case FP::RoundingMode::ToNearest_TieEven:
-        exact
-            ? EmitTwoOpFallback<3>(code, ctx, inst, EmitFPVectorRoundIntThunk<FPT, FP::RoundingMode::ToNearest_TieEven, true>)
-            : EmitTwoOpFallback<3>(code, ctx, inst, EmitFPVectorRoundIntThunk<FPT, FP::RoundingMode::ToNearest_TieEven, false>);
-        break;
-    case FP::RoundingMode::TowardsPlusInfinity:
-        exact
-            ? EmitTwoOpFallback<3>(code, ctx, inst, EmitFPVectorRoundIntThunk<FPT, FP::RoundingMode::TowardsPlusInfinity, true>)
-            : EmitTwoOpFallback<3>(code, ctx, inst, EmitFPVectorRoundIntThunk<FPT, FP::RoundingMode::TowardsPlusInfinity, false>);
-        break;
-    case FP::RoundingMode::TowardsMinusInfinity:
-        exact
-            ? EmitTwoOpFallback<3>(code, ctx, inst, EmitFPVectorRoundIntThunk<FPT, FP::RoundingMode::TowardsMinusInfinity, true>)
-            : EmitTwoOpFallback<3>(code, ctx, inst, EmitFPVectorRoundIntThunk<FPT, FP::RoundingMode::TowardsMinusInfinity, false>);
-        break;
-    case FP::RoundingMode::TowardsZero:
-        exact
-            ? EmitTwoOpFallback<3>(code, ctx, inst, EmitFPVectorRoundIntThunk<FPT, FP::RoundingMode::TowardsZero, true>)
-            : EmitTwoOpFallback<3>(code, ctx, inst, EmitFPVectorRoundIntThunk<FPT, FP::RoundingMode::TowardsZero, false>);
-        break;
-    case FP::RoundingMode::ToNearest_TieAwayFromZero:
-        exact
-            ? EmitTwoOpFallback<3>(code, ctx, inst, EmitFPVectorRoundIntThunk<FPT, FP::RoundingMode::ToNearest_TieAwayFromZero, true>)
-            : EmitTwoOpFallback<3>(code, ctx, inst, EmitFPVectorRoundIntThunk<FPT, FP::RoundingMode::ToNearest_TieAwayFromZero, false>);
-        break;
-    default:
-        UNREACHABLE();
+#define ROUND_LIST \
+    ROUND_ELEM(ToNearest_TieEven) \
+    ROUND_ELEM(TowardsPlusInfinity) \
+    ROUND_ELEM(TowardsMinusInfinity) \
+    ROUND_ELEM(TowardsZero) \
+    ROUND_ELEM(ToNearest_TieAwayFromZero)
+#define ROUND_ELEM(name) \
+    case FP::RoundingMode::name: \
+        return exact \
+            ? EmitTwoOpFallback<3>(code, ctx, inst, EmitFPVectorRoundIntThunk<FPT, FP::RoundingMode::name, true>) \
+            : EmitTwoOpFallback<3>(code, ctx, inst, EmitFPVectorRoundIntThunk<FPT, FP::RoundingMode::name, false>);
+ROUND_LIST
+#undef ROUND_ELEM
+#undef ROUND_LIST
+    default: UNREACHABLE();
    }
 }