diff --git a/src/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.cpp b/src/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.cpp index 1c21886c60..316dbc3490 100644 --- a/src/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.cpp +++ b/src/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.cpp @@ -348,7 +348,11 @@ void A32EmitX64::EmitA32SetRegister(A32EmitContext& ctx, IR::Inst* inst) { code.mov(MJitStateReg(reg), args[1].GetImmediateU32()); } else if (args[1].IsInXmm(ctx.reg_alloc)) { const Xbyak::Xmm to_store = ctx.reg_alloc.UseXmm(code, args[1]); - code.movd(MJitStateReg(reg), to_store); + if (code.HasHostFeature(HostFeature::AVX)) { + code.vmovd(MJitStateReg(reg), to_store); + } else { + code.movd(MJitStateReg(reg), to_store); + } } else { const Xbyak::Reg32 to_store = ctx.reg_alloc.UseGpr(code, args[1]).cvt32(); code.mov(MJitStateReg(reg), to_store); @@ -641,7 +645,11 @@ void A32EmitX64::EmitA32OrQFlag(A32EmitContext& ctx, IR::Inst* inst) { void A32EmitX64::EmitA32GetGEFlags(A32EmitContext& ctx, IR::Inst* inst) { const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code); - code.movd(result, dword[code.ABI_JIT_PTR + offsetof(A32JitState, cpsr_ge)]); + if (code.HasHostFeature(HostFeature::AVX)) { + code.vmovd(result, dword[code.ABI_JIT_PTR + offsetof(A32JitState, cpsr_ge)]); + } else { + code.movd(result, dword[code.ABI_JIT_PTR + offsetof(A32JitState, cpsr_ge)]); + } ctx.reg_alloc.DefineValue(code, inst, result); } @@ -651,7 +659,11 @@ void A32EmitX64::EmitA32SetGEFlags(A32EmitContext& ctx, IR::Inst* inst) { if (args[0].IsInXmm(ctx.reg_alloc)) { const Xbyak::Xmm to_store = ctx.reg_alloc.UseXmm(code, args[0]); - code.movd(dword[code.ABI_JIT_PTR + offsetof(A32JitState, cpsr_ge)], to_store); + if (code.HasHostFeature(HostFeature::AVX)) { + code.vmovd(dword[code.ABI_JIT_PTR + offsetof(A32JitState, cpsr_ge)], to_store); + } else { + code.movd(dword[code.ABI_JIT_PTR + offsetof(A32JitState, cpsr_ge)], to_store); + } } else { const Xbyak::Reg32 to_store = ctx.reg_alloc.UseGpr(code, args[0]).cvt32(); code.mov(dword[code.ABI_JIT_PTR + offsetof(A32JitState, cpsr_ge)], to_store); diff --git a/src/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.cpp b/src/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.cpp index 53381aa1df..776d7c9fb1 100644 --- a/src/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.cpp +++ b/src/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.cpp @@ -339,7 +339,11 @@ void A64EmitX64::EmitA64GetS(A64EmitContext& ctx, IR::Inst* inst) { const auto addr = qword[code.ABI_JIT_PTR + offsetof(A64JitState, vec) + sizeof(u64) * 2 * static_cast(vec)]; const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code); - code.movd(result, addr); + if (code.HasHostFeature(HostFeature::AVX)) { + code.vmovd(result, addr); + } else { + code.movd(result, addr); + } ctx.reg_alloc.DefineValue(code, inst, result); } @@ -348,7 +352,11 @@ void A64EmitX64::EmitA64GetD(A64EmitContext& ctx, IR::Inst* inst) { const auto addr = qword[code.ABI_JIT_PTR + offsetof(A64JitState, vec) + sizeof(u64) * 2 * static_cast(vec)]; const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code); - code.movq(result, addr); + if (code.HasHostFeature(HostFeature::AVX)) { + code.vmovq(result, addr); + } else { + code.movq(result, addr); + } ctx.reg_alloc.DefineValue(code, inst, result); } @@ -357,7 +365,11 @@ void A64EmitX64::EmitA64GetQ(A64EmitContext& ctx, IR::Inst* inst) { const auto addr = xword[code.ABI_JIT_PTR + offsetof(A64JitState, vec) + sizeof(u64) * 2 * static_cast(vec)]; const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code); - code.movaps(result, addr); + if (code.HasHostFeature(HostFeature::AVX)) { + code.vmovaps(result, addr); + } else { + code.movaps(result, addr); + } ctx.reg_alloc.DefineValue(code, inst, result); } diff --git a/src/dynarmic/src/dynarmic/backend/x64/emit_x64_floating_point.cpp b/src/dynarmic/src/dynarmic/backend/x64/emit_x64_floating_point.cpp index 827600c7c2..4c516e49cb 100644 --- a/src/dynarmic/src/dynarmic/backend/x64/emit_x64_floating_point.cpp +++ b/src/dynarmic/src/dynarmic/backend/x64/emit_x64_floating_point.cpp @@ -226,13 +226,22 @@ void EmitPostProcessNaNs(BlockOfCode& code, Xbyak::Xmm result, Xbyak::Xmm op1, X // op1 == Inf && op2 == QNaN // op1 == QNaN && op2 == SNaN <<< The problematic case // op1 == QNaN && op2 == Inf - - if constexpr (fsize == 32) { - code.movd(tmp.cvt32(), op2); - code.shl(tmp.cvt32(), 32 - mantissa_msb_bit); + if (code.HasHostFeature(HostFeature::AVX)) { + if constexpr (fsize == 32) { + code.movd(tmp.cvt32(), op2); + code.shl(tmp.cvt32(), 32 - mantissa_msb_bit); + } else { + code.movq(tmp, op2); + code.shl(tmp, 64 - mantissa_msb_bit); + } } else { - code.movq(tmp, op2); - code.shl(tmp, 64 - mantissa_msb_bit); + if constexpr (fsize == 32) { + code.vmovd(tmp.cvt32(), op2); + code.shl(tmp.cvt32(), 32 - mantissa_msb_bit); + } else { + code.vmovq(tmp, op2); + code.shl(tmp, 64 - mantissa_msb_bit); + } } // If op2 is a SNaN, CF = 0 and ZF = 0. code.jna(end, code.T_NEAR); @@ -477,10 +486,18 @@ static inline void EmitFPMinMaxNumeric(BlockOfCode& code, EmitContext& ctx, IR:: tmp.setBit(fsize); const auto move_to_tmp = [=, &code](const Xbyak::Xmm& xmm) { - if constexpr (fsize == 32) { - code.movd(tmp.cvt32(), xmm); + if (code.HasHostFeature(HostFeature::AVX)) { + if constexpr (fsize == 32) { + code.vmovd(tmp.cvt32(), xmm); + } else { + code.vmovq(tmp.cvt64(), xmm); + } } else { - code.movq(tmp.cvt64(), xmm); + if constexpr (fsize == 32) { + code.movd(tmp.cvt32(), xmm); + } else { + code.movq(tmp.cvt64(), xmm); + } } }; @@ -1156,7 +1173,11 @@ static void EmitFPRSqrtEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* i code.L(*bad_values); if constexpr (fsize == 32) { - code.movd(tmp, operand); + if (code.HasHostFeature(HostFeature::AVX)) { + code.vmovd(tmp, operand); + } else { + code.movd(tmp, operand); + } if (!ctx.FPCR().FZ()) { if (ctx.FPCR().DN()) { @@ -1186,7 +1207,12 @@ static void EmitFPRSqrtEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* i } code.L(default_nan); - code.movd(result, code.Const(xword, 0x7FC00000)); + + if (code.HasHostFeature(HostFeature::AVX)) { + code.vmovd(result, code.Const(xword, 0x7FC00000)); + } else { + code.movd(result, code.Const(xword, 0x7FC00000)); + } code.jmp(*end, code.T_NEAR); } else { Xbyak::Label nan, zero; diff --git a/src/dynarmic/src/dynarmic/backend/x64/emit_x64_vector.cpp b/src/dynarmic/src/dynarmic/backend/x64/emit_x64_vector.cpp index b5ec6ec7cf..4ae74a164e 100644 --- a/src/dynarmic/src/dynarmic/backend/x64/emit_x64_vector.cpp +++ b/src/dynarmic/src/dynarmic/backend/x64/emit_x64_vector.cpp @@ -227,7 +227,11 @@ void EmitX64::EmitVectorGetElement32(EmitContext& ctx, IR::Inst* inst) { } else { auto const source = ctx.reg_alloc.UseScratchXmm(code, args[0]); code.pshufd(source, source, index); - code.movd(dest, source); + if (code.HasHostFeature(HostFeature::AVX)) { + code.vmovd(dest, source); + } else { + code.movd(dest, source); + } } ctx.reg_alloc.DefineValue(code, inst, dest); diff --git a/src/dynarmic/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp b/src/dynarmic/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp index 926653a920..c9742f132e 100644 --- a/src/dynarmic/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp +++ b/src/dynarmic/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include "common/assert.h" #include "dynarmic/mcl/function_info.hpp" @@ -1652,25 +1653,23 @@ static void EmitFPVectorRoundIntThunk(VectorArray& output, const VectorArra template void EmitFPVectorRoundInt(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { + //auto args = ctx.reg_alloc.GetArgumentInfo(inst); const auto rounding = FP::RoundingMode(inst->GetArg(1).GetU8()); const bool exact = inst->GetArg(2).GetU1(); - if constexpr (fsize != 16) { if (code.HasHostFeature(HostFeature::SSE41) && rounding != FP::RoundingMode::ToNearest_TieAwayFromZero && !exact) { - const u8 round_imm = [&]() -> u8 { + const u8 round_imm = [rounding]() -> u8 { switch (rounding) { - case FP::RoundingMode::ToNearest_TieEven: return 0b00; - case FP::RoundingMode::TowardsPlusInfinity: return 0b10; - case FP::RoundingMode::TowardsMinusInfinity: return 0b01; - case FP::RoundingMode::TowardsZero: return 0b11; + case FP::RoundingMode::ToNearest_TieEven: return _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC; + case FP::RoundingMode::TowardsPlusInfinity: return _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC; + case FP::RoundingMode::TowardsMinusInfinity: return _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC; + case FP::RoundingMode::TowardsZero: return _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC; default: UNREACHABLE(); } }(); - - EmitTwoOpVectorOperation(code, ctx, inst, [&](const Xbyak::Xmm& result, const Xbyak::Xmm& xmm_a) { + EmitTwoOpVectorOperation(code, ctx, inst, [&code, round_imm](const Xbyak::Xmm result, const Xbyak::Xmm xmm_a) { FCODE(roundp)(result, xmm_a, round_imm); }); - return; } } @@ -1678,33 +1677,21 @@ void EmitFPVectorRoundInt(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { // Do not make a LUT out of this, let the compiler do it's thing using FPT = mcl::unsigned_integer_of_size; switch (rounding) { - case FP::RoundingMode::ToNearest_TieEven: - exact - ? EmitTwoOpFallback<3>(code, ctx, inst, EmitFPVectorRoundIntThunk) - : EmitTwoOpFallback<3>(code, ctx, inst, EmitFPVectorRoundIntThunk); - break; - case FP::RoundingMode::TowardsPlusInfinity: - exact - ? EmitTwoOpFallback<3>(code, ctx, inst, EmitFPVectorRoundIntThunk) - : EmitTwoOpFallback<3>(code, ctx, inst, EmitFPVectorRoundIntThunk); - break; - case FP::RoundingMode::TowardsMinusInfinity: - exact - ? EmitTwoOpFallback<3>(code, ctx, inst, EmitFPVectorRoundIntThunk) - : EmitTwoOpFallback<3>(code, ctx, inst, EmitFPVectorRoundIntThunk); - break; - case FP::RoundingMode::TowardsZero: - exact - ? EmitTwoOpFallback<3>(code, ctx, inst, EmitFPVectorRoundIntThunk) - : EmitTwoOpFallback<3>(code, ctx, inst, EmitFPVectorRoundIntThunk); - break; - case FP::RoundingMode::ToNearest_TieAwayFromZero: - exact - ? EmitTwoOpFallback<3>(code, ctx, inst, EmitFPVectorRoundIntThunk) - : EmitTwoOpFallback<3>(code, ctx, inst, EmitFPVectorRoundIntThunk); - break; - default: - UNREACHABLE(); +#define ROUND_LIST \ + ROUND_ELEM(ToNearest_TieEven) \ + ROUND_ELEM(TowardsPlusInfinity) \ + ROUND_ELEM(TowardsMinusInfinity) \ + ROUND_ELEM(TowardsZero) \ + ROUND_ELEM(ToNearest_TieAwayFromZero) +#define ROUND_ELEM(name) \ + case FP::RoundingMode::name: \ + return exact \ + ? EmitTwoOpFallback<3>(code, ctx, inst, EmitFPVectorRoundIntThunk) \ + : EmitTwoOpFallback<3>(code, ctx, inst, EmitFPVectorRoundIntThunk); +ROUND_LIST +#undef ROUND_ELEM +#undef ROUND_LIST + default: UNREACHABLE(); } }