diff --git a/src/core/arm/dynarmic/arm_dynarmic_64.cpp b/src/core/arm/dynarmic/arm_dynarmic_64.cpp index fe7fb5983f..7e4bf456d2 100644 --- a/src/core/arm/dynarmic/arm_dynarmic_64.cpp +++ b/src/core/arm/dynarmic/arm_dynarmic_64.cpp @@ -226,7 +226,7 @@ void ArmDynarmic64::MakeJit(Common::PageTable* page_table, std::size_t address_s config.only_detect_misalignment_via_page_table_on_page_boundary = true; config.fastmem_pointer = page_table->fastmem_arena ? - std::optional{reinterpret_cast(page_table->fastmem_arena)} : + std::optional{uintptr_t(page_table->fastmem_arena)} : std::nullopt; config.fastmem_address_space_bits = std::uint32_t(address_space_bits); config.silently_mirror_fastmem = false; diff --git a/src/core/hle/service/jit/jit_context.cpp b/src/core/hle/service/jit/jit_context.cpp index 522d849e6f..447d36a0d0 100644 --- a/src/core/hle/service/jit/jit_context.cpp +++ b/src/core/hle/service/jit/jit_context.cpp @@ -48,6 +48,8 @@ public: mapped_ranges{mapped_ranges_}, parent{parent_} {} std::optional MemoryReadCode(VAddr vaddr) override { + if (!memory.IsValidVirtualAddressRange(vaddr, sizeof(u32))) + return std::nullopt; static_assert(Core::Memory::YUZU_PAGESIZE == Dynarmic::CODE_PAGE_SIZE); auto const aligned_vaddr = vaddr & ~Core::Memory::YUZU_PAGEMASK; if (last_code_addr != aligned_vaddr) { diff --git a/src/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.cpp b/src/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.cpp index 80f0f9cc2f..dd9e9e4a66 100644 --- a/src/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.cpp +++ b/src/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.cpp @@ -59,8 +59,10 @@ static Xbyak::Address MJitStateExtReg(A32::ExtReg reg) { UNREACHABLE(); } -A32EmitContext::A32EmitContext(const A32::UserConfig& conf, RegAlloc& reg_alloc, IR::Block& block) - : EmitContext(reg_alloc, block), conf(conf) {} +A32EmitContext::A32EmitContext(const A32::UserConfig& conf, RegAlloc& reg_alloc, IR::Block& block, boost::container::stable_vector& shared_labels) + : EmitContext(reg_alloc, block, shared_labels) + , conf(conf) +{} A32::LocationDescriptor A32EmitContext::Location() const { return A32::LocationDescriptor{block.Location()}; @@ -109,35 +111,59 @@ A32EmitX64::BlockDescriptor A32EmitX64::Emit(IR::Block& block) { gprs.reset(size_t(HostLoc::R14)); return gprs; }(), any_xmm); - A32EmitContext ctx{conf, reg_alloc, block}; + + A32EmitContext ctx{conf, reg_alloc, block, shared_labels}; // Start emitting. code.align(); const u8* const entrypoint = code.getCurr(); + code.mov(code.qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, abi_base_pointer)], rbp); + code.lea(rbp, code.ptr[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, abi_base_pointer) - 8]); EmitCondPrelude(ctx); - - for (auto iter = block.instructions.begin(); iter != block.instructions.end(); ++iter) [[likely]] { - auto* inst = &*iter; - // Call the relevant Emit* member function. - switch (inst->GetOpcode()) { -#define OPCODE(name, type, ...) \ - case IR::Opcode::name: \ - A32EmitX64::Emit##name(ctx, inst); \ - break; -#define A32OPC(name, type, ...) \ - case IR::Opcode::A32##name: \ - A32EmitX64::EmitA32##name(ctx, inst);\ - break; + typedef void (EmitX64::*EmitHandlerFn)(EmitContext& context, IR::Inst* inst); + constexpr EmitHandlerFn opcode_handlers[] = { +#define OPCODE(name, type, ...) &EmitX64::Emit##name, +#define A32OPC(name, type, ...) +#define A64OPC(name, type, ...) +#include "dynarmic/ir/opcodes.inc" +#undef OPCODE +#undef A32OPC +#undef A64OPC + }; + typedef void (A32EmitX64::*A32EmitHandlerFn)(A32EmitContext& context, IR::Inst* inst); + constexpr A32EmitHandlerFn a32_handlers[] = { +#define OPCODE(...) +#define A32OPC(name, type, ...) &A32EmitX64::EmitA32##name, #define A64OPC(...) #include "dynarmic/ir/opcodes.inc" #undef OPCODE #undef A32OPC +#undef A64OPC + }; + + for (auto& inst : block.instructions) { + auto const opcode = inst.GetOpcode(); + // Call the relevant Emit* member function. + switch (opcode) { +#define OPCODE(name, type, ...) case IR::Opcode::name: goto opcode_branch; +#define A32OPC(name, type, ...) case IR::Opcode::A32##name: goto a32_branch; +#define A64OPC(name, type, ...) +#include "dynarmic/ir/opcodes.inc" +#undef OPCODE +#undef A32OPC #undef A64OPC default: UNREACHABLE(); } - reg_alloc.EndOfAllocScope(); +opcode_branch: + (this->*opcode_handlers[size_t(opcode)])(ctx, &inst); + goto finish_this_inst; +a32_branch: + // Update with FIRST A32 instruction + (this->*a32_handlers[size_t(opcode) - size_t(IR::Opcode::A32SetCheckBit)])(ctx, &inst); +finish_this_inst: + ctx.reg_alloc.EndOfAllocScope(); #ifndef NDEBUG if (conf.very_verbose_debugging_output) EmitVerboseDebuggingOutput(reg_alloc); @@ -146,15 +172,14 @@ A32EmitX64::BlockDescriptor A32EmitX64::Emit(IR::Block& block) { reg_alloc.AssertNoMoreUses(); - if (conf.enable_cycle_counting) { + if (conf.enable_cycle_counting) EmitAddCycles(block.CycleCount()); - } + code.mov(rbp, code.qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, abi_base_pointer)]); EmitTerminal(block.GetTerminal(), ctx.Location().SetSingleStepping(false), ctx.IsSingleStep()); code.int3(); - for (auto& deferred_emit : ctx.deferred_emits) { + for (auto& deferred_emit : ctx.deferred_emits) deferred_emit(); - } code.int3(); const size_t size = size_t(code.getCurr() - entrypoint); @@ -167,6 +192,7 @@ A32EmitX64::BlockDescriptor A32EmitX64::Emit(IR::Block& block) { auto const bdesc = RegisterBlock(descriptor, entrypoint, size); code.DisableWriting(); + shared_labels.clear(); return bdesc; } diff --git a/src/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.h b/src/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.h index 5ec78ff50e..8e97dc7737 100644 --- a/src/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.h +++ b/src/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.h @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project +// SPDX-FileCopyrightText: Copyright 2026 Eden Emulator Project // SPDX-License-Identifier: GPL-3.0-or-later /* This file is part of the dynarmic project. @@ -29,7 +29,7 @@ namespace Dynarmic::Backend::X64 { class RegAlloc; struct A32EmitContext final : public EmitContext { - A32EmitContext(const A32::UserConfig& conf, RegAlloc& reg_alloc, IR::Block& block); + A32EmitContext(const A32::UserConfig& conf, RegAlloc& reg_alloc, IR::Block& block, boost::container::stable_vector& shared_labels); A32::LocationDescriptor Location() const; A32::LocationDescriptor EndLocation() const; @@ -130,6 +130,7 @@ public: ankerl::unordered_dense::map, void (*)()> write_fallbacks; ankerl::unordered_dense::map, void (*)()> exclusive_write_fallbacks; ankerl::unordered_dense::set do_not_fastmem; + boost::container::stable_vector shared_labels; void (*memory_read_128)() = nullptr; // Dummy void (*memory_write_128)() = nullptr; // Dummy const void* terminal_handler_pop_rsb_hint; diff --git a/src/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.cpp b/src/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.cpp index 832cfdcce2..8edeb29aed 100644 --- a/src/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.cpp +++ b/src/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.cpp @@ -37,8 +37,10 @@ namespace Dynarmic::Backend::X64 { using namespace Xbyak::util; -A64EmitContext::A64EmitContext(const A64::UserConfig& conf, RegAlloc& reg_alloc, IR::Block& block) - : EmitContext(reg_alloc, block), conf(conf) {} +A64EmitContext::A64EmitContext(const A64::UserConfig& conf, RegAlloc& reg_alloc, IR::Block& block, boost::container::stable_vector& shared_labels) + : EmitContext(reg_alloc, block, shared_labels) + , conf(conf) +{} A64::LocationDescriptor A64EmitContext::Location() const { return A64::LocationDescriptor{block.Location()}; @@ -83,11 +85,14 @@ A64EmitX64::BlockDescriptor A64EmitX64::Emit(IR::Block& block) noexcept { gprs.reset(size_t(HostLoc::R14)); return gprs; }(), any_xmm}; - A64EmitContext ctx{conf, reg_alloc, block}; + + A64EmitContext ctx{conf, reg_alloc, block, shared_labels}; // Start emitting. code.align(); const auto* const entrypoint = code.getCurr(); + code.mov(code.qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, abi_base_pointer)], rbp); + code.lea(rbp, code.ptr[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, abi_base_pointer) - 8]); DEBUG_ASSERT(block.GetCondition() == IR::Cond::AL); typedef void (EmitX64::*EmitHandlerFn)(EmitContext& context, IR::Inst* inst); @@ -139,16 +144,13 @@ finish_this_inst: } reg_alloc.AssertNoMoreUses(); - - if (conf.enable_cycle_counting) { + if (conf.enable_cycle_counting) EmitAddCycles(block.CycleCount()); - } + code.mov(rbp, code.qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, abi_base_pointer)]); EmitTerminal(block.GetTerminal(), ctx.Location().SetSingleStepping(false), ctx.IsSingleStep()); code.int3(); - - for (auto& deferred_emit : ctx.deferred_emits) { + for (auto& deferred_emit : ctx.deferred_emits) deferred_emit(); - } code.int3(); const size_t size = size_t(code.getCurr() - entrypoint); @@ -161,6 +163,7 @@ finish_this_inst: auto bdesc = RegisterBlock(descriptor, entrypoint, size); code.DisableWriting(); + shared_labels.clear(); return bdesc; } diff --git a/src/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.h b/src/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.h index dd556e36ce..d57b1d81b9 100644 --- a/src/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.h +++ b/src/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.h @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project +// SPDX-FileCopyrightText: Copyright 2026 Eden Emulator Project // SPDX-License-Identifier: GPL-3.0-or-later /* This file is part of the dynarmic project. @@ -27,7 +27,7 @@ namespace Dynarmic::Backend::X64 { struct A64EmitContext final : public EmitContext { - A64EmitContext(const A64::UserConfig& conf, RegAlloc& reg_alloc, IR::Block& block); + A64EmitContext(const A64::UserConfig& conf, RegAlloc& reg_alloc, IR::Block& block, boost::container::stable_vector& shared_labels); A64::LocationDescriptor Location() const; bool IsSingleStep() const; @@ -126,6 +126,7 @@ public: ankerl::unordered_dense::map, void (*)()> write_fallbacks; ankerl::unordered_dense::map, void (*)()> exclusive_write_fallbacks; ankerl::unordered_dense::set do_not_fastmem; + boost::container::stable_vector shared_labels; const void* terminal_handler_pop_rsb_hint = nullptr; const void* terminal_handler_fast_dispatch_hint = nullptr; FastDispatchEntry& (*fast_dispatch_table_lookup)(u64) = nullptr; diff --git a/src/dynarmic/src/dynarmic/backend/x64/emit_x64.cpp b/src/dynarmic/src/dynarmic/backend/x64/emit_x64.cpp index 4e515fef2f..4ed198e09f 100644 --- a/src/dynarmic/src/dynarmic/backend/x64/emit_x64.cpp +++ b/src/dynarmic/src/dynarmic/backend/x64/emit_x64.cpp @@ -32,8 +32,11 @@ namespace Dynarmic::Backend::X64 { using namespace Xbyak::util; -EmitContext::EmitContext(RegAlloc& reg_alloc, IR::Block& block) - : reg_alloc(reg_alloc), block(block) {} +EmitContext::EmitContext(RegAlloc& reg_alloc, IR::Block& block, boost::container::stable_vector& shared_labels) + : reg_alloc(reg_alloc) + , block(block) + , shared_labels(shared_labels) +{} EmitContext::~EmitContext() = default; diff --git a/src/dynarmic/src/dynarmic/backend/x64/emit_x64.h b/src/dynarmic/src/dynarmic/backend/x64/emit_x64.h index 301f4ffc89..619945e19a 100644 --- a/src/dynarmic/src/dynarmic/backend/x64/emit_x64.h +++ b/src/dynarmic/src/dynarmic/backend/x64/emit_x64.h @@ -16,11 +16,12 @@ #include #include -#include "dynarmic/mcl/bit.hpp" #include -#include "dynarmic/backend/x64/xbyak.h" +#include #include +#include "dynarmic/backend/x64/xbyak.h" +#include "dynarmic/mcl/bit.hpp" #include "dynarmic/backend/exception_handler.h" #include "dynarmic/backend/x64/reg_alloc.h" #include "dynarmic/common/fp/fpcr.h" @@ -52,24 +53,23 @@ using VectorArray = std::array> template using HalfVectorArray = std::array / 2>; +using SharedLabel = Xbyak::Label*; struct EmitContext { - EmitContext(RegAlloc& reg_alloc, IR::Block& block); + EmitContext(RegAlloc& reg_alloc, IR::Block& block, boost::container::stable_vector& shared_labels); virtual ~EmitContext(); virtual FP::FPCR FPCR(bool fpcr_controlled = true) const = 0; virtual bool HasOptimization(OptimizationFlag flag) const = 0; - RegAlloc& reg_alloc; - IR::Block& block; + [[nodiscard]] inline Xbyak::Label* GenSharedLabel() noexcept { + return &shared_labels.emplace_back(); + } std::vector> deferred_emits; + RegAlloc& reg_alloc; + IR::Block& block; + boost::container::stable_vector& shared_labels; }; -using SharedLabel = std::shared_ptr; - -inline SharedLabel GenSharedLabel() { - return std::make_shared(); -} - class EmitX64 { public: struct BlockDescriptor { diff --git a/src/dynarmic/src/dynarmic/backend/x64/emit_x64_floating_point.cpp b/src/dynarmic/src/dynarmic/backend/x64/emit_x64_floating_point.cpp index d073991fbe..6a3ab005f3 100644 --- a/src/dynarmic/src/dynarmic/backend/x64/emit_x64_floating_point.cpp +++ b/src/dynarmic/src/dynarmic/backend/x64/emit_x64_floating_point.cpp @@ -136,7 +136,7 @@ void ForceToDefaultNaN(BlockOfCode& code, Xbyak::Xmm result) { template SharedLabel ProcessNaN(BlockOfCode& code, EmitContext& ctx, Xbyak::Xmm a) { - SharedLabel nan = GenSharedLabel(), end = GenSharedLabel(); + SharedLabel nan = ctx.GenSharedLabel(), end = ctx.GenSharedLabel(); FCODE(ucomis)(a, a); code.jp(*nan, code.T_NEAR); @@ -251,7 +251,7 @@ template void FPTwoOp(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - SharedLabel end = GenSharedLabel(); + SharedLabel end = ctx.GenSharedLabel(); Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]); @@ -304,7 +304,7 @@ void FPThreeOp(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code); const Xbyak::Reg64 tmp = ctx.reg_alloc.ScratchGpr(code); - SharedLabel end = GenSharedLabel(), nan = GenSharedLabel(); + SharedLabel end = ctx.GenSharedLabel(), nan = ctx.GenSharedLabel(); code.movaps(result, op1); if constexpr (std::is_member_function_pointer_v) { @@ -413,7 +413,7 @@ static void EmitFPMinMax(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, bo DenormalsAreZero(code, ctx, {result, operand}); - SharedLabel equal = GenSharedLabel(), end = GenSharedLabel(); + SharedLabel equal = ctx.GenSharedLabel(), end = ctx.GenSharedLabel(); FCODE(ucomis)(result, operand); code.jz(*equal, code.T_NEAR); @@ -484,7 +484,7 @@ static inline void EmitFPMinMaxNumeric(BlockOfCode& code, EmitContext& ctx, IR:: } }; - SharedLabel end = GenSharedLabel(), z = GenSharedLabel(); + SharedLabel end = ctx.GenSharedLabel(), z = ctx.GenSharedLabel(); FCODE(ucomis)(op1, op2); code.jz(*z, code.T_NEAR); @@ -632,7 +632,7 @@ static void EmitFPMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, bo } if (code.HasHostFeature(HostFeature::FMA | HostFeature::AVX)) { - SharedLabel fallback = GenSharedLabel(), end = GenSharedLabel(); + SharedLabel fallback = ctx.GenSharedLabel(), end = ctx.GenSharedLabel(); const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(code, args[0]); const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(code, args[1]); @@ -843,7 +843,7 @@ static void EmitFPMulX(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code); const Xbyak::Reg64 tmp = do_default_nan ? INVALID_REG : ctx.reg_alloc.ScratchGpr(code); - SharedLabel end = GenSharedLabel(), nan = GenSharedLabel(); + SharedLabel end = ctx.GenSharedLabel(), nan = ctx.GenSharedLabel(); if (code.HasHostFeature(HostFeature::AVX)) { FCODE(vmuls)(result, op1, op2); @@ -981,7 +981,7 @@ static void EmitFPRecipStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst* } if (code.HasHostFeature(HostFeature::FMA)) { - SharedLabel end = GenSharedLabel(), fallback = GenSharedLabel(); + SharedLabel end = ctx.GenSharedLabel(), fallback = ctx.GenSharedLabel(); const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(code, args[0]); const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(code, args[1]); @@ -1129,7 +1129,7 @@ static void EmitFPRSqrtEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* i const Xbyak::Xmm value = ctx.reg_alloc.ScratchXmm(code); [[maybe_unused]] const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr(code).cvt32(); - SharedLabel bad_values = GenSharedLabel(), end = GenSharedLabel(); + SharedLabel bad_values = ctx.GenSharedLabel(), end = ctx.GenSharedLabel(); code.movaps(value, operand); @@ -1296,7 +1296,7 @@ static void EmitFPRSqrtStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst* } if (code.HasHostFeature(HostFeature::FMA | HostFeature::AVX)) { - SharedLabel end = GenSharedLabel(), fallback = GenSharedLabel(); + SharedLabel end = ctx.GenSharedLabel(), fallback = ctx.GenSharedLabel(); const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(code, args[0]); const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(code, args[1]); @@ -1641,7 +1641,7 @@ static void EmitFPToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { const Xbyak::Xmm scratch = ctx.reg_alloc.ScratchXmm(code); if (!unsigned_) { - SharedLabel saturate_max = GenSharedLabel(), end = GenSharedLabel(); + SharedLabel saturate_max = ctx.GenSharedLabel(), end = ctx.GenSharedLabel(); ZeroIfNaN<64>(code, src, scratch); diff --git a/src/dynarmic/src/dynarmic/backend/x64/emit_x64_memory.cpp.inc b/src/dynarmic/src/dynarmic/backend/x64/emit_x64_memory.cpp.inc index 54fc595214..4fa14d504b 100644 --- a/src/dynarmic/src/dynarmic/backend/x64/emit_x64_memory.cpp.inc +++ b/src/dynarmic/src/dynarmic/backend/x64/emit_x64_memory.cpp.inc @@ -86,7 +86,7 @@ void AxxEmitX64::EmitMemoryRead(AxxEmitContext& ctx, IR::Inst* inst) { const auto wrapped_fn = read_fallbacks[std::make_tuple(ordered, bitsize, vaddr.getIdx(), value_idx)]; - SharedLabel abort = GenSharedLabel(), end = GenSharedLabel(); + SharedLabel abort = ctx.GenSharedLabel(), end = ctx.GenSharedLabel(); if (fastmem_marker) { // Use fastmem @@ -108,7 +108,7 @@ void AxxEmitX64::EmitMemoryRead(AxxEmitContext& ctx, IR::Inst* inst) { conf.recompile_on_fastmem_failure, }); - EmitCheckMemoryAbort(ctx, inst, end.get()); + EmitCheckMemoryAbort(ctx, inst, end); code.jmp(*end, code.T_NEAR); }); } else { @@ -120,7 +120,7 @@ void AxxEmitX64::EmitMemoryRead(AxxEmitContext& ctx, IR::Inst* inst) { ctx.deferred_emits.emplace_back([=, this, &ctx] { code.L(*abort); code.call(wrapped_fn); - EmitCheckMemoryAbort(ctx, inst, end.get()); + EmitCheckMemoryAbort(ctx, inst, end); code.jmp(*end, code.T_NEAR); }); } @@ -173,7 +173,7 @@ void AxxEmitX64::EmitMemoryWrite(AxxEmitContext& ctx, IR::Inst* inst) { const auto wrapped_fn = write_fallbacks[std::make_tuple(ordered, bitsize, vaddr.getIdx(), value_idx)]; - SharedLabel abort = GenSharedLabel(), end = GenSharedLabel(); + SharedLabel abort = ctx.GenSharedLabel(), end = ctx.GenSharedLabel(); if (fastmem_marker) { // Use fastmem @@ -195,7 +195,7 @@ void AxxEmitX64::EmitMemoryWrite(AxxEmitContext& ctx, IR::Inst* inst) { conf.recompile_on_fastmem_failure, }); - EmitCheckMemoryAbort(ctx, inst, end.get()); + EmitCheckMemoryAbort(ctx, inst, end); code.jmp(*end, code.T_NEAR); }); } else { @@ -207,7 +207,7 @@ void AxxEmitX64::EmitMemoryWrite(AxxEmitContext& ctx, IR::Inst* inst) { ctx.deferred_emits.emplace_back([=, this, &ctx] { code.L(*abort); code.call(wrapped_fn); - EmitCheckMemoryAbort(ctx, inst, end.get()); + EmitCheckMemoryAbort(ctx, inst, end); code.jmp(*end, code.T_NEAR); }); } @@ -352,7 +352,7 @@ void AxxEmitX64::EmitExclusiveReadMemoryInline(AxxEmitContext& ctx, IR::Inst* in const auto fastmem_marker = ShouldFastmem(ctx, inst); if (fastmem_marker) { - SharedLabel abort = GenSharedLabel(), end = GenSharedLabel(); + SharedLabel abort = ctx.GenSharedLabel(), end = ctx.GenSharedLabel(); bool require_abort_handling = false; const auto src_ptr = EmitFastmemVAddr(code, ctx, *abort, vaddr, require_abort_handling); @@ -427,7 +427,7 @@ void AxxEmitX64::EmitExclusiveWriteMemoryInline(AxxEmitContext& ctx, IR::Inst* i EmitExclusiveLock(code, conf, tmp, tmp2.cvt32()); - SharedLabel end = GenSharedLabel(); + SharedLabel end = ctx.GenSharedLabel(); code.mov(status, u32(1)); code.movzx(tmp.cvt32(), code.byte[code.ABI_JIT_PTR + offsetof(AxxJitState, exclusive_state)]); @@ -460,7 +460,7 @@ void AxxEmitX64::EmitExclusiveWriteMemoryInline(AxxEmitContext& ctx, IR::Inst* i const auto fastmem_marker = ShouldFastmem(ctx, inst); if (fastmem_marker) { - SharedLabel abort = GenSharedLabel(); + SharedLabel abort = ctx.GenSharedLabel(); bool require_abort_handling = false; const auto dest_ptr = EmitFastmemVAddr(code, ctx, *abort, vaddr, require_abort_handling, tmp); diff --git a/src/dynarmic/src/dynarmic/backend/x64/emit_x64_memory.h b/src/dynarmic/src/dynarmic/backend/x64/emit_x64_memory.h index b354efcb51..3ac078f1d7 100644 --- a/src/dynarmic/src/dynarmic/backend/x64/emit_x64_memory.h +++ b/src/dynarmic/src/dynarmic/backend/x64/emit_x64_memory.h @@ -54,7 +54,7 @@ void EmitDetectMisalignedVAddr(BlockOfCode& code, EmitContext& ctx, size_t bitsi if (ctx.conf.only_detect_misalignment_via_page_table_on_page_boundary) { const u32 page_align_mask = static_cast(page_table_const_size - 1) & ~align_mask; - SharedLabel detect_boundary = GenSharedLabel(), resume = GenSharedLabel(); + SharedLabel detect_boundary = ctx.GenSharedLabel(), resume = ctx.GenSharedLabel(); code.jnz(*detect_boundary, code.T_NEAR); code.L(*resume); diff --git a/src/dynarmic/src/dynarmic/backend/x64/emit_x64_vector.cpp b/src/dynarmic/src/dynarmic/backend/x64/emit_x64_vector.cpp index a0fd944041..6f53580997 100644 --- a/src/dynarmic/src/dynarmic/backend/x64/emit_x64_vector.cpp +++ b/src/dynarmic/src/dynarmic/backend/x64/emit_x64_vector.cpp @@ -38,33 +38,21 @@ template static void EmitVectorOperation(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(code, args[1]); + auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const xmm_b = ctx.reg_alloc.UseXmm(code, args[1]); (code.*fn)(xmm_a, xmm_b); ctx.reg_alloc.DefineValue(code, inst, xmm_a); } -template -static void EmitAVXVectorOperation(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) { - auto args = ctx.reg_alloc.GetArgumentInfo(inst); - - const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(code, args[1]); - - (code.*fn)(xmm_a, xmm_a, xmm_b); - - ctx.reg_alloc.DefineValue(code, inst, xmm_a); -} - template static void EmitOneArgumentFallback(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Lambda lambda) { const auto fn = static_cast*>(lambda); constexpr u32 stack_space = 2 * 16; auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm arg1 = ctx.reg_alloc.UseXmm(code, args[0]); - const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code); + auto const arg1 = ctx.reg_alloc.UseXmm(code, args[0]); + auto const result = ctx.reg_alloc.ScratchXmm(code); ctx.reg_alloc.EndOfAllocScope(); ctx.reg_alloc.HostCall(code, nullptr); @@ -86,8 +74,8 @@ static void EmitOneArgumentFallbackWithSaturation(BlockOfCode& code, EmitContext const auto fn = static_cast*>(lambda); constexpr u32 stack_space = 2 * 16; auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm arg1 = ctx.reg_alloc.UseXmm(code, args[0]); - const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code); + auto const arg1 = ctx.reg_alloc.UseXmm(code, args[0]); + auto const result = ctx.reg_alloc.ScratchXmm(code); ctx.reg_alloc.EndOfAllocScope(); ctx.reg_alloc.HostCall(code, nullptr); @@ -111,9 +99,9 @@ static void EmitTwoArgumentFallbackWithSaturation(BlockOfCode& code, EmitContext const auto fn = static_cast*>(lambda); constexpr u32 stack_space = 3 * 16; auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm arg1 = ctx.reg_alloc.UseXmm(code, args[0]); - const Xbyak::Xmm arg2 = ctx.reg_alloc.UseXmm(code, args[1]); - const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code); + auto const arg1 = ctx.reg_alloc.UseXmm(code, args[0]); + auto const arg2 = ctx.reg_alloc.UseXmm(code, args[1]); + auto const result = ctx.reg_alloc.ScratchXmm(code); ctx.reg_alloc.EndOfAllocScope(); ctx.reg_alloc.HostCall(code, nullptr); @@ -139,9 +127,9 @@ static void EmitTwoArgumentFallbackWithSaturationAndImmediate(BlockOfCode& code, const auto fn = static_cast*>(lambda); constexpr u32 stack_space = 2 * 16; auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm arg1 = ctx.reg_alloc.UseXmm(code, args[0]); + auto const arg1 = ctx.reg_alloc.UseXmm(code, args[0]); const u8 arg2 = args[1].GetImmediateU8(); - const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code); + auto const result = ctx.reg_alloc.ScratchXmm(code); ctx.reg_alloc.EndOfAllocScope(); ctx.reg_alloc.HostCall(code, nullptr); @@ -166,9 +154,9 @@ static void EmitTwoArgumentFallback(BlockOfCode& code, EmitContext& ctx, IR::Ins const auto fn = static_cast*>(lambda); constexpr u32 stack_space = 3 * 16; auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm arg1 = ctx.reg_alloc.UseXmm(code, args[0]); - const Xbyak::Xmm arg2 = ctx.reg_alloc.UseXmm(code, args[1]); - const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code); + auto const arg1 = ctx.reg_alloc.UseXmm(code, args[0]); + auto const arg2 = ctx.reg_alloc.UseXmm(code, args[1]); + auto const result = ctx.reg_alloc.ScratchXmm(code); ctx.reg_alloc.EndOfAllocScope(); ctx.reg_alloc.HostCall(code, nullptr); @@ -194,7 +182,7 @@ void EmitX64::EmitVectorGetElement8(EmitContext& ctx, IR::Inst* inst) { // TODO: DefineValue directly on Argument for index == 0 - const Xbyak::Xmm source = ctx.reg_alloc.UseXmm(code, args[0]); + auto const source = ctx.reg_alloc.UseXmm(code, args[0]); const Xbyak::Reg32 dest = ctx.reg_alloc.ScratchGpr(code).cvt32(); if (code.HasHostFeature(HostFeature::SSE41)) { @@ -218,7 +206,7 @@ void EmitX64::EmitVectorGetElement16(EmitContext& ctx, IR::Inst* inst) { // TODO: DefineValue directly on Argument for index == 0 - const Xbyak::Xmm source = ctx.reg_alloc.UseXmm(code, args[0]); + auto const source = ctx.reg_alloc.UseXmm(code, args[0]); const Xbyak::Reg32 dest = ctx.reg_alloc.ScratchGpr(code).cvt32(); code.pextrw(dest, source, index); ctx.reg_alloc.DefineValue(code, inst, dest); @@ -234,10 +222,10 @@ void EmitX64::EmitVectorGetElement32(EmitContext& ctx, IR::Inst* inst) { const Xbyak::Reg32 dest = ctx.reg_alloc.ScratchGpr(code).cvt32(); if (code.HasHostFeature(HostFeature::SSE41)) { - const Xbyak::Xmm source = ctx.reg_alloc.UseXmm(code, args[0]); + auto const source = ctx.reg_alloc.UseXmm(code, args[0]); code.pextrd(dest, source, index); } else { - const Xbyak::Xmm source = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const source = ctx.reg_alloc.UseScratchXmm(code, args[0]); code.pshufd(source, source, index); code.movd(dest, source); } @@ -253,7 +241,7 @@ void EmitX64::EmitVectorGetElement64(EmitContext& ctx, IR::Inst* inst) { if (index == 0) { // TODO: DefineValue directly on Argument for index == 0 const Xbyak::Reg64 dest = ctx.reg_alloc.ScratchGpr(code).cvt64(); - const Xbyak::Xmm source = ctx.reg_alloc.UseXmm(code, args[0]); + auto const source = ctx.reg_alloc.UseXmm(code, args[0]); code.movq(dest, source); ctx.reg_alloc.DefineValue(code, inst, dest); return; @@ -262,10 +250,10 @@ void EmitX64::EmitVectorGetElement64(EmitContext& ctx, IR::Inst* inst) { const Xbyak::Reg64 dest = ctx.reg_alloc.ScratchGpr(code).cvt64(); if (code.HasHostFeature(HostFeature::SSE41)) { - const Xbyak::Xmm source = ctx.reg_alloc.UseXmm(code, args[0]); + auto const source = ctx.reg_alloc.UseXmm(code, args[0]); code.pextrq(dest, source, 1); } else { - const Xbyak::Xmm source = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const source = ctx.reg_alloc.UseScratchXmm(code, args[0]); code.punpckhqdq(source, source); code.movq(dest, source); } @@ -277,7 +265,7 @@ void EmitX64::EmitVectorSetElement8(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); ASSERT(args[1].IsImmediate()); const u8 index = args[1].GetImmediateU8(); - const Xbyak::Xmm source_vector = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const source_vector = ctx.reg_alloc.UseScratchXmm(code, args[0]); if (code.HasHostFeature(HostFeature::SSE41)) { const Xbyak::Reg8 source_elem = ctx.reg_alloc.UseGpr(code, args[2]).cvt8(); @@ -310,7 +298,7 @@ void EmitX64::EmitVectorSetElement16(EmitContext& ctx, IR::Inst* inst) { ASSERT(args[1].IsImmediate()); const u8 index = args[1].GetImmediateU8(); - const Xbyak::Xmm source_vector = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const source_vector = ctx.reg_alloc.UseScratchXmm(code, args[0]); const Xbyak::Reg16 source_elem = ctx.reg_alloc.UseGpr(code, args[2]).cvt16(); code.pinsrw(source_vector, source_elem.cvt32(), index); @@ -322,7 +310,7 @@ void EmitX64::EmitVectorSetElement32(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); ASSERT(args[1].IsImmediate()); const u8 index = args[1].GetImmediateU8(); - const Xbyak::Xmm source_vector = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const source_vector = ctx.reg_alloc.UseScratchXmm(code, args[0]); if (code.HasHostFeature(HostFeature::SSE41)) { const Xbyak::Reg32 source_elem = ctx.reg_alloc.UseGpr(code, args[2]).cvt32(); @@ -345,7 +333,7 @@ void EmitX64::EmitVectorSetElement64(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); ASSERT(args[1].IsImmediate()); const u8 index = args[1].GetImmediateU8(); - const Xbyak::Xmm source_vector = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const source_vector = ctx.reg_alloc.UseScratchXmm(code, args[0]); if (code.HasHostFeature(HostFeature::SSE41)) { const Xbyak::Reg64 source_elem = ctx.reg_alloc.UseGpr(code, args[2]); @@ -355,7 +343,7 @@ void EmitX64::EmitVectorSetElement64(EmitContext& ctx, IR::Inst* inst) { ctx.reg_alloc.DefineValue(code, inst, source_vector); } else { const Xbyak::Reg64 source_elem = ctx.reg_alloc.UseGpr(code, args[2]); - const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code); + auto const tmp = ctx.reg_alloc.ScratchXmm(code); code.movq(tmp, source_elem); @@ -369,72 +357,53 @@ void EmitX64::EmitVectorSetElement64(EmitContext& ctx, IR::Inst* inst) { } } -static void VectorAbs8(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& data) { - if (code.HasHostFeature(HostFeature::SSSE3)) { - code.pabsb(data, data); - } else { - const Xbyak::Xmm temp = ctx.reg_alloc.ScratchXmm(code); - code.pxor(temp, temp); - code.psubb(temp, data); - code.pminub(data, temp); - } -} - -static void VectorAbs16(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& data) { - if (code.HasHostFeature(HostFeature::SSSE3)) { - code.pabsw(data, data); - } else { - const Xbyak::Xmm temp = ctx.reg_alloc.ScratchXmm(code); - code.pxor(temp, temp); - code.psubw(temp, data); - code.pmaxsw(data, temp); - } -} - -static void VectorAbs32(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& data) { - if (code.HasHostFeature(HostFeature::SSSE3)) { - code.pabsd(data, data); - } else { - const Xbyak::Xmm temp = ctx.reg_alloc.ScratchXmm(code); - code.movdqa(temp, data); - code.psrad(temp, 31); - code.pxor(data, temp); - code.psubd(data, temp); - } -} - -static void VectorAbs64(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& data) { - if (code.HasHostFeature(HostFeature::AVX512_Ortho)) { - code.vpabsq(data, data); - } else { - const Xbyak::Xmm temp = ctx.reg_alloc.ScratchXmm(code); - code.pshufd(temp, data, 0b11110101); - code.psrad(temp, 31); - code.pxor(data, temp); - code.psubq(data, temp); - } -} - static void EmitVectorAbs(size_t esize, EmitContext& ctx, IR::Inst* inst, BlockOfCode& code) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - - const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]); - + auto const data = ctx.reg_alloc.UseScratchXmm(code, args[0]); switch (esize) { case 8: - VectorAbs8(code, ctx, data); + if (code.HasHostFeature(HostFeature::SSSE3)) { + code.pabsb(data, data); + } else { + auto const temp = ctx.reg_alloc.ScratchXmm(code); + code.pxor(temp, temp); + code.psubb(temp, data); + code.pminub(data, temp); + } break; case 16: - VectorAbs16(code, ctx, data); + if (code.HasHostFeature(HostFeature::SSSE3)) { + code.pabsw(data, data); + } else { + auto const temp = ctx.reg_alloc.ScratchXmm(code); + code.pxor(temp, temp); + code.psubw(temp, data); + code.pmaxsw(data, temp); + } break; case 32: - VectorAbs32(code, ctx, data); + if (code.HasHostFeature(HostFeature::SSSE3)) { + code.pabsd(data, data); + } else { + auto const temp = ctx.reg_alloc.ScratchXmm(code); + code.movdqa(temp, data); + code.psrad(temp, 31); + code.pxor(data, temp); + code.psubd(data, temp); + } break; case 64: - VectorAbs64(code, ctx, data); + if (code.HasHostFeature(HostFeature::AVX512_Ortho)) { + code.vpabsq(data, data); + } else { + auto const temp = ctx.reg_alloc.ScratchXmm(code); + code.pshufd(temp, data, 0b11110101); + code.psrad(temp, 31); + code.pxor(data, temp); + code.psubq(data, temp); + } break; } - ctx.reg_alloc.DefineValue(code, inst, data); } @@ -477,15 +446,15 @@ void EmitX64::EmitVectorAnd(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorAndNot(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseXmm(code, args[0]); - const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseScratchXmm(code, args[1]); + auto const xmm_a = ctx.reg_alloc.UseXmm(code, args[0]); + auto const xmm_b = ctx.reg_alloc.UseScratchXmm(code, args[1]); code.pandn(xmm_b, xmm_a); ctx.reg_alloc.DefineValue(code, inst, xmm_b); } -static void ArithmeticShiftRightByte(EmitContext& ctx, BlockOfCode& code, const Xbyak::Xmm& result, u8 shift_amount) { +static void ArithmeticShiftRightByte(EmitContext& ctx, BlockOfCode& code, auto const& result, u8 shift_amount) { if (code.HasHostFeature(HostFeature::GFNI)) { const u64 shift_matrix = shift_amount < 8 ? (0x0102040810204080 << (shift_amount * 8)) | (0x8080808080808080 >> (64 - shift_amount * 8)) @@ -494,7 +463,7 @@ static void ArithmeticShiftRightByte(EmitContext& ctx, BlockOfCode& code, const return; } - const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code); + auto const tmp = ctx.reg_alloc.ScratchXmm(code); code.punpckhbw(tmp, result); code.punpcklbw(result, result); @@ -506,7 +475,7 @@ static void ArithmeticShiftRightByte(EmitContext& ctx, BlockOfCode& code, const void EmitX64::EmitVectorArithmeticShiftRight8(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]); const u8 shift_amount = args[1].GetImmediateU8(); ArithmeticShiftRightByte(ctx, code, result, shift_amount); @@ -517,7 +486,7 @@ void EmitX64::EmitVectorArithmeticShiftRight8(EmitContext& ctx, IR::Inst* inst) void EmitX64::EmitVectorArithmeticShiftRight16(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]); const u8 shift_amount = args[1].GetImmediateU8(); code.psraw(result, shift_amount); @@ -528,7 +497,7 @@ void EmitX64::EmitVectorArithmeticShiftRight16(EmitContext& ctx, IR::Inst* inst) void EmitX64::EmitVectorArithmeticShiftRight32(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]); const u8 shift_amount = args[1].GetImmediateU8(); code.psrad(result, shift_amount); @@ -538,14 +507,14 @@ void EmitX64::EmitVectorArithmeticShiftRight32(EmitContext& ctx, IR::Inst* inst) void EmitX64::EmitVectorArithmeticShiftRight64(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]); const u8 shift_amount = (std::min)(args[1].GetImmediateU8(), u8(63)); if (code.HasHostFeature(HostFeature::AVX512_Ortho)) { code.vpsraq(result, result, shift_amount); } else { - const Xbyak::Xmm tmp1 = ctx.reg_alloc.ScratchXmm(code); - const Xbyak::Xmm tmp2 = ctx.reg_alloc.ScratchXmm(code); + auto const tmp1 = ctx.reg_alloc.ScratchXmm(code); + auto const tmp2 = ctx.reg_alloc.ScratchXmm(code); const u64 sign_bit = 0x80000000'00000000u >> shift_amount; @@ -660,12 +629,12 @@ void EmitX64::EmitVectorArithmeticVShift64(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorBroadcastLower8(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]); if (code.HasHostFeature(HostFeature::AVX2)) { code.vpbroadcastb(a, a); code.vmovq(a, a); } else if (code.HasHostFeature(HostFeature::SSSE3)) { - const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code); + auto const tmp = ctx.reg_alloc.ScratchXmm(code); code.pxor(tmp, tmp); code.pshufb(a, tmp); code.movq(a, a); @@ -678,7 +647,7 @@ void EmitX64::EmitVectorBroadcastLower8(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorBroadcastLower16(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]); code.pshuflw(a, a, 0); @@ -687,7 +656,7 @@ void EmitX64::EmitVectorBroadcastLower16(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorBroadcastLower32(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]); code.pshuflw(a, a, 0b01000100); @@ -696,11 +665,11 @@ void EmitX64::EmitVectorBroadcastLower32(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorBroadcast8(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]); if (code.HasHostFeature(HostFeature::AVX2)) { code.vpbroadcastb(a, a); } else if (code.HasHostFeature(HostFeature::SSSE3)) { - const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code); + auto const tmp = ctx.reg_alloc.ScratchXmm(code); code.pxor(tmp, tmp); code.pshufb(a, tmp); } else { @@ -713,7 +682,7 @@ void EmitX64::EmitVectorBroadcast8(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorBroadcast16(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]); if (code.HasHostFeature(HostFeature::AVX2)) { code.vpbroadcastw(a, a); } else { @@ -725,7 +694,7 @@ void EmitX64::EmitVectorBroadcast16(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorBroadcast32(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]); if (code.HasHostFeature(HostFeature::AVX2)) { code.vpbroadcastd(a, a); } else { @@ -736,7 +705,7 @@ void EmitX64::EmitVectorBroadcast32(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorBroadcast64(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]); if (code.HasHostFeature(HostFeature::AVX2)) { code.vpbroadcastq(a, a); } else { @@ -747,7 +716,7 @@ void EmitX64::EmitVectorBroadcast64(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorBroadcastElementLower8(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]); ASSERT(args[1].IsImmediate()); const u8 index = args[1].GetImmediateU8(); ASSERT(index < 16); @@ -758,7 +727,7 @@ void EmitX64::EmitVectorBroadcastElementLower8(EmitContext& ctx, IR::Inst* inst) code.vpbroadcastb(a, a); code.vmovq(a, a); } else if (code.HasHostFeature(HostFeature::SSSE3)) { - const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code); + auto const tmp = ctx.reg_alloc.ScratchXmm(code); code.pxor(tmp, tmp); code.pshufb(a, tmp); code.movq(a, a); @@ -771,7 +740,7 @@ void EmitX64::EmitVectorBroadcastElementLower8(EmitContext& ctx, IR::Inst* inst) void EmitX64::EmitVectorBroadcastElementLower16(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]); ASSERT(args[1].IsImmediate()); const u8 index = args[1].GetImmediateU8(); ASSERT(index < 8); @@ -784,7 +753,7 @@ void EmitX64::EmitVectorBroadcastElementLower16(EmitContext& ctx, IR::Inst* inst void EmitX64::EmitVectorBroadcastElementLower32(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]); ASSERT(args[1].IsImmediate()); const u8 index = args[1].GetImmediateU8(); ASSERT(index < 4); @@ -800,7 +769,7 @@ void EmitX64::EmitVectorBroadcastElementLower32(EmitContext& ctx, IR::Inst* inst void EmitX64::EmitVectorBroadcastElement8(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]); ASSERT(args[1].IsImmediate()); const u8 index = args[1].GetImmediateU8(); ASSERT(index < 16); @@ -810,7 +779,7 @@ void EmitX64::EmitVectorBroadcastElement8(EmitContext& ctx, IR::Inst* inst) { if (code.HasHostFeature(HostFeature::AVX2)) { code.vpbroadcastb(a, a); } else if (code.HasHostFeature(HostFeature::SSSE3)) { - const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code); + auto const tmp = ctx.reg_alloc.ScratchXmm(code); code.pxor(tmp, tmp); code.pshufb(a, tmp); @@ -824,7 +793,7 @@ void EmitX64::EmitVectorBroadcastElement8(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorBroadcastElement16(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]); ASSERT(args[1].IsImmediate()); const u8 index = args[1].GetImmediateU8(); ASSERT(index < 8); @@ -844,7 +813,7 @@ void EmitX64::EmitVectorBroadcastElement16(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorBroadcastElement32(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]); ASSERT(args[1].IsImmediate()); const u8 index = args[1].GetImmediateU8(); ASSERT(index < 4); @@ -856,7 +825,7 @@ void EmitX64::EmitVectorBroadcastElement32(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorBroadcastElement64(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]); ASSERT(args[1].IsImmediate()); const u8 index = args[1].GetImmediateU8(); ASSERT(index < 2); @@ -1043,9 +1012,9 @@ void EmitX64::EmitVectorCountLeadingZeros32(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorDeinterleaveEven8(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(code, args[1]); - const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code); + auto const lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const rhs = ctx.reg_alloc.UseScratchXmm(code, args[1]); + auto const tmp = ctx.reg_alloc.ScratchXmm(code); code.movdqa(tmp, code.Const(xword, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF)); code.pand(lhs, tmp); @@ -1057,11 +1026,11 @@ void EmitX64::EmitVectorDeinterleaveEven8(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorDeinterleaveEven16(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(code, args[1]); + auto const lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const rhs = ctx.reg_alloc.UseScratchXmm(code, args[1]); if (code.HasHostFeature(HostFeature::SSE41)) { - const Xbyak::Xmm zero = ctx.reg_alloc.ScratchXmm(code); + auto const zero = ctx.reg_alloc.ScratchXmm(code); code.pxor(zero, zero); code.pblendw(lhs, zero, 0b10101010); @@ -1082,8 +1051,8 @@ void EmitX64::EmitVectorDeinterleaveEven16(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorDeinterleaveEven32(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(code, args[1]); + auto const lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const rhs = ctx.reg_alloc.UseXmm(code, args[1]); code.shufps(lhs, rhs, 0b10001000); @@ -1092,8 +1061,8 @@ void EmitX64::EmitVectorDeinterleaveEven32(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorDeinterleaveEven64(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(code, args[1]); + auto const lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const rhs = ctx.reg_alloc.UseXmm(code, args[1]); code.shufpd(lhs, rhs, 0b00); @@ -1102,16 +1071,16 @@ void EmitX64::EmitVectorDeinterleaveEven64(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorDeinterleaveEvenLower8(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]); if (code.HasHostFeature(HostFeature::SSSE3)) { - const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(code, args[1]); + auto const rhs = ctx.reg_alloc.UseXmm(code, args[1]); code.punpcklbw(lhs, rhs); code.pshufb(lhs, code.Const(xword, 0x0D'09'05'01'0C'08'04'00, 0x8080808080808080)); } else { - const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code); - const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(code, args[1]); + auto const tmp = ctx.reg_alloc.ScratchXmm(code); + auto const rhs = ctx.reg_alloc.UseScratchXmm(code, args[1]); code.movdqa(tmp, code.Const(xword, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF)); code.pand(lhs, tmp); @@ -1126,15 +1095,15 @@ void EmitX64::EmitVectorDeinterleaveEvenLower8(EmitContext& ctx, IR::Inst* inst) void EmitX64::EmitVectorDeinterleaveEvenLower16(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]); if (code.HasHostFeature(HostFeature::SSSE3)) { - const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(code, args[1]); + auto const rhs = ctx.reg_alloc.UseXmm(code, args[1]); code.punpcklwd(lhs, rhs); code.pshufb(lhs, code.Const(xword, 0x0B0A'0302'0908'0100, 0x8080'8080'8080'8080)); } else { - const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(code, args[1]); + auto const rhs = ctx.reg_alloc.UseScratchXmm(code, args[1]); code.pslld(lhs, 16); code.psrad(lhs, 16); @@ -1152,8 +1121,8 @@ void EmitX64::EmitVectorDeinterleaveEvenLower16(EmitContext& ctx, IR::Inst* inst void EmitX64::EmitVectorDeinterleaveEvenLower32(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(code, args[1]); + auto const lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const rhs = ctx.reg_alloc.UseXmm(code, args[1]); if (code.HasHostFeature(HostFeature::SSE41)) { // copy bytes 0:3 of rhs to lhs, zero out upper 8 bytes @@ -1168,8 +1137,8 @@ void EmitX64::EmitVectorDeinterleaveEvenLower32(EmitContext& ctx, IR::Inst* inst void EmitX64::EmitVectorDeinterleaveOdd8(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(code, args[1]); + auto const lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const rhs = ctx.reg_alloc.UseScratchXmm(code, args[1]); code.psraw(lhs, 8); code.psraw(rhs, 8); @@ -1180,8 +1149,8 @@ void EmitX64::EmitVectorDeinterleaveOdd8(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorDeinterleaveOdd16(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(code, args[1]); + auto const lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const rhs = ctx.reg_alloc.UseScratchXmm(code, args[1]); code.psrad(lhs, 16); code.psrad(rhs, 16); @@ -1192,8 +1161,8 @@ void EmitX64::EmitVectorDeinterleaveOdd16(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorDeinterleaveOdd32(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(code, args[1]); + auto const lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const rhs = ctx.reg_alloc.UseXmm(code, args[1]); code.shufps(lhs, rhs, 0b11011101); @@ -1202,8 +1171,8 @@ void EmitX64::EmitVectorDeinterleaveOdd32(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorDeinterleaveOdd64(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(code, args[1]); + auto const lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const rhs = ctx.reg_alloc.UseXmm(code, args[1]); code.shufpd(lhs, rhs, 0b11); @@ -1212,15 +1181,15 @@ void EmitX64::EmitVectorDeinterleaveOdd64(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorDeinterleaveOddLower8(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]); if (code.HasHostFeature(HostFeature::SSSE3)) { - const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(code, args[1]); + auto const rhs = ctx.reg_alloc.UseXmm(code, args[1]); code.punpcklbw(lhs, rhs); code.pshufb(lhs, code.Const(xword, 0x0F'0B'07'03'0E'0A'06'02, 0x8080808080808080)); } else { - const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(code, args[1]); + auto const rhs = ctx.reg_alloc.UseScratchXmm(code, args[1]); code.psraw(lhs, 8); code.psraw(rhs, 8); @@ -1234,15 +1203,15 @@ void EmitX64::EmitVectorDeinterleaveOddLower8(EmitContext& ctx, IR::Inst* inst) void EmitX64::EmitVectorDeinterleaveOddLower16(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]); if (code.HasHostFeature(HostFeature::SSSE3)) { - const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(code, args[1]); + auto const rhs = ctx.reg_alloc.UseXmm(code, args[1]); code.punpcklwd(lhs, rhs); code.pshufb(lhs, code.Const(xword, 0x0F0E'0706'0D0C'0504, 0x8080'8080'8080'8080)); } else { - const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(code, args[1]); + auto const rhs = ctx.reg_alloc.UseScratchXmm(code, args[1]); code.psrad(lhs, 16); code.psrad(rhs, 16); @@ -1258,17 +1227,17 @@ void EmitX64::EmitVectorDeinterleaveOddLower32(EmitContext& ctx, IR::Inst* inst) auto args = ctx.reg_alloc.GetArgumentInfo(inst); if (code.HasHostFeature(HostFeature::SSE41)) { - const Xbyak::Xmm lhs = ctx.reg_alloc.UseXmm(code, args[0]); - const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(code, args[1]); + auto const lhs = ctx.reg_alloc.UseXmm(code, args[0]); + auto const rhs = ctx.reg_alloc.UseScratchXmm(code, args[1]); // copy bytes 4:7 of lhs to bytes 0:3 of rhs, zero out upper 8 bytes code.insertps(rhs, lhs, 0b01001100); ctx.reg_alloc.DefineValue(code, inst, rhs); } else { - const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(code, args[1]); - const Xbyak::Xmm zero = ctx.reg_alloc.ScratchXmm(code); + auto const lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const rhs = ctx.reg_alloc.UseXmm(code, args[1]); + auto const zero = ctx.reg_alloc.ScratchXmm(code); code.xorps(zero, zero); code.unpcklps(lhs, rhs); @@ -1302,9 +1271,9 @@ void EmitX64::EmitVectorEqual64(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(code, args[1]); - const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code); + auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const xmm_b = ctx.reg_alloc.UseXmm(code, args[1]); + auto const tmp = ctx.reg_alloc.ScratchXmm(code); code.pcmpeqd(xmm_a, xmm_b); code.pshufd(tmp, xmm_a, 0b10110001); @@ -1317,9 +1286,9 @@ void EmitX64::EmitVectorEqual128(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); if (code.HasHostFeature(HostFeature::SSE41)) { - const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(code, args[1]); - const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code); + auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const xmm_b = ctx.reg_alloc.UseXmm(code, args[1]); + auto const tmp = ctx.reg_alloc.ScratchXmm(code); code.pcmpeqq(xmm_a, xmm_b); code.pshufd(tmp, xmm_a, 0b01001110); @@ -1327,9 +1296,9 @@ void EmitX64::EmitVectorEqual128(EmitContext& ctx, IR::Inst* inst) { ctx.reg_alloc.DefineValue(code, inst, xmm_a); } else { - const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(code, args[1]); - const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code); + auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const xmm_b = ctx.reg_alloc.UseXmm(code, args[1]); + auto const tmp = ctx.reg_alloc.ScratchXmm(code); code.pcmpeqd(xmm_a, xmm_b); code.pshufd(tmp, xmm_a, 0b10110001); @@ -1353,16 +1322,16 @@ void EmitX64::EmitVectorExtract(EmitContext& ctx, IR::Inst* inst) { } if (code.HasHostFeature(HostFeature::SSSE3)) { - const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseXmm(code, args[0]); - const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseScratchXmm(code, args[1]); + auto const xmm_a = ctx.reg_alloc.UseXmm(code, args[0]); + auto const xmm_b = ctx.reg_alloc.UseScratchXmm(code, args[1]); code.palignr(xmm_b, xmm_a, position / 8); ctx.reg_alloc.DefineValue(code, inst, xmm_b); return; } - const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseScratchXmm(code, args[1]); + auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const xmm_b = ctx.reg_alloc.UseScratchXmm(code, args[1]); code.psrldq(xmm_a, position / 8); code.pslldq(xmm_b, (128 - position) / 8); @@ -1374,13 +1343,13 @@ void EmitX64::EmitVectorExtract(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorExtractLower(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]); const u8 position = args[2].GetImmediateU8(); ASSERT(position % 8 == 0); if (position != 0) { - const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(code, args[1]); + auto const xmm_b = ctx.reg_alloc.UseXmm(code, args[1]); code.punpcklqdq(xmm_a, xmm_b); code.psrldq(xmm_a, position / 8); @@ -1405,22 +1374,33 @@ void EmitX64::EmitVectorGreaterS32(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorGreaterS64(EmitContext& ctx, IR::Inst* inst) { if (code.HasHostFeature(HostFeature::SSE42)) { EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pcmpgtq); - return; + } else { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const tmp1 = ctx.reg_alloc.UseScratchXmm(code, args[1]); + auto const tmp2 = ctx.reg_alloc.ScratchXmm(code); + auto const tmp3 = ctx.reg_alloc.ScratchXmm(code); + code.movdqa(tmp2, code.Const(xword, 0x80000000, 0x80000000)); + code.pxor(tmp0, tmp2); + code.pxor(tmp1, tmp2); + code.movdqa(tmp2, tmp0); + code.pcmpeqd(tmp0, tmp1); + code.pcmpgtd(tmp2, tmp1); + code.pshufd(tmp1, tmp0, 245); + code.pshufd(tmp3, tmp2, 160); + code.pshufd(tmp0, tmp2, 245); + code.pand(tmp1, tmp3); + code.por(tmp0, tmp1); + ctx.reg_alloc.DefineValue(code, inst, tmp0); } - - EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) { - for (size_t i = 0; i < result.size(); ++i) { - result[i] = (a[i] > b[i]) ? ~u64(0) : 0; - } - }); } static void EmitVectorHalvingAddSigned(size_t esize, EmitContext& ctx, IR::Inst* inst, BlockOfCode& code) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(code, args[1]); - const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code); + auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const b = ctx.reg_alloc.UseScratchXmm(code, args[1]); + auto const tmp = ctx.reg_alloc.ScratchXmm(code); code.movdqa(tmp, b); code.pand(tmp, a); @@ -1459,9 +1439,9 @@ void EmitX64::EmitVectorHalvingAddS32(EmitContext& ctx, IR::Inst* inst) { static void EmitVectorHalvingAddUnsigned(size_t esize, EmitContext& ctx, IR::Inst* inst, BlockOfCode& code) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(code, args[1]); - const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code); + auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const b = ctx.reg_alloc.UseScratchXmm(code, args[1]); + auto const tmp = ctx.reg_alloc.ScratchXmm(code); code.movdqa(tmp, b); @@ -1504,12 +1484,12 @@ void EmitX64::EmitVectorHalvingAddU32(EmitContext& ctx, IR::Inst* inst) { static void EmitVectorHalvingSubSigned(size_t esize, EmitContext& ctx, IR::Inst* inst, BlockOfCode& code) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(code, args[1]); + auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const b = ctx.reg_alloc.UseScratchXmm(code, args[1]); switch (esize) { case 8: { - const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code); + auto const tmp = ctx.reg_alloc.ScratchXmm(code); code.movdqa(tmp, code.Const(xword, 0x8080808080808080, 0x8080808080808080)); code.pxor(a, tmp); code.pxor(b, tmp); @@ -1518,7 +1498,7 @@ static void EmitVectorHalvingSubSigned(size_t esize, EmitContext& ctx, IR::Inst* break; } case 16: { - const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code); + auto const tmp = ctx.reg_alloc.ScratchXmm(code); code.movdqa(tmp, code.Const(xword, 0x8000800080008000, 0x8000800080008000)); code.pxor(a, tmp); code.pxor(b, tmp); @@ -1552,8 +1532,8 @@ void EmitX64::EmitVectorHalvingSubS32(EmitContext& ctx, IR::Inst* inst) { static void EmitVectorHalvingSubUnsigned(size_t esize, EmitContext& ctx, IR::Inst* inst, BlockOfCode& code) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(code, args[1]); + auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const b = ctx.reg_alloc.UseScratchXmm(code, args[1]); switch (esize) { case 8: @@ -1590,8 +1570,8 @@ void EmitX64::EmitVectorHalvingSubU32(EmitContext& ctx, IR::Inst* inst) { static void EmitVectorInterleaveLower(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, int size) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(code, args[1]); + auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const b = ctx.reg_alloc.UseXmm(code, args[1]); switch (size) { case 8: @@ -1630,8 +1610,8 @@ void EmitX64::EmitVectorInterleaveLower64(EmitContext& ctx, IR::Inst* inst) { static void EmitVectorInterleaveUpper(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, int size) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(code, args[1]); + auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const b = ctx.reg_alloc.UseXmm(code, args[1]); switch (size) { case 8: @@ -1670,7 +1650,7 @@ void EmitX64::EmitVectorInterleaveUpper64(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorLogicalShiftLeft8(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]); const u8 shift_amount = args[1].GetImmediateU8(); if (shift_amount == 0) { @@ -1696,7 +1676,7 @@ void EmitX64::EmitVectorLogicalShiftLeft8(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorLogicalShiftLeft16(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]); const u8 shift_amount = args[1].GetImmediateU8(); code.psllw(result, shift_amount); @@ -1707,7 +1687,7 @@ void EmitX64::EmitVectorLogicalShiftLeft16(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorLogicalShiftLeft32(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]); const u8 shift_amount = args[1].GetImmediateU8(); code.pslld(result, shift_amount); @@ -1718,7 +1698,7 @@ void EmitX64::EmitVectorLogicalShiftLeft32(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorLogicalShiftLeft64(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]); const u8 shift_amount = args[1].GetImmediateU8(); code.psllq(result, shift_amount); @@ -1729,7 +1709,7 @@ void EmitX64::EmitVectorLogicalShiftLeft64(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorLogicalShiftRight8(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]); const u8 shift_amount = args[1].GetImmediateU8(); if (shift_amount == 0) { @@ -1753,7 +1733,7 @@ void EmitX64::EmitVectorLogicalShiftRight8(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorLogicalShiftRight16(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]); const u8 shift_amount = args[1].GetImmediateU8(); code.psrlw(result, shift_amount); @@ -1764,7 +1744,7 @@ void EmitX64::EmitVectorLogicalShiftRight16(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorLogicalShiftRight32(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]); const u8 shift_amount = args[1].GetImmediateU8(); code.psrld(result, shift_amount); @@ -1775,7 +1755,7 @@ void EmitX64::EmitVectorLogicalShiftRight32(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorLogicalShiftRight64(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]); const u8 shift_amount = args[1].GetImmediateU8(); code.psrlq(result, shift_amount); @@ -1783,41 +1763,12 @@ void EmitX64::EmitVectorLogicalShiftRight64(EmitContext& ctx, IR::Inst* inst) { ctx.reg_alloc.DefineValue(code, inst, result); } -template -static void EmitVectorLogicalVShiftAVX2(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { - static_assert(esize == 32 || esize == 64); - auto args = ctx.reg_alloc.GetArgumentInfo(inst); - - const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(code, args[1]); - const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code); - - // store sign bit of lowest byte of each element of b to select left/right shift later - ICODE(vpsll)(xmm0, b, u8(esize - 8)); - - // sse/avx shifts are only positive, with dedicated left/right forms - shift by lowest byte of abs(b) - code.vpabsb(b, b); - code.vpand(b, b, code.BConst(xword, 0xFF)); - - // calculate shifts - ICODE(vpsllv)(result, a, b); - ICODE(vpsrlv)(a, a, b); - - // implicit argument: xmm0 (sign of lowest byte of b) - if (esize == 32) { - code.blendvps(result, a); - } else { - code.blendvpd(result, a); - } - ctx.reg_alloc.DefineValue(code, inst, result); -} - void EmitX64::EmitVectorLogicalVShift8(EmitContext& ctx, IR::Inst* inst) { if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW | HostFeature::GFNI)) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm left_shift = ctx.reg_alloc.UseScratchXmm(code, args[1]); - const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code); + auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const left_shift = ctx.reg_alloc.UseScratchXmm(code, args[1]); + auto const tmp = ctx.reg_alloc.ScratchXmm(code); const Xbyak::Opmask negative_mask = k1; code.pxor(tmp, tmp); @@ -1862,10 +1813,10 @@ void EmitX64::EmitVectorLogicalVShift16(EmitContext& ctx, IR::Inst* inst) { if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW)) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm left_shift = ctx.reg_alloc.UseScratchXmm(code, args[1]); - const Xbyak::Xmm right_shift = ctx.reg_alloc.ScratchXmm(code); - const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code); + auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const left_shift = ctx.reg_alloc.UseScratchXmm(code, args[1]); + auto const right_shift = ctx.reg_alloc.ScratchXmm(code); + auto const tmp = ctx.reg_alloc.ScratchXmm(code); code.vmovdqa32(tmp, code.Const(xword, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF)); code.vpxord(right_shift, right_shift, right_shift); @@ -1886,18 +1837,87 @@ void EmitX64::EmitVectorLogicalVShift16(EmitContext& ctx, IR::Inst* inst) { } void EmitX64::EmitVectorLogicalVShift32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); if (code.HasHostFeature(HostFeature::AVX2)) { - EmitVectorLogicalVShiftAVX2<32>(code, ctx, inst); + auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const b = ctx.reg_alloc.UseScratchXmm(code, args[1]); + auto const result = ctx.reg_alloc.ScratchXmm(code); + auto const mask = ctx.reg_alloc.ScratchXmm(code); + // store sign bit of lowest byte of each element of b to select left/right shift later + code.vpslld(mask, b, u8(32 - 8)); + // sse/avx shifts are only positive, with dedicated left/right forms - shift by lowest byte of abs(b) + code.vpabsb(b, b); + code.vpand(b, b, code.BConst<32>(xword, 0xFF)); + // calculate shifts + code.vpsllvd(result, a, b); + code.vpsrlvd(a, a, b); + code.vblendvps(result, result, a, mask); + ctx.reg_alloc.DefineValue(code, inst, result); } else { - EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) { - std::transform(a.begin(), a.end(), b.begin(), result.begin(), VShift); - }); + auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const tmp1 = ctx.reg_alloc.UseScratchXmm(code, args[1]); + auto const tmp2 = ctx.reg_alloc.ScratchXmm(code); + auto const tmp3 = ctx.reg_alloc.ScratchXmm(code); + auto const tmp4 = ctx.reg_alloc.ScratchXmm(code); + auto const tmp5 = ctx.reg_alloc.ScratchXmm(code); + auto const tmp6 = ctx.reg_alloc.ScratchXmm(code); + code.pxor(tmp3, tmp3); + code.movdqa(tmp2, tmp0); + code.psubb(tmp3, tmp1); + code.movdqa(tmp4, tmp2); + code.movdqa(tmp6, tmp2); + code.pminub(tmp3, tmp1); + code.pslld(tmp1, 24); + code.pand(tmp3, code.Const(xword, 0x000000ff'000000ff, 0x000000ff'000000ff)); + code.psrad(tmp1, 31); + code.pshuflw(tmp0, tmp3, 254); + code.pshuflw(tmp5, tmp3, 84); + code.psrld(tmp4, tmp0); + code.movdqa(tmp0, tmp2); + code.psrld(tmp0, tmp5); + code.punpcklqdq(tmp0, tmp4); + code.pshufd(tmp4, tmp3, 238); + code.pslld(tmp3, 23); + code.paddd(tmp3, code.Const(xword, 0x3F80'00003F80'0000, 0x3F80'00003F80'0000)); + code.pshuflw(tmp5, tmp4, 254); + code.pshuflw(tmp4, tmp4, 84); + code.psrld(tmp6, tmp5); + code.movdqa(tmp5, tmp2); + code.psrld(tmp5, tmp4); + code.pshufd(tmp4, tmp2, 245); + code.punpckhqdq(tmp5, tmp6); + code.cvttps2dq(tmp3, tmp3); + code.shufps(tmp0, tmp5, 204); + code.pmuludq(tmp2, tmp3); + code.pshufd(tmp3, tmp3, 245); + code.andps(tmp0, tmp1); + code.pmuludq(tmp3, tmp4); + code.pshufd(tmp2, tmp2, 232); + code.pshufd(tmp3, tmp3, 232); + code.punpckldq(tmp2, tmp3); + code.pandn(tmp1, tmp2); + code.orps(tmp0, tmp1); + ctx.reg_alloc.DefineValue(code, inst, tmp0); } } void EmitX64::EmitVectorLogicalVShift64(EmitContext& ctx, IR::Inst* inst) { if (code.HasHostFeature(HostFeature::AVX2)) { - EmitVectorLogicalVShiftAVX2<64>(code, ctx, inst); + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const b = ctx.reg_alloc.UseScratchXmm(code, args[1]); + auto const result = ctx.reg_alloc.ScratchXmm(code); + auto const mask = ctx.reg_alloc.ScratchXmm(code); + // store sign bit of lowest byte of each element of b to select left/right shift later + code.vpsllq(mask, b, u8(64 - 8)); + // sse/avx shifts are only positive, with dedicated left/right forms - shift by lowest byte of abs(b) + code.vpabsb(b, b); + code.vpand(b, b, code.BConst<64>(xword, 0xFF)); + // calculate shifts + code.vpsllvq(result, a, b); + code.vpsrlvq(a, a, b); + code.vblendvpd(result, result, a, mask); + ctx.reg_alloc.DefineValue(code, inst, result); } else { EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) { std::transform(a.begin(), a.end(), b.begin(), result.begin(), VShift); @@ -1912,28 +1932,11 @@ enum class MinMaxOperation { Max, }; -// Compute the minimum/maximum of two vectors of signed 8-bit integers, using only SSE2 instructons. -// The result of the operation is placed in operand a, while b is unmodified. -void FallbackMinMaxS8(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& a, const Xbyak::Xmm& b, MinMaxOperation op) { - const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(code); - if(op == MinMaxOperation::Min) { - code.movdqa(c, b); - code.pcmpgtb(c, a); - } else { - code.movdqa(c, a); - code.pcmpgtb(c, b); - } - - code.pand(a, c); - code.pandn(c, b); - code.por(a, c); -} - // Compute the minimum/maximum of two vectors of unsigned 16-bit integers, using only SSE2 instructons. // The result of the operation is placed in operand a, while b is unmodified. -void FallbackMinMaxU16(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& a, const Xbyak::Xmm& b, MinMaxOperation op) { +void FallbackMinMaxU16(BlockOfCode& code, EmitContext& ctx, auto const& a, auto const& b, MinMaxOperation op) { if(op == MinMaxOperation::Min) { - const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(code); + auto const c = ctx.reg_alloc.ScratchXmm(code); code.movdqa(c, a); code.psubusw(c, b); code.psubw(a, c); @@ -1945,8 +1948,8 @@ void FallbackMinMaxU16(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& a, // Compute the minimum/maximum of two vectors of signed 32-bit integers, using only SSE2 instructons. // The result of the operation is placed in operand a, while b is unmodified. -void FallbackMinMaxS32(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& a, const Xbyak::Xmm& b, MinMaxOperation op) { - const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(code); +void FallbackMinMaxS32(BlockOfCode& code, EmitContext& ctx, auto const& a, auto const& b, MinMaxOperation op) { + auto const c = ctx.reg_alloc.ScratchXmm(code); if(op == MinMaxOperation::Min) { code.movdqa(c, b); code.pcmpgtd(c, a); @@ -1962,12 +1965,12 @@ void FallbackMinMaxS32(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& a, // Compute the minimum/maximum of two vectors of unsigned 32-bit integers, using only SSE2 instructons. // The result of the operation is placed in operand a, while b is unmodified. -void FallbackMinMaxU32(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& a, const Xbyak::Xmm& b, MinMaxOperation op) { - const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(code); +void FallbackMinMaxU32(BlockOfCode& code, EmitContext& ctx, auto const& a, auto const& b, MinMaxOperation op) { + auto const c = ctx.reg_alloc.ScratchXmm(code); code.movdqa(c, code.BConst<32>(xword, 0x80000000)); // bias a and b by XORing their sign bits, then use the signed comparison function - const Xbyak::Xmm d = ctx.reg_alloc.ScratchXmm(code); + auto const d = ctx.reg_alloc.ScratchXmm(code); if(op == MinMaxOperation::Min) { code.movdqa(d, a); code.pxor(d, c); @@ -1989,11 +1992,16 @@ void EmitX64::EmitVectorMaxS8(EmitContext& ctx, IR::Inst* inst) { if (code.HasHostFeature(HostFeature::SSE41)) { EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pmaxsb); } else { - auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(code, args[1]); - FallbackMinMaxS8(code, ctx, a, b, MinMaxOperation::Max); - ctx.reg_alloc.DefineValue(code, inst, a); + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const b = ctx.reg_alloc.UseXmm(code, args[1]); + auto const c = ctx.reg_alloc.ScratchXmm(code); + code.movdqa(c, a); + code.pcmpgtb(c, b); + code.pand(a, c); + code.pandn(c, b); + code.por(a, c); + ctx.reg_alloc.DefineValue(code, inst, a); } } @@ -2005,31 +2013,55 @@ void EmitX64::EmitVectorMaxS32(EmitContext& ctx, IR::Inst* inst) { if (code.HasHostFeature(HostFeature::SSE41)) { EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pmaxsd); } else { - auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(code, args[1]); - FallbackMinMaxS32(code, ctx, a, b, MinMaxOperation::Max); - ctx.reg_alloc.DefineValue(code, inst, a); + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const tmp1 = ctx.reg_alloc.UseXmm(code, args[1]); + auto const tmp2 = ctx.reg_alloc.ScratchXmm(code); + code.movdqa(tmp2, tmp0); + code.pcmpgtd(tmp2, tmp1); + code.pand(tmp0, tmp2); + code.pandn(tmp2, tmp1); + code.por(tmp0, tmp2); + ctx.reg_alloc.DefineValue(code, inst, tmp0); } } void EmitX64::EmitVectorMaxS64(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); if (code.HasHostFeature(HostFeature::AVX512_Ortho)) { - EmitAVXVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::vpmaxsq); + auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const xmm_b = ctx.reg_alloc.UseXmm(code, args[1]); + code.vpmaxsq(xmm_a, xmm_a, xmm_b); + ctx.reg_alloc.DefineValue(code, inst, xmm_a); } else if (code.HasHostFeature(HostFeature::AVX)) { - auto args = ctx.reg_alloc.GetArgumentInfo(inst); - - const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(code, args[1]); - + auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const y = ctx.reg_alloc.UseXmm(code, args[1]); code.vpcmpgtq(xmm0, y, x); code.pblendvb(x, y); - ctx.reg_alloc.DefineValue(code, inst, x); } else { - EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) { - std::transform(a.begin(), a.end(), b.begin(), result.begin(), [](auto x, auto y) { return (std::max)(x, y); }); - }); + auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const tmp1 = ctx.reg_alloc.UseXmm(code, args[1]); + auto const tmp2 = ctx.reg_alloc.ScratchXmm(code); + auto const tmp3 = ctx.reg_alloc.ScratchXmm(code); + auto const tmp4 = ctx.reg_alloc.ScratchXmm(code); + auto const tmp5 = ctx.reg_alloc.ScratchXmm(code); + code.movdqa(tmp2, code.Const(xword, 0x8000'0000, 0x8000'0000)); + code.movdqa(tmp3, tmp1); + code.pxor(tmp3, tmp2); + code.pxor(tmp2, tmp0); + code.movdqa(tmp4, tmp2); + code.pcmpeqd(tmp2, tmp3); + code.pcmpgtd(tmp4, tmp3); + code.pshufd(tmp2, tmp2, 245); + code.pshufd(tmp5, tmp4, 160); + code.pshufd(tmp3, tmp4, 245); + code.pand(tmp2, tmp5); + code.por(tmp3, tmp2); + code.pand(tmp0, tmp3); + code.pandn(tmp3, tmp1); + code.por(tmp0, tmp3); + ctx.reg_alloc.DefineValue(code, inst, tmp0); } } @@ -2041,11 +2073,11 @@ void EmitX64::EmitVectorMaxU16(EmitContext& ctx, IR::Inst* inst) { if (code.HasHostFeature(HostFeature::SSE41)) { EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pmaxuw); } else { - auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(code, args[1]); + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const b = ctx.reg_alloc.UseXmm(code, args[1]); FallbackMinMaxU16(code, ctx, a, b, MinMaxOperation::Max); - ctx.reg_alloc.DefineValue(code, inst, a); + ctx.reg_alloc.DefineValue(code, inst, a); } } @@ -2053,35 +2085,54 @@ void EmitX64::EmitVectorMaxU32(EmitContext& ctx, IR::Inst* inst) { if (code.HasHostFeature(HostFeature::SSE41)) { EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pmaxud); } else { - auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(code, args[1]); + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const b = ctx.reg_alloc.UseXmm(code, args[1]); FallbackMinMaxU32(code, ctx, a, b, MinMaxOperation::Max); - ctx.reg_alloc.DefineValue(code, inst, a); + ctx.reg_alloc.DefineValue(code, inst, a); } } void EmitX64::EmitVectorMaxU64(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); if (code.HasHostFeature(HostFeature::AVX512_Ortho)) { - EmitAVXVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::vpmaxuq); + auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const xmm_b = ctx.reg_alloc.UseXmm(code, args[1]); + code.vpmaxuq(xmm_a, xmm_a, xmm_b); + ctx.reg_alloc.DefineValue(code, inst, xmm_a); } else if (code.HasHostFeature(HostFeature::AVX)) { - auto args = ctx.reg_alloc.GetArgumentInfo(inst); - - const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(code, args[1]); - const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code); - + auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const y = ctx.reg_alloc.UseXmm(code, args[1]); + auto const tmp = ctx.reg_alloc.ScratchXmm(code); code.vmovdqa(xmm0, code.Const(xword, 0x8000000000000000, 0x8000000000000000)); code.vpsubq(tmp, y, xmm0); code.vpsubq(xmm0, x, xmm0); code.vpcmpgtq(xmm0, tmp, xmm0); code.pblendvb(x, y); - ctx.reg_alloc.DefineValue(code, inst, x); } else { - EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) { - std::transform(a.begin(), a.end(), b.begin(), result.begin(), [](auto x, auto y) { return (std::max)(x, y); }); - }); + auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const tmp1 = ctx.reg_alloc.UseXmm(code, args[1]); + auto const tmp2 = ctx.reg_alloc.ScratchXmm(code); + auto const tmp3 = ctx.reg_alloc.ScratchXmm(code); + auto const tmp4 = ctx.reg_alloc.ScratchXmm(code); + auto const tmp5 = ctx.reg_alloc.ScratchXmm(code); + code.movdqa(tmp2, code.Const(xword, 0x8000000080000000, 0x8000000080000000)); + code.movdqa(tmp3, tmp1); + code.pxor(tmp3, tmp2); + code.pxor(tmp2, tmp0); + code.movdqa(tmp4, tmp2); + code.pcmpeqd(tmp2, tmp3); + code.pcmpgtd(tmp4, tmp3); + code.pshufd(tmp2, tmp2, 245); + code.pshufd(tmp5, tmp4, 160); + code.pshufd(tmp3, tmp4, 245); + code.pand(tmp2, tmp5); + code.por(tmp3, tmp2); + code.pand(tmp0, tmp3); + code.pandn(tmp3, tmp1); + code.por(tmp0, tmp3); + ctx.reg_alloc.DefineValue(code, inst, tmp0); } } @@ -2089,11 +2140,16 @@ void EmitX64::EmitVectorMinS8(EmitContext& ctx, IR::Inst* inst) { if (code.HasHostFeature(HostFeature::SSE41)) { EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pminsb); } else { - auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(code, args[1]); - FallbackMinMaxS8(code, ctx, a, b, MinMaxOperation::Min); - ctx.reg_alloc.DefineValue(code, inst, a); + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const b = ctx.reg_alloc.UseXmm(code, args[1]); + auto const c = ctx.reg_alloc.ScratchXmm(code); + code.movdqa(c, b); + code.pcmpgtb(c, a); + code.pand(a, c); + code.pandn(c, b); + code.por(a, c); + ctx.reg_alloc.DefineValue(code, inst, a); } } @@ -2105,31 +2161,51 @@ void EmitX64::EmitVectorMinS32(EmitContext& ctx, IR::Inst* inst) { if (code.HasHostFeature(HostFeature::SSE41)) { EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pminsd); } else { - auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(code, args[1]); + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const b = ctx.reg_alloc.UseXmm(code, args[1]); FallbackMinMaxS32(code, ctx, a, b, MinMaxOperation::Min); - ctx.reg_alloc.DefineValue(code, inst, a); + ctx.reg_alloc.DefineValue(code, inst, a); } } void EmitX64::EmitVectorMinS64(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); if (code.HasHostFeature(HostFeature::AVX512_Ortho)) { - EmitAVXVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::vpminsq); + auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const xmm_b = ctx.reg_alloc.UseXmm(code, args[1]); + code.vpminsq(xmm_a, xmm_a, xmm_b); + ctx.reg_alloc.DefineValue(code, inst, xmm_a); } else if (code.HasHostFeature(HostFeature::AVX)) { - auto args = ctx.reg_alloc.GetArgumentInfo(inst); - - const Xbyak::Xmm x = ctx.reg_alloc.UseXmm(code, args[0]); - const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(code, args[1]); - + auto const x = ctx.reg_alloc.UseXmm(code, args[0]); + auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]); code.vpcmpgtq(xmm0, y, x); code.pblendvb(y, x); - ctx.reg_alloc.DefineValue(code, inst, y); } else { - EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) { - std::transform(a.begin(), a.end(), b.begin(), result.begin(), [](auto x, auto y) { return (std::min)(x, y); }); - }); + auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const tmp1 = ctx.reg_alloc.UseXmm(code, args[1]); + auto const tmp2 = ctx.reg_alloc.ScratchXmm(code); + auto const tmp3 = ctx.reg_alloc.ScratchXmm(code); + auto const tmp4 = ctx.reg_alloc.ScratchXmm(code); + auto const tmp5 = ctx.reg_alloc.ScratchXmm(code); + code.movdqa(tmp2, code.Const(xword, 0x8000'0000, 0x8000'0000)); + code.movdqa(tmp3, tmp1); + code.pxor(tmp3, tmp2); + code.pxor(tmp2, tmp0); + code.movdqa(tmp4, tmp2); + code.pcmpeqd(tmp2, tmp3); + code.pcmpgtd(tmp4, tmp3); + code.pshufd(tmp3, tmp2, 245); + code.pshufd(tmp5, tmp4, 160); + code.pshufd(tmp2, tmp4, 245); + code.pand(tmp3, tmp5); + code.por(tmp2, tmp3); + code.pand(tmp1, tmp2); + code.pandn(tmp2, tmp0); + code.por(tmp2, tmp1); + //code.movdqa(tmp0, tmp2); + ctx.reg_alloc.DefineValue(code, inst, tmp2); } } @@ -2141,11 +2217,11 @@ void EmitX64::EmitVectorMinU16(EmitContext& ctx, IR::Inst* inst) { if (code.HasHostFeature(HostFeature::SSE41)) { EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pminuw); } else { - auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(code, args[1]); + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const b = ctx.reg_alloc.UseXmm(code, args[1]); FallbackMinMaxU16(code, ctx, a, b, MinMaxOperation::Min); - ctx.reg_alloc.DefineValue(code, inst, a); + ctx.reg_alloc.DefineValue(code, inst, a); } } @@ -2153,57 +2229,93 @@ void EmitX64::EmitVectorMinU32(EmitContext& ctx, IR::Inst* inst) { if (code.HasHostFeature(HostFeature::SSE41)) { EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pminud); } else { - auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(code, args[1]); + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const b = ctx.reg_alloc.UseXmm(code, args[1]); FallbackMinMaxU32(code, ctx, a, b, MinMaxOperation::Min); - ctx.reg_alloc.DefineValue(code, inst, a); + ctx.reg_alloc.DefineValue(code, inst, a); } } void EmitX64::EmitVectorMinU64(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); if (code.HasHostFeature(HostFeature::AVX512_Ortho)) { - EmitAVXVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::vpminuq); + auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const xmm_b = ctx.reg_alloc.UseXmm(code, args[1]); + code.vpminuq(xmm_a, xmm_a, xmm_b); + ctx.reg_alloc.DefineValue(code, inst, xmm_a); } else if (code.HasHostFeature(HostFeature::AVX)) { - auto args = ctx.reg_alloc.GetArgumentInfo(inst); - - const Xbyak::Xmm x = ctx.reg_alloc.UseXmm(code, args[0]); - const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(code, args[1]); - const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code); - + auto const x = ctx.reg_alloc.UseXmm(code, args[0]); + auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]); + auto const tmp = ctx.reg_alloc.ScratchXmm(code); code.vmovdqa(xmm0, code.Const(xword, 0x8000000000000000, 0x8000000000000000)); code.vpsubq(tmp, y, xmm0); code.vpsubq(xmm0, x, xmm0); code.vpcmpgtq(xmm0, tmp, xmm0); code.pblendvb(y, x); - ctx.reg_alloc.DefineValue(code, inst, y); } else { - EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) { - std::transform(a.begin(), a.end(), b.begin(), result.begin(), [](auto x, auto y) { return (std::min)(x, y); }); - }); + auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const tmp1 = ctx.reg_alloc.UseXmm(code, args[1]); + auto const tmp2 = ctx.reg_alloc.ScratchXmm(code); + auto const tmp3 = ctx.reg_alloc.ScratchXmm(code); + auto const tmp4 = ctx.reg_alloc.ScratchXmm(code); + auto const tmp5 = ctx.reg_alloc.ScratchXmm(code); + code.movdqa(tmp2, code.Const(xword, 0x8000000080000000, 0x8000000080000000)); + code.movdqa(tmp3, tmp1); + code.pxor(tmp3, tmp2); + code.pxor(tmp2, tmp0); + code.movdqa(tmp4, tmp2); + code.pcmpeqd(tmp2, tmp3); + code.pcmpgtd(tmp4, tmp3); + code.pshufd(tmp3, tmp2, 245); + code.pshufd(tmp5, tmp4, 160); + code.pshufd(tmp2, tmp4, 245); + code.pand(tmp3, tmp5); + code.por(tmp2, tmp3); + code.pand(tmp1, tmp2); + code.pandn(tmp2, tmp0); + code.por(tmp2, tmp1); + //code.movdqa(tmp0, tmp2); + ctx.reg_alloc.DefineValue(code, inst, tmp2); } } void EmitX64::EmitVectorMultiply8(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(code, args[1]); - const Xbyak::Xmm tmp_a = ctx.reg_alloc.ScratchXmm(code); - const Xbyak::Xmm tmp_b = ctx.reg_alloc.ScratchXmm(code); - - // TODO: Optimize - code.movdqa(tmp_a, a); - code.movdqa(tmp_b, b); - code.pmullw(a, b); - code.psrlw(tmp_a, 8); - code.psrlw(tmp_b, 8); - code.pmullw(tmp_a, tmp_b); - code.pand(a, code.Const(xword, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF)); - code.psllw(tmp_a, 8); - code.por(a, tmp_a); - - ctx.reg_alloc.DefineValue(code, inst, a); + if (code.HasHostFeature(HostFeature::AVX)) { + auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const tmp1 = ctx.reg_alloc.UseScratchXmm(code, args[1]); + auto const tmp2 = ctx.reg_alloc.ScratchXmm(code); + auto const tmp3 = ctx.reg_alloc.ScratchXmm(code); + code.vbroadcastss(tmp3, code.Const(dword, 0x00ff'00ff)); + code.vpmullw(tmp2, tmp1, tmp0); + code.vpandn(tmp0, tmp3, tmp0); + code.vpand(tmp2, tmp2, tmp3); + code.vpmaddubsw(tmp0, tmp1, tmp0); + code.vpsllw(tmp0, tmp0, 8); + code.vpor(tmp0, tmp2, tmp0); + ctx.reg_alloc.DefineValue(code, inst, tmp0); + } else { + auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const tmp1 = ctx.reg_alloc.UseScratchXmm(code, args[1]); + auto const tmp2 = ctx.reg_alloc.ScratchXmm(code); + auto const tmp3 = ctx.reg_alloc.ScratchXmm(code); + auto const tmp4 = ctx.reg_alloc.ScratchXmm(code); + code.movdqa(tmp2, tmp0); + code.movdqa(tmp3, tmp1); + code.movdqa(tmp4, code.Const(xword, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF)); + code.punpckhbw(tmp2, tmp2); + code.punpckhbw(tmp3, tmp3); + code.punpcklbw(tmp0, tmp0); + code.punpcklbw(tmp1, tmp1); + code.pmullw(tmp3, tmp2); + code.pmullw(tmp0, tmp1); + code.pand(tmp3, tmp4); + code.pand(tmp0, tmp4); + code.packuswb(tmp0, tmp3); + ctx.reg_alloc.DefineValue(code, inst, tmp0); + } } void EmitX64::EmitVectorMultiply16(EmitContext& ctx, IR::Inst* inst) { @@ -2214,31 +2326,32 @@ void EmitX64::EmitVectorMultiply32(EmitContext& ctx, IR::Inst* inst) { if (code.HasHostFeature(HostFeature::SSE41)) { EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pmulld); } else { - auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(code, args[1]); - const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code); - - code.movdqa(tmp, a); - code.psrlq(a, 32); - code.pmuludq(tmp, b); - code.psrlq(b, 32); - code.pmuludq(a, b); - code.pshufd(tmp, tmp, 0b00001000); - code.pshufd(b, a, 0b00001000); - code.punpckldq(tmp, b); - - ctx.reg_alloc.DefineValue(code, inst, tmp); + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const b = ctx.reg_alloc.UseScratchXmm(code, args[1]); + auto const tmp = ctx.reg_alloc.ScratchXmm(code); + code.movdqa(tmp, a); + code.psrlq(a, 32); + code.pmuludq(tmp, b); + code.psrlq(b, 32); + code.pmuludq(a, b); + code.pshufd(tmp, tmp, 0b00001000); + code.pshufd(b, a, 0b00001000); + code.punpckldq(tmp, b); + ctx.reg_alloc.DefineValue(code, inst, tmp); } } void EmitX64::EmitVectorMultiply64(EmitContext& ctx, IR::Inst* inst) { - if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512DQ)) { - EmitAVXVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::vpmullq); - } else if (code.HasHostFeature(HostFeature::SSE41)) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(code, args[1]); + if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512DQ)) { + auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const xmm_b = ctx.reg_alloc.UseXmm(code, args[1]); + code.vpmullq(xmm_a, xmm_a, xmm_b); + ctx.reg_alloc.DefineValue(code, inst, xmm_a); + } else if (code.HasHostFeature(HostFeature::SSE41)) { + auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const b = ctx.reg_alloc.UseXmm(code, args[1]); const Xbyak::Reg64 tmp1 = ctx.reg_alloc.ScratchGpr(code); const Xbyak::Reg64 tmp2 = ctx.reg_alloc.ScratchGpr(code); @@ -2253,29 +2366,28 @@ void EmitX64::EmitVectorMultiply64(EmitContext& ctx, IR::Inst* inst) { ctx.reg_alloc.DefineValue(code, inst, a); } else { - auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm a = ctx.reg_alloc.UseXmm(code, args[0]); - const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(code, args[1]); - const Xbyak::Xmm tmp1 = ctx.reg_alloc.ScratchXmm(code); - const Xbyak::Xmm tmp2 = ctx.reg_alloc.ScratchXmm(code); - const Xbyak::Xmm tmp3 = ctx.reg_alloc.ScratchXmm(code); + auto const a = ctx.reg_alloc.UseXmm(code, args[0]); + auto const b = ctx.reg_alloc.UseScratchXmm(code, args[1]); + auto const tmp1 = ctx.reg_alloc.ScratchXmm(code); + auto const tmp2 = ctx.reg_alloc.ScratchXmm(code); + auto const tmp3 = ctx.reg_alloc.ScratchXmm(code); - code.movdqa(tmp1, a); - code.movdqa(tmp2, a); - code.movdqa(tmp3, b); + code.movdqa(tmp1, a); + code.movdqa(tmp2, a); + code.movdqa(tmp3, b); - code.psrlq(tmp1, 32); - code.psrlq(tmp3, 32); + code.psrlq(tmp1, 32); + code.psrlq(tmp3, 32); - code.pmuludq(tmp2, b); - code.pmuludq(tmp3, a); - code.pmuludq(b, tmp1); + code.pmuludq(tmp2, b); + code.pmuludq(tmp3, a); + code.pmuludq(b, tmp1); - code.paddq(b, tmp3); - code.psllq(b, 32); - code.paddq(tmp2, b); + code.paddq(b, tmp3); + code.psllq(b, 32); + code.paddq(tmp2, b); - ctx.reg_alloc.DefineValue(code, inst, tmp2); + ctx.reg_alloc.DefineValue(code, inst, tmp2); } } @@ -2307,15 +2419,15 @@ void EmitX64::EmitVectorNarrow16(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW)) { - const Xbyak::Xmm a = ctx.reg_alloc.UseXmm(code, args[0]); - const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code); + auto const a = ctx.reg_alloc.UseXmm(code, args[0]); + auto const result = ctx.reg_alloc.ScratchXmm(code); code.vpmovwb(result, a); ctx.reg_alloc.DefineValue(code, inst, result); } else { - const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm zeros = ctx.reg_alloc.ScratchXmm(code); + auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const zeros = ctx.reg_alloc.ScratchXmm(code); code.pxor(zeros, zeros); code.pand(a, code.Const(xword, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF)); @@ -2328,13 +2440,13 @@ void EmitX64::EmitVectorNarrow16(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorNarrow32(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); if (code.HasHostFeature(HostFeature::AVX512_Ortho)) { - const Xbyak::Xmm a = ctx.reg_alloc.UseXmm(code, args[0]); - const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code); + auto const a = ctx.reg_alloc.UseXmm(code, args[0]); + auto const result = ctx.reg_alloc.ScratchXmm(code); code.vpmovdw(result, a); ctx.reg_alloc.DefineValue(code, inst, result); } else { - const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm zeros = ctx.reg_alloc.ScratchXmm(code); + auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const zeros = ctx.reg_alloc.ScratchXmm(code); code.pxor(zeros, zeros); if (code.HasHostFeature(HostFeature::SSE41)) { code.pblendw(a, zeros, 0b10101010); @@ -2352,15 +2464,15 @@ void EmitX64::EmitVectorNarrow64(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); if (code.HasHostFeature(HostFeature::AVX512_Ortho)) { - const Xbyak::Xmm a = ctx.reg_alloc.UseXmm(code, args[0]); - const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code); + auto const a = ctx.reg_alloc.UseXmm(code, args[0]); + auto const result = ctx.reg_alloc.ScratchXmm(code); code.vpmovqd(result, a); ctx.reg_alloc.DefineValue(code, inst, result); } else { - const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm zeros = ctx.reg_alloc.ScratchXmm(code); + auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const zeros = ctx.reg_alloc.ScratchXmm(code); code.pxor(zeros, zeros); code.shufps(a, zeros, 0b00001000); @@ -2373,13 +2485,13 @@ void EmitX64::EmitVectorNot(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); if (code.HasHostFeature(HostFeature::AVX512_Ortho)) { - const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code); - const Xbyak::Xmm operand = ctx.reg_alloc.UseXmm(code, args[0]); + auto const result = ctx.reg_alloc.ScratchXmm(code); + auto const operand = ctx.reg_alloc.UseXmm(code, args[0]); code.vpternlogq(result, operand, operand, u8(~Tern::c)); ctx.reg_alloc.DefineValue(code, inst, result); } else { - const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm xmm_b = ctx.reg_alloc.ScratchXmm(code); + auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const xmm_b = ctx.reg_alloc.ScratchXmm(code); code.pcmpeqw(xmm_b, xmm_b); code.pxor(xmm_a, xmm_b); ctx.reg_alloc.DefineValue(code, inst, xmm_a); @@ -2393,9 +2505,9 @@ void EmitX64::EmitVectorOr(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorPairedAddLower8(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(code, args[1]); - const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code); + auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const xmm_b = ctx.reg_alloc.UseXmm(code, args[1]); + auto const tmp = ctx.reg_alloc.ScratchXmm(code); code.punpcklqdq(xmm_a, xmm_b); code.movdqa(tmp, xmm_a); @@ -2411,9 +2523,9 @@ void EmitX64::EmitVectorPairedAddLower8(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorPairedAddLower16(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(code, args[1]); - const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code); + auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const xmm_b = ctx.reg_alloc.UseXmm(code, args[1]); + auto const tmp = ctx.reg_alloc.ScratchXmm(code); code.punpcklqdq(xmm_a, xmm_b); if (code.HasHostFeature(HostFeature::SSSE3)) { @@ -2434,9 +2546,9 @@ void EmitX64::EmitVectorPairedAddLower16(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorPairedAddLower32(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(code, args[1]); - const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code); + auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const xmm_b = ctx.reg_alloc.UseXmm(code, args[1]); + auto const tmp = ctx.reg_alloc.ScratchXmm(code); code.punpcklqdq(xmm_a, xmm_b); if (code.HasHostFeature(HostFeature::SSSE3)) { @@ -2456,10 +2568,10 @@ void EmitX64::EmitVectorPairedAddLower32(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorPairedAdd8(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(code, args[1]); - const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(code); - const Xbyak::Xmm d = ctx.reg_alloc.ScratchXmm(code); + auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const b = ctx.reg_alloc.UseScratchXmm(code, args[1]); + auto const c = ctx.reg_alloc.ScratchXmm(code); + auto const d = ctx.reg_alloc.ScratchXmm(code); code.movdqa(c, a); code.movdqa(d, b); @@ -2478,17 +2590,17 @@ void EmitX64::EmitVectorPairedAdd16(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); if (code.HasHostFeature(HostFeature::SSSE3)) { - const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(code, args[1]); + auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const b = ctx.reg_alloc.UseXmm(code, args[1]); code.phaddw(a, b); ctx.reg_alloc.DefineValue(code, inst, a); } else { - const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(code, args[1]); - const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(code); - const Xbyak::Xmm d = ctx.reg_alloc.ScratchXmm(code); + auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const b = ctx.reg_alloc.UseScratchXmm(code, args[1]); + auto const c = ctx.reg_alloc.ScratchXmm(code); + auto const d = ctx.reg_alloc.ScratchXmm(code); code.movdqa(c, a); code.movdqa(d, b); @@ -2508,17 +2620,17 @@ void EmitX64::EmitVectorPairedAdd32(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); if (code.HasHostFeature(HostFeature::SSSE3)) { - const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(code, args[1]); + auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const b = ctx.reg_alloc.UseXmm(code, args[1]); code.phaddd(a, b); ctx.reg_alloc.DefineValue(code, inst, a); } else { - const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(code, args[1]); - const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(code); - const Xbyak::Xmm d = ctx.reg_alloc.ScratchXmm(code); + auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const b = ctx.reg_alloc.UseScratchXmm(code, args[1]); + auto const c = ctx.reg_alloc.ScratchXmm(code); + auto const d = ctx.reg_alloc.ScratchXmm(code); code.movdqa(c, a); code.movdqa(d, b); @@ -2535,9 +2647,9 @@ void EmitX64::EmitVectorPairedAdd32(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorPairedAdd64(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(code, args[1]); - const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(code); + auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const b = ctx.reg_alloc.UseXmm(code, args[1]); + auto const c = ctx.reg_alloc.ScratchXmm(code); code.movdqa(c, a); code.punpcklqdq(a, b); @@ -2550,8 +2662,8 @@ void EmitX64::EmitVectorPairedAdd64(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorPairedAddSignedWiden8(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(code); + auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const c = ctx.reg_alloc.ScratchXmm(code); code.movdqa(c, a); code.psllw(a, 8); @@ -2565,8 +2677,8 @@ void EmitX64::EmitVectorPairedAddSignedWiden8(EmitContext& ctx, IR::Inst* inst) void EmitX64::EmitVectorPairedAddSignedWiden16(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(code); + auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const c = ctx.reg_alloc.ScratchXmm(code); code.movdqa(c, a); code.pslld(a, 16); @@ -2580,18 +2692,18 @@ void EmitX64::EmitVectorPairedAddSignedWiden16(EmitContext& ctx, IR::Inst* inst) void EmitX64::EmitVectorPairedAddSignedWiden32(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]); if (code.HasHostFeature(HostFeature::AVX512_Ortho)) { - const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(code); + auto const c = ctx.reg_alloc.ScratchXmm(code); code.vpsraq(c, a, 32); code.vpsllq(a, a, 32); code.vpsraq(a, a, 32); code.vpaddq(a, a, c); } else { - const Xbyak::Xmm tmp1 = ctx.reg_alloc.ScratchXmm(code); - const Xbyak::Xmm tmp2 = ctx.reg_alloc.ScratchXmm(code); - const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(code); + auto const tmp1 = ctx.reg_alloc.ScratchXmm(code); + auto const tmp2 = ctx.reg_alloc.ScratchXmm(code); + auto const c = ctx.reg_alloc.ScratchXmm(code); code.movdqa(c, a); code.psllq(a, 32); @@ -2613,8 +2725,8 @@ void EmitX64::EmitVectorPairedAddSignedWiden32(EmitContext& ctx, IR::Inst* inst) void EmitX64::EmitVectorPairedAddUnsignedWiden8(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(code); + auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const c = ctx.reg_alloc.ScratchXmm(code); code.movdqa(c, a); code.psllw(a, 8); @@ -2628,8 +2740,8 @@ void EmitX64::EmitVectorPairedAddUnsignedWiden8(EmitContext& ctx, IR::Inst* inst void EmitX64::EmitVectorPairedAddUnsignedWiden16(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(code); + auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const c = ctx.reg_alloc.ScratchXmm(code); code.movdqa(c, a); code.pslld(a, 16); @@ -2643,8 +2755,8 @@ void EmitX64::EmitVectorPairedAddUnsignedWiden16(EmitContext& ctx, IR::Inst* ins void EmitX64::EmitVectorPairedAddUnsignedWiden32(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(code); + auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const c = ctx.reg_alloc.ScratchXmm(code); code.movdqa(c, a); code.psllq(a, 32); @@ -2658,14 +2770,10 @@ void EmitX64::EmitVectorPairedAddUnsignedWiden32(EmitContext& ctx, IR::Inst* ins template static void PairedOperation(VectorArray& result, const VectorArray& x, const VectorArray& y, Function fn) { const size_t range = x.size() / 2; - - for (size_t i = 0; i < range; i++) { + for (size_t i = 0; i < range; i++) result[i] = fn(x[2 * i], x[2 * i + 1]); - } - - for (size_t i = 0; i < range; i++) { + for (size_t i = 0; i < range; i++) result[range + i] = fn(y[2 * i], y[2 * i + 1]); - } } template @@ -2686,11 +2794,6 @@ static void PairedMax(VectorArray& result, const VectorArray& x, const Vec PairedOperation(result, x, y, [](auto a, auto b) { return (std::max)(a, b); }); } -template -static void PairedMin(VectorArray& result, const VectorArray& x, const VectorArray& y) { - PairedOperation(result, x, y, [](auto a, auto b) { return (std::min)(a, b); }); -} - template static void LowerPairedMax(VectorArray& result, const VectorArray& x, const VectorArray& y) { LowerPairedOperation(result, x, y, [](auto a, auto b) { return (std::max)(a, b); }); @@ -2705,19 +2808,16 @@ template static void EmitVectorPairedMinMax8(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(code, args[1]); - const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code); + auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]); + auto const tmp = ctx.reg_alloc.ScratchXmm(code); code.movdqa(tmp, code.Const(xword, 0x0E'0C'0A'08'06'04'02'00, 0x0F'0D'0B'09'07'05'03'01)); code.pshufb(x, tmp); code.pshufb(y, tmp); - code.movaps(tmp, x); code.shufps(tmp, y, 0b01'00'01'00); - code.shufps(x, y, 0b11'10'11'10); - if constexpr (std::is_member_function_pointer_v) { (code.*fn)(x, tmp); } else { @@ -2730,21 +2830,17 @@ static void EmitVectorPairedMinMax8(BlockOfCode& code, EmitContext& ctx, IR::Ins template static void EmitVectorPairedMinMaxLower8(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - - const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(code, args[1]); - + auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]); code.punpcklqdq(x, y); code.pshufb(x, code.Const(xword, 0x0E'0C'0A'08'06'04'02'00, 0x0F'0D'0B'09'07'05'03'01)); code.movhlps(y, x); code.movq(x, x); - if constexpr (std::is_member_function_pointer_v) { (code.*fn)(x, y); } else { fn(x, y); } - ctx.reg_alloc.DefineValue(code, inst, x); } @@ -2752,9 +2848,9 @@ template static void EmitVectorPairedMinMax16(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(code, args[1]); - const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code); + auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]); + auto const tmp = ctx.reg_alloc.ScratchXmm(code); // swap idxs 1 and 2 within 64-bit lanes so that both registers contain [even, odd, even, odd]-indexed pairs of elements code.pshuflw(x, x, 0b11'01'10'00); @@ -2780,63 +2876,31 @@ static void EmitVectorPairedMinMax16(BlockOfCode& code, EmitContext& ctx, IR::In ctx.reg_alloc.DefineValue(code, inst, x); } -template -static void EmitVectorPairedMinMaxLower16(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) { - auto args = ctx.reg_alloc.GetArgumentInfo(inst); - - const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(code, args[1]); - const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code); - - // swap idxs 1 and 2 so that both registers contain even then odd-indexed pairs of elements - code.pshuflw(x, x, 0b11'01'10'00); - code.pshuflw(y, y, 0b11'01'10'00); - - // move pairs of even/odd-indexed elements into one register each - - // tmp = x[0, 2], y[0, 2], 0s... - code.movaps(tmp, y); - code.insertps(tmp, x, 0b01001100); - // x = x[1, 3], y[1, 3], 0s... - code.insertps(x, y, 0b00011100); - - (code.*fn)(x, tmp); - - ctx.reg_alloc.DefineValue(code, inst, x); -} - -static void EmitVectorPairedMinMaxLower32(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, void (Xbyak::CodeGenerator::*fn)(const Xbyak::Xmm&, const Xbyak::Operand&)) { - auto args = ctx.reg_alloc.GetArgumentInfo(inst); - - const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(code, args[1]); - const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code); - - // tmp = x[1], y[1], 0, 0 - code.movaps(tmp, y); - code.insertps(tmp, x, 0b01001100); - // x = x[0], y[0], 0, 0 - code.insertps(x, y, 0b00011100); - - (code.*fn)(x, tmp); - - ctx.reg_alloc.DefineValue(code, inst, x); -} void EmitX64::EmitVectorPairedMaxS8(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]); + auto const tmp = ctx.reg_alloc.ScratchXmm(code); + code.movdqa(tmp, code.Const(xword, 0x0E'0C'0A'08'06'04'02'00, 0x0F'0D'0B'09'07'05'03'01)); + code.pshufb(x, tmp); + code.pshufb(y, tmp); + code.movaps(tmp, x); + code.shufps(tmp, y, 0b01'00'01'00); + code.shufps(x, y, 0b11'10'11'10); if (code.HasHostFeature(HostFeature::SSE41)) { - EmitVectorPairedMinMax8(code, ctx, inst, &Xbyak::CodeGenerator::pmaxsb); - return; - } else if (code.HasHostFeature(HostFeature::SSSE3)) { - EmitVectorPairedMinMax8(code, ctx, inst, [&](const auto& lhs, const auto& rhs) { - FallbackMinMaxS8(code, ctx, lhs, rhs, MinMaxOperation::Max); - }); - return; + code.pmaxsb(x, tmp); + } else { + auto const a = x; + auto const b = tmp; + auto const c = ctx.reg_alloc.ScratchXmm(code); + code.movdqa(c, a); + code.pcmpgtb(c, b); + code.pand(a, c); + code.pandn(c, b); + code.por(a, c); } - - EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) { - PairedMax(result, a, b); - }); + ctx.reg_alloc.DefineValue(code, inst, x); } void EmitX64::EmitVectorPairedMaxS16(EmitContext& ctx, IR::Inst* inst) { @@ -2846,9 +2910,9 @@ void EmitX64::EmitVectorPairedMaxS16(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorPairedMaxS32(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(code, args[1]); - const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code); + auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const y = ctx.reg_alloc.UseXmm(code, args[1]); + auto const tmp = ctx.reg_alloc.ScratchXmm(code); code.movdqa(tmp, x); code.shufps(tmp, y, 0b10001000); @@ -2866,12 +2930,24 @@ void EmitX64::EmitVectorPairedMaxS32(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorPairedMaxU8(EmitContext& ctx, IR::Inst* inst) { if (code.HasHostFeature(HostFeature::SSSE3)) { EmitVectorPairedMinMax8(code, ctx, inst, &Xbyak::CodeGenerator::pmaxub); - return; + } else { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const tmp1 = ctx.reg_alloc.UseScratchXmm(code, args[1]); + auto const tmp2 = ctx.reg_alloc.ScratchXmm(code); + auto const tmp3 = ctx.reg_alloc.ScratchXmm(code); + auto const constant_00ff = code.Const(xword, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF); + code.movdqa(tmp2, constant_00ff); + code.movdqa(tmp3, tmp1); + code.pand(tmp3, tmp2); + code.pand(tmp2, tmp0); + code.packuswb(tmp2, tmp3); + code.psrlw(tmp1, 8); + code.psrlw(tmp0, 8); + code.packuswb(tmp0, tmp1); + code.pmaxub(tmp0, tmp2); + ctx.reg_alloc.DefineValue(code, inst, tmp0); } - - EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) { - PairedMax(result, a, b); - }); } void EmitX64::EmitVectorPairedMaxU16(EmitContext& ctx, IR::Inst* inst) { @@ -2887,9 +2963,9 @@ void EmitX64::EmitVectorPairedMaxU16(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorPairedMaxU32(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(code, args[1]); - const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code); + auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const y = ctx.reg_alloc.UseXmm(code, args[1]); + auto const tmp = ctx.reg_alloc.ScratchXmm(code); code.movdqa(tmp, x); code.shufps(tmp, y, 0b10001000); @@ -2907,14 +2983,15 @@ void EmitX64::EmitVectorPairedMaxU32(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorPairedMinS8(EmitContext& ctx, IR::Inst* inst) { if (code.HasHostFeature(HostFeature::SSE41)) { EmitVectorPairedMinMax8(code, ctx, inst, &Xbyak::CodeGenerator::pminsb); - } else if (code.HasHostFeature(HostFeature::SSSE3)) { - EmitVectorPairedMinMax8(code, ctx, inst, [&](const auto& lhs, const auto& rhs) { - FallbackMinMaxS8(code, ctx, lhs, rhs, MinMaxOperation::Min); - }); } else { - EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) { - PairedMin(result, a, b); - }); + EmitVectorPairedMinMax8(code, ctx, inst, [&](const auto& a, const auto& b) { + auto const c = ctx.reg_alloc.ScratchXmm(code); + code.movdqa(c, b); + code.pcmpgtb(c, a); + code.pand(a, c); + code.pandn(c, b); + code.por(a, c); + }); } } @@ -2925,9 +3002,9 @@ void EmitX64::EmitVectorPairedMinS16(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorPairedMinS32(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(code, args[1]); - const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code); + auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const y = ctx.reg_alloc.UseXmm(code, args[1]); + auto const tmp = ctx.reg_alloc.ScratchXmm(code); code.movdqa(tmp, x); code.shufps(tmp, y, 0b10001000); @@ -2943,12 +3020,25 @@ void EmitX64::EmitVectorPairedMinS32(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorPairedMinU8(EmitContext& ctx, IR::Inst* inst) { if (code.HasHostFeature(HostFeature::SSSE3)) { EmitVectorPairedMinMax8(code, ctx, inst, &Xbyak::CodeGenerator::pminub); - return; + } else { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const tmp1 = ctx.reg_alloc.UseScratchXmm(code, args[1]); + auto const tmp2 = ctx.reg_alloc.ScratchXmm(code); + auto const tmp3 = ctx.reg_alloc.ScratchXmm(code); + auto const constant_00ff = code.Const(xword, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF); + code.movdqa(tmp2, tmp1); + code.psrlw(tmp2, 8); + code.movdqa(tmp3, tmp0); + code.psrlw(tmp3, 8); + code.packuswb(tmp3, tmp2); + code.movdqa(tmp2, constant_00ff); + code.pand(tmp1, tmp2); + code.pand(tmp0, tmp2); + code.packuswb(tmp0, tmp1); + code.pminub(tmp0, tmp3); + ctx.reg_alloc.DefineValue(code, inst, tmp0); } - - EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) { - PairedMin(result, a, b); - }); } void EmitX64::EmitVectorPairedMinU16(EmitContext& ctx, IR::Inst* inst) { @@ -2964,9 +3054,9 @@ void EmitX64::EmitVectorPairedMinU16(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorPairedMinU32(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(code, args[1]); - const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code); + auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const y = ctx.reg_alloc.UseXmm(code, args[1]); + auto const tmp = ctx.reg_alloc.ScratchXmm(code); code.movdqa(tmp, x); code.shufps(tmp, y, 0b10001000); @@ -2982,41 +3072,88 @@ void EmitX64::EmitVectorPairedMinU32(EmitContext& ctx, IR::Inst* inst) { } void EmitX64::EmitVectorPairedMaxLowerS8(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]); if (code.HasHostFeature(HostFeature::SSE41)) { - EmitVectorPairedMinMaxLower8(code, ctx, inst, &Xbyak::CodeGenerator::pmaxsb); - return; - } else if (code.HasHostFeature(HostFeature::SSSE3)) { - EmitVectorPairedMinMaxLower8(code, ctx, inst, [&](const auto& lhs, const auto& rhs) { - FallbackMinMaxS8(code, ctx, lhs, rhs, MinMaxOperation::Max); - }); - return; + code.punpcklqdq(x, y); + code.pshufb(x, code.Const(xword, 0x0E'0C'0A'08'06'04'02'00, 0x0F'0D'0B'09'07'05'03'01)); + code.movhlps(y, x); + code.movq(x, x); + code.pmaxsb(x, y); + } else { + auto const c = ctx.reg_alloc.ScratchXmm(code); + code.punpcklqdq(x, y); + code.pshufb(x, code.Const(xword, 0x0E'0C'0A'08'06'04'02'00, 0x0F'0D'0B'09'07'05'03'01)); + code.movhlps(y, x); + code.movq(x, x); + code.movdqa(c, x); + code.pcmpgtb(c, y); + code.pand(x, c); + code.pandn(c, y); + code.por(x, c); } - - EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) { - LowerPairedMax(result, a, b); - }); + ctx.reg_alloc.DefineValue(code, inst, x); } void EmitX64::EmitVectorPairedMaxLowerS16(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); if (code.HasHostFeature(HostFeature::SSE41)) { - EmitVectorPairedMinMaxLower16(code, ctx, inst, &Xbyak::CodeGenerator::pmaxsw); - return; + auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]); + auto const tmp = ctx.reg_alloc.ScratchXmm(code); + // swap idxs 1 and 2 so that both registers contain even then odd-indexed pairs of elements + code.pshuflw(x, x, 0b11'01'10'00); + code.pshuflw(y, y, 0b11'01'10'00); + // move pairs of even/odd-indexed elements into one register each + // tmp = x[0, 2], y[0, 2], 0s... + code.movaps(tmp, y); + code.insertps(tmp, x, 0b01001100); + // x = x[1, 3], y[1, 3], 0s... + code.insertps(x, y, 0b00011100); + code.pmaxsw(x, tmp); + ctx.reg_alloc.DefineValue(code, inst, x); + } else { + auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const tmp1 = ctx.reg_alloc.UseScratchXmm(code, args[1]); + code.punpcklwd(tmp0, tmp1); + code.pshufd(tmp1, tmp0, 232); + code.pshuflw(tmp1, tmp1, 216); + code.pshufd(tmp0, tmp0, 231); + code.pshuflw(tmp0, tmp0, 114); + code.pmaxsw(tmp0, tmp1); + code.movq(tmp0, tmp0); + ctx.reg_alloc.DefineValue(code, inst, tmp0); } - - EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) { - LowerPairedMax(result, a, b); - }); } void EmitX64::EmitVectorPairedMaxLowerS32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); if (code.HasHostFeature(HostFeature::SSE41)) { - EmitVectorPairedMinMaxLower32(code, ctx, inst, &Xbyak::CodeGenerator::pmaxsd); - return; + auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const y = ctx.reg_alloc.UseXmm(code, args[1]); + auto const tmp = ctx.reg_alloc.ScratchXmm(code); + // tmp = x[1], y[1], 0, 0 + code.movaps(tmp, y); + code.insertps(tmp, x, 0b01001100); + // x = x[0], y[0], 0, 0 + code.insertps(x, y, 0b00011100); + code.pmaxsd(x, tmp); + ctx.reg_alloc.DefineValue(code, inst, x); + } else { + auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const tmp1 = ctx.reg_alloc.UseScratchXmm(code, args[1]); + auto const tmp2 = ctx.reg_alloc.ScratchXmm(code); + code.punpckldq(tmp0, tmp1); + code.pshufd(tmp1, tmp0, 238); + code.movdqa(tmp2, tmp0); + code.pcmpgtd(tmp2, tmp1); + code.pand(tmp0, tmp2); + code.pandn(tmp2, tmp1); + code.por(tmp2, tmp0); + code.movq(tmp0, tmp2); + ctx.reg_alloc.DefineValue(code, inst, tmp0); } - - EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) { - LowerPairedMax(result, a, b); - }); } void EmitX64::EmitVectorPairedMaxLowerU8(EmitContext& ctx, IR::Inst* inst) { @@ -3031,63 +3168,143 @@ void EmitX64::EmitVectorPairedMaxLowerU8(EmitContext& ctx, IR::Inst* inst) { } void EmitX64::EmitVectorPairedMaxLowerU16(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); if (code.HasHostFeature(HostFeature::SSE41)) { - EmitVectorPairedMinMaxLower16(code, ctx, inst, &Xbyak::CodeGenerator::pmaxuw); - return; + auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]); + auto const tmp = ctx.reg_alloc.ScratchXmm(code); + // swap idxs 1 and 2 so that both registers contain even then odd-indexed pairs of elements + code.pshuflw(x, x, 0b11'01'10'00); + code.pshuflw(y, y, 0b11'01'10'00); + // move pairs of even/odd-indexed elements into one register each + // tmp = x[0, 2], y[0, 2], 0s... + code.movaps(tmp, y); + code.insertps(tmp, x, 0b01001100); + // x = x[1, 3], y[1, 3], 0s... + code.insertps(x, y, 0b00011100); + code.pmaxuw(x, tmp); + ctx.reg_alloc.DefineValue(code, inst, x); + } else { + auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const tmp1 = ctx.reg_alloc.UseScratchXmm(code, args[1]); + code.punpcklwd(tmp0, tmp1); + code.pshufd(tmp1, tmp0, 232); + code.pshuflw(tmp1, tmp1, 216); + code.pshufd(tmp0, tmp0, 231); + code.pshuflw(tmp0, tmp0, 114); + code.psubusw(tmp0, tmp1); + code.paddw(tmp0, tmp1); + code.movq(tmp0, tmp0); + ctx.reg_alloc.DefineValue(code, inst, tmp0); } - - EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) { - LowerPairedMax(result, a, b); - }); } void EmitX64::EmitVectorPairedMaxLowerU32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); if (code.HasHostFeature(HostFeature::SSE41)) { - EmitVectorPairedMinMaxLower32(code, ctx, inst, &Xbyak::CodeGenerator::pmaxud); - return; + auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const y = ctx.reg_alloc.UseXmm(code, args[1]); + auto const tmp = ctx.reg_alloc.ScratchXmm(code); + // tmp = x[1], y[1], 0, 0 + code.movaps(tmp, y); + code.insertps(tmp, x, 0b01001100); + // x = x[0], y[0], 0, 0 + code.insertps(x, y, 0b00011100); + code.pmaxud(x, tmp); + ctx.reg_alloc.DefineValue(code, inst, x); + } else { + auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const tmp1 = ctx.reg_alloc.UseScratchXmm(code, args[1]); + auto const tmp2 = ctx.reg_alloc.ScratchXmm(code); + auto const tmp3 = ctx.reg_alloc.ScratchXmm(code); + code.punpckldq(tmp0, tmp1); + code.pshufd(tmp1, tmp0, 238); + code.movdqa(tmp2, code.Const(xword, 0x8000'00008000'0000, 0x8000'00008000'0000)); + code.movdqa(tmp3, tmp0); + code.pxor(tmp3, tmp2); + code.pxor(tmp2, tmp1); + code.pcmpgtd(tmp3, tmp2); + code.pand(tmp0, tmp3); + code.pandn(tmp3, tmp1); + code.por(tmp3, tmp0); + code.movq(tmp0, tmp3); + ctx.reg_alloc.DefineValue(code, inst, tmp0); } - - EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) { - LowerPairedMax(result, a, b); - }); } void EmitX64::EmitVectorPairedMinLowerS8(EmitContext& ctx, IR::Inst* inst) { if (code.HasHostFeature(HostFeature::SSE41)) { EmitVectorPairedMinMaxLower8(code, ctx, inst, &Xbyak::CodeGenerator::pminsb); - return; - } else if (code.HasHostFeature(HostFeature::SSSE3)) { - EmitVectorPairedMinMaxLower8(code, ctx, inst, [&](const auto& lhs, const auto& rhs) { - FallbackMinMaxS8(code, ctx, lhs, rhs, MinMaxOperation::Min); + } else { + EmitVectorPairedMinMaxLower8(code, ctx, inst, [&](const auto& a, const auto& b) { + auto const c = ctx.reg_alloc.ScratchXmm(code); + code.movdqa(c, b); + code.pcmpgtb(c, a); + code.pand(a, c); + code.pandn(c, b); + code.por(a, c); }); - return; } - - EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) { - LowerPairedMin(result, a, b); - }); } void EmitX64::EmitVectorPairedMinLowerS16(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); if (code.HasHostFeature(HostFeature::SSE41)) { - EmitVectorPairedMinMaxLower16(code, ctx, inst, &Xbyak::CodeGenerator::pminsw); - return; + auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]); + auto const tmp = ctx.reg_alloc.ScratchXmm(code); + // swap idxs 1 and 2 so that both registers contain even then odd-indexed pairs of elements + code.pshuflw(x, x, 0b11'01'10'00); + code.pshuflw(y, y, 0b11'01'10'00); + // move pairs of even/odd-indexed elements into one register each + // tmp = x[0, 2], y[0, 2], 0s... + code.movaps(tmp, y); + code.insertps(tmp, x, 0b01001100); + // x = x[1, 3], y[1, 3], 0s... + code.insertps(x, y, 0b00011100); + code.pminsw(x, tmp); + ctx.reg_alloc.DefineValue(code, inst, x); + } else { + auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const tmp1 = ctx.reg_alloc.UseScratchXmm(code, args[1]); + code.punpcklwd(tmp0, tmp1); + code.pshufd(tmp1, tmp0, 231); + code.pshuflw(tmp1, tmp1, 114); + code.pshufd(tmp0, tmp0, 232); + code.pshuflw(tmp0, tmp0, 216); + code.pminsw(tmp0, tmp1); + code.movq(tmp0, tmp0); + ctx.reg_alloc.DefineValue(code, inst, tmp0); } - - EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) { - LowerPairedMin(result, a, b); - }); } void EmitX64::EmitVectorPairedMinLowerS32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); if (code.HasHostFeature(HostFeature::SSE41)) { - EmitVectorPairedMinMaxLower32(code, ctx, inst, &Xbyak::CodeGenerator::pminsd); - return; + auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const y = ctx.reg_alloc.UseXmm(code, args[1]); + auto const tmp = ctx.reg_alloc.ScratchXmm(code); + // tmp = x[1], y[1], 0, 0 + code.movaps(tmp, y); + code.insertps(tmp, x, 0b01001100); + // x = x[0], y[0], 0, 0 + code.insertps(x, y, 0b00011100); + code.pminsd(x, tmp); + ctx.reg_alloc.DefineValue(code, inst, x); + } else { + auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const tmp1 = ctx.reg_alloc.UseScratchXmm(code, args[1]); + auto const tmp2 = ctx.reg_alloc.ScratchXmm(code); + code.punpckldq(tmp0, tmp1); + code.pshufd(tmp1, tmp0, 238); + code.movdqa(tmp2, tmp0); + code.pcmpgtd(tmp2, tmp1); + code.pand(tmp1, tmp2); + code.pandn(tmp2, tmp0); + code.por(tmp2, tmp1); + code.movq(tmp0, tmp2); + ctx.reg_alloc.DefineValue(code, inst, tmp0); } - - EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) { - LowerPairedMin(result, a, b); - }); } void EmitX64::EmitVectorPairedMinLowerU8(EmitContext& ctx, IR::Inst* inst) { @@ -3102,50 +3319,91 @@ void EmitX64::EmitVectorPairedMinLowerU8(EmitContext& ctx, IR::Inst* inst) { } void EmitX64::EmitVectorPairedMinLowerU16(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); if (code.HasHostFeature(HostFeature::SSE41)) { - EmitVectorPairedMinMaxLower16(code, ctx, inst, &Xbyak::CodeGenerator::pminuw); - return; + auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]); + auto const tmp = ctx.reg_alloc.ScratchXmm(code); + // swap idxs 1 and 2 so that both registers contain even then odd-indexed pairs of elements + code.pshuflw(x, x, 0b11'01'10'00); + code.pshuflw(y, y, 0b11'01'10'00); + // move pairs of even/odd-indexed elements into one register each + // tmp = x[0, 2], y[0, 2], 0s... + code.movaps(tmp, y); + code.insertps(tmp, x, 0b01001100); + // x = x[1, 3], y[1, 3], 0s... + code.insertps(x, y, 0b00011100); + code.pminuw(x, tmp); + ctx.reg_alloc.DefineValue(code, inst, x); + } else { + auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const tmp1 = ctx.reg_alloc.UseScratchXmm(code, args[1]); + auto const tmp2 = ctx.reg_alloc.ScratchXmm(code); + code.punpcklwd(tmp0, tmp1); + code.pshufd(tmp1, tmp0, 231); + code.pshuflw(tmp1, tmp1, 114); + code.pshufd(tmp0, tmp0, 232); + code.pshuflw(tmp0, tmp0, 216); + code.movdqa(tmp2, tmp1); + code.psubusw(tmp2, tmp0); + code.psubw(tmp1, tmp2); + code.movq(tmp0, tmp1); + ctx.reg_alloc.DefineValue(code, inst, tmp0); } - - EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) { - LowerPairedMin(result, a, b); - }); } void EmitX64::EmitVectorPairedMinLowerU32(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); if (code.HasHostFeature(HostFeature::SSE41)) { - EmitVectorPairedMinMaxLower32(code, ctx, inst, &Xbyak::CodeGenerator::pminud); - return; + auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const y = ctx.reg_alloc.UseXmm(code, args[1]); + auto const tmp = ctx.reg_alloc.ScratchXmm(code); + // tmp = x[1], y[1], 0, 0 + code.movaps(tmp, y); + code.insertps(tmp, x, 0b01001100); + // x = x[0], y[0], 0, 0 + code.insertps(x, y, 0b00011100); + code.pminud(x, tmp); + ctx.reg_alloc.DefineValue(code, inst, x); + } else { + auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const tmp1 = ctx.reg_alloc.UseScratchXmm(code, args[1]); + auto const tmp2 = ctx.reg_alloc.ScratchXmm(code); + auto const tmp3 = ctx.reg_alloc.ScratchXmm(code); + code.punpckldq(tmp0, tmp1); + code.pshufd(tmp1, tmp0, 238); + code.movdqa(tmp2, code.Const(xword, 0x8000'00008000'0000, 0x8000'00008000'0000)); + code.movdqa(tmp3, tmp0); + code.pxor(tmp3, tmp2); + code.pxor(tmp2, tmp1); + code.pcmpgtd(tmp3, tmp2); + code.pand(tmp1, tmp3); + code.pandn(tmp3, tmp0); + code.por(tmp3, tmp1); + code.movq(tmp0, tmp3); + ctx.reg_alloc.DefineValue(code, inst, tmp0); } - - EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) { - LowerPairedMin(result, a, b); - }); } template static D PolynomialMultiply(T lhs, T rhs) { constexpr size_t bit_size = mcl::bitsizeof; const std::bitset operand(lhs); - D res = 0; - for (size_t i = 0; i < bit_size; i++) { - if (operand[i]) { + for (size_t i = 0; i < bit_size; i++) + if (operand[i]) res ^= rhs << i; - } - } - return res; } void EmitX64::EmitVectorPolynomialMultiply8(EmitContext& ctx, IR::Inst* inst) { if (code.HasHostFeature(HostFeature::SSE41)) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(code, args[1]); - const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code); - const Xbyak::Xmm alternate = ctx.reg_alloc.ScratchXmm(code); - const Xbyak::Xmm mask = ctx.reg_alloc.ScratchXmm(code); + auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const xmm_b = ctx.reg_alloc.UseXmm(code, args[1]); + auto const result = ctx.reg_alloc.ScratchXmm(code); + auto const alternate = ctx.reg_alloc.ScratchXmm(code); + auto const mask = ctx.reg_alloc.ScratchXmm(code); const Xbyak::Reg32 counter = ctx.reg_alloc.ScratchGpr(code).cvt32(); Xbyak::Label loop; @@ -3183,11 +3441,11 @@ void EmitX64::EmitVectorPolynomialMultiply8(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorPolynomialMultiplyLong8(EmitContext& ctx, IR::Inst* inst) { if (code.HasHostFeature(HostFeature::SSE41)) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseScratchXmm(code, args[1]); - const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code); - const Xbyak::Xmm alternate = ctx.reg_alloc.ScratchXmm(code); - const Xbyak::Xmm mask = ctx.reg_alloc.ScratchXmm(code); + auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const xmm_b = ctx.reg_alloc.UseScratchXmm(code, args[1]); + auto const result = ctx.reg_alloc.ScratchXmm(code); + auto const alternate = ctx.reg_alloc.ScratchXmm(code); + auto const mask = ctx.reg_alloc.ScratchXmm(code); const Xbyak::Reg32 counter = ctx.reg_alloc.ScratchGpr(code).cvt32(); Xbyak::Label loop; @@ -3229,8 +3487,8 @@ void EmitX64::EmitVectorPolynomialMultiplyLong8(EmitContext& ctx, IR::Inst* inst void EmitX64::EmitVectorPolynomialMultiplyLong64(EmitContext& ctx, IR::Inst* inst) { if (code.HasHostFeature(HostFeature::PCLMULQDQ)) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(code, args[1]); + auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const xmm_b = ctx.reg_alloc.UseXmm(code, args[1]); code.pclmulqdq(xmm_a, xmm_b, 0x00); @@ -3260,7 +3518,7 @@ void EmitX64::EmitVectorPolynomialMultiplyLong64(EmitContext& ctx, IR::Inst* ins void EmitX64::EmitVectorPopulationCount(EmitContext& ctx, IR::Inst* inst) { if (code.HasHostFeature(HostFeature::AVX512VL | HostFeature::AVX512BITALG)) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const data = ctx.reg_alloc.UseScratchXmm(code, args[0]); code.vpopcntb(data, data); @@ -3271,10 +3529,10 @@ void EmitX64::EmitVectorPopulationCount(EmitContext& ctx, IR::Inst* inst) { if (code.HasHostFeature(HostFeature::SSSE3)) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm low_a = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm high_a = ctx.reg_alloc.ScratchXmm(code); - const Xbyak::Xmm tmp1 = ctx.reg_alloc.ScratchXmm(code); - const Xbyak::Xmm tmp2 = ctx.reg_alloc.ScratchXmm(code); + auto const low_a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const high_a = ctx.reg_alloc.ScratchXmm(code); + auto const tmp1 = ctx.reg_alloc.ScratchXmm(code); + auto const tmp2 = ctx.reg_alloc.ScratchXmm(code); code.movdqa(high_a, low_a); code.psrlw(high_a, 4); @@ -3303,12 +3561,12 @@ void EmitX64::EmitVectorPopulationCount(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorReverseBits(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const data = ctx.reg_alloc.UseScratchXmm(code, args[0]); if (code.HasHostFeature(HostFeature::GFNI)) { code.gf2p8affineqb(data, code.Const(xword, 0x8040201008040201, 0x8040201008040201), 0); } else { - const Xbyak::Xmm high_nibble_reg = ctx.reg_alloc.ScratchXmm(code); + auto const high_nibble_reg = ctx.reg_alloc.ScratchXmm(code); code.movdqa(high_nibble_reg, code.Const(xword, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0)); code.pand(high_nibble_reg, data); code.pxor(data, high_nibble_reg); @@ -3316,7 +3574,7 @@ void EmitX64::EmitVectorReverseBits(EmitContext& ctx, IR::Inst* inst) { if (code.HasHostFeature(HostFeature::SSSE3)) { // High lookup - const Xbyak::Xmm high_reversed_reg = ctx.reg_alloc.ScratchXmm(code); + auto const high_reversed_reg = ctx.reg_alloc.ScratchXmm(code); code.movdqa(high_reversed_reg, code.Const(xword, 0xE060A020C0408000, 0xF070B030D0509010)); code.pshufb(high_reversed_reg, data); @@ -3350,8 +3608,8 @@ void EmitX64::EmitVectorReverseBits(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorReverseElementsInHalfGroups8(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code); + auto const data = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const tmp = ctx.reg_alloc.ScratchXmm(code); code.movdqa(tmp, data); code.psllw(tmp, 8); @@ -3363,13 +3621,13 @@ void EmitX64::EmitVectorReverseElementsInHalfGroups8(EmitContext& ctx, IR::Inst* void EmitX64::EmitVectorReverseElementsInWordGroups8(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const data = ctx.reg_alloc.UseScratchXmm(code, args[0]); if (code.HasHostFeature(HostFeature::AVX)) { code.vpshufb(data, data, code.Const(xword, 0x0405060700010203, 0x0c0d0e0f08090a0b)); } else if (code.HasHostFeature(HostFeature::SSSE3)) { code.pshufb(data, code.Const(xword, 0x0405060700010203, 0x0c0d0e0f08090a0b)); } else { - const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code); + auto const tmp = ctx.reg_alloc.ScratchXmm(code); code.movdqa(tmp, data); code.psllw(tmp, 8); code.psrlw(data, 8); @@ -3382,7 +3640,7 @@ void EmitX64::EmitVectorReverseElementsInWordGroups8(EmitContext& ctx, IR::Inst* void EmitX64::EmitVectorReverseElementsInWordGroups16(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const data = ctx.reg_alloc.UseScratchXmm(code, args[0]); code.pshuflw(data, data, 0b10110001); code.pshufhw(data, data, 0b10110001); ctx.reg_alloc.DefineValue(code, inst, data); @@ -3390,13 +3648,13 @@ void EmitX64::EmitVectorReverseElementsInWordGroups16(EmitContext& ctx, IR::Inst void EmitX64::EmitVectorReverseElementsInLongGroups8(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const data = ctx.reg_alloc.UseScratchXmm(code, args[0]); if (code.HasHostFeature(HostFeature::AVX)) { code.vpshufb(data, data, code.Const(xword, 0x0001020304050607, 0x08090a0b0c0d0e0f)); } else if (code.HasHostFeature(HostFeature::SSSE3)) { code.pshufb(data, code.Const(xword, 0x0001020304050607, 0x08090a0b0c0d0e0f)); } else { - const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code); + auto const tmp = ctx.reg_alloc.ScratchXmm(code); code.movdqa(tmp, data); code.psllw(tmp, 8); code.psrlw(data, 8); @@ -3410,7 +3668,7 @@ void EmitX64::EmitVectorReverseElementsInLongGroups8(EmitContext& ctx, IR::Inst* void EmitX64::EmitVectorReverseElementsInLongGroups16(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const data = ctx.reg_alloc.UseScratchXmm(code, args[0]); code.pshuflw(data, data, 0b00011011); code.pshufhw(data, data, 0b00011011); @@ -3421,7 +3679,7 @@ void EmitX64::EmitVectorReverseElementsInLongGroups16(EmitContext& ctx, IR::Inst void EmitX64::EmitVectorReverseElementsInLongGroups32(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const data = ctx.reg_alloc.UseScratchXmm(code, args[0]); code.pshuflw(data, data, 0b01001110); code.pshufhw(data, data, 0b01001110); @@ -3432,8 +3690,8 @@ void EmitX64::EmitVectorReverseElementsInLongGroups32(EmitContext& ctx, IR::Inst void EmitX64::EmitVectorReduceAdd8(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm temp = xmm0; + auto const data = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const temp = xmm0; // Add upper elements to lower elements code.pshufd(temp, data, 0b01'00'11'10); @@ -3453,8 +3711,8 @@ void EmitX64::EmitVectorReduceAdd8(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorReduceAdd16(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm temp = xmm0; + auto const data = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const temp = xmm0; if (code.HasHostFeature(HostFeature::SSSE3)) { code.pxor(temp, temp); @@ -3484,8 +3742,8 @@ void EmitX64::EmitVectorReduceAdd16(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorReduceAdd32(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm temp = xmm0; + auto const data = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const temp = xmm0; // Add upper elements to lower elements(reversed) code.pshufd(temp, data, 0b00'01'10'11); @@ -3508,8 +3766,8 @@ void EmitX64::EmitVectorReduceAdd32(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorReduceAdd64(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm temp = xmm0; + auto const data = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const temp = xmm0; // Add upper elements to lower elements code.pshufd(temp, data, 0b01'00'11'10); @@ -3524,8 +3782,8 @@ void EmitX64::EmitVectorReduceAdd64(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorRotateWholeVectorRight(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm operand = ctx.reg_alloc.UseXmm(code, args[0]); - const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code); + auto const operand = ctx.reg_alloc.UseXmm(code, args[0]); + auto const result = ctx.reg_alloc.ScratchXmm(code); const u8 shift_amount = args[1].GetImmediateU8(); ASSERT(shift_amount % 32 == 0); const u8 shuffle_imm = std::rotr(0b11100100, shift_amount / 32 * 2); @@ -3538,12 +3796,12 @@ void EmitX64::EmitVectorRotateWholeVectorRight(EmitContext& ctx, IR::Inst* inst) static void EmitVectorRoundingHalvingAddSigned(size_t esize, EmitContext& ctx, IR::Inst* inst, BlockOfCode& code) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(code, args[1]); + auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const b = ctx.reg_alloc.UseScratchXmm(code, args[1]); switch (esize) { case 8: { - const Xbyak::Xmm vec_128 = ctx.reg_alloc.ScratchXmm(code); + auto const vec_128 = ctx.reg_alloc.ScratchXmm(code); code.movdqa(vec_128, code.Const(xword, 0x8080808080808080, 0x8080808080808080)); code.paddb(a, vec_128); @@ -3553,7 +3811,7 @@ static void EmitVectorRoundingHalvingAddSigned(size_t esize, EmitContext& ctx, I break; } case 16: { - const Xbyak::Xmm vec_32768 = ctx.reg_alloc.ScratchXmm(code); + auto const vec_32768 = ctx.reg_alloc.ScratchXmm(code); code.movdqa(vec_32768, code.Const(xword, 0x8000800080008000, 0x8000800080008000)); code.paddw(a, vec_32768); @@ -3563,7 +3821,7 @@ static void EmitVectorRoundingHalvingAddSigned(size_t esize, EmitContext& ctx, I break; } case 32: { - const Xbyak::Xmm tmp1 = ctx.reg_alloc.ScratchXmm(code); + auto const tmp1 = ctx.reg_alloc.ScratchXmm(code); code.movdqa(tmp1, a); code.por(a, b); @@ -3603,9 +3861,9 @@ static void EmitVectorRoundingHalvingAddUnsigned(size_t esize, EmitContext& ctx, case 32: { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(code, args[1]); - const Xbyak::Xmm tmp1 = ctx.reg_alloc.ScratchXmm(code); + auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const b = ctx.reg_alloc.UseScratchXmm(code, args[1]); + auto const tmp1 = ctx.reg_alloc.ScratchXmm(code); code.movdqa(tmp1, a); @@ -3669,18 +3927,18 @@ static void EmitUnsignedRoundingShiftLeft(BlockOfCode& code, EmitContext& ctx, I static_assert(esize == 32 || esize == 64); auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm a = ctx.reg_alloc.UseXmm(code, args[0]); - const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(code, args[1]); + auto const a = ctx.reg_alloc.UseXmm(code, args[0]); + auto const b = ctx.reg_alloc.UseXmm(code, args[1]); // positive values of b are left shifts, while negative values are (positive) rounding right shifts // only the lowest byte of each element is read as the shift amount // conveniently, the behavior of bit shifts greater than element width is the same in NEON and SSE/AVX - filled with zeros - const Xbyak::Xmm shift_amount = ctx.reg_alloc.ScratchXmm(code); + auto const shift_amount = ctx.reg_alloc.ScratchXmm(code); code.vpabsb(shift_amount, b); code.vpand(shift_amount, shift_amount, code.BConst(xword, 0xFF)); // if b is positive, do a normal left shift - const Xbyak::Xmm left_shift = ctx.reg_alloc.ScratchXmm(code); + auto const left_shift = ctx.reg_alloc.ScratchXmm(code); ICODE(vpsllv)(left_shift, a, shift_amount); // if b is negative, compute the rounding right shift @@ -3691,7 +3949,7 @@ static void EmitUnsignedRoundingShiftLeft(BlockOfCode& code, EmitContext& ctx, I // tmp = (a >> (b - 1)) & 1 // res = (a >> b) + tmp // to add the value of the last bit to be shifted off to the result of the right shift - const Xbyak::Xmm right_shift = ctx.reg_alloc.ScratchXmm(code); + auto const right_shift = ctx.reg_alloc.ScratchXmm(code); code.vmovdqa(xmm0, code.BConst(xword, 1)); // find value of last bit to be shifted off @@ -3775,12 +4033,12 @@ void EmitX64::EmitVectorRoundingShiftLeftU64(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorSignExtend8(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); if (code.HasHostFeature(HostFeature::SSE41)) { - const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]); code.pmovsxbw(a, a); ctx.reg_alloc.DefineValue(code, inst, a); } else { - const Xbyak::Xmm a = ctx.reg_alloc.UseXmm(code, args[0]); - const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code); + auto const a = ctx.reg_alloc.UseXmm(code, args[0]); + auto const result = ctx.reg_alloc.ScratchXmm(code); code.pxor(result, result); code.punpcklbw(result, a); code.psraw(result, 8); @@ -3791,12 +4049,12 @@ void EmitX64::EmitVectorSignExtend8(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorSignExtend16(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); if (code.HasHostFeature(HostFeature::SSE41)) { - const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]); code.pmovsxwd(a, a); ctx.reg_alloc.DefineValue(code, inst, a); } else { - const Xbyak::Xmm a = ctx.reg_alloc.UseXmm(code, args[0]); - const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code); + auto const a = ctx.reg_alloc.UseXmm(code, args[0]); + auto const result = ctx.reg_alloc.ScratchXmm(code); code.pxor(result, result); code.punpcklwd(result, a); code.psrad(result, 16); @@ -3806,12 +4064,12 @@ void EmitX64::EmitVectorSignExtend16(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorSignExtend32(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]); if (code.HasHostFeature(HostFeature::SSE41)) { code.pmovsxdq(a, a); } else { - const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code); + auto const tmp = ctx.reg_alloc.ScratchXmm(code); code.movaps(tmp, a); code.psrad(tmp, 31); @@ -3824,7 +4082,7 @@ void EmitX64::EmitVectorSignExtend32(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorSignExtend64(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const data = ctx.reg_alloc.UseScratchXmm(code, args[0]); const Xbyak::Reg64 gpr_tmp = ctx.reg_alloc.ScratchGpr(code); code.movq(gpr_tmp, data); @@ -3833,7 +4091,7 @@ void EmitX64::EmitVectorSignExtend64(EmitContext& ctx, IR::Inst* inst) { if (code.HasHostFeature(HostFeature::SSE41)) { code.pinsrq(data, gpr_tmp, 1); } else { - const Xbyak::Xmm xmm_tmp = ctx.reg_alloc.ScratchXmm(code); + auto const xmm_tmp = ctx.reg_alloc.ScratchXmm(code); code.movq(xmm_tmp, gpr_tmp); code.punpcklqdq(data, xmm_tmp); @@ -3844,9 +4102,9 @@ void EmitX64::EmitVectorSignExtend64(EmitContext& ctx, IR::Inst* inst) { static void EmitVectorSignedAbsoluteDifference(size_t esize, EmitContext& ctx, IR::Inst* inst, BlockOfCode& code) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(code, args[1]); - const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code); + auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const y = ctx.reg_alloc.UseXmm(code, args[1]); + auto const tmp = ctx.reg_alloc.ScratchXmm(code); // only signed 16-bit min/max are available below SSE4.1 if (code.HasHostFeature(HostFeature::SSE41) || esize == 16) { @@ -3912,11 +4170,11 @@ void EmitX64::EmitVectorSignedMultiply16(EmitContext& ctx, IR::Inst* inst) { const auto lower_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetLowerFromOp); auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm x = ctx.reg_alloc.UseXmm(code, args[0]); - const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(code, args[1]); + auto const x = ctx.reg_alloc.UseXmm(code, args[0]); + auto const y = ctx.reg_alloc.UseXmm(code, args[1]); if (upper_inst) { - const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code); + auto const result = ctx.reg_alloc.ScratchXmm(code); if (code.HasHostFeature(HostFeature::AVX)) { code.vpmulhw(result, x, y); } else { @@ -3928,7 +4186,7 @@ void EmitX64::EmitVectorSignedMultiply16(EmitContext& ctx, IR::Inst* inst) { } if (lower_inst) { - const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code); + auto const result = ctx.reg_alloc.ScratchXmm(code); if (code.HasHostFeature(HostFeature::AVX)) { code.vpmullw(result, x, y); } else { @@ -3946,9 +4204,9 @@ void EmitX64::EmitVectorSignedMultiply32(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); if (lower_inst && !upper_inst && code.HasHostFeature(HostFeature::AVX)) { - const Xbyak::Xmm x = ctx.reg_alloc.UseXmm(code, args[0]); - const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(code, args[1]); - const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code); + auto const x = ctx.reg_alloc.UseXmm(code, args[0]); + auto const y = ctx.reg_alloc.UseXmm(code, args[1]); + auto const result = ctx.reg_alloc.ScratchXmm(code); code.vpmulld(result, x, y); @@ -3957,16 +4215,16 @@ void EmitX64::EmitVectorSignedMultiply32(EmitContext& ctx, IR::Inst* inst) { } if (code.HasHostFeature(HostFeature::AVX)) { - const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(code, args[1]); + auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]); if (lower_inst) { - const Xbyak::Xmm lower_result = ctx.reg_alloc.ScratchXmm(code); + auto const lower_result = ctx.reg_alloc.ScratchXmm(code); code.vpmulld(lower_result, x, y); ctx.reg_alloc.DefineValue(code, lower_inst, lower_result); } - const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code); + auto const result = ctx.reg_alloc.ScratchXmm(code); code.vpmuldq(result, x, y); code.vpsrlq(x, x, 32); @@ -3978,12 +4236,12 @@ void EmitX64::EmitVectorSignedMultiply32(EmitContext& ctx, IR::Inst* inst) { return; } - const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(code, args[1]); - const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code); - const Xbyak::Xmm sign_correction = ctx.reg_alloc.ScratchXmm(code); - const Xbyak::Xmm upper_result = ctx.reg_alloc.ScratchXmm(code); - const Xbyak::Xmm lower_result = ctx.reg_alloc.ScratchXmm(code); + auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]); + auto const tmp = ctx.reg_alloc.ScratchXmm(code); + auto const sign_correction = ctx.reg_alloc.ScratchXmm(code); + auto const upper_result = ctx.reg_alloc.ScratchXmm(code); + auto const lower_result = ctx.reg_alloc.ScratchXmm(code); // calculate sign correction code.movdqa(tmp, x); @@ -4026,7 +4284,7 @@ void EmitX64::EmitVectorSignedMultiply32(EmitContext& ctx, IR::Inst* inst) { static void EmitVectorSignedSaturatedAbs(size_t esize, BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const data = ctx.reg_alloc.UseScratchXmm(code, args[0]); const Xbyak::Reg32 bit = ctx.reg_alloc.ScratchGpr(code).cvt32(); // SSE absolute value functions return an unsigned result @@ -4038,21 +4296,34 @@ static void EmitVectorSignedSaturatedAbs(size_t esize, BlockOfCode& code, EmitCo // or shift in sign bits to create a mask of (msb == 1 ? -1 : 0), then add to the result vector switch (esize) { case 8: { - VectorAbs8(code, ctx, data); + if (code.HasHostFeature(HostFeature::SSSE3)) { + code.pabsb(data, data); + } else { + auto const temp = ctx.reg_alloc.ScratchXmm(code); + code.pxor(temp, temp); + code.psubb(temp, data); + code.pminub(data, temp); + } code.pmovmskb(bit, data); - code.pminub(data, code.BConst<8>(xword, 0x7F)); break; } case 16: { - VectorAbs16(code, ctx, data); + if (code.HasHostFeature(HostFeature::SSSE3)) { + code.pabsw(data, data); + } else { + auto const temp = ctx.reg_alloc.ScratchXmm(code); + code.pxor(temp, temp); + code.psubw(temp, data); + code.pmaxsw(data, temp); + } code.pmovmskb(bit, data); code.and_(bit, 0xAAAA); // toggle mask bits that aren't the msb of an int16 to 0 if (code.HasHostFeature(HostFeature::SSE41)) { code.pminuw(data, code.BConst<16>(xword, 0x7FFF)); } else { - const Xbyak::Xmm tmp = xmm0; + auto const tmp = xmm0; code.movdqa(tmp, data); code.psraw(data, 15); code.paddw(data, tmp); @@ -4060,13 +4331,21 @@ static void EmitVectorSignedSaturatedAbs(size_t esize, BlockOfCode& code, EmitCo break; } case 32: { - VectorAbs32(code, ctx, data); + if (code.HasHostFeature(HostFeature::SSSE3)) { + code.pabsd(data, data); + } else { + auto const temp = ctx.reg_alloc.ScratchXmm(code); + code.movdqa(temp, data); + code.psrad(temp, 31); + code.pxor(data, temp); + code.psubd(data, temp); + } code.movmskps(bit, data); if (code.HasHostFeature(HostFeature::SSE41)) { code.pminud(data, code.BConst<32>(xword, 0x7FFFFFFF)); } else { - const Xbyak::Xmm tmp = xmm0; + auto const tmp = xmm0; code.movdqa(tmp, data); code.psrad(data, 31); code.paddd(data, tmp); @@ -4074,10 +4353,18 @@ static void EmitVectorSignedSaturatedAbs(size_t esize, BlockOfCode& code, EmitCo break; } case 64: { - VectorAbs64(code, ctx, data); + if (code.HasHostFeature(HostFeature::AVX512_Ortho)) { + code.vpabsq(data, data); + } else { + auto const temp = ctx.reg_alloc.ScratchXmm(code); + code.pshufd(temp, data, 0b11110101); + code.psrad(temp, 31); + code.pxor(data, temp); + code.psubq(data, temp); + } code.movmskpd(bit, data); - const Xbyak::Xmm tmp = xmm0; + auto const tmp = xmm0; if (code.HasHostFeature(HostFeature::SSE42)) { // create a -1 mask if msb is set code.pxor(tmp, tmp); @@ -4119,13 +4406,13 @@ template static void EmitVectorSignedSaturatedAccumulateUnsigned(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(code, args[1]); + auto const y = ctx.reg_alloc.UseXmm(code, args[1]); code.movdqa(xmm0, y); ctx.reg_alloc.Release(y); - const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code); - const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code); + auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const result = ctx.reg_alloc.ScratchXmm(code); + auto const tmp = ctx.reg_alloc.ScratchXmm(code); switch (bit_width) { case 8: @@ -4182,7 +4469,7 @@ static void EmitVectorSignedSaturatedAccumulateUnsigned(BlockOfCode& code, EmitC switch (bit_width) { case 8: if (code.HasHostFeature(HostFeature::AVX)) { - const Xbyak::Xmm tmp2 = ctx.reg_alloc.ScratchXmm(code); + auto const tmp2 = ctx.reg_alloc.ScratchXmm(code); code.pcmpeqb(tmp2, tmp2); code.pxor(tmp, tmp); code.vpblendvb(xmm0, tmp, tmp2, xmm0); @@ -4262,10 +4549,10 @@ void EmitX64::EmitVectorSignedSaturatedAccumulateUnsigned64(EmitContext& ctx, IR template static void EmitVectorSignedSaturatedDoublingMultiply16(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm x = ctx.reg_alloc.UseXmm(code, args[0]); - const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(code, args[1]); - const Xbyak::Xmm upper_tmp = ctx.reg_alloc.ScratchXmm(code); - const Xbyak::Xmm lower_tmp = ctx.reg_alloc.ScratchXmm(code); + auto const x = ctx.reg_alloc.UseXmm(code, args[0]); + auto const y = ctx.reg_alloc.UseXmm(code, args[1]); + auto const upper_tmp = ctx.reg_alloc.ScratchXmm(code); + auto const lower_tmp = ctx.reg_alloc.ScratchXmm(code); if (code.HasHostFeature(HostFeature::AVX)) { code.vpmulhw(upper_tmp, x, y); @@ -4284,7 +4571,7 @@ static void EmitVectorSignedSaturatedDoublingMultiply16(BlockOfCode& code, EmitC ctx.reg_alloc.Release(x); ctx.reg_alloc.Release(y); - const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code); + auto const result = ctx.reg_alloc.ScratchXmm(code); if (code.HasHostFeature(HostFeature::AVX)) { if constexpr (is_rounding) { @@ -4334,10 +4621,10 @@ void EmitVectorSignedSaturatedDoublingMultiply32(BlockOfCode& code, EmitContext& auto args = ctx.reg_alloc.GetArgumentInfo(inst); if (code.HasHostFeature(HostFeature::AVX)) { - const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(code, args[1]); - const Xbyak::Xmm odds = ctx.reg_alloc.ScratchXmm(code); - const Xbyak::Xmm even = ctx.reg_alloc.ScratchXmm(code); + auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]); + auto const odds = ctx.reg_alloc.ScratchXmm(code); + auto const even = ctx.reg_alloc.ScratchXmm(code); code.vpmuldq(odds, x, y); code.vpsrlq(x, x, 32); @@ -4350,7 +4637,7 @@ void EmitVectorSignedSaturatedDoublingMultiply32(BlockOfCode& code, EmitContext& code.vpaddq(odds, odds, odds); code.vpaddq(even, even, even); - const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code); + auto const result = ctx.reg_alloc.ScratchXmm(code); if constexpr (is_rounding) { code.vmovdqa(result, code.Const(xword, 0x0000000080000000, 0x0000000080000000)); @@ -4361,7 +4648,7 @@ void EmitVectorSignedSaturatedDoublingMultiply32(BlockOfCode& code, EmitContext& code.vpsrlq(result, odds, 32); code.vblendps(result, result, even, 0b1010); - const Xbyak::Xmm mask = ctx.reg_alloc.ScratchXmm(code); + auto const mask = ctx.reg_alloc.ScratchXmm(code); const Xbyak::Reg32 bit = ctx.reg_alloc.ScratchGpr(code).cvt32(); code.vpcmpeqd(mask, result, code.Const(xword, 0x8000000080000000, 0x8000000080000000)); @@ -4376,11 +4663,11 @@ void EmitVectorSignedSaturatedDoublingMultiply32(BlockOfCode& code, EmitContext& return; } - const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(code, args[1]); - const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code); - const Xbyak::Xmm sign_correction = ctx.reg_alloc.ScratchXmm(code); - const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code); + auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]); + auto const tmp = ctx.reg_alloc.ScratchXmm(code); + auto const sign_correction = ctx.reg_alloc.ScratchXmm(code); + auto const result = ctx.reg_alloc.ScratchXmm(code); // calculate sign correction code.movdqa(tmp, x); @@ -4439,8 +4726,8 @@ void EmitX64::EmitVectorSignedSaturatedDoublingMultiplyHighRounding32(EmitContex void EmitX64::EmitVectorSignedSaturatedDoublingMultiplyLong16(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(code, args[1]); + auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]); code.punpcklwd(x, x); code.punpcklwd(y, y); @@ -4465,8 +4752,8 @@ void EmitX64::EmitVectorSignedSaturatedDoublingMultiplyLong16(EmitContext& ctx, void EmitX64::EmitVectorSignedSaturatedDoublingMultiplyLong32(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(code, args[1]); + auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]); if (code.HasHostFeature(HostFeature::AVX)) { code.vpmovsxdq(x, x); @@ -4517,10 +4804,10 @@ void EmitX64::EmitVectorSignedSaturatedDoublingMultiplyLong32(EmitContext& ctx, static void EmitVectorSignedSaturatedNarrowToSigned(size_t original_esize, BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm src = ctx.reg_alloc.UseXmm(code, args[0]); - const Xbyak::Xmm dest = ctx.reg_alloc.ScratchXmm(code); - const Xbyak::Xmm reconstructed = ctx.reg_alloc.ScratchXmm(code); - const Xbyak::Xmm sign = ctx.reg_alloc.ScratchXmm(code); + auto const src = ctx.reg_alloc.UseXmm(code, args[0]); + auto const dest = ctx.reg_alloc.ScratchXmm(code); + auto const reconstructed = ctx.reg_alloc.ScratchXmm(code); + auto const sign = ctx.reg_alloc.ScratchXmm(code); code.movdqa(dest, src); code.pxor(xmm0, xmm0); @@ -4577,9 +4864,9 @@ void EmitX64::EmitVectorSignedSaturatedNarrowToSigned64(EmitContext& ctx, IR::In static void EmitVectorSignedSaturatedNarrowToUnsigned(size_t original_esize, BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm src = ctx.reg_alloc.UseXmm(code, args[0]); - const Xbyak::Xmm dest = ctx.reg_alloc.ScratchXmm(code); - const Xbyak::Xmm reconstructed = ctx.reg_alloc.ScratchXmm(code); + auto const src = ctx.reg_alloc.UseXmm(code, args[0]); + auto const dest = ctx.reg_alloc.ScratchXmm(code); + auto const reconstructed = ctx.reg_alloc.ScratchXmm(code); code.movdqa(dest, src); code.pxor(xmm0, xmm0); @@ -4647,9 +4934,9 @@ void EmitX64::EmitVectorSignedSaturatedNarrowToUnsigned64(EmitContext& ctx, IR:: static void EmitVectorSignedSaturatedNeg(size_t esize, BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm data = ctx.reg_alloc.UseXmm(code, args[0]); - const Xbyak::Xmm zero = ctx.reg_alloc.ScratchXmm(code); - const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code); + auto const data = ctx.reg_alloc.UseXmm(code, args[0]); + auto const zero = ctx.reg_alloc.ScratchXmm(code); + auto const tmp = ctx.reg_alloc.ScratchXmm(code); const Xbyak::Address mask = [esize, &code] { switch (esize) { case 8: @@ -4665,7 +4952,7 @@ static void EmitVectorSignedSaturatedNeg(size_t esize, BlockOfCode& code, EmitCo } }(); - const auto vector_equality = [esize, &code](const Xbyak::Xmm& x, const auto& y) { + const auto vector_equality = [esize, &code](auto const& x, const auto& y) { switch (esize) { case 8: code.pcmpeqb(x, y); @@ -4810,33 +5097,23 @@ void EmitX64::EmitVectorSignedSaturatedShiftLeft64(EmitContext& ctx, IR::Inst* i EmitTwoArgumentFallbackWithSaturation(code, ctx, inst, VectorSignedSaturatedShiftLeft); } -template> +template static bool VectorSignedSaturatedShiftLeftUnsigned(VectorArray& dst, const VectorArray& data, u8 shift_amount) { + using U = std::make_unsigned_t; static_assert(std::is_signed_v, "T must be signed."); - bool qc_flag = false; for (size_t i = 0; i < dst.size(); i++) { - const T element = data[i]; - const T shift = static_cast(shift_amount); - - if (element == 0) { - dst[i] = 0; - } else if (element < 0) { - dst[i] = 0; - qc_flag = true; - } else { - const U shifted = static_cast(element) << static_cast(shift); - const U shifted_test = shifted >> static_cast(shift); - - if (shifted_test != static_cast(element)) { - dst[i] = static_cast((std::numeric_limits::max)()); - qc_flag = true; - } else { - dst[i] = shifted; - } - } + auto const element = data[i]; + auto const shifted = U(element) << U(T(shift_amount)); + auto const shifted_test = shifted >> U(T(shift_amount)); + auto result = 0; + if (element > 0 && shifted_test != U(element)) + result = T((std::numeric_limits::max)()); + if (element > 0 && shifted_test == U(element)) + result = shifted; + qc_flag |= element < 0 || (element > 0 && shifted_test != U(element)); + dst[i] = result; } - return qc_flag; } @@ -4849,7 +5126,97 @@ void EmitX64::EmitVectorSignedSaturatedShiftLeftUnsigned16(EmitContext& ctx, IR: } void EmitX64::EmitVectorSignedSaturatedShiftLeftUnsigned32(EmitContext& ctx, IR::Inst* inst) { - EmitTwoArgumentFallbackWithSaturationAndImmediate(code, ctx, inst, VectorSignedSaturatedShiftLeftUnsigned); + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + auto const imm8 = args[1].GetImmediateU8(); + if (code.HasHostFeature(HostFeature::AVX2)) { + auto const tmp_flag = ctx.reg_alloc.ScratchGpr(code); + auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]); + if (imm8 == 0) { + auto const tmp1 = ctx.reg_alloc.ScratchXmm(code); + auto const tmp2 = ctx.reg_alloc.ScratchXmm(code); + code.vpshufd(tmp1, tmp0, 85); + code.vpshufd(tmp2, tmp0, 238); + code.vpor(tmp1, tmp1, tmp2); + code.vpshufd(tmp2, tmp0, 255); + code.vpor(tmp2, tmp2, tmp0); + code.vpor(tmp1, tmp1, tmp2); + code.vmovd(tmp_flag.cvt32(), tmp1); + code.shr(tmp_flag.cvt32(), 31); + code.vpxor(tmp1, tmp1, tmp1); + code.vpmaxsd(tmp0, tmp0, tmp1); + } else { + auto const tmp1 = ctx.reg_alloc.ScratchXmm(code); + auto const tmp2 = ctx.reg_alloc.ScratchXmm(code); + auto const tmp3 = ctx.reg_alloc.ScratchXmm(code); + auto const tmp4 = ctx.reg_alloc.ScratchXmm(code); + auto const cmp_value = u32(1ULL << 31) >> (imm8 - 1); + code.vpshufd(tmp1, tmp0, 238); + code.vpor(tmp1, tmp1, tmp0); + code.vpshufd(tmp2, tmp1, 85); + code.vpor(tmp1, tmp1, tmp2); + code.vmovd(tmp_flag.cvt32(), tmp1); + code.cmp(tmp_flag.cvt32(), cmp_value); + code.vpslld(tmp1, tmp0, imm8); + code.vpbroadcastd(tmp2, code.Const(dword, cmp_value - 2)); + code.vpbroadcastd(tmp3, code.Const(dword, cmp_value - 1)); + code.vpcmpgtd(tmp3, tmp0, tmp3); + code.vpcmpeqd(tmp4, tmp4, tmp4); + code.vpaddd(tmp0, tmp0, tmp4); + code.vpminud(tmp2, tmp0, tmp2); + code.vpcmpeqd(tmp0, tmp0, tmp2); + code.vblendvps(tmp0, tmp3, tmp1, tmp0); + code.setae(tmp_flag.cvt8()); + } + code.or_(code.byte[code.ABI_JIT_PTR + code.GetJitStateInfo().offsetof_fpsr_qc], tmp_flag.cvt8()); + ctx.reg_alloc.DefineValue(code, inst, tmp0); + } else { + auto const tmp_flag = ctx.reg_alloc.ScratchGpr(code); + auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]); + if (imm8 == 0) { + auto const tmp1 = ctx.reg_alloc.ScratchXmm(code); + auto const tmp2 = ctx.reg_alloc.ScratchXmm(code); + code.pshufd(tmp1, tmp0, 85); + code.pshufd(tmp2, tmp0, 238); + code.por(tmp2, tmp1); + code.pshufd(tmp1, tmp0, 255); + code.por(tmp1, tmp0); + code.por(tmp1, tmp2); + code.movd(tmp_flag.cvt32(), tmp1); + code.shr(tmp_flag.cvt32(), 31); + code.pxor(tmp1, tmp1); + code.movdqa(tmp2, tmp0); + code.pcmpgtd(tmp2, tmp1); + code.pand(tmp0, tmp2); + } else { + auto const tmp1 = ctx.reg_alloc.ScratchXmm(code); + auto const tmp2 = ctx.reg_alloc.ScratchXmm(code); + auto const tmp3 = ctx.reg_alloc.ScratchXmm(code); + u64 const cmp_value = u64(1ULL << 31) >> (imm8 - 1); + u64 const cmp_one = cmp_value - 1; + u64 const cmp_add = (cmp_value - 2) + 0x80000000; + code.pshufd(tmp1, tmp0, 238); + code.por(tmp1, tmp0); + code.pshufd(tmp2, tmp1, 85); + code.por(tmp2, tmp1); + code.movd(tmp_flag.cvt32(), tmp2); + code.cmp(tmp_flag.cvt32(), cmp_value); + code.movdqa(tmp1, tmp0); + code.pslld(tmp1, imm8); + code.movdqa(tmp2, tmp0); + code.pcmpgtd(tmp2, code.Const(xword, cmp_one | (cmp_one << 32), cmp_one | (cmp_one << 32))); + code.pcmpeqd(tmp3, tmp3); + code.paddd(tmp0, tmp3); + code.pxor(tmp0, code.Const(xword, 0x80000000'80000000, 0x80000000'80000000)); + code.pcmpgtd(tmp0, code.Const(xword, cmp_add | (cmp_add << 32), cmp_add | (cmp_add << 32))); + code.pand(tmp2, tmp0); + code.pandn(tmp0, tmp1); + code.por(tmp0, tmp2); + code.setae(tmp_flag.cvt8()); + } + code.or_(code.byte[code.ABI_JIT_PTR + code.GetJitStateInfo().offsetof_fpsr_qc], tmp_flag.cvt8()); + ctx.reg_alloc.DefineValue(code, inst, tmp0); +// EmitTwoArgumentFallbackWithSaturationAndImmediate(code, ctx, inst, VectorSignedSaturatedShiftLeftUnsigned); + } } void EmitX64::EmitVectorSignedSaturatedShiftLeftUnsigned64(EmitContext& ctx, IR::Inst* inst) { @@ -4887,7 +5254,7 @@ void EmitX64::EmitVectorTableLookup64(EmitContext& ctx, IR::Inst* inst) { const bool is_defaults_zero = inst->GetArg(0).IsZero(); if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW | HostFeature::AVX512VBMI)) { - const Xbyak::Xmm indicies = table_size <= 2 ? ctx.reg_alloc.UseXmm(code, args[2]) : ctx.reg_alloc.UseScratchXmm(code, args[2]); + auto const indicies = table_size <= 2 ? ctx.reg_alloc.UseXmm(code, args[2]) : ctx.reg_alloc.UseScratchXmm(code, args[2]); const u64 index_count = mcl::bit::replicate_element(static_cast(table_size * 8)); @@ -4895,43 +5262,43 @@ void EmitX64::EmitVectorTableLookup64(EmitContext& ctx, IR::Inst* inst) { switch (table_size) { case 1: { - const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseXmm(code, table[0]); + auto const xmm_table0 = ctx.reg_alloc.UseXmm(code, table[0]); if (is_defaults_zero) { - const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code); + auto const result = ctx.reg_alloc.ScratchXmm(code); code.vpermb(result | k1 | T_z, indicies, xmm_table0); ctx.reg_alloc.DefineValue(code, inst, result); } else { - const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]); code.vpermb(result | k1, indicies, xmm_table0); ctx.reg_alloc.DefineValue(code, inst, result); } break; } case 2: { - const Xbyak::Xmm xmm_table0_lower = ctx.reg_alloc.UseXmm(code, table[0]); - const Xbyak::Xmm xmm_table0_upper = ctx.reg_alloc.UseXmm(code, table[1]); + auto const xmm_table0_lower = ctx.reg_alloc.UseXmm(code, table[0]); + auto const xmm_table0_upper = ctx.reg_alloc.UseXmm(code, table[1]); code.vpunpcklqdq(xmm0, xmm_table0_lower, xmm_table0_upper); if (is_defaults_zero) { - const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code); + auto const result = ctx.reg_alloc.ScratchXmm(code); code.vpermb(result | k1 | T_z, indicies, xmm0); ctx.reg_alloc.DefineValue(code, inst, result); } else { - const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]); code.vpermb(result | k1, indicies, xmm0); ctx.reg_alloc.DefineValue(code, inst, result); } break; } case 3: { - const Xbyak::Xmm xmm_table0_lower = ctx.reg_alloc.UseXmm(code, table[0]); - const Xbyak::Xmm xmm_table0_upper = ctx.reg_alloc.UseXmm(code, table[1]); - const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseXmm(code, table[2]); + auto const xmm_table0_lower = ctx.reg_alloc.UseXmm(code, table[0]); + auto const xmm_table0_upper = ctx.reg_alloc.UseXmm(code, table[1]); + auto const xmm_table1 = ctx.reg_alloc.UseXmm(code, table[2]); code.vpunpcklqdq(xmm0, xmm_table0_lower, xmm_table0_upper); if (is_defaults_zero) { code.vpermi2b(indicies | k1 | T_z, xmm0, xmm_table1); ctx.reg_alloc.DefineValue(code, inst, indicies); } else { - const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]); code.vpermi2b(indicies, xmm0, xmm_table1); code.vmovdqu8(result | k1, indicies); ctx.reg_alloc.DefineValue(code, inst, result); @@ -4939,17 +5306,17 @@ void EmitX64::EmitVectorTableLookup64(EmitContext& ctx, IR::Inst* inst) { break; } case 4: { - const Xbyak::Xmm xmm_table0_lower = ctx.reg_alloc.UseXmm(code, table[0]); - const Xbyak::Xmm xmm_table0_upper = ctx.reg_alloc.UseXmm(code, table[1]); - const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseScratchXmm(code, table[2]); - const Xbyak::Xmm xmm_table1_upper = ctx.reg_alloc.UseXmm(code, table[3]); + auto const xmm_table0_lower = ctx.reg_alloc.UseXmm(code, table[0]); + auto const xmm_table0_upper = ctx.reg_alloc.UseXmm(code, table[1]); + auto const xmm_table1 = ctx.reg_alloc.UseScratchXmm(code, table[2]); + auto const xmm_table1_upper = ctx.reg_alloc.UseXmm(code, table[3]); code.vpunpcklqdq(xmm0, xmm_table0_lower, xmm_table0_upper); code.vpunpcklqdq(xmm_table1, xmm_table1, xmm_table1_upper); if (is_defaults_zero) { code.vpermi2b(indicies | k1 | T_z, xmm0, xmm_table1); ctx.reg_alloc.DefineValue(code, inst, indicies); } else { - const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]); code.vpermi2b(indicies, xmm0, xmm_table1); code.vmovdqu8(result | k1, indicies); ctx.reg_alloc.DefineValue(code, inst, result); @@ -4972,9 +5339,9 @@ void EmitX64::EmitVectorTableLookup64(EmitContext& ctx, IR::Inst* inst) { }; if (code.HasHostFeature(HostFeature::SSSE3) && is_defaults_zero && table_size == 1) { - const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]); - const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseXmm(code, table[0]); - const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code); + auto const indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]); + auto const xmm_table0 = ctx.reg_alloc.UseXmm(code, table[0]); + auto const result = ctx.reg_alloc.ScratchXmm(code); code.xorps(result, result); code.movsd(result, xmm_table0); @@ -4986,9 +5353,9 @@ void EmitX64::EmitVectorTableLookup64(EmitContext& ctx, IR::Inst* inst) { } if (code.HasHostFeature(HostFeature::SSSE3) && is_defaults_zero && table_size == 2) { - const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]); - const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(code, table[0]); - const Xbyak::Xmm xmm_table0_upper = ctx.reg_alloc.UseXmm(code, table[1]); + auto const indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]); + auto const xmm_table0 = ctx.reg_alloc.UseScratchXmm(code, table[0]); + auto const xmm_table0_upper = ctx.reg_alloc.UseXmm(code, table[1]); code.punpcklqdq(xmm_table0, xmm_table0_upper); code.paddusb(indicies, code.Const(xword, 0x7070707070707070, 0xFFFFFFFFFFFFFFFF)); @@ -4999,12 +5366,12 @@ void EmitX64::EmitVectorTableLookup64(EmitContext& ctx, IR::Inst* inst) { } if (code.HasHostFeature(HostFeature::SSE41) && table_size <= 2) { - const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(code, args[2]); - const Xbyak::Xmm defaults = ctx.reg_alloc.UseXmm(code, args[0]); - const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(code, table[0]); + auto const indicies = ctx.reg_alloc.UseXmm(code, args[2]); + auto const defaults = ctx.reg_alloc.UseXmm(code, args[0]); + auto const xmm_table0 = ctx.reg_alloc.UseScratchXmm(code, table[0]); if (table_size == 2) { - const Xbyak::Xmm xmm_table0_upper = ctx.reg_alloc.UseXmm(code, table[1]); + auto const xmm_table0_upper = ctx.reg_alloc.UseXmm(code, table[1]); code.punpcklqdq(xmm_table0, xmm_table0_upper); ctx.reg_alloc.Release(xmm_table0_upper); } @@ -5023,12 +5390,12 @@ void EmitX64::EmitVectorTableLookup64(EmitContext& ctx, IR::Inst* inst) { } if (code.HasHostFeature(HostFeature::SSE41) && is_defaults_zero) { - const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]); - const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(code, table[0]); - const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseScratchXmm(code, table[2]); + auto const indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]); + auto const xmm_table0 = ctx.reg_alloc.UseScratchXmm(code, table[0]); + auto const xmm_table1 = ctx.reg_alloc.UseScratchXmm(code, table[2]); { - const Xbyak::Xmm xmm_table0_upper = ctx.reg_alloc.UseXmm(code, table[1]); + auto const xmm_table0_upper = ctx.reg_alloc.UseXmm(code, table[1]); code.punpcklqdq(xmm_table0, xmm_table0_upper); ctx.reg_alloc.Release(xmm_table0_upper); } @@ -5037,7 +5404,7 @@ void EmitX64::EmitVectorTableLookup64(EmitContext& ctx, IR::Inst* inst) { code.punpcklqdq(xmm_table1, xmm0); } else { ASSERT(table_size == 4); - const Xbyak::Xmm xmm_table1_upper = ctx.reg_alloc.UseXmm(code, table[3]); + auto const xmm_table1_upper = ctx.reg_alloc.UseXmm(code, table[3]); code.punpcklqdq(xmm_table1, xmm_table1_upper); ctx.reg_alloc.Release(xmm_table1_upper); } @@ -5058,18 +5425,18 @@ void EmitX64::EmitVectorTableLookup64(EmitContext& ctx, IR::Inst* inst) { } if (code.HasHostFeature(HostFeature::SSE41)) { - const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]); - const Xbyak::Xmm defaults = ctx.reg_alloc.UseXmm(code, args[0]); - const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(code, table[0]); - const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseScratchXmm(code, table[2]); + auto const indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]); + auto const defaults = ctx.reg_alloc.UseXmm(code, args[0]); + auto const xmm_table0 = ctx.reg_alloc.UseScratchXmm(code, table[0]); + auto const xmm_table1 = ctx.reg_alloc.UseScratchXmm(code, table[2]); { - const Xbyak::Xmm xmm_table0_upper = ctx.reg_alloc.UseXmm(code, table[1]); + auto const xmm_table0_upper = ctx.reg_alloc.UseXmm(code, table[1]); code.punpcklqdq(xmm_table0, xmm_table0_upper); ctx.reg_alloc.Release(xmm_table0_upper); } if (table_size == 4) { - const Xbyak::Xmm xmm_table1_upper = ctx.reg_alloc.UseXmm(code, table[3]); + auto const xmm_table1_upper = ctx.reg_alloc.UseXmm(code, table[3]); code.punpcklqdq(xmm_table1, xmm_table1_upper); ctx.reg_alloc.Release(xmm_table1_upper); } @@ -5098,37 +5465,31 @@ void EmitX64::EmitVectorTableLookup64(EmitContext& ctx, IR::Inst* inst) { const u32 stack_space = static_cast(6 * 8); ctx.reg_alloc.AllocStackSpace(code, stack_space + ABI_SHADOW_SPACE); for (size_t i = 0; i < table_size; ++i) { - const Xbyak::Xmm table_value = ctx.reg_alloc.UseXmm(code, table[i]); + auto const table_value = ctx.reg_alloc.UseXmm(code, table[i]); code.movq(qword[rsp + ABI_SHADOW_SPACE + i * 8], table_value); ctx.reg_alloc.Release(table_value); } - const Xbyak::Xmm defaults = ctx.reg_alloc.UseXmm(code, args[0]); - const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(code, args[2]); - const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code); + auto const defaults = ctx.reg_alloc.UseXmm(code, args[0]); + auto const indicies = ctx.reg_alloc.UseXmm(code, args[2]); + auto const result = ctx.reg_alloc.ScratchXmm(code); ctx.reg_alloc.EndOfAllocScope(); ctx.reg_alloc.HostCall(code, nullptr); - code.lea(code.ABI_PARAM1, ptr[rsp + ABI_SHADOW_SPACE]); code.lea(code.ABI_PARAM2, ptr[rsp + ABI_SHADOW_SPACE + 4 * 8]); code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE + 5 * 8]); code.mov(code.ABI_PARAM4.cvt32(), table_size); code.movq(qword[code.ABI_PARAM2], defaults); code.movq(qword[code.ABI_PARAM3], indicies); - - code.CallLambda( - [](const HalfVectorArray* table, HalfVectorArray& result, const HalfVectorArray& indicies, size_t table_size) { - for (size_t i = 0; i < result.size(); ++i) { - const size_t index = indicies[i] / table[0].size(); - const size_t elem = indicies[i] % table[0].size(); - if (index < table_size) { - result[i] = table[index][elem]; - } - } - }); - + code.CallLambda([](const HalfVectorArray* table, HalfVectorArray& result, const HalfVectorArray& indicies, size_t table_size) { + for (size_t i = 0; i < result.size(); ++i) { + const size_t index = indicies[i] / table[0].size(); + const size_t elem = indicies[i] % table[0].size(); + if (index < table_size) + result[i] = table[index][elem]; + } + }); code.movq(result, qword[rsp + ABI_SHADOW_SPACE + 4 * 8]); ctx.reg_alloc.ReleaseStackSpace(code, stack_space + ABI_SHADOW_SPACE); - ctx.reg_alloc.DefineValue(code, inst, result); } @@ -5142,14 +5503,14 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) { const bool is_defaults_zero = !inst->GetArg(0).IsImmediate() && inst->GetArg(0).GetInst()->GetOpcode() == IR::Opcode::ZeroVector; if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW | HostFeature::AVX512VBMI) && table_size == 4) { - const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]); + auto const indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]); code.vpcmpub(k1, indicies, code.BConst<8>(xword, 2 * 16), CmpInt::LessThan); code.vpcmpub(k2, indicies, code.BConst<8>(xword, 4 * 16), CmpInt::LessThan); // Handle vector-table 0,1 - const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseXmm(code, table[0]); - const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseXmm(code, table[1]); + auto const xmm_table0 = ctx.reg_alloc.UseXmm(code, table[0]); + auto const xmm_table1 = ctx.reg_alloc.UseXmm(code, table[1]); code.vpermi2b(indicies | k1, xmm_table0, xmm_table1); @@ -5157,8 +5518,8 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) { ctx.reg_alloc.Release(xmm_table1); // Handle vector-table 2,3 - const Xbyak::Xmm xmm_table2 = ctx.reg_alloc.UseXmm(code, table[2]); - const Xbyak::Xmm xmm_table3 = ctx.reg_alloc.UseXmm(code, table[3]); + auto const xmm_table2 = ctx.reg_alloc.UseXmm(code, table[2]); + auto const xmm_table3 = ctx.reg_alloc.UseXmm(code, table[3]); code.kandnw(k1, k1, k2); code.vpermi2b(indicies | k1, xmm_table2, xmm_table3); @@ -5167,19 +5528,19 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) { code.vmovdqu8(indicies | k2 | T_z, indicies); ctx.reg_alloc.DefineValue(code, inst, indicies); } else { - const Xbyak::Xmm defaults = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const defaults = ctx.reg_alloc.UseScratchXmm(code, args[0]); code.vmovdqu8(defaults | k2, indicies); ctx.reg_alloc.DefineValue(code, inst, defaults); } } else if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW | HostFeature::AVX512VBMI) && table_size == 3) { - const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]); + auto const indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]); code.vpcmpub(k1, indicies, code.BConst<8>(xword, 2 * 16), CmpInt::LessThan); code.vpcmpub(k2, indicies, code.BConst<8>(xword, 3 * 16), CmpInt::LessThan); // Handle vector-table 0,1 - const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseXmm(code, table[0]); - const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseXmm(code, table[1]); + auto const xmm_table0 = ctx.reg_alloc.UseXmm(code, table[0]); + auto const xmm_table1 = ctx.reg_alloc.UseXmm(code, table[1]); code.vpermi2b(indicies | k1, xmm_table0, xmm_table1); @@ -5187,7 +5548,7 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) { ctx.reg_alloc.Release(xmm_table1); // Handle vector-table 2 - const Xbyak::Xmm xmm_table2 = ctx.reg_alloc.UseXmm(code, table[2]); + auto const xmm_table2 = ctx.reg_alloc.UseXmm(code, table[2]); code.kandnw(k1, k1, k2); code.vpermb(indicies | k1, indicies, xmm_table2); @@ -5196,14 +5557,14 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) { code.vmovdqu8(indicies | k2 | T_z, indicies); ctx.reg_alloc.DefineValue(code, inst, indicies); } else { - const Xbyak::Xmm defaults = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const defaults = ctx.reg_alloc.UseScratchXmm(code, args[0]); code.vmovdqu8(defaults | k2, indicies); ctx.reg_alloc.DefineValue(code, inst, defaults); } } else if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW | HostFeature::AVX512VBMI) && table_size == 2) { - const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]); - const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseXmm(code, table[0]); - const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseXmm(code, table[1]); + auto const indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]); + auto const xmm_table0 = ctx.reg_alloc.UseXmm(code, table[0]); + auto const xmm_table1 = ctx.reg_alloc.UseXmm(code, table[1]); code.vpcmpub(k1, indicies, code.BConst<8>(xword, 2 * 16), CmpInt::LessThan); @@ -5211,36 +5572,36 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) { code.vpermi2b(indicies | k1 | T_z, xmm_table0, xmm_table1); ctx.reg_alloc.DefineValue(code, inst, indicies); } else { - const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]); code.vpermi2b(indicies, xmm_table0, xmm_table1); code.vmovdqu8(result | k1, indicies); ctx.reg_alloc.DefineValue(code, inst, result); } } else if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW | HostFeature::AVX512VBMI) && table_size == 1) { - const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(code, args[2]); - const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseXmm(code, table[0]); + auto const indicies = ctx.reg_alloc.UseXmm(code, args[2]); + auto const xmm_table0 = ctx.reg_alloc.UseXmm(code, table[0]); code.vpcmpub(k1, indicies, code.BConst<8>(xword, 1 * 16), CmpInt::LessThan); if (is_defaults_zero) { - const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code); + auto const result = ctx.reg_alloc.ScratchXmm(code); code.vpermb(result | k1 | T_z, indicies, xmm_table0); ctx.reg_alloc.DefineValue(code, inst, result); } else { - const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]); code.vpermb(result | k1, indicies, xmm_table0); ctx.reg_alloc.DefineValue(code, inst, result); } } else if (code.HasHostFeature(HostFeature::SSSE3) && is_defaults_zero && table_size == 1) { - const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]); - const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(code, table[0]); + auto const indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]); + auto const xmm_table0 = ctx.reg_alloc.UseScratchXmm(code, table[0]); code.paddusb(indicies, code.Const(xword, 0x7070707070707070, 0x7070707070707070)); code.pshufb(xmm_table0, indicies); ctx.reg_alloc.DefineValue(code, inst, xmm_table0); } else if (code.HasHostFeature(HostFeature::SSE41) && table_size == 1) { - const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(code, args[2]); - const Xbyak::Xmm defaults = ctx.reg_alloc.UseXmm(code, args[0]); - const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(code, table[0]); + auto const indicies = ctx.reg_alloc.UseXmm(code, args[2]); + auto const defaults = ctx.reg_alloc.UseXmm(code, args[0]); + auto const xmm_table0 = ctx.reg_alloc.UseScratchXmm(code, table[0]); if (code.HasHostFeature(HostFeature::AVX)) { code.vpaddusb(xmm0, indicies, code.Const(xword, 0x7070707070707070, 0x7070707070707070)); @@ -5253,9 +5614,9 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) { ctx.reg_alloc.DefineValue(code, inst, xmm_table0); } else if (code.HasHostFeature(HostFeature::SSE41) && is_defaults_zero && table_size == 2) { - const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]); - const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(code, table[0]); - const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseScratchXmm(code, table[1]); + auto const indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]); + auto const xmm_table0 = ctx.reg_alloc.UseScratchXmm(code, table[0]); + auto const xmm_table1 = ctx.reg_alloc.UseScratchXmm(code, table[1]); if (code.HasHostFeature(HostFeature::AVX)) { code.vpaddusb(xmm0, indicies, code.Const(xword, 0x7070707070707070, 0x7070707070707070)); @@ -5271,14 +5632,14 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) { ctx.reg_alloc.DefineValue(code, inst, xmm_table0); return; } else if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW)) { - const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(code, args[2]); - const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm masked = ctx.reg_alloc.ScratchXmm(code); + auto const indicies = ctx.reg_alloc.UseXmm(code, args[2]); + auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const masked = ctx.reg_alloc.ScratchXmm(code); code.vpandd(masked, indicies, code.Const(xword_b, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0)); for (size_t i = 0; i < table_size; ++i) { - const Xbyak::Xmm xmm_table = ctx.reg_alloc.UseScratchXmm(code, table[i]); + auto const xmm_table = ctx.reg_alloc.UseScratchXmm(code, table[i]); const Xbyak::Opmask table_mask = k1; const u64 table_index = mcl::bit::replicate_element(i * 16); @@ -5295,15 +5656,15 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) { ctx.reg_alloc.DefineValue(code, inst, result); } else if (code.HasHostFeature(HostFeature::SSE41)) { - const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(code, args[2]); - const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm masked = ctx.reg_alloc.ScratchXmm(code); + auto const indicies = ctx.reg_alloc.UseXmm(code, args[2]); + auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const masked = ctx.reg_alloc.ScratchXmm(code); code.movaps(masked, code.Const(xword, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0)); code.pand(masked, indicies); for (size_t i = 0; i < table_size; ++i) { - const Xbyak::Xmm xmm_table = ctx.reg_alloc.UseScratchXmm(code, table[i]); + auto const xmm_table = ctx.reg_alloc.UseScratchXmm(code, table[i]); const u64 table_index = mcl::bit::replicate_element(i * 16); @@ -5327,13 +5688,13 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) { const u32 stack_space = static_cast((table_size + 2) * 16); ctx.reg_alloc.AllocStackSpace(code, stack_space + ABI_SHADOW_SPACE); for (size_t i = 0; i < table_size; ++i) { - const Xbyak::Xmm table_value = ctx.reg_alloc.UseXmm(code, table[i]); + auto const table_value = ctx.reg_alloc.UseXmm(code, table[i]); code.movaps(xword[rsp + ABI_SHADOW_SPACE + i * 16], table_value); ctx.reg_alloc.Release(table_value); } - const Xbyak::Xmm defaults = ctx.reg_alloc.UseXmm(code, args[0]); - const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(code, args[2]); - const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code); + auto const defaults = ctx.reg_alloc.UseXmm(code, args[0]); + auto const indicies = ctx.reg_alloc.UseXmm(code, args[2]); + auto const result = ctx.reg_alloc.ScratchXmm(code); ctx.reg_alloc.EndOfAllocScope(); ctx.reg_alloc.HostCall(code, nullptr); code.lea(code.ABI_PARAM1, ptr[rsp + ABI_SHADOW_SPACE]); @@ -5360,8 +5721,8 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorTranspose8(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm lower = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm upper = ctx.reg_alloc.UseScratchXmm(code, args[1]); + auto const lower = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const upper = ctx.reg_alloc.UseScratchXmm(code, args[1]); const bool part = args[2].GetImmediateU1(); if (!part) { @@ -5379,8 +5740,8 @@ void EmitX64::EmitVectorTranspose8(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorTranspose16(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm lower = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm upper = ctx.reg_alloc.UseScratchXmm(code, args[1]); + auto const lower = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const upper = ctx.reg_alloc.UseScratchXmm(code, args[1]); const bool part = args[2].GetImmediateU1(); if (!part) { @@ -5398,8 +5759,8 @@ void EmitX64::EmitVectorTranspose16(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorTranspose32(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm lower = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm upper = ctx.reg_alloc.UseXmm(code, args[1]); + auto const lower = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const upper = ctx.reg_alloc.UseXmm(code, args[1]); const bool part = args[2].GetImmediateU1(); code.shufps(lower, upper, !part ? 0b10001000 : 0b11011101); @@ -5411,8 +5772,8 @@ void EmitX64::EmitVectorTranspose32(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorTranspose64(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm lower = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm upper = ctx.reg_alloc.UseXmm(code, args[1]); + auto const lower = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const upper = ctx.reg_alloc.UseXmm(code, args[1]); const bool part = args[2].GetImmediateU1(); code.shufpd(lower, upper, !part ? 0b00 : 0b11); @@ -5420,89 +5781,87 @@ void EmitX64::EmitVectorTranspose64(EmitContext& ctx, IR::Inst* inst) { ctx.reg_alloc.DefineValue(code, inst, lower); } -static void EmitVectorUnsignedAbsoluteDifference(size_t esize, EmitContext& ctx, IR::Inst* inst, BlockOfCode& code) { + +void EmitX64::EmitVectorUnsignedAbsoluteDifference8(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - - const Xbyak::Xmm temp = ctx.reg_alloc.ScratchXmm(code); - - switch (esize) { - case 8: { - const Xbyak::Xmm x = ctx.reg_alloc.UseXmm(code, args[0]); - const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(code, args[1]); - + if (code.HasHostFeature(HostFeature::AVX)) { + auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const tmp1 = ctx.reg_alloc.UseXmm(code, args[1]); + auto const tmp2 = ctx.reg_alloc.ScratchXmm(code); + code.vpminub(tmp2, tmp0, tmp1); + code.vpmaxub(tmp0, tmp0, tmp1); + code.vpsubb(tmp0, tmp0, tmp2); + ctx.reg_alloc.DefineValue(code, inst, tmp0); + } else { + auto const temp = ctx.reg_alloc.ScratchXmm(code); + auto const x = ctx.reg_alloc.UseXmm(code, args[0]); + auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]); code.movdqa(temp, x); code.psubusb(temp, y); code.psubusb(y, x); code.por(temp, y); - break; + ctx.reg_alloc.DefineValue(code, inst, temp); } - case 16: { - const Xbyak::Xmm x = ctx.reg_alloc.UseXmm(code, args[0]); - const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(code, args[1]); +} +void EmitX64::EmitVectorUnsignedAbsoluteDifference16(EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + if (code.HasHostFeature(HostFeature::AVX)) { + auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const tmp1 = ctx.reg_alloc.UseXmm(code, args[1]); + auto const tmp2 = ctx.reg_alloc.ScratchXmm(code); + code.vpminuw(tmp2, tmp0, tmp1); + code.vpmaxuw(tmp0, tmp0, tmp1); + code.vpsubw(tmp0, tmp0, tmp2); + ctx.reg_alloc.DefineValue(code, inst, tmp0); + } else { + auto const temp = ctx.reg_alloc.ScratchXmm(code); + auto const x = ctx.reg_alloc.UseXmm(code, args[0]); + auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]); code.movdqa(temp, x); code.psubusw(temp, y); code.psubusw(y, x); code.por(temp, y); - break; + ctx.reg_alloc.DefineValue(code, inst, temp); } - case 32: - // See https://stackoverflow.com/questions/3380785/compute-the-absolute-difference-between-unsigned-integers-using-sse/3527267#3527267 - if (code.HasHostFeature(HostFeature::SSE41)) { - const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(code, args[1]); - - code.movdqa(temp, x); - code.pminud(x, y); - code.pmaxud(temp, y); - code.psubd(temp, x); - } else { - const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(code, args[1]); - if (ctx.HasOptimization(OptimizationFlag::CodeSpeed)) { - // About 45 bytes - const Xbyak::Xmm temp_x = ctx.reg_alloc.ScratchXmm(code); - const Xbyak::Xmm temp_y = ctx.reg_alloc.ScratchXmm(code); - code.pcmpeqd(temp, temp); - code.pslld(temp, 31); - code.movdqa(temp_x, x); - code.movdqa(temp_y, y); - code.paddd(temp_x, x); - code.paddd(temp_y, y); - code.pcmpgtd(temp_y, temp_x); - code.psubd(x, y); - code.pandn(temp, temp_y); - code.pxor(x, y); - code.psubd(x, y); - } else { - // Smaller code size - about 36 bytes - code.movdqa(temp, code.Const(xword, 0x8000000080000000, 0x8000000080000000)); - code.pxor(x, temp); - code.pxor(y, temp); - code.movdqa(temp, x); - code.psubd(temp, y); - code.pcmpgtd(y, x); - code.psrld(y, 1); - code.pxor(temp, y); - code.psubd(temp, y); - } - } - break; - } - - ctx.reg_alloc.DefineValue(code, inst, temp); -} - -void EmitX64::EmitVectorUnsignedAbsoluteDifference8(EmitContext& ctx, IR::Inst* inst) { - EmitVectorUnsignedAbsoluteDifference(8, ctx, inst, code); -} - -void EmitX64::EmitVectorUnsignedAbsoluteDifference16(EmitContext& ctx, IR::Inst* inst) { - EmitVectorUnsignedAbsoluteDifference(16, ctx, inst, code); } void EmitX64::EmitVectorUnsignedAbsoluteDifference32(EmitContext& ctx, IR::Inst* inst) { - EmitVectorUnsignedAbsoluteDifference(32, ctx, inst, code); + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + if (code.HasHostFeature(HostFeature::AVX)) { + auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const tmp1 = ctx.reg_alloc.UseXmm(code, args[1]); + auto const tmp2 = ctx.reg_alloc.ScratchXmm(code); + code.vpminud(tmp2, tmp0, tmp1); + code.vpmaxud(tmp0, tmp0, tmp1); + code.vpsubd(tmp0, tmp0, tmp2); + ctx.reg_alloc.DefineValue(code, inst, tmp0); + } else if (code.HasHostFeature(HostFeature::SSE41)) { + // See https://stackoverflow.com/questions/3380785/compute-the-absolute-difference-between-unsigned-integers-using-sse/3527267#3527267 + auto const temp = ctx.reg_alloc.ScratchXmm(code); + auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const y = ctx.reg_alloc.UseXmm(code, args[1]); + code.movdqa(temp, x); + code.pminud(x, y); + code.pmaxud(temp, y); + code.psubd(temp, x); + ctx.reg_alloc.DefineValue(code, inst, temp); + } else { + auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const tmp1 = ctx.reg_alloc.UseScratchXmm(code, args[1]); + auto const tmp2 = ctx.reg_alloc.ScratchXmm(code); + auto const tmp3 = ctx.reg_alloc.ScratchXmm(code); + code.movdqa(tmp2, code.Const(xword, 0x8000'00008000'0000, 0x8000'00008000'0000)); + code.movdqa(tmp3, tmp1); + code.pxor(tmp3, tmp2); + code.pxor(tmp2, tmp0); + code.pcmpgtd(tmp2, tmp3); + code.psubd(tmp0, tmp1); + code.pxor(tmp0, tmp2); + code.psubd(tmp2, tmp0); + //code.movdqa(tmp0, tmp2); + ctx.reg_alloc.DefineValue(code, inst, tmp2); + } } void EmitX64::EmitVectorUnsignedMultiply16(EmitContext& ctx, IR::Inst* inst) { @@ -5510,11 +5869,11 @@ void EmitX64::EmitVectorUnsignedMultiply16(EmitContext& ctx, IR::Inst* inst) { const auto lower_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetLowerFromOp); auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm x = ctx.reg_alloc.UseXmm(code, args[0]); - const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(code, args[1]); + auto const x = ctx.reg_alloc.UseXmm(code, args[0]); + auto const y = ctx.reg_alloc.UseXmm(code, args[1]); if (upper_inst) { - const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code); + auto const result = ctx.reg_alloc.ScratchXmm(code); if (code.HasHostFeature(HostFeature::AVX)) { code.vpmulhuw(result, x, y); } else { @@ -5526,7 +5885,7 @@ void EmitX64::EmitVectorUnsignedMultiply16(EmitContext& ctx, IR::Inst* inst) { } if (lower_inst) { - const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code); + auto const result = ctx.reg_alloc.ScratchXmm(code); if (code.HasHostFeature(HostFeature::AVX)) { code.vpmullw(result, x, y); } else { @@ -5544,24 +5903,24 @@ void EmitX64::EmitVectorUnsignedMultiply32(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); if (lower_inst && !upper_inst && code.HasHostFeature(HostFeature::AVX)) { - const Xbyak::Xmm x = ctx.reg_alloc.UseXmm(code, args[0]); - const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(code, args[1]); - const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code); + auto const x = ctx.reg_alloc.UseXmm(code, args[0]); + auto const y = ctx.reg_alloc.UseXmm(code, args[1]); + auto const result = ctx.reg_alloc.ScratchXmm(code); code.vpmulld(result, x, y); ctx.reg_alloc.DefineValue(code, lower_inst, result); } else if (code.HasHostFeature(HostFeature::AVX)) { - const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(code, args[1]); + auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]); if (lower_inst) { - const Xbyak::Xmm lower_result = ctx.reg_alloc.ScratchXmm(code); + auto const lower_result = ctx.reg_alloc.ScratchXmm(code); code.vpmulld(lower_result, x, y); ctx.reg_alloc.DefineValue(code, lower_inst, lower_result); } - const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code); + auto const result = ctx.reg_alloc.ScratchXmm(code); code.vpmuludq(result, x, y); code.vpsrlq(x, x, 32); @@ -5571,11 +5930,11 @@ void EmitX64::EmitVectorUnsignedMultiply32(EmitContext& ctx, IR::Inst* inst) { ctx.reg_alloc.DefineValue(code, upper_inst, result); } else { - const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(code, args[1]); - const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code); - const Xbyak::Xmm upper_result = upper_inst ? ctx.reg_alloc.ScratchXmm(code) : Xbyak::Xmm{-1}; - const Xbyak::Xmm lower_result = lower_inst ? ctx.reg_alloc.ScratchXmm(code) : Xbyak::Xmm{-1}; + auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]); + auto const tmp = ctx.reg_alloc.ScratchXmm(code); + auto const upper_result = upper_inst ? ctx.reg_alloc.ScratchXmm(code) : Xbyak::Xmm{-1}; + auto const lower_result = lower_inst ? ctx.reg_alloc.ScratchXmm(code) : Xbyak::Xmm{-1}; // calculate unsigned multiply code.movdqa(tmp, x); @@ -5792,11 +6151,11 @@ void EmitX64::EmitVectorUnsignedSaturatedShiftLeft64(EmitContext& ctx, IR::Inst* void EmitX64::EmitVectorZeroExtend8(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]); if (code.HasHostFeature(HostFeature::SSE41)) { code.pmovzxbw(a, a); } else { - const Xbyak::Xmm zeros = ctx.reg_alloc.ScratchXmm(code); + auto const zeros = ctx.reg_alloc.ScratchXmm(code); code.pxor(zeros, zeros); code.punpcklbw(a, zeros); } @@ -5805,11 +6164,11 @@ void EmitX64::EmitVectorZeroExtend8(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorZeroExtend16(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]); if (code.HasHostFeature(HostFeature::SSE41)) { code.pmovzxwd(a, a); } else { - const Xbyak::Xmm zeros = ctx.reg_alloc.ScratchXmm(code); + auto const zeros = ctx.reg_alloc.ScratchXmm(code); code.pxor(zeros, zeros); code.punpcklwd(a, zeros); } @@ -5818,11 +6177,11 @@ void EmitX64::EmitVectorZeroExtend16(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorZeroExtend32(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]); if (code.HasHostFeature(HostFeature::SSE41)) { code.pmovzxdq(a, a); } else { - const Xbyak::Xmm zeros = ctx.reg_alloc.ScratchXmm(code); + auto const zeros = ctx.reg_alloc.ScratchXmm(code); code.pxor(zeros, zeros); code.punpckldq(a, zeros); } @@ -5831,8 +6190,8 @@ void EmitX64::EmitVectorZeroExtend32(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorZeroExtend64(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]); - const Xbyak::Xmm zeros = ctx.reg_alloc.ScratchXmm(code); + auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const zeros = ctx.reg_alloc.ScratchXmm(code); code.pxor(zeros, zeros); code.punpcklqdq(a, zeros); ctx.reg_alloc.DefineValue(code, inst, a); @@ -5840,7 +6199,7 @@ void EmitX64::EmitVectorZeroExtend64(EmitContext& ctx, IR::Inst* inst) { void EmitX64::EmitVectorZeroUpper(EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]); + auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]); code.movq(a, a); // TODO: !IsLastUse @@ -5848,7 +6207,7 @@ void EmitX64::EmitVectorZeroUpper(EmitContext& ctx, IR::Inst* inst) { } void EmitX64::EmitZeroVector(EmitContext& ctx, IR::Inst* inst) { - const Xbyak::Xmm a = ctx.reg_alloc.ScratchXmm(code); + auto const a = ctx.reg_alloc.ScratchXmm(code); code.pxor(a, a); ctx.reg_alloc.DefineValue(code, inst, a); } diff --git a/src/dynarmic/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp b/src/dynarmic/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp index 70edfbd0bc..046ecc78d6 100644 --- a/src/dynarmic/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp +++ b/src/dynarmic/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp @@ -24,6 +24,7 @@ #include "dynarmic/common/fp/fpcr.h" #include "dynarmic/common/fp/info.h" #include "dynarmic/common/fp/op.h" +#include "dynarmic/common/fp/rounding_mode.h" #include "dynarmic/common/fp/util.h" #include "dynarmic/interface/optimization_flags.h" #include "dynarmic/ir/basic_block.h" @@ -93,7 +94,7 @@ void HandleNaNs(BlockOfCode& code, EmitContext& ctx, bool fpcr_controlled, std:: code.cmp(bitmask, 0); } - SharedLabel end = GenSharedLabel(), nan = GenSharedLabel(); + SharedLabel end = ctx.GenSharedLabel(), nan = ctx.GenSharedLabel(); code.jnz(*nan, code.T_NEAR); code.L(*end); @@ -188,23 +189,6 @@ void ForceToDefaultNaN(BlockOfCode& code, FP::FPCR fpcr, Xbyak::Xmm result) { } } -template -void ZeroIfNaN(BlockOfCode& code, Xbyak::Xmm result) { - const Xbyak::Xmm nan_mask = xmm0; - if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) { - constexpr u32 nan_to_zero = FixupLUT(FpFixup::PosZero, - FpFixup::PosZero); - FCODE(vfixupimmp)(result, result, code.BConst<32>(ptr_b, nan_to_zero), u8(0)); - } else if (code.HasHostFeature(HostFeature::AVX)) { - FCODE(vcmpordp)(nan_mask, result, result); - FCODE(vandp)(result, result, nan_mask); - } else { - code.movaps(nan_mask, result); - FCODE(cmpordp)(nan_mask, nan_mask); - code.andps(result, nan_mask); - } -} - template void DenormalsAreZero(BlockOfCode& code, FP::FPCR fpcr, std::initializer_list to_daz, Xbyak::Xmm tmp) { if (fpcr.FZ()) { @@ -1330,7 +1314,7 @@ void EmitFPVectorMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code); const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code); - SharedLabel end = GenSharedLabel(), fallback = GenSharedLabel(); + SharedLabel end = ctx.GenSharedLabel(), fallback = ctx.GenSharedLabel(); MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] { code.movaps(result, xmm_a); @@ -1603,7 +1587,7 @@ static void EmitRecipStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst* in const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(code, args[1]); const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code); - SharedLabel end = GenSharedLabel(), fallback = GenSharedLabel(); + SharedLabel end = ctx.GenSharedLabel(), fallback = ctx.GenSharedLabel(); MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] { code.movaps(result, GetVectorOf(code)); @@ -1776,7 +1760,7 @@ static void EmitRSqrtEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* ins const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code); const Xbyak::Xmm value = ctx.reg_alloc.ScratchXmm(code); - SharedLabel bad_values = GenSharedLabel(), end = GenSharedLabel(); + SharedLabel bad_values = ctx.GenSharedLabel(), end = ctx.GenSharedLabel(); code.movaps(value, operand); @@ -1867,7 +1851,7 @@ static void EmitRSqrtStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst* in const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code); const Xbyak::Xmm mask = ctx.reg_alloc.ScratchXmm(code); - SharedLabel end = GenSharedLabel(), fallback = GenSharedLabel(); + SharedLabel end = ctx.GenSharedLabel(), fallback = ctx.GenSharedLabel(); MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] { code.vmovaps(result, GetVectorOf(code)); @@ -2004,120 +1988,123 @@ void EmitX64::EmitFPVectorToHalf32(EmitContext& ctx, IR::Inst* inst) { template void EmitFPVectorToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { const size_t fbits = inst->GetArg(1).GetU8(); - const auto rounding = static_cast(inst->GetArg(2).GetU8()); + const auto rounding = FP::RoundingMode(inst->GetArg(2).GetU8()); [[maybe_unused]] const bool fpcr_controlled = inst->GetArg(3).GetU1(); - if constexpr (fsize != 16) { - if (code.HasHostFeature(HostFeature::SSE41) && rounding != FP::RoundingMode::ToNearest_TieAwayFromZero) { - auto args = ctx.reg_alloc.GetArgumentInfo(inst); - - const Xbyak::Xmm src = ctx.reg_alloc.UseScratchXmm(code, args[0]); - - MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] { - const int round_imm = [&] { - switch (rounding) { - case FP::RoundingMode::ToNearest_TieEven: - default: - return 0b00; - case FP::RoundingMode::TowardsPlusInfinity: - return 0b10; - case FP::RoundingMode::TowardsMinusInfinity: - return 0b01; - case FP::RoundingMode::TowardsZero: - return 0b11; - } - }(); - - const auto perform_conversion = [&code, &ctx](const Xbyak::Xmm& src) { - // MSVC doesn't allow us to use a [&] capture, so we have to do this instead. - (void)ctx; - - if constexpr (fsize == 32) { - code.cvttps2dq(src, src); - } else { - if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) { - code.vcvttpd2qq(src, src); - } else { - const Xbyak::Reg64 hi = ctx.reg_alloc.ScratchGpr(code); - const Xbyak::Reg64 lo = ctx.reg_alloc.ScratchGpr(code); - - code.cvttsd2si(lo, src); - code.punpckhqdq(src, src); - code.cvttsd2si(hi, src); - code.movq(src, lo); - code.pinsrq(src, hi, 1); - - ctx.reg_alloc.Release(hi); - ctx.reg_alloc.Release(lo); - } - } - }; - - if (fbits != 0) { - const u64 scale_factor = fsize == 32 - ? static_cast(fbits + 127) << 23 - : static_cast(fbits + 1023) << 52; - FCODE(mulp)(src, GetVectorOf(code, scale_factor)); + if (code.HasHostFeature(HostFeature::SSE41) && fsize != 16 && rounding != FP::RoundingMode::ToNearest_TieAwayFromZero) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const Xbyak::Xmm src = ctx.reg_alloc.UseScratchXmm(code, args[0]); + MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] { + const int round_imm = [&] { + switch (rounding) { + case FP::RoundingMode::ToNearest_TieEven: + default: + return 0b00; + case FP::RoundingMode::TowardsPlusInfinity: + return 0b10; + case FP::RoundingMode::TowardsMinusInfinity: + return 0b01; + case FP::RoundingMode::TowardsZero: + return 0b11; } + }(); + const auto perform_conversion = [&code, &ctx](const Xbyak::Xmm& src) { + // MSVC doesn't allow us to use a [&] capture, so we have to do this instead. + (void)ctx; - FCODE(roundp)(src, src, static_cast(round_imm)); - ZeroIfNaN(code, src); - - constexpr u64 float_upper_limit_signed = fsize == 32 ? 0x4f000000 : 0x43e0000000000000; - [[maybe_unused]] constexpr u64 float_upper_limit_unsigned = fsize == 32 ? 0x4f800000 : 0x43f0000000000000; - - if constexpr (unsigned_) { + if constexpr (fsize == 32) { + code.cvttps2dq(src, src); + } else { if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) { - // Mask positive values - code.xorps(xmm0, xmm0); - FCODE(vcmpp)(k1, src, xmm0, Cmp::GreaterEqual_OQ); - - // Convert positive values to unsigned integers, write 0 anywhere else - // vcvttp*2u*q already saturates out-of-range values to (0xFFFF...) - if constexpr (fsize == 32) { - code.vcvttps2udq(src | k1 | T_z, src); - } else { - code.vcvttpd2uqq(src | k1 | T_z, src); - } + code.vcvttpd2qq(src, src); } else { - // Zero is minimum - code.xorps(xmm0, xmm0); - FCODE(cmplep)(xmm0, src); - FCODE(andp)(src, xmm0); + const Xbyak::Reg64 hi = ctx.reg_alloc.ScratchGpr(code); + const Xbyak::Reg64 lo = ctx.reg_alloc.ScratchGpr(code); - // Will we exceed unsigned range? - const Xbyak::Xmm exceed_unsigned = ctx.reg_alloc.ScratchXmm(code); - code.movaps(exceed_unsigned, GetVectorOf(code)); - FCODE(cmplep)(exceed_unsigned, src); + code.cvttsd2si(lo, src); + code.punpckhqdq(src, src); + code.cvttsd2si(hi, src); + code.movq(src, lo); + code.pinsrq(src, hi, 1); - // Will be exceed signed range? - const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code); - code.movaps(tmp, GetVectorOf(code)); - code.movaps(xmm0, tmp); - FCODE(cmplep)(xmm0, src); - FCODE(andp)(tmp, xmm0); - FCODE(subp)(src, tmp); - perform_conversion(src); - ICODE(psll)(xmm0, u8(fsize - 1)); - FCODE(orp)(src, xmm0); + ctx.reg_alloc.Release(hi); + ctx.reg_alloc.Release(lo); + } + } + }; + if (fbits != 0) { + const u64 scale_factor = fsize == 32 + ? u64(fbits + 127) << 23 + : u64(fbits + 1023) << 52; + FCODE(mulp)(src, GetVectorOf(code, scale_factor)); + } - // Saturate to max - FCODE(orp)(src, exceed_unsigned); + FCODE(roundp)(src, src, u8(round_imm)); + const Xbyak::Xmm nan_mask = xmm0; + if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) { + static constexpr u32 nan_to_zero = FixupLUT(FpFixup::PosZero, FpFixup::PosZero); + FCODE(vfixupimmp)(src, src, code.BConst<32>(ptr_b, nan_to_zero), u8(0)); + } else if (code.HasHostFeature(HostFeature::AVX)) { + FCODE(vcmpordp)(nan_mask, src, src); + FCODE(vandp)(src, src, nan_mask); + } else { + code.movaps(nan_mask, src); + FCODE(cmpordp)(nan_mask, nan_mask); + code.andps(src, nan_mask); + } + + constexpr u64 float_upper_limit_signed = fsize == 32 ? 0x4f000000 : 0x43e0000000000000; + [[maybe_unused]] constexpr u64 float_upper_limit_unsigned = fsize == 32 ? 0x4f800000 : 0x43f0000000000000; + + if constexpr (unsigned_) { + if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) { + // Mask positive values + code.xorps(xmm0, xmm0); + FCODE(vcmpp)(k1, src, xmm0, Cmp::GreaterEqual_OQ); + + // Convert positive values to unsigned integers, write 0 anywhere else + // vcvttp*2u*q already saturates out-of-range values to (0xFFFF...) + if (fsize == 32) { + code.vcvttps2udq(src | k1 | T_z, src); + } else { + code.vcvttpd2uqq(src | k1 | T_z, src); } } else { - using FPT = mcl::unsigned_integer_of_size; // WORKAROUND: For issue 678 on MSVC - constexpr u64 integer_max = FPT((std::numeric_limits>>::max)()); - - code.movaps(xmm0, GetVectorOf(code)); + // Zero is minimum + code.xorps(xmm0, xmm0); FCODE(cmplep)(xmm0, src); - perform_conversion(src); - FCODE(blendvp)(src, GetVectorOf(code)); - } - }); + FCODE(andp)(src, xmm0); - ctx.reg_alloc.DefineValue(code, inst, src); - return; - } + // Will we exceed unsigned range? + const Xbyak::Xmm exceed_unsigned = ctx.reg_alloc.ScratchXmm(code); + code.movaps(exceed_unsigned, GetVectorOf(code)); + FCODE(cmplep)(exceed_unsigned, src); + + // Will be exceed signed range? + const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code); + code.movaps(tmp, GetVectorOf(code)); + code.movaps(xmm0, tmp); + FCODE(cmplep)(xmm0, src); + FCODE(andp)(tmp, xmm0); + FCODE(subp)(src, tmp); + perform_conversion(src); + ICODE(psll)(xmm0, u8(fsize - 1)); + FCODE(orp)(src, xmm0); + + // Saturate to max + FCODE(orp)(src, exceed_unsigned); + } + } else { + using FPT = mcl::unsigned_integer_of_size; // WORKAROUND: For issue 678 on MSVC + constexpr u64 integer_max = FPT((std::numeric_limits>>::max)()); + code.movaps(xmm0, GetVectorOf(code)); + FCODE(cmplep)(xmm0, src); + perform_conversion(src); + FCODE(blendvp)(src, GetVectorOf(code)); + } + }); + ctx.reg_alloc.DefineValue(code, inst, src); + return; } using FPT = mcl::unsigned_integer_of_size; // WORKAROUND: For issue 678 on MSVC diff --git a/src/dynarmic/src/dynarmic/backend/x64/exception_handler_windows.cpp b/src/dynarmic/src/dynarmic/backend/x64/exception_handler_windows.cpp index 3ae553bccd..bae397ff2b 100644 --- a/src/dynarmic/src/dynarmic/backend/x64/exception_handler_windows.cpp +++ b/src/dynarmic/src/dynarmic/backend/x64/exception_handler_windows.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project +// SPDX-FileCopyrightText: Copyright 2026 Eden Emulator Project // SPDX-License-Identifier: GPL-3.0-or-later /* This file is part of the dynarmic project. @@ -176,7 +176,7 @@ struct ExceptionHandler::Impl final { code.align(16); const u8* exception_handler_without_cb = code.getCurr(); - code.mov(code.eax, static_cast(ExceptionContinueSearch)); + code.mov(code.eax, u32(ExceptionContinueSearch)); code.ret(); code.align(16); @@ -192,20 +192,18 @@ struct ExceptionHandler::Impl final { code.lea(code.rsp, code.ptr[code.rsp - 8]); code.mov(code.ABI_PARAM1, std::bit_cast(&cb)); code.mov(code.ABI_PARAM2, code.ABI_PARAM3); - code.CallLambda( - [](const std::function& cb_, PCONTEXT ctx) { - FakeCall fc = cb_(ctx->Rip); - - ctx->Rsp -= sizeof(u64); - *std::bit_cast(ctx->Rsp) = fc.ret_rip; - ctx->Rip = fc.call_rip; - }); + code.CallLambda([](const std::function& cb_, PCONTEXT ctx) { + FakeCall fc = cb_(ctx->Rip); + ctx->Rsp -= sizeof(u64); + *std::bit_cast(ctx->Rsp) = fc.ret_rip; + ctx->Rip = fc.call_rip; + }); code.add(code.rsp, 8); - code.mov(code.eax, static_cast(ExceptionContinueExecution)); + code.mov(code.eax, u32(ExceptionContinueExecution)); code.ret(); - exception_handler_without_cb_offset = static_cast(exception_handler_without_cb - code.getCode()); - exception_handler_with_cb_offset = static_cast(exception_handler_with_cb - code.getCode()); + exception_handler_without_cb_offset = ULONG(exception_handler_without_cb - code.getCode()); + exception_handler_with_cb_offset = ULONG(exception_handler_with_cb - code.getCode()); code.align(16); UNWIND_INFO* unwind_info = static_cast(code.AllocateFromCodeSpace(sizeof(UNWIND_INFO))); diff --git a/src/dynarmic/src/dynarmic/backend/x64/reg_alloc.cpp b/src/dynarmic/src/dynarmic/backend/x64/reg_alloc.cpp index 5c5ed25131..2cfa14ae18 100644 --- a/src/dynarmic/src/dynarmic/backend/x64/reg_alloc.cpp +++ b/src/dynarmic/src/dynarmic/backend/x64/reg_alloc.cpp @@ -417,7 +417,8 @@ HostLoc RegAlloc::SelectARegister(std::bitset<32> desired_locations) const noexc // While R13 and R14 are technically available, we avoid allocating for them // at all costs, because theoretically skipping them is better than spilling // all over the place - i also fixes bugs with high reg pressure - } else if (i >= HostLoc::R13 && i <= HostLoc::R15) { + // %rbp must not be trashed, so skip it as well + } else if (i == HostLoc::RBP || (i >= HostLoc::R13 && i <= HostLoc::R15)) { // skip, do not touch // Intel recommends to reuse registers as soon as they're overwritable (DO NOT SPILL) } else if (loc_info.IsEmpty()) { diff --git a/src/dynarmic/src/dynarmic/backend/x64/stack_layout.h b/src/dynarmic/src/dynarmic/backend/x64/stack_layout.h index 50737f12eb..43a3fc7ab2 100644 --- a/src/dynarmic/src/dynarmic/backend/x64/stack_layout.h +++ b/src/dynarmic/src/dynarmic/backend/x64/stack_layout.h @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project +// SPDX-FileCopyrightText: Copyright 2026 Eden Emulator Project // SPDX-License-Identifier: GPL-3.0-or-later /* This file is part of the dynarmic project. @@ -22,14 +22,13 @@ constexpr size_t SpillCount = 64; #endif struct alignas(16) StackLayout { + // Needs alignment for VMOV and XMM spills + alignas(16) std::array, SpillCount> spill; s64 cycles_remaining; s64 cycles_to_run; - - std::array, SpillCount> spill; - u32 save_host_MXCSR; - bool check_bit; + u64 abi_base_pointer; }; #ifdef _MSC_VER diff --git a/src/dynarmic/src/dynarmic/ir/opcodes.inc b/src/dynarmic/src/dynarmic/ir/opcodes.inc index 6f57f278a3..b1ba5b2993 100644 --- a/src/dynarmic/src/dynarmic/ir/opcodes.inc +++ b/src/dynarmic/src/dynarmic/ir/opcodes.inc @@ -1,3 +1,6 @@ +// SPDX-FileCopyrightText: Copyright 2026 Eden Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + // First we list common shared opcodes // Since we give priority to A64 performance, we include them first, this is so we // can discard all A32 opcodes instead of having a "hole" in our checks @@ -710,6 +713,8 @@ A64OPC(ExclusiveWriteMemory32, U32, U64, A64OPC(ExclusiveWriteMemory64, U32, U64, U64, U64, AccType ) A64OPC(ExclusiveWriteMemory128, U32, U64, U64, U128, AccType ) +// Remember to update: +// - a32_emit_x64.cpp // A32 Context getters/setters A32OPC(SetCheckBit, Void, U1 ) diff --git a/src/dynarmic/tests/A64/a64.cpp b/src/dynarmic/tests/A64/a64.cpp index 4d4484e53e..e85986ea5a 100644 --- a/src/dynarmic/tests/A64/a64.cpp +++ b/src/dynarmic/tests/A64/a64.cpp @@ -415,6 +415,105 @@ TEST_CASE("A64: URSHL", "[a64]") { CHECK(jit.GetVector(9) == Vector{0x0000000000000002, 0x12db8b8280e0ba}); } +TEST_CASE("A64: SQSHLU", "[a64]") { + A64TestEnv env; + A64::UserConfig jit_user_config{}; + jit_user_config.callbacks = &env; + A64::Jit jit{jit_user_config}; + + oaknut::VectorCodeGenerator code{env.code_mem, nullptr}; + code.SQSHLU(V8.B16(), V0.B16(), 1); + code.SQSHLU(V9.H8(), V1.H8(), 2); + code.SQSHLU(V10.S4(), V2.S4(), 28); + code.SQSHLU(V11.D2(), V3.D2(), 4); + code.SQSHLU(V12.S4(), V0.S4(), 1); + code.SQSHLU(V13.S4(), V1.S4(), 3); + code.SQSHLU(V14.S4(), V2.S4(), 0); + code.SQSHLU(V15.S4(), V3.S4(), 0); + + jit.SetVector(0, Vector{0xffffffff'18ba6a6a, 0x7fffffff'943b954f}); + jit.SetVector(1, Vector{0x0000000b'0000000f, 0xffffffff'ffffffff}); + jit.SetVector(2, Vector{0x00000001'000000ff, 0x00000010'0000007f}); + jit.SetVector(3, Vector{0xffffffffffffffff, 0x96dc5c140705cd04}); + + env.ticks_left = env.code_mem.size(); + CheckedRun([&]() { jit.Run(); }); + + CHECK(jit.GetVector(8) == Vector{0x3000d4d4, 0xfe0000000076009e}); + CHECK(jit.GetVector(9) == Vector{0x2c0000003c, 0}); + CHECK(jit.GetVector(10) == Vector{0x10000000'ffffffff, 0xffffffff'ffffffff}); + CHECK(jit.GetVector(11) == Vector{0, 0}); + CHECK(jit.GetVector(12) == Vector{0x3174d4d4, 0xfffffffe00000000}); + CHECK(jit.GetVector(13) == Vector{0x5800000078, 0}); + CHECK(jit.GetVector(14) == Vector{0x1000000ff, 0x100000007f}); + CHECK(jit.GetVector(15) == Vector{0, 0x705cd04}); +} + +TEST_CASE("A64: SMIN", "[a64]") { + A64TestEnv env; + A64::UserConfig jit_user_config{}; + jit_user_config.callbacks = &env; + A64::Jit jit{jit_user_config}; + + oaknut::VectorCodeGenerator code{env.code_mem, nullptr}; + code.SMIN(V8.B16(), V0.B16(), V3.B16()); + code.SMIN(V9.H8(), V1.H8(), V2.H8()); + code.SMIN(V10.S4(), V2.S4(), V3.S4()); + code.SMIN(V11.S4(), V3.S4(), V3.S4()); + code.SMIN(V12.S4(), V0.S4(), V3.S4()); + code.SMIN(V13.S4(), V1.S4(), V2.S4()); + code.SMIN(V14.S4(), V2.S4(), V1.S4()); + code.SMIN(V15.S4(), V3.S4(), V0.S4()); + + jit.SetPC(0); + jit.SetVector(0, Vector{0xffffffff'18ba6a6a, 0x7fffffff'943b954f}); + jit.SetVector(1, Vector{0x0000000b'0000000f, 0xffffffff'ffffffff}); + jit.SetVector(2, Vector{0x00000001'000000ff, 0x00000010'0000007f}); + jit.SetVector(3, Vector{0xffffffff'ffffffff, 0x96dc5c14'0705cd04}); + + env.ticks_left = 4; + CheckedRun([&]() { jit.Run(); }); + + REQUIRE(jit.GetVector(8) == Vector{0xffffffffffbaffff, 0x96dcffff94059504}); + REQUIRE(jit.GetVector(9) == Vector{0x10000000f, 0xffffffffffffffff}); + REQUIRE(jit.GetVector(10) == Vector{0xffffffffffffffff, 0x96dc5c140000007f}); +} + +TEST_CASE("A64: SMINP", "[a64]") { + A64TestEnv env; + A64::UserConfig jit_user_config{}; + jit_user_config.callbacks = &env; + A64::Jit jit{jit_user_config}; + + oaknut::VectorCodeGenerator code{env.code_mem, nullptr}; + code.SMINP(V8.B16(), V0.B16(), V3.B16()); + code.SMINP(V9.H8(), V1.H8(), V2.H8()); + code.SMINP(V10.S4(), V2.S4(), V1.S4()); + code.SMINP(V11.S4(), V3.S4(), V3.S4()); + code.SMINP(V12.S4(), V0.S4(), V3.S4()); + code.SMINP(V13.S4(), V1.S4(), V2.S4()); + code.SMINP(V14.S4(), V2.S4(), V1.S4()); + code.SMINP(V15.S4(), V3.S4(), V0.S4()); + + jit.SetPC(0); + jit.SetVector(0, Vector{0xffffffff'18ba6a6a, 0x7fffffff'943b954f}); + jit.SetVector(1, Vector{0x0000000b'0000000f, 0xffffffff'ffffffff}); + jit.SetVector(2, Vector{0x00000001'000000ff, 0x00000010'0000007f}); + jit.SetVector(3, Vector{0xffffffff'ffffffff, 0x96dc5c14'0705cd04}); + + env.ticks_left = 4; + CheckedRun([&]() { jit.Run(); }); + + REQUIRE(jit.GetVector(8) == Vector{0xffff9495ffffba6a, 0x961405cdffffffff}); + REQUIRE(jit.GetVector(9) == Vector{0xffffffff00000000, 0}); + REQUIRE(jit.GetVector(10) == Vector{0x1000000001, 0xffffffff0000000b}); + REQUIRE(jit.GetVector(11) == Vector{0x96dc5c14ffffffff, 0x96dc5c14ffffffff}); + REQUIRE(jit.GetVector(12) == Vector{0x943b954fffffffff, 0x96dc5c14ffffffff}); + REQUIRE(jit.GetVector(13) == Vector{0xffffffff0000000b, 0x1000000001}); + REQUIRE(jit.GetVector(14) == Vector{0x1000000001, 0xffffffff0000000b}); + REQUIRE(jit.GetVector(15) == Vector{0x96dc5c14ffffffff, 0x943b954fffffffff}); +} + TEST_CASE("A64: XTN", "[a64]") { A64TestEnv env; A64::UserConfig jit_user_config{}; diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp index e989bf6b31..75fbcaa968 100644 --- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp @@ -49,7 +49,7 @@ using VideoCore::Surface::PixelFormatFromDepthFormat; using VideoCore::Surface::PixelFormatFromRenderTargetFormat; constexpr size_t NUM_STAGES = Maxwell::MaxShaderStage; -constexpr size_t MAX_IMAGE_ELEMENTS = 64; +constexpr size_t INLINE_IMAGE_ELEMENTS = 64; DescriptorLayoutBuilder MakeBuilder(const Device& device, std::span infos) { DescriptorLayoutBuilder builder{device}; @@ -264,7 +264,11 @@ GraphicsPipeline::GraphicsPipeline( stage_infos[stage] = *info; enabled_uniform_buffer_masks[stage] = info->constant_buffer_mask; std::ranges::copy(info->constant_buffer_used_sizes, uniform_buffer_sizes[stage].begin()); + num_image_elements += Shader::NumDescriptors(info->texture_buffer_descriptors); + num_image_elements += Shader::NumDescriptors(info->image_buffer_descriptors); num_textures += Shader::NumDescriptors(info->texture_descriptors); + num_image_elements += Shader::NumDescriptors(info->texture_descriptors); + num_image_elements += Shader::NumDescriptors(info->image_descriptors); } fragment_has_color0_output = stage_infos[NUM_STAGES - 1].stores_frag_color[0]; auto func{[this, shader_notify, &render_pass_cache, &descriptor_pool, pipeline_statistics] { @@ -310,10 +314,10 @@ void GraphicsPipeline::AddTransition(GraphicsPipeline* transition) { template bool GraphicsPipeline::ConfigureImpl(bool is_indexed) { - std::array views; - std::array samplers; - size_t sampler_index{}; - size_t view_index{}; + small_vector views; + small_vector samplers; + views.reserve(num_image_elements); + samplers.reserve(num_textures); texture_cache.SynchronizeGraphicsDescriptors(); @@ -358,11 +362,11 @@ bool GraphicsPipeline::ConfigureImpl(bool is_indexed) { const auto add_image{[&](const auto& desc, bool blacklist) LAMBDA_FORCEINLINE { for (u32 index = 0; index < desc.count; ++index) { const auto handle{read_handle(desc, index)}; - views[view_index++] = { + views.push_back({ .index = handle.first, .blacklist = blacklist, .id = {} - }; + }); } }}; if constexpr (Spec::has_texture_buffers) { @@ -378,10 +382,10 @@ bool GraphicsPipeline::ConfigureImpl(bool is_indexed) { for (const auto& desc : info.texture_descriptors) { for (u32 index = 0; index < desc.count; ++index) { const auto handle{read_handle(desc, index)}; - views[view_index++] = {handle.first}; + views.push_back({handle.first}); VideoCommon::SamplerId sampler{texture_cache.GetGraphicsSamplerId(handle.second)}; - samplers[sampler_index++] = sampler; + samplers.push_back(sampler); } } if constexpr (Spec::has_images) { @@ -407,7 +411,9 @@ bool GraphicsPipeline::ConfigureImpl(bool is_indexed) { if constexpr (Spec::enabled_stages[4]) { config_stage(4); } - texture_cache.FillGraphicsImageViews(std::span(views.data(), view_index)); + ASSERT(views.size() == num_image_elements); + ASSERT(samplers.size() == num_textures); + texture_cache.FillGraphicsImageViews(std::span(views.data(), views.size())); VideoCommon::ImageViewInOut* texture_buffer_it{views.data()}; const auto bind_stage_info{[&](size_t stage) LAMBDA_FORCEINLINE { @@ -501,7 +507,8 @@ bool GraphicsPipeline::ConfigureImpl(bool is_indexed) { buffer_cache.any_buffer_uploaded = false; } texture_cache.UpdateRenderTargets(false); - texture_cache.CheckFeedbackLoop(views); + texture_cache.CheckFeedbackLoop(std::span{views.data(), + views.size()}); ConfigureDraw(rescaling, render_area); return true; @@ -987,7 +994,7 @@ void GraphicsPipeline::Validate() { num_images += Shader::NumDescriptors(info.texture_descriptors); num_images += Shader::NumDescriptors(info.image_descriptors); } - ASSERT(num_images <= MAX_IMAGE_ELEMENTS); + ASSERT(num_images == num_image_elements); } } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.h b/src/video_core/renderer_vulkan/vk_graphics_pipeline.h index 34941d6e8d..1a41e50a36 100644 --- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.h +++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.h @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project +// SPDX-FileCopyrightText: Copyright 2026 Eden Emulator Project // SPDX-License-Identifier: GPL-3.0-or-later // SPDX-FileCopyrightText: Copyright 2021 yuzu Emulator Project @@ -159,6 +159,7 @@ private: std::array stage_infos; std::array enabled_uniform_buffer_masks{}; VideoCommon::UniformBufferSizes uniform_buffer_sizes{}; + size_t num_image_elements{}; u32 num_textures{}; bool fragment_has_color0_output{};