diff --git a/src/core/arm/dynarmic/arm_dynarmic_64.cpp b/src/core/arm/dynarmic/arm_dynarmic_64.cpp
index fe7fb5983f..7e4bf456d2 100644
--- a/src/core/arm/dynarmic/arm_dynarmic_64.cpp
+++ b/src/core/arm/dynarmic/arm_dynarmic_64.cpp
@@ -226,7 +226,7 @@ void ArmDynarmic64::MakeJit(Common::PageTable* page_table, std::size_t address_s
         config.only_detect_misalignment_via_page_table_on_page_boundary = true;
 
         config.fastmem_pointer = page_table->fastmem_arena ?
-            std::optional<uintptr_t>{reinterpret_cast<uintptr_t>(page_table->fastmem_arena)} :
+            std::optional<uintptr_t>{uintptr_t(page_table->fastmem_arena)} :
             std::nullopt;
         config.fastmem_address_space_bits = std::uint32_t(address_space_bits);
         config.silently_mirror_fastmem = false;
diff --git a/src/core/hle/service/jit/jit_context.cpp b/src/core/hle/service/jit/jit_context.cpp
index 522d849e6f..447d36a0d0 100644
--- a/src/core/hle/service/jit/jit_context.cpp
+++ b/src/core/hle/service/jit/jit_context.cpp
@@ -48,6 +48,8 @@ public:
           mapped_ranges{mapped_ranges_}, parent{parent_} {}
 
     std::optional<std::uint32_t> MemoryReadCode(VAddr vaddr) override {
+        if (!memory.IsValidVirtualAddressRange(vaddr, sizeof(u32)))
+            return std::nullopt;
         static_assert(Core::Memory::YUZU_PAGESIZE == Dynarmic::CODE_PAGE_SIZE);
         auto const aligned_vaddr = vaddr & ~Core::Memory::YUZU_PAGEMASK;
         if (last_code_addr != aligned_vaddr) {
diff --git a/src/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.cpp b/src/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.cpp
index 80f0f9cc2f..dd9e9e4a66 100644
--- a/src/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.cpp
+++ b/src/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.cpp
@@ -59,8 +59,10 @@ static Xbyak::Address MJitStateExtReg(A32::ExtReg reg) {
     UNREACHABLE();
 }
 
-A32EmitContext::A32EmitContext(const A32::UserConfig& conf, RegAlloc& reg_alloc, IR::Block& block)
-        : EmitContext(reg_alloc, block), conf(conf) {}
+A32EmitContext::A32EmitContext(const A32::UserConfig& conf, RegAlloc& reg_alloc, IR::Block& block, boost::container::stable_vector<Xbyak::Label>& shared_labels)
+    : EmitContext(reg_alloc, block, shared_labels)
+    , conf(conf)
+{}
 
 A32::LocationDescriptor A32EmitContext::Location() const {
     return A32::LocationDescriptor{block.Location()};
@@ -109,35 +111,59 @@ A32EmitX64::BlockDescriptor A32EmitX64::Emit(IR::Block& block) {
             gprs.reset(size_t(HostLoc::R14));
         return gprs;
     }(), any_xmm);
-    A32EmitContext ctx{conf, reg_alloc, block};
+
+    A32EmitContext ctx{conf, reg_alloc, block, shared_labels};
 
     // Start emitting.
     code.align();
     const u8* const entrypoint = code.getCurr();
+    code.mov(code.qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, abi_base_pointer)], rbp);
+    code.lea(rbp, code.ptr[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, abi_base_pointer) - 8]);
 
     EmitCondPrelude(ctx);
-
-    for (auto iter = block.instructions.begin(); iter != block.instructions.end(); ++iter) [[likely]] {
-        auto* inst = &*iter;
-        // Call the relevant Emit* member function.
-        switch (inst->GetOpcode()) {
-#define OPCODE(name, type, ...)                     \
-        case IR::Opcode::name:                  \
-            A32EmitX64::Emit##name(ctx, inst);  \
-            break;
-#define A32OPC(name, type, ...)                     \
-        case IR::Opcode::A32##name:             \
-            A32EmitX64::EmitA32##name(ctx, inst);\
-            break;
+    typedef void (EmitX64::*EmitHandlerFn)(EmitContext& context, IR::Inst* inst);
+    constexpr EmitHandlerFn opcode_handlers[] = {
+#define OPCODE(name, type, ...) &EmitX64::Emit##name,
+#define A32OPC(name, type, ...)
+#define A64OPC(name, type, ...)
+#include "dynarmic/ir/opcodes.inc"
+#undef OPCODE
+#undef A32OPC
+#undef A64OPC
+    };
+    typedef void (A32EmitX64::*A32EmitHandlerFn)(A32EmitContext& context, IR::Inst* inst);
+    constexpr A32EmitHandlerFn a32_handlers[] = {
+#define OPCODE(...)
+#define A32OPC(name, type, ...) &A32EmitX64::EmitA32##name,
 #define A64OPC(...)
 #include "dynarmic/ir/opcodes.inc"
 #undef OPCODE
 #undef A32OPC
+#undef A64OPC
+    };
+
+    for (auto& inst : block.instructions) {
+        auto const opcode = inst.GetOpcode();
+        // Call the relevant Emit* member function.
+        switch (opcode) {
+#define OPCODE(name, type, ...) case IR::Opcode::name: goto opcode_branch;
+#define A32OPC(name, type, ...) case IR::Opcode::A32##name: goto a32_branch;
+#define A64OPC(name, type, ...)
+#include "dynarmic/ir/opcodes.inc"
+#undef OPCODE
+#undef A32OPC
 #undef A64OPC
         default:
             UNREACHABLE();
         }
-        reg_alloc.EndOfAllocScope();
+opcode_branch:
+        (this->*opcode_handlers[size_t(opcode)])(ctx, &inst);
+        goto finish_this_inst;
+a32_branch:
+        // Update with FIRST A32 instruction
+        (this->*a32_handlers[size_t(opcode) - size_t(IR::Opcode::A32SetCheckBit)])(ctx, &inst);
+finish_this_inst:
+        ctx.reg_alloc.EndOfAllocScope();
 #ifndef NDEBUG
         if (conf.very_verbose_debugging_output)
             EmitVerboseDebuggingOutput(reg_alloc);
@@ -146,15 +172,14 @@ A32EmitX64::BlockDescriptor A32EmitX64::Emit(IR::Block& block) {
 
     reg_alloc.AssertNoMoreUses();
 
-    if (conf.enable_cycle_counting) {
+    if (conf.enable_cycle_counting)
         EmitAddCycles(block.CycleCount());
-    }
+    code.mov(rbp, code.qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, abi_base_pointer)]);
     EmitTerminal(block.GetTerminal(), ctx.Location().SetSingleStepping(false), ctx.IsSingleStep());
     code.int3();
 
-    for (auto& deferred_emit : ctx.deferred_emits) {
+    for (auto& deferred_emit : ctx.deferred_emits)
         deferred_emit();
-    }
     code.int3();
 
     const size_t size = size_t(code.getCurr() - entrypoint);
@@ -167,6 +192,7 @@ A32EmitX64::BlockDescriptor A32EmitX64::Emit(IR::Block& block) {
 
     auto const bdesc = RegisterBlock(descriptor, entrypoint, size);
     code.DisableWriting();
+    shared_labels.clear();
     return bdesc;
 }
 
diff --git a/src/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.h b/src/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.h
index 5ec78ff50e..8e97dc7737 100644
--- a/src/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.h
+++ b/src/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.h
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
+// SPDX-FileCopyrightText: Copyright 2026 Eden Emulator Project
 // SPDX-License-Identifier: GPL-3.0-or-later
 
 /* This file is part of the dynarmic project.
@@ -29,7 +29,7 @@ namespace Dynarmic::Backend::X64 {
 class RegAlloc;
 
 struct A32EmitContext final : public EmitContext {
-    A32EmitContext(const A32::UserConfig& conf, RegAlloc& reg_alloc, IR::Block& block);
+    A32EmitContext(const A32::UserConfig& conf, RegAlloc& reg_alloc, IR::Block& block, boost::container::stable_vector<Xbyak::Label>& shared_labels);
 
     A32::LocationDescriptor Location() const;
     A32::LocationDescriptor EndLocation() const;
@@ -130,6 +130,7 @@ public:
     ankerl::unordered_dense::map<std::tuple<bool, size_t, int, int>, void (*)()> write_fallbacks;
     ankerl::unordered_dense::map<std::tuple<bool, size_t, int, int>, void (*)()> exclusive_write_fallbacks;
     ankerl::unordered_dense::set<DoNotFastmemMarker> do_not_fastmem;
+    boost::container::stable_vector<Xbyak::Label> shared_labels;
     void (*memory_read_128)() = nullptr;   // Dummy
     void (*memory_write_128)() = nullptr;  // Dummy
     const void* terminal_handler_pop_rsb_hint;
diff --git a/src/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.cpp b/src/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.cpp
index 832cfdcce2..8edeb29aed 100644
--- a/src/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.cpp
+++ b/src/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.cpp
@@ -37,8 +37,10 @@ namespace Dynarmic::Backend::X64 {
 
 using namespace Xbyak::util;
 
-A64EmitContext::A64EmitContext(const A64::UserConfig& conf, RegAlloc& reg_alloc, IR::Block& block)
-        : EmitContext(reg_alloc, block), conf(conf) {}
+A64EmitContext::A64EmitContext(const A64::UserConfig& conf, RegAlloc& reg_alloc, IR::Block& block, boost::container::stable_vector<Xbyak::Label>& shared_labels)
+    : EmitContext(reg_alloc, block, shared_labels)
+    , conf(conf)
+{}
 
 A64::LocationDescriptor A64EmitContext::Location() const {
     return A64::LocationDescriptor{block.Location()};
@@ -83,11 +85,14 @@ A64EmitX64::BlockDescriptor A64EmitX64::Emit(IR::Block& block) noexcept {
             gprs.reset(size_t(HostLoc::R14));
         return gprs;
     }(), any_xmm};
-    A64EmitContext ctx{conf, reg_alloc, block};
+
+    A64EmitContext ctx{conf, reg_alloc, block, shared_labels};
 
     // Start emitting.
     code.align();
     const auto* const entrypoint = code.getCurr();
+    code.mov(code.qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, abi_base_pointer)], rbp);
+    code.lea(rbp, code.ptr[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, abi_base_pointer) - 8]);
 
     DEBUG_ASSERT(block.GetCondition() == IR::Cond::AL);
     typedef void (EmitX64::*EmitHandlerFn)(EmitContext& context, IR::Inst* inst);
@@ -139,16 +144,13 @@ finish_this_inst:
     }
 
     reg_alloc.AssertNoMoreUses();
-
-    if (conf.enable_cycle_counting) {
+    if (conf.enable_cycle_counting)
         EmitAddCycles(block.CycleCount());
-    }
+    code.mov(rbp, code.qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, abi_base_pointer)]);
     EmitTerminal(block.GetTerminal(), ctx.Location().SetSingleStepping(false), ctx.IsSingleStep());
     code.int3();
-
-    for (auto& deferred_emit : ctx.deferred_emits) {
+    for (auto& deferred_emit : ctx.deferred_emits)
         deferred_emit();
-    }
     code.int3();
 
     const size_t size = size_t(code.getCurr() - entrypoint);
@@ -161,6 +163,7 @@ finish_this_inst:
 
     auto bdesc = RegisterBlock(descriptor, entrypoint, size);
     code.DisableWriting();
+    shared_labels.clear();
     return bdesc;
 }
 
diff --git a/src/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.h b/src/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.h
index dd556e36ce..d57b1d81b9 100644
--- a/src/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.h
+++ b/src/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.h
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
+// SPDX-FileCopyrightText: Copyright 2026 Eden Emulator Project
 // SPDX-License-Identifier: GPL-3.0-or-later
 
 /* This file is part of the dynarmic project.
@@ -27,7 +27,7 @@
 namespace Dynarmic::Backend::X64 {
 
 struct A64EmitContext final : public EmitContext {
-    A64EmitContext(const A64::UserConfig& conf, RegAlloc& reg_alloc, IR::Block& block);
+    A64EmitContext(const A64::UserConfig& conf, RegAlloc& reg_alloc, IR::Block& block, boost::container::stable_vector<Xbyak::Label>& shared_labels);
 
     A64::LocationDescriptor Location() const;
     bool IsSingleStep() const;
@@ -126,6 +126,7 @@ public:
     ankerl::unordered_dense::map<std::tuple<bool, size_t, int, int>, void (*)()> write_fallbacks;
     ankerl::unordered_dense::map<std::tuple<bool, size_t, int, int>, void (*)()> exclusive_write_fallbacks;
     ankerl::unordered_dense::set<DoNotFastmemMarker> do_not_fastmem;
+    boost::container::stable_vector<Xbyak::Label> shared_labels;
     const void* terminal_handler_pop_rsb_hint = nullptr;
     const void* terminal_handler_fast_dispatch_hint = nullptr;
     FastDispatchEntry& (*fast_dispatch_table_lookup)(u64) = nullptr;
diff --git a/src/dynarmic/src/dynarmic/backend/x64/emit_x64.cpp b/src/dynarmic/src/dynarmic/backend/x64/emit_x64.cpp
index 4e515fef2f..4ed198e09f 100644
--- a/src/dynarmic/src/dynarmic/backend/x64/emit_x64.cpp
+++ b/src/dynarmic/src/dynarmic/backend/x64/emit_x64.cpp
@@ -32,8 +32,11 @@ namespace Dynarmic::Backend::X64 {
 
 using namespace Xbyak::util;
 
-EmitContext::EmitContext(RegAlloc& reg_alloc, IR::Block& block)
-        : reg_alloc(reg_alloc), block(block) {}
+EmitContext::EmitContext(RegAlloc& reg_alloc, IR::Block& block, boost::container::stable_vector<Xbyak::Label>& shared_labels)
+    : reg_alloc(reg_alloc)
+    , block(block)
+    , shared_labels(shared_labels)
+{}
 
 EmitContext::~EmitContext() = default;
 
diff --git a/src/dynarmic/src/dynarmic/backend/x64/emit_x64.h b/src/dynarmic/src/dynarmic/backend/x64/emit_x64.h
index 301f4ffc89..619945e19a 100644
--- a/src/dynarmic/src/dynarmic/backend/x64/emit_x64.h
+++ b/src/dynarmic/src/dynarmic/backend/x64/emit_x64.h
@@ -16,11 +16,12 @@
 #include <type_traits>
 #include <vector>
 
-#include "dynarmic/mcl/bit.hpp"
 #include <ankerl/unordered_dense.h>
-#include "dynarmic/backend/x64/xbyak.h"
+#include <boost/container/stable_vector.hpp>
 #include <boost/container/small_vector.hpp>
 
+#include "dynarmic/backend/x64/xbyak.h"
+#include "dynarmic/mcl/bit.hpp"
 #include "dynarmic/backend/exception_handler.h"
 #include "dynarmic/backend/x64/reg_alloc.h"
 #include "dynarmic/common/fp/fpcr.h"
@@ -52,24 +53,23 @@ using VectorArray = std::array<T, A64FullVectorWidth::value / mcl::bitsizeof<T>>
 template<typename T>
 using HalfVectorArray = std::array<T, A64FullVectorWidth::value / mcl::bitsizeof<T> / 2>;
 
+using SharedLabel = Xbyak::Label*;
 struct EmitContext {
-    EmitContext(RegAlloc& reg_alloc, IR::Block& block);
+    EmitContext(RegAlloc& reg_alloc, IR::Block& block, boost::container::stable_vector<Xbyak::Label>& shared_labels);
     virtual ~EmitContext();
     virtual FP::FPCR FPCR(bool fpcr_controlled = true) const = 0;
     virtual bool HasOptimization(OptimizationFlag flag) const = 0;
 
-    RegAlloc& reg_alloc;
-    IR::Block& block;
+    [[nodiscard]] inline Xbyak::Label* GenSharedLabel() noexcept {
+        return &shared_labels.emplace_back();
+    }
 
     std::vector<std::function<void()>> deferred_emits;
+    RegAlloc& reg_alloc;
+    IR::Block& block;
+    boost::container::stable_vector<Xbyak::Label>& shared_labels;
 };
 
-using SharedLabel = std::shared_ptr<Xbyak::Label>;
-
-inline SharedLabel GenSharedLabel() {
-    return std::make_shared<Xbyak::Label>();
-}
-
 class EmitX64 {
 public:
     struct BlockDescriptor {
diff --git a/src/dynarmic/src/dynarmic/backend/x64/emit_x64_floating_point.cpp b/src/dynarmic/src/dynarmic/backend/x64/emit_x64_floating_point.cpp
index d073991fbe..6a3ab005f3 100644
--- a/src/dynarmic/src/dynarmic/backend/x64/emit_x64_floating_point.cpp
+++ b/src/dynarmic/src/dynarmic/backend/x64/emit_x64_floating_point.cpp
@@ -136,7 +136,7 @@ void ForceToDefaultNaN(BlockOfCode& code, Xbyak::Xmm result) {
 
 template<size_t fsize>
 SharedLabel ProcessNaN(BlockOfCode& code, EmitContext& ctx, Xbyak::Xmm a) {
-    SharedLabel nan = GenSharedLabel(), end = GenSharedLabel();
+    SharedLabel nan = ctx.GenSharedLabel(), end = ctx.GenSharedLabel();
 
     FCODE(ucomis)(a, a);
     code.jp(*nan, code.T_NEAR);
@@ -251,7 +251,7 @@ template<size_t fsize, typename Function>
 void FPTwoOp(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
 
-    SharedLabel end = GenSharedLabel();
+    SharedLabel end = ctx.GenSharedLabel();
 
     Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
 
@@ -304,7 +304,7 @@ void FPThreeOp(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn)
     const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
     const Xbyak::Reg64 tmp = ctx.reg_alloc.ScratchGpr(code);
 
-    SharedLabel end = GenSharedLabel(), nan = GenSharedLabel();
+    SharedLabel end = ctx.GenSharedLabel(), nan = ctx.GenSharedLabel();
 
     code.movaps(result, op1);
     if constexpr (std::is_member_function_pointer_v<Function>) {
@@ -413,7 +413,7 @@ static void EmitFPMinMax(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, bo
 
     DenormalsAreZero<fsize>(code, ctx, {result, operand});
 
-    SharedLabel equal = GenSharedLabel(), end = GenSharedLabel();
+    SharedLabel equal = ctx.GenSharedLabel(), end = ctx.GenSharedLabel();
 
     FCODE(ucomis)(result, operand);
     code.jz(*equal, code.T_NEAR);
@@ -484,7 +484,7 @@ static inline void EmitFPMinMaxNumeric(BlockOfCode& code, EmitContext& ctx, IR::
             }
         };
 
-        SharedLabel end = GenSharedLabel(), z = GenSharedLabel();
+        SharedLabel end = ctx.GenSharedLabel(), z = ctx.GenSharedLabel();
 
         FCODE(ucomis)(op1, op2);
         code.jz(*z, code.T_NEAR);
@@ -632,7 +632,7 @@ static void EmitFPMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, bo
         }
 
         if (code.HasHostFeature(HostFeature::FMA | HostFeature::AVX)) {
-            SharedLabel fallback = GenSharedLabel(), end = GenSharedLabel();
+            SharedLabel fallback = ctx.GenSharedLabel(), end = ctx.GenSharedLabel();
 
             const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(code, args[0]);
             const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(code, args[1]);
@@ -843,7 +843,7 @@ static void EmitFPMulX(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
     const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
     const Xbyak::Reg64 tmp = do_default_nan ? INVALID_REG : ctx.reg_alloc.ScratchGpr(code);
 
-    SharedLabel end = GenSharedLabel(), nan = GenSharedLabel();
+    SharedLabel end = ctx.GenSharedLabel(), nan = ctx.GenSharedLabel();
 
     if (code.HasHostFeature(HostFeature::AVX)) {
         FCODE(vmuls)(result, op1, op2);
@@ -981,7 +981,7 @@ static void EmitFPRecipStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst*
         }
 
         if (code.HasHostFeature(HostFeature::FMA)) {
-            SharedLabel end = GenSharedLabel(), fallback = GenSharedLabel();
+            SharedLabel end = ctx.GenSharedLabel(), fallback = ctx.GenSharedLabel();
 
             const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(code, args[0]);
             const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(code, args[1]);
@@ -1129,7 +1129,7 @@ static void EmitFPRSqrtEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* i
         const Xbyak::Xmm value = ctx.reg_alloc.ScratchXmm(code);
         [[maybe_unused]] const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr(code).cvt32();
 
-        SharedLabel bad_values = GenSharedLabel(), end = GenSharedLabel();
+        SharedLabel bad_values = ctx.GenSharedLabel(), end = ctx.GenSharedLabel();
 
         code.movaps(value, operand);
 
@@ -1296,7 +1296,7 @@ static void EmitFPRSqrtStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst*
         }
 
         if (code.HasHostFeature(HostFeature::FMA | HostFeature::AVX)) {
-            SharedLabel end = GenSharedLabel(), fallback = GenSharedLabel();
+            SharedLabel end = ctx.GenSharedLabel(), fallback = ctx.GenSharedLabel();
 
             const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(code, args[0]);
             const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(code, args[1]);
@@ -1641,7 +1641,7 @@ static void EmitFPToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
                 const Xbyak::Xmm scratch = ctx.reg_alloc.ScratchXmm(code);
 
                 if (!unsigned_) {
-                    SharedLabel saturate_max = GenSharedLabel(), end = GenSharedLabel();
+                    SharedLabel saturate_max = ctx.GenSharedLabel(), end = ctx.GenSharedLabel();
 
                     ZeroIfNaN<64>(code, src, scratch);
 
diff --git a/src/dynarmic/src/dynarmic/backend/x64/emit_x64_memory.cpp.inc b/src/dynarmic/src/dynarmic/backend/x64/emit_x64_memory.cpp.inc
index 54fc595214..4fa14d504b 100644
--- a/src/dynarmic/src/dynarmic/backend/x64/emit_x64_memory.cpp.inc
+++ b/src/dynarmic/src/dynarmic/backend/x64/emit_x64_memory.cpp.inc
@@ -86,7 +86,7 @@ void AxxEmitX64::EmitMemoryRead(AxxEmitContext& ctx, IR::Inst* inst) {
 
     const auto wrapped_fn = read_fallbacks[std::make_tuple(ordered, bitsize, vaddr.getIdx(), value_idx)];
 
-    SharedLabel abort = GenSharedLabel(), end = GenSharedLabel();
+    SharedLabel abort = ctx.GenSharedLabel(), end = ctx.GenSharedLabel();
 
     if (fastmem_marker) {
         // Use fastmem
@@ -108,7 +108,7 @@ void AxxEmitX64::EmitMemoryRead(AxxEmitContext& ctx, IR::Inst* inst) {
                     conf.recompile_on_fastmem_failure,
                 });
 
-            EmitCheckMemoryAbort(ctx, inst, end.get());
+            EmitCheckMemoryAbort(ctx, inst, end);
             code.jmp(*end, code.T_NEAR);
         });
     } else {
@@ -120,7 +120,7 @@ void AxxEmitX64::EmitMemoryRead(AxxEmitContext& ctx, IR::Inst* inst) {
         ctx.deferred_emits.emplace_back([=, this, &ctx] {
             code.L(*abort);
             code.call(wrapped_fn);
-            EmitCheckMemoryAbort(ctx, inst, end.get());
+            EmitCheckMemoryAbort(ctx, inst, end);
             code.jmp(*end, code.T_NEAR);
         });
     }
@@ -173,7 +173,7 @@ void AxxEmitX64::EmitMemoryWrite(AxxEmitContext& ctx, IR::Inst* inst) {
 
     const auto wrapped_fn = write_fallbacks[std::make_tuple(ordered, bitsize, vaddr.getIdx(), value_idx)];
 
-    SharedLabel abort = GenSharedLabel(), end = GenSharedLabel();
+    SharedLabel abort = ctx.GenSharedLabel(), end = ctx.GenSharedLabel();
 
     if (fastmem_marker) {
         // Use fastmem
@@ -195,7 +195,7 @@ void AxxEmitX64::EmitMemoryWrite(AxxEmitContext& ctx, IR::Inst* inst) {
                     conf.recompile_on_fastmem_failure,
                 });
 
-            EmitCheckMemoryAbort(ctx, inst, end.get());
+            EmitCheckMemoryAbort(ctx, inst, end);
             code.jmp(*end, code.T_NEAR);
         });
     } else {
@@ -207,7 +207,7 @@ void AxxEmitX64::EmitMemoryWrite(AxxEmitContext& ctx, IR::Inst* inst) {
         ctx.deferred_emits.emplace_back([=, this, &ctx] {
             code.L(*abort);
             code.call(wrapped_fn);
-            EmitCheckMemoryAbort(ctx, inst, end.get());
+            EmitCheckMemoryAbort(ctx, inst, end);
             code.jmp(*end, code.T_NEAR);
         });
     }
@@ -352,7 +352,7 @@ void AxxEmitX64::EmitExclusiveReadMemoryInline(AxxEmitContext& ctx, IR::Inst* in
 
     const auto fastmem_marker = ShouldFastmem(ctx, inst);
     if (fastmem_marker) {
-        SharedLabel abort = GenSharedLabel(), end = GenSharedLabel();
+        SharedLabel abort = ctx.GenSharedLabel(), end = ctx.GenSharedLabel();
         bool require_abort_handling = false;
 
         const auto src_ptr = EmitFastmemVAddr(code, ctx, *abort, vaddr, require_abort_handling);
@@ -427,7 +427,7 @@ void AxxEmitX64::EmitExclusiveWriteMemoryInline(AxxEmitContext& ctx, IR::Inst* i
 
     EmitExclusiveLock(code, conf, tmp, tmp2.cvt32());
 
-    SharedLabel end = GenSharedLabel();
+    SharedLabel end = ctx.GenSharedLabel();
 
     code.mov(status, u32(1));
     code.movzx(tmp.cvt32(), code.byte[code.ABI_JIT_PTR + offsetof(AxxJitState, exclusive_state)]);
@@ -460,7 +460,7 @@ void AxxEmitX64::EmitExclusiveWriteMemoryInline(AxxEmitContext& ctx, IR::Inst* i
 
     const auto fastmem_marker = ShouldFastmem(ctx, inst);
     if (fastmem_marker) {
-        SharedLabel abort = GenSharedLabel();
+        SharedLabel abort = ctx.GenSharedLabel();
         bool require_abort_handling = false;
 
         const auto dest_ptr = EmitFastmemVAddr(code, ctx, *abort, vaddr, require_abort_handling, tmp);
diff --git a/src/dynarmic/src/dynarmic/backend/x64/emit_x64_memory.h b/src/dynarmic/src/dynarmic/backend/x64/emit_x64_memory.h
index b354efcb51..3ac078f1d7 100644
--- a/src/dynarmic/src/dynarmic/backend/x64/emit_x64_memory.h
+++ b/src/dynarmic/src/dynarmic/backend/x64/emit_x64_memory.h
@@ -54,7 +54,7 @@ void EmitDetectMisalignedVAddr(BlockOfCode& code, EmitContext& ctx, size_t bitsi
     if (ctx.conf.only_detect_misalignment_via_page_table_on_page_boundary) {
         const u32 page_align_mask = static_cast<u32>(page_table_const_size - 1) & ~align_mask;
 
-        SharedLabel detect_boundary = GenSharedLabel(), resume = GenSharedLabel();
+        SharedLabel detect_boundary = ctx.GenSharedLabel(), resume = ctx.GenSharedLabel();
 
         code.jnz(*detect_boundary, code.T_NEAR);
         code.L(*resume);
diff --git a/src/dynarmic/src/dynarmic/backend/x64/emit_x64_vector.cpp b/src/dynarmic/src/dynarmic/backend/x64/emit_x64_vector.cpp
index a0fd944041..6f53580997 100644
--- a/src/dynarmic/src/dynarmic/backend/x64/emit_x64_vector.cpp
+++ b/src/dynarmic/src/dynarmic/backend/x64/emit_x64_vector.cpp
@@ -38,33 +38,21 @@ template<typename Function>
 static void EmitVectorOperation(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
 
-    const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-    const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
+    auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
 
     (code.*fn)(xmm_a, xmm_b);
 
     ctx.reg_alloc.DefineValue(code, inst, xmm_a);
 }
 
-template<typename Function>
-static void EmitAVXVectorOperation(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) {
-    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-
-    const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-    const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
-
-    (code.*fn)(xmm_a, xmm_a, xmm_b);
-
-    ctx.reg_alloc.DefineValue(code, inst, xmm_a);
-}
-
 template<typename Lambda>
 static void EmitOneArgumentFallback(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Lambda lambda) {
     const auto fn = static_cast<mcl::equivalent_function_type<Lambda>*>(lambda);
     constexpr u32 stack_space = 2 * 16;
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-    const Xbyak::Xmm arg1 = ctx.reg_alloc.UseXmm(code, args[0]);
-    const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+    auto const arg1 = ctx.reg_alloc.UseXmm(code, args[0]);
+    auto const result = ctx.reg_alloc.ScratchXmm(code);
     ctx.reg_alloc.EndOfAllocScope();
 
     ctx.reg_alloc.HostCall(code, nullptr);
@@ -86,8 +74,8 @@ static void EmitOneArgumentFallbackWithSaturation(BlockOfCode& code, EmitContext
     const auto fn = static_cast<mcl::equivalent_function_type<Lambda>*>(lambda);
     constexpr u32 stack_space = 2 * 16;
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-    const Xbyak::Xmm arg1 = ctx.reg_alloc.UseXmm(code, args[0]);
-    const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+    auto const arg1 = ctx.reg_alloc.UseXmm(code, args[0]);
+    auto const result = ctx.reg_alloc.ScratchXmm(code);
     ctx.reg_alloc.EndOfAllocScope();
 
     ctx.reg_alloc.HostCall(code, nullptr);
@@ -111,9 +99,9 @@ static void EmitTwoArgumentFallbackWithSaturation(BlockOfCode& code, EmitContext
     const auto fn = static_cast<mcl::equivalent_function_type<Lambda>*>(lambda);
     constexpr u32 stack_space = 3 * 16;
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-    const Xbyak::Xmm arg1 = ctx.reg_alloc.UseXmm(code, args[0]);
-    const Xbyak::Xmm arg2 = ctx.reg_alloc.UseXmm(code, args[1]);
-    const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+    auto const arg1 = ctx.reg_alloc.UseXmm(code, args[0]);
+    auto const arg2 = ctx.reg_alloc.UseXmm(code, args[1]);
+    auto const result = ctx.reg_alloc.ScratchXmm(code);
     ctx.reg_alloc.EndOfAllocScope();
 
     ctx.reg_alloc.HostCall(code, nullptr);
@@ -139,9 +127,9 @@ static void EmitTwoArgumentFallbackWithSaturationAndImmediate(BlockOfCode& code,
     const auto fn = static_cast<mcl::equivalent_function_type<Lambda>*>(lambda);
     constexpr u32 stack_space = 2 * 16;
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-    const Xbyak::Xmm arg1 = ctx.reg_alloc.UseXmm(code, args[0]);
+    auto const arg1 = ctx.reg_alloc.UseXmm(code, args[0]);
     const u8 arg2 = args[1].GetImmediateU8();
-    const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+    auto const result = ctx.reg_alloc.ScratchXmm(code);
     ctx.reg_alloc.EndOfAllocScope();
 
     ctx.reg_alloc.HostCall(code, nullptr);
@@ -166,9 +154,9 @@ static void EmitTwoArgumentFallback(BlockOfCode& code, EmitContext& ctx, IR::Ins
     const auto fn = static_cast<mcl::equivalent_function_type<Lambda>*>(lambda);
     constexpr u32 stack_space = 3 * 16;
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-    const Xbyak::Xmm arg1 = ctx.reg_alloc.UseXmm(code, args[0]);
-    const Xbyak::Xmm arg2 = ctx.reg_alloc.UseXmm(code, args[1]);
-    const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+    auto const arg1 = ctx.reg_alloc.UseXmm(code, args[0]);
+    auto const arg2 = ctx.reg_alloc.UseXmm(code, args[1]);
+    auto const result = ctx.reg_alloc.ScratchXmm(code);
     ctx.reg_alloc.EndOfAllocScope();
 
     ctx.reg_alloc.HostCall(code, nullptr);
@@ -194,7 +182,7 @@ void EmitX64::EmitVectorGetElement8(EmitContext& ctx, IR::Inst* inst) {
 
     // TODO: DefineValue directly on Argument for index == 0
 
-    const Xbyak::Xmm source = ctx.reg_alloc.UseXmm(code, args[0]);
+    auto const source = ctx.reg_alloc.UseXmm(code, args[0]);
     const Xbyak::Reg32 dest = ctx.reg_alloc.ScratchGpr(code).cvt32();
 
     if (code.HasHostFeature(HostFeature::SSE41)) {
@@ -218,7 +206,7 @@ void EmitX64::EmitVectorGetElement16(EmitContext& ctx, IR::Inst* inst) {
 
     // TODO: DefineValue directly on Argument for index == 0
 
-    const Xbyak::Xmm source = ctx.reg_alloc.UseXmm(code, args[0]);
+    auto const source = ctx.reg_alloc.UseXmm(code, args[0]);
     const Xbyak::Reg32 dest = ctx.reg_alloc.ScratchGpr(code).cvt32();
     code.pextrw(dest, source, index);
     ctx.reg_alloc.DefineValue(code, inst, dest);
@@ -234,10 +222,10 @@ void EmitX64::EmitVectorGetElement32(EmitContext& ctx, IR::Inst* inst) {
     const Xbyak::Reg32 dest = ctx.reg_alloc.ScratchGpr(code).cvt32();
 
     if (code.HasHostFeature(HostFeature::SSE41)) {
-        const Xbyak::Xmm source = ctx.reg_alloc.UseXmm(code, args[0]);
+        auto const source = ctx.reg_alloc.UseXmm(code, args[0]);
         code.pextrd(dest, source, index);
     } else {
-        const Xbyak::Xmm source = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        auto const source = ctx.reg_alloc.UseScratchXmm(code, args[0]);
         code.pshufd(source, source, index);
         code.movd(dest, source);
     }
@@ -253,7 +241,7 @@ void EmitX64::EmitVectorGetElement64(EmitContext& ctx, IR::Inst* inst) {
     if (index == 0) {
         // TODO: DefineValue directly on Argument for index == 0
         const Xbyak::Reg64 dest = ctx.reg_alloc.ScratchGpr(code).cvt64();
-        const Xbyak::Xmm source = ctx.reg_alloc.UseXmm(code, args[0]);
+        auto const source = ctx.reg_alloc.UseXmm(code, args[0]);
         code.movq(dest, source);
         ctx.reg_alloc.DefineValue(code, inst, dest);
         return;
@@ -262,10 +250,10 @@ void EmitX64::EmitVectorGetElement64(EmitContext& ctx, IR::Inst* inst) {
     const Xbyak::Reg64 dest = ctx.reg_alloc.ScratchGpr(code).cvt64();
 
     if (code.HasHostFeature(HostFeature::SSE41)) {
-        const Xbyak::Xmm source = ctx.reg_alloc.UseXmm(code, args[0]);
+        auto const source = ctx.reg_alloc.UseXmm(code, args[0]);
         code.pextrq(dest, source, 1);
     } else {
-        const Xbyak::Xmm source = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        auto const source = ctx.reg_alloc.UseScratchXmm(code, args[0]);
         code.punpckhqdq(source, source);
         code.movq(dest, source);
     }
@@ -277,7 +265,7 @@ void EmitX64::EmitVectorSetElement8(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
     ASSERT(args[1].IsImmediate());
     const u8 index = args[1].GetImmediateU8();
-    const Xbyak::Xmm source_vector = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const source_vector = ctx.reg_alloc.UseScratchXmm(code, args[0]);
 
     if (code.HasHostFeature(HostFeature::SSE41)) {
         const Xbyak::Reg8 source_elem = ctx.reg_alloc.UseGpr(code, args[2]).cvt8();
@@ -310,7 +298,7 @@ void EmitX64::EmitVectorSetElement16(EmitContext& ctx, IR::Inst* inst) {
     ASSERT(args[1].IsImmediate());
     const u8 index = args[1].GetImmediateU8();
 
-    const Xbyak::Xmm source_vector = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const source_vector = ctx.reg_alloc.UseScratchXmm(code, args[0]);
     const Xbyak::Reg16 source_elem = ctx.reg_alloc.UseGpr(code, args[2]).cvt16();
 
     code.pinsrw(source_vector, source_elem.cvt32(), index);
@@ -322,7 +310,7 @@ void EmitX64::EmitVectorSetElement32(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
     ASSERT(args[1].IsImmediate());
     const u8 index = args[1].GetImmediateU8();
-    const Xbyak::Xmm source_vector = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const source_vector = ctx.reg_alloc.UseScratchXmm(code, args[0]);
 
     if (code.HasHostFeature(HostFeature::SSE41)) {
         const Xbyak::Reg32 source_elem = ctx.reg_alloc.UseGpr(code, args[2]).cvt32();
@@ -345,7 +333,7 @@ void EmitX64::EmitVectorSetElement64(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
     ASSERT(args[1].IsImmediate());
     const u8 index = args[1].GetImmediateU8();
-    const Xbyak::Xmm source_vector = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const source_vector = ctx.reg_alloc.UseScratchXmm(code, args[0]);
 
     if (code.HasHostFeature(HostFeature::SSE41)) {
         const Xbyak::Reg64 source_elem = ctx.reg_alloc.UseGpr(code, args[2]);
@@ -355,7 +343,7 @@ void EmitX64::EmitVectorSetElement64(EmitContext& ctx, IR::Inst* inst) {
         ctx.reg_alloc.DefineValue(code, inst, source_vector);
     } else {
         const Xbyak::Reg64 source_elem = ctx.reg_alloc.UseGpr(code, args[2]);
-        const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+        auto const tmp = ctx.reg_alloc.ScratchXmm(code);
 
         code.movq(tmp, source_elem);
 
@@ -369,72 +357,53 @@ void EmitX64::EmitVectorSetElement64(EmitContext& ctx, IR::Inst* inst) {
     }
 }
 
-static void VectorAbs8(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& data) {
-    if (code.HasHostFeature(HostFeature::SSSE3)) {
-        code.pabsb(data, data);
-    } else {
-        const Xbyak::Xmm temp = ctx.reg_alloc.ScratchXmm(code);
-        code.pxor(temp, temp);
-        code.psubb(temp, data);
-        code.pminub(data, temp);
-    }
-}
-
-static void VectorAbs16(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& data) {
-    if (code.HasHostFeature(HostFeature::SSSE3)) {
-        code.pabsw(data, data);
-    } else {
-        const Xbyak::Xmm temp = ctx.reg_alloc.ScratchXmm(code);
-        code.pxor(temp, temp);
-        code.psubw(temp, data);
-        code.pmaxsw(data, temp);
-    }
-}
-
-static void VectorAbs32(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& data) {
-    if (code.HasHostFeature(HostFeature::SSSE3)) {
-        code.pabsd(data, data);
-    } else {
-        const Xbyak::Xmm temp = ctx.reg_alloc.ScratchXmm(code);
-        code.movdqa(temp, data);
-        code.psrad(temp, 31);
-        code.pxor(data, temp);
-        code.psubd(data, temp);
-    }
-}
-
-static void VectorAbs64(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& data) {
-    if (code.HasHostFeature(HostFeature::AVX512_Ortho)) {
-        code.vpabsq(data, data);
-    } else {
-        const Xbyak::Xmm temp = ctx.reg_alloc.ScratchXmm(code);
-        code.pshufd(temp, data, 0b11110101);
-        code.psrad(temp, 31);
-        code.pxor(data, temp);
-        code.psubq(data, temp);
-    }
-}
-
 static void EmitVectorAbs(size_t esize, EmitContext& ctx, IR::Inst* inst, BlockOfCode& code) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-
-    const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-
+    auto const data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
     switch (esize) {
     case 8:
-        VectorAbs8(code, ctx, data);
+        if (code.HasHostFeature(HostFeature::SSSE3)) {
+            code.pabsb(data, data);
+        } else {
+            auto const temp = ctx.reg_alloc.ScratchXmm(code);
+            code.pxor(temp, temp);
+            code.psubb(temp, data);
+            code.pminub(data, temp);
+        }
         break;
     case 16:
-        VectorAbs16(code, ctx, data);
+        if (code.HasHostFeature(HostFeature::SSSE3)) {
+            code.pabsw(data, data);
+        } else {
+            auto const temp = ctx.reg_alloc.ScratchXmm(code);
+            code.pxor(temp, temp);
+            code.psubw(temp, data);
+            code.pmaxsw(data, temp);
+        }
         break;
     case 32:
-        VectorAbs32(code, ctx, data);
+        if (code.HasHostFeature(HostFeature::SSSE3)) {
+            code.pabsd(data, data);
+        } else {
+            auto const temp = ctx.reg_alloc.ScratchXmm(code);
+            code.movdqa(temp, data);
+            code.psrad(temp, 31);
+            code.pxor(data, temp);
+            code.psubd(data, temp);
+        }
         break;
     case 64:
-        VectorAbs64(code, ctx, data);
+        if (code.HasHostFeature(HostFeature::AVX512_Ortho)) {
+            code.vpabsq(data, data);
+        } else {
+            auto const temp = ctx.reg_alloc.ScratchXmm(code);
+            code.pshufd(temp, data, 0b11110101);
+            code.psrad(temp, 31);
+            code.pxor(data, temp);
+            code.psubq(data, temp);
+        }
         break;
     }
-
     ctx.reg_alloc.DefineValue(code, inst, data);
 }
 
@@ -477,15 +446,15 @@ void EmitX64::EmitVectorAnd(EmitContext& ctx, IR::Inst* inst) {
 void EmitX64::EmitVectorAndNot(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
 
-    const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseXmm(code, args[0]);
-    const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+    auto const xmm_a = ctx.reg_alloc.UseXmm(code, args[0]);
+    auto const xmm_b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
 
     code.pandn(xmm_b, xmm_a);
 
     ctx.reg_alloc.DefineValue(code, inst, xmm_b);
 }
 
-static void ArithmeticShiftRightByte(EmitContext& ctx, BlockOfCode& code, const Xbyak::Xmm& result, u8 shift_amount) {
+static void ArithmeticShiftRightByte(EmitContext& ctx, BlockOfCode& code, auto const& result, u8 shift_amount) {
     if (code.HasHostFeature(HostFeature::GFNI)) {
         const u64 shift_matrix = shift_amount < 8
                                    ? (0x0102040810204080 << (shift_amount * 8)) | (0x8080808080808080 >> (64 - shift_amount * 8))
@@ -494,7 +463,7 @@ static void ArithmeticShiftRightByte(EmitContext& ctx, BlockOfCode& code, const
         return;
     }
 
-    const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+    auto const tmp = ctx.reg_alloc.ScratchXmm(code);
 
     code.punpckhbw(tmp, result);
     code.punpcklbw(result, result);
@@ -506,7 +475,7 @@ static void ArithmeticShiftRightByte(EmitContext& ctx, BlockOfCode& code, const
 void EmitX64::EmitVectorArithmeticShiftRight8(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
 
-    const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
     const u8 shift_amount = args[1].GetImmediateU8();
 
     ArithmeticShiftRightByte(ctx, code, result, shift_amount);
@@ -517,7 +486,7 @@ void EmitX64::EmitVectorArithmeticShiftRight8(EmitContext& ctx, IR::Inst* inst)
 void EmitX64::EmitVectorArithmeticShiftRight16(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
 
-    const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
     const u8 shift_amount = args[1].GetImmediateU8();
 
     code.psraw(result, shift_amount);
@@ -528,7 +497,7 @@ void EmitX64::EmitVectorArithmeticShiftRight16(EmitContext& ctx, IR::Inst* inst)
 void EmitX64::EmitVectorArithmeticShiftRight32(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
 
-    const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
     const u8 shift_amount = args[1].GetImmediateU8();
 
     code.psrad(result, shift_amount);
@@ -538,14 +507,14 @@ void EmitX64::EmitVectorArithmeticShiftRight32(EmitContext& ctx, IR::Inst* inst)
 
 void EmitX64::EmitVectorArithmeticShiftRight64(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-    const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
     const u8 shift_amount = (std::min)(args[1].GetImmediateU8(), u8(63));
 
     if (code.HasHostFeature(HostFeature::AVX512_Ortho)) {
         code.vpsraq(result, result, shift_amount);
     } else {
-        const Xbyak::Xmm tmp1 = ctx.reg_alloc.ScratchXmm(code);
-        const Xbyak::Xmm tmp2 = ctx.reg_alloc.ScratchXmm(code);
+        auto const tmp1 = ctx.reg_alloc.ScratchXmm(code);
+        auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
 
         const u64 sign_bit = 0x80000000'00000000u >> shift_amount;
 
@@ -660,12 +629,12 @@ void EmitX64::EmitVectorArithmeticVShift64(EmitContext& ctx, IR::Inst* inst) {
 
 void EmitX64::EmitVectorBroadcastLower8(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-    const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
     if (code.HasHostFeature(HostFeature::AVX2)) {
         code.vpbroadcastb(a, a);
         code.vmovq(a, a);
     } else if (code.HasHostFeature(HostFeature::SSSE3)) {
-        const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+        auto const tmp = ctx.reg_alloc.ScratchXmm(code);
         code.pxor(tmp, tmp);
         code.pshufb(a, tmp);
         code.movq(a, a);
@@ -678,7 +647,7 @@ void EmitX64::EmitVectorBroadcastLower8(EmitContext& ctx, IR::Inst* inst) {
 
 void EmitX64::EmitVectorBroadcastLower16(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-    const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
 
     code.pshuflw(a, a, 0);
 
@@ -687,7 +656,7 @@ void EmitX64::EmitVectorBroadcastLower16(EmitContext& ctx, IR::Inst* inst) {
 
 void EmitX64::EmitVectorBroadcastLower32(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-    const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
 
     code.pshuflw(a, a, 0b01000100);
 
@@ -696,11 +665,11 @@ void EmitX64::EmitVectorBroadcastLower32(EmitContext& ctx, IR::Inst* inst) {
 
 void EmitX64::EmitVectorBroadcast8(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-    const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
     if (code.HasHostFeature(HostFeature::AVX2)) {
         code.vpbroadcastb(a, a);
     } else if (code.HasHostFeature(HostFeature::SSSE3)) {
-        const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+        auto const tmp = ctx.reg_alloc.ScratchXmm(code);
         code.pxor(tmp, tmp);
         code.pshufb(a, tmp);
     } else {
@@ -713,7 +682,7 @@ void EmitX64::EmitVectorBroadcast8(EmitContext& ctx, IR::Inst* inst) {
 
 void EmitX64::EmitVectorBroadcast16(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-    const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
     if (code.HasHostFeature(HostFeature::AVX2)) {
         code.vpbroadcastw(a, a);
     } else {
@@ -725,7 +694,7 @@ void EmitX64::EmitVectorBroadcast16(EmitContext& ctx, IR::Inst* inst) {
 
 void EmitX64::EmitVectorBroadcast32(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-    const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
     if (code.HasHostFeature(HostFeature::AVX2)) {
         code.vpbroadcastd(a, a);
     } else {
@@ -736,7 +705,7 @@ void EmitX64::EmitVectorBroadcast32(EmitContext& ctx, IR::Inst* inst) {
 
 void EmitX64::EmitVectorBroadcast64(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-    const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
     if (code.HasHostFeature(HostFeature::AVX2)) {
         code.vpbroadcastq(a, a);
     } else {
@@ -747,7 +716,7 @@ void EmitX64::EmitVectorBroadcast64(EmitContext& ctx, IR::Inst* inst) {
 
 void EmitX64::EmitVectorBroadcastElementLower8(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-    const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
     ASSERT(args[1].IsImmediate());
     const u8 index = args[1].GetImmediateU8();
     ASSERT(index < 16);
@@ -758,7 +727,7 @@ void EmitX64::EmitVectorBroadcastElementLower8(EmitContext& ctx, IR::Inst* inst)
         code.vpbroadcastb(a, a);
         code.vmovq(a, a);
     } else if (code.HasHostFeature(HostFeature::SSSE3)) {
-        const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+        auto const tmp = ctx.reg_alloc.ScratchXmm(code);
         code.pxor(tmp, tmp);
         code.pshufb(a, tmp);
         code.movq(a, a);
@@ -771,7 +740,7 @@ void EmitX64::EmitVectorBroadcastElementLower8(EmitContext& ctx, IR::Inst* inst)
 
 void EmitX64::EmitVectorBroadcastElementLower16(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-    const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
     ASSERT(args[1].IsImmediate());
     const u8 index = args[1].GetImmediateU8();
     ASSERT(index < 8);
@@ -784,7 +753,7 @@ void EmitX64::EmitVectorBroadcastElementLower16(EmitContext& ctx, IR::Inst* inst
 
 void EmitX64::EmitVectorBroadcastElementLower32(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-    const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
     ASSERT(args[1].IsImmediate());
     const u8 index = args[1].GetImmediateU8();
     ASSERT(index < 4);
@@ -800,7 +769,7 @@ void EmitX64::EmitVectorBroadcastElementLower32(EmitContext& ctx, IR::Inst* inst
 
 void EmitX64::EmitVectorBroadcastElement8(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-    const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
     ASSERT(args[1].IsImmediate());
     const u8 index = args[1].GetImmediateU8();
     ASSERT(index < 16);
@@ -810,7 +779,7 @@ void EmitX64::EmitVectorBroadcastElement8(EmitContext& ctx, IR::Inst* inst) {
     if (code.HasHostFeature(HostFeature::AVX2)) {
         code.vpbroadcastb(a, a);
     } else if (code.HasHostFeature(HostFeature::SSSE3)) {
-        const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+        auto const tmp = ctx.reg_alloc.ScratchXmm(code);
 
         code.pxor(tmp, tmp);
         code.pshufb(a, tmp);
@@ -824,7 +793,7 @@ void EmitX64::EmitVectorBroadcastElement8(EmitContext& ctx, IR::Inst* inst) {
 
 void EmitX64::EmitVectorBroadcastElement16(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-    const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
     ASSERT(args[1].IsImmediate());
     const u8 index = args[1].GetImmediateU8();
     ASSERT(index < 8);
@@ -844,7 +813,7 @@ void EmitX64::EmitVectorBroadcastElement16(EmitContext& ctx, IR::Inst* inst) {
 
 void EmitX64::EmitVectorBroadcastElement32(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-    const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
     ASSERT(args[1].IsImmediate());
     const u8 index = args[1].GetImmediateU8();
     ASSERT(index < 4);
@@ -856,7 +825,7 @@ void EmitX64::EmitVectorBroadcastElement32(EmitContext& ctx, IR::Inst* inst) {
 
 void EmitX64::EmitVectorBroadcastElement64(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-    const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
     ASSERT(args[1].IsImmediate());
     const u8 index = args[1].GetImmediateU8();
     ASSERT(index < 2);
@@ -1043,9 +1012,9 @@ void EmitX64::EmitVectorCountLeadingZeros32(EmitContext& ctx, IR::Inst* inst) {
 
 void EmitX64::EmitVectorDeinterleaveEven8(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-    const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-    const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(code, args[1]);
-    const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+    auto const lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const rhs = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+    auto const tmp = ctx.reg_alloc.ScratchXmm(code);
 
     code.movdqa(tmp, code.Const(xword, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF));
     code.pand(lhs, tmp);
@@ -1057,11 +1026,11 @@ void EmitX64::EmitVectorDeinterleaveEven8(EmitContext& ctx, IR::Inst* inst) {
 
 void EmitX64::EmitVectorDeinterleaveEven16(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-    const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-    const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+    auto const lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const rhs = ctx.reg_alloc.UseScratchXmm(code, args[1]);
 
     if (code.HasHostFeature(HostFeature::SSE41)) {
-        const Xbyak::Xmm zero = ctx.reg_alloc.ScratchXmm(code);
+        auto const zero = ctx.reg_alloc.ScratchXmm(code);
         code.pxor(zero, zero);
 
         code.pblendw(lhs, zero, 0b10101010);
@@ -1082,8 +1051,8 @@ void EmitX64::EmitVectorDeinterleaveEven16(EmitContext& ctx, IR::Inst* inst) {
 
 void EmitX64::EmitVectorDeinterleaveEven32(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-    const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-    const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(code, args[1]);
+    auto const lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const rhs = ctx.reg_alloc.UseXmm(code, args[1]);
 
     code.shufps(lhs, rhs, 0b10001000);
 
@@ -1092,8 +1061,8 @@ void EmitX64::EmitVectorDeinterleaveEven32(EmitContext& ctx, IR::Inst* inst) {
 
 void EmitX64::EmitVectorDeinterleaveEven64(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-    const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-    const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(code, args[1]);
+    auto const lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const rhs = ctx.reg_alloc.UseXmm(code, args[1]);
 
     code.shufpd(lhs, rhs, 0b00);
 
@@ -1102,16 +1071,16 @@ void EmitX64::EmitVectorDeinterleaveEven64(EmitContext& ctx, IR::Inst* inst) {
 
 void EmitX64::EmitVectorDeinterleaveEvenLower8(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-    const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
 
     if (code.HasHostFeature(HostFeature::SSSE3)) {
-        const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(code, args[1]);
+        auto const rhs = ctx.reg_alloc.UseXmm(code, args[1]);
 
         code.punpcklbw(lhs, rhs);
         code.pshufb(lhs, code.Const(xword, 0x0D'09'05'01'0C'08'04'00, 0x8080808080808080));
     } else {
-        const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
-        const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+        auto const tmp = ctx.reg_alloc.ScratchXmm(code);
+        auto const rhs = ctx.reg_alloc.UseScratchXmm(code, args[1]);
 
         code.movdqa(tmp, code.Const(xword, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF));
         code.pand(lhs, tmp);
@@ -1126,15 +1095,15 @@ void EmitX64::EmitVectorDeinterleaveEvenLower8(EmitContext& ctx, IR::Inst* inst)
 
 void EmitX64::EmitVectorDeinterleaveEvenLower16(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-    const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
 
     if (code.HasHostFeature(HostFeature::SSSE3)) {
-        const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(code, args[1]);
+        auto const rhs = ctx.reg_alloc.UseXmm(code, args[1]);
 
         code.punpcklwd(lhs, rhs);
         code.pshufb(lhs, code.Const(xword, 0x0B0A'0302'0908'0100, 0x8080'8080'8080'8080));
     } else {
-        const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+        auto const rhs = ctx.reg_alloc.UseScratchXmm(code, args[1]);
 
         code.pslld(lhs, 16);
         code.psrad(lhs, 16);
@@ -1152,8 +1121,8 @@ void EmitX64::EmitVectorDeinterleaveEvenLower16(EmitContext& ctx, IR::Inst* inst
 
 void EmitX64::EmitVectorDeinterleaveEvenLower32(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-    const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-    const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(code, args[1]);
+    auto const lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const rhs = ctx.reg_alloc.UseXmm(code, args[1]);
 
     if (code.HasHostFeature(HostFeature::SSE41)) {
         // copy bytes 0:3 of rhs to lhs, zero out upper 8 bytes
@@ -1168,8 +1137,8 @@ void EmitX64::EmitVectorDeinterleaveEvenLower32(EmitContext& ctx, IR::Inst* inst
 
 void EmitX64::EmitVectorDeinterleaveOdd8(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-    const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-    const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+    auto const lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const rhs = ctx.reg_alloc.UseScratchXmm(code, args[1]);
 
     code.psraw(lhs, 8);
     code.psraw(rhs, 8);
@@ -1180,8 +1149,8 @@ void EmitX64::EmitVectorDeinterleaveOdd8(EmitContext& ctx, IR::Inst* inst) {
 
 void EmitX64::EmitVectorDeinterleaveOdd16(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-    const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-    const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+    auto const lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const rhs = ctx.reg_alloc.UseScratchXmm(code, args[1]);
 
     code.psrad(lhs, 16);
     code.psrad(rhs, 16);
@@ -1192,8 +1161,8 @@ void EmitX64::EmitVectorDeinterleaveOdd16(EmitContext& ctx, IR::Inst* inst) {
 
 void EmitX64::EmitVectorDeinterleaveOdd32(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-    const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-    const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(code, args[1]);
+    auto const lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const rhs = ctx.reg_alloc.UseXmm(code, args[1]);
 
     code.shufps(lhs, rhs, 0b11011101);
 
@@ -1202,8 +1171,8 @@ void EmitX64::EmitVectorDeinterleaveOdd32(EmitContext& ctx, IR::Inst* inst) {
 
 void EmitX64::EmitVectorDeinterleaveOdd64(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-    const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-    const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(code, args[1]);
+    auto const lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const rhs = ctx.reg_alloc.UseXmm(code, args[1]);
 
     code.shufpd(lhs, rhs, 0b11);
 
@@ -1212,15 +1181,15 @@ void EmitX64::EmitVectorDeinterleaveOdd64(EmitContext& ctx, IR::Inst* inst) {
 
 void EmitX64::EmitVectorDeinterleaveOddLower8(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-    const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
 
     if (code.HasHostFeature(HostFeature::SSSE3)) {
-        const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(code, args[1]);
+        auto const rhs = ctx.reg_alloc.UseXmm(code, args[1]);
 
         code.punpcklbw(lhs, rhs);
         code.pshufb(lhs, code.Const(xword, 0x0F'0B'07'03'0E'0A'06'02, 0x8080808080808080));
     } else {
-        const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+        auto const rhs = ctx.reg_alloc.UseScratchXmm(code, args[1]);
 
         code.psraw(lhs, 8);
         code.psraw(rhs, 8);
@@ -1234,15 +1203,15 @@ void EmitX64::EmitVectorDeinterleaveOddLower8(EmitContext& ctx, IR::Inst* inst)
 
 void EmitX64::EmitVectorDeinterleaveOddLower16(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-    const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
 
     if (code.HasHostFeature(HostFeature::SSSE3)) {
-        const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(code, args[1]);
+        auto const rhs = ctx.reg_alloc.UseXmm(code, args[1]);
 
         code.punpcklwd(lhs, rhs);
         code.pshufb(lhs, code.Const(xword, 0x0F0E'0706'0D0C'0504, 0x8080'8080'8080'8080));
     } else {
-        const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+        auto const rhs = ctx.reg_alloc.UseScratchXmm(code, args[1]);
 
         code.psrad(lhs, 16);
         code.psrad(rhs, 16);
@@ -1258,17 +1227,17 @@ void EmitX64::EmitVectorDeinterleaveOddLower32(EmitContext& ctx, IR::Inst* inst)
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
 
     if (code.HasHostFeature(HostFeature::SSE41)) {
-        const Xbyak::Xmm lhs = ctx.reg_alloc.UseXmm(code, args[0]);
-        const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+        auto const lhs = ctx.reg_alloc.UseXmm(code, args[0]);
+        auto const rhs = ctx.reg_alloc.UseScratchXmm(code, args[1]);
 
         // copy bytes 4:7 of lhs to bytes 0:3 of rhs, zero out upper 8 bytes
         code.insertps(rhs, lhs, 0b01001100);
 
         ctx.reg_alloc.DefineValue(code, inst, rhs);
     } else {
-        const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-        const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(code, args[1]);
-        const Xbyak::Xmm zero = ctx.reg_alloc.ScratchXmm(code);
+        auto const lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        auto const rhs = ctx.reg_alloc.UseXmm(code, args[1]);
+        auto const zero = ctx.reg_alloc.ScratchXmm(code);
 
         code.xorps(zero, zero);
         code.unpcklps(lhs, rhs);
@@ -1302,9 +1271,9 @@ void EmitX64::EmitVectorEqual64(EmitContext& ctx, IR::Inst* inst) {
 
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
 
-    const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-    const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
-    const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+    auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
+    auto const tmp = ctx.reg_alloc.ScratchXmm(code);
 
     code.pcmpeqd(xmm_a, xmm_b);
     code.pshufd(tmp, xmm_a, 0b10110001);
@@ -1317,9 +1286,9 @@ void EmitX64::EmitVectorEqual128(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
 
     if (code.HasHostFeature(HostFeature::SSE41)) {
-        const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-        const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
-        const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+        auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        auto const xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
+        auto const tmp = ctx.reg_alloc.ScratchXmm(code);
 
         code.pcmpeqq(xmm_a, xmm_b);
         code.pshufd(tmp, xmm_a, 0b01001110);
@@ -1327,9 +1296,9 @@ void EmitX64::EmitVectorEqual128(EmitContext& ctx, IR::Inst* inst) {
 
         ctx.reg_alloc.DefineValue(code, inst, xmm_a);
     } else {
-        const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-        const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
-        const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+        auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        auto const xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
+        auto const tmp = ctx.reg_alloc.ScratchXmm(code);
 
         code.pcmpeqd(xmm_a, xmm_b);
         code.pshufd(tmp, xmm_a, 0b10110001);
@@ -1353,16 +1322,16 @@ void EmitX64::EmitVectorExtract(EmitContext& ctx, IR::Inst* inst) {
     }
 
     if (code.HasHostFeature(HostFeature::SSSE3)) {
-        const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseXmm(code, args[0]);
-        const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+        auto const xmm_a = ctx.reg_alloc.UseXmm(code, args[0]);
+        auto const xmm_b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
 
         code.palignr(xmm_b, xmm_a, position / 8);
         ctx.reg_alloc.DefineValue(code, inst, xmm_b);
         return;
     }
 
-    const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-    const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+    auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const xmm_b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
 
     code.psrldq(xmm_a, position / 8);
     code.pslldq(xmm_b, (128 - position) / 8);
@@ -1374,13 +1343,13 @@ void EmitX64::EmitVectorExtract(EmitContext& ctx, IR::Inst* inst) {
 void EmitX64::EmitVectorExtractLower(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
 
-    const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
 
     const u8 position = args[2].GetImmediateU8();
     ASSERT(position % 8 == 0);
 
     if (position != 0) {
-        const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
+        auto const xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
 
         code.punpcklqdq(xmm_a, xmm_b);
         code.psrldq(xmm_a, position / 8);
@@ -1405,22 +1374,33 @@ void EmitX64::EmitVectorGreaterS32(EmitContext& ctx, IR::Inst* inst) {
 void EmitX64::EmitVectorGreaterS64(EmitContext& ctx, IR::Inst* inst) {
     if (code.HasHostFeature(HostFeature::SSE42)) {
         EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pcmpgtq);
-        return;
+    } else {
+        auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+        auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        auto const tmp1 = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+        auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+        auto const tmp3 = ctx.reg_alloc.ScratchXmm(code);
+        code.movdqa(tmp2, code.Const(xword, 0x80000000, 0x80000000));
+        code.pxor(tmp0, tmp2);
+        code.pxor(tmp1, tmp2);
+        code.movdqa(tmp2, tmp0);
+        code.pcmpeqd(tmp0, tmp1);
+        code.pcmpgtd(tmp2, tmp1);
+        code.pshufd(tmp1, tmp0, 245);
+        code.pshufd(tmp3, tmp2, 160);
+        code.pshufd(tmp0, tmp2, 245);
+        code.pand(tmp1, tmp3);
+        code.por(tmp0, tmp1);
+        ctx.reg_alloc.DefineValue(code, inst, tmp0);
     }
-
-    EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u64>& result, const VectorArray<s64>& a, const VectorArray<s64>& b) {
-        for (size_t i = 0; i < result.size(); ++i) {
-            result[i] = (a[i] > b[i]) ? ~u64(0) : 0;
-        }
-    });
 }
 
 static void EmitVectorHalvingAddSigned(size_t esize, EmitContext& ctx, IR::Inst* inst, BlockOfCode& code) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
 
-    const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-    const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
-    const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+    auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+    auto const tmp = ctx.reg_alloc.ScratchXmm(code);
 
     code.movdqa(tmp, b);
     code.pand(tmp, a);
@@ -1459,9 +1439,9 @@ void EmitX64::EmitVectorHalvingAddS32(EmitContext& ctx, IR::Inst* inst) {
 static void EmitVectorHalvingAddUnsigned(size_t esize, EmitContext& ctx, IR::Inst* inst, BlockOfCode& code) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
 
-    const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-    const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
-    const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+    auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+    auto const tmp = ctx.reg_alloc.ScratchXmm(code);
 
     code.movdqa(tmp, b);
 
@@ -1504,12 +1484,12 @@ void EmitX64::EmitVectorHalvingAddU32(EmitContext& ctx, IR::Inst* inst) {
 static void EmitVectorHalvingSubSigned(size_t esize, EmitContext& ctx, IR::Inst* inst, BlockOfCode& code) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
 
-    const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-    const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+    auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
 
     switch (esize) {
     case 8: {
-        const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+        auto const tmp = ctx.reg_alloc.ScratchXmm(code);
         code.movdqa(tmp, code.Const(xword, 0x8080808080808080, 0x8080808080808080));
         code.pxor(a, tmp);
         code.pxor(b, tmp);
@@ -1518,7 +1498,7 @@ static void EmitVectorHalvingSubSigned(size_t esize, EmitContext& ctx, IR::Inst*
         break;
     }
     case 16: {
-        const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+        auto const tmp = ctx.reg_alloc.ScratchXmm(code);
         code.movdqa(tmp, code.Const(xword, 0x8000800080008000, 0x8000800080008000));
         code.pxor(a, tmp);
         code.pxor(b, tmp);
@@ -1552,8 +1532,8 @@ void EmitX64::EmitVectorHalvingSubS32(EmitContext& ctx, IR::Inst* inst) {
 static void EmitVectorHalvingSubUnsigned(size_t esize, EmitContext& ctx, IR::Inst* inst, BlockOfCode& code) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
 
-    const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-    const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+    auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
 
     switch (esize) {
     case 8:
@@ -1590,8 +1570,8 @@ void EmitX64::EmitVectorHalvingSubU32(EmitContext& ctx, IR::Inst* inst) {
 static void EmitVectorInterleaveLower(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, int size) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
 
-    const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-    const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(code, args[1]);
+    auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const b = ctx.reg_alloc.UseXmm(code, args[1]);
 
     switch (size) {
     case 8:
@@ -1630,8 +1610,8 @@ void EmitX64::EmitVectorInterleaveLower64(EmitContext& ctx, IR::Inst* inst) {
 static void EmitVectorInterleaveUpper(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, int size) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
 
-    const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-    const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(code, args[1]);
+    auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const b = ctx.reg_alloc.UseXmm(code, args[1]);
 
     switch (size) {
     case 8:
@@ -1670,7 +1650,7 @@ void EmitX64::EmitVectorInterleaveUpper64(EmitContext& ctx, IR::Inst* inst) {
 void EmitX64::EmitVectorLogicalShiftLeft8(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
 
-    const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
     const u8 shift_amount = args[1].GetImmediateU8();
 
     if (shift_amount == 0) {
@@ -1696,7 +1676,7 @@ void EmitX64::EmitVectorLogicalShiftLeft8(EmitContext& ctx, IR::Inst* inst) {
 void EmitX64::EmitVectorLogicalShiftLeft16(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
 
-    const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
     const u8 shift_amount = args[1].GetImmediateU8();
 
     code.psllw(result, shift_amount);
@@ -1707,7 +1687,7 @@ void EmitX64::EmitVectorLogicalShiftLeft16(EmitContext& ctx, IR::Inst* inst) {
 void EmitX64::EmitVectorLogicalShiftLeft32(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
 
-    const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
     const u8 shift_amount = args[1].GetImmediateU8();
 
     code.pslld(result, shift_amount);
@@ -1718,7 +1698,7 @@ void EmitX64::EmitVectorLogicalShiftLeft32(EmitContext& ctx, IR::Inst* inst) {
 void EmitX64::EmitVectorLogicalShiftLeft64(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
 
-    const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
     const u8 shift_amount = args[1].GetImmediateU8();
 
     code.psllq(result, shift_amount);
@@ -1729,7 +1709,7 @@ void EmitX64::EmitVectorLogicalShiftLeft64(EmitContext& ctx, IR::Inst* inst) {
 void EmitX64::EmitVectorLogicalShiftRight8(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
 
-    const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
     const u8 shift_amount = args[1].GetImmediateU8();
 
     if (shift_amount == 0) {
@@ -1753,7 +1733,7 @@ void EmitX64::EmitVectorLogicalShiftRight8(EmitContext& ctx, IR::Inst* inst) {
 void EmitX64::EmitVectorLogicalShiftRight16(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
 
-    const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
     const u8 shift_amount = args[1].GetImmediateU8();
 
     code.psrlw(result, shift_amount);
@@ -1764,7 +1744,7 @@ void EmitX64::EmitVectorLogicalShiftRight16(EmitContext& ctx, IR::Inst* inst) {
 void EmitX64::EmitVectorLogicalShiftRight32(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
 
-    const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
     const u8 shift_amount = args[1].GetImmediateU8();
 
     code.psrld(result, shift_amount);
@@ -1775,7 +1755,7 @@ void EmitX64::EmitVectorLogicalShiftRight32(EmitContext& ctx, IR::Inst* inst) {
 void EmitX64::EmitVectorLogicalShiftRight64(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
 
-    const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
     const u8 shift_amount = args[1].GetImmediateU8();
 
     code.psrlq(result, shift_amount);
@@ -1783,41 +1763,12 @@ void EmitX64::EmitVectorLogicalShiftRight64(EmitContext& ctx, IR::Inst* inst) {
     ctx.reg_alloc.DefineValue(code, inst, result);
 }
 
-template<size_t esize>
-static void EmitVectorLogicalVShiftAVX2(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
-    static_assert(esize == 32 || esize == 64);
-    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-
-    const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-    const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
-    const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
-
-    // store sign bit of lowest byte of each element of b to select left/right shift later
-    ICODE(vpsll)(xmm0, b, u8(esize - 8));
-
-    // sse/avx shifts are only positive, with dedicated left/right forms - shift by lowest byte of abs(b)
-    code.vpabsb(b, b);
-    code.vpand(b, b, code.BConst<esize>(xword, 0xFF));
-
-    // calculate shifts
-    ICODE(vpsllv)(result, a, b);
-    ICODE(vpsrlv)(a, a, b);
-
-    // implicit argument: xmm0 (sign of lowest byte of b)
-    if (esize == 32) {
-        code.blendvps(result, a);
-    } else {
-        code.blendvpd(result, a);
-    }
-    ctx.reg_alloc.DefineValue(code, inst, result);
-}
-
 void EmitX64::EmitVectorLogicalVShift8(EmitContext& ctx, IR::Inst* inst) {
     if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW | HostFeature::GFNI)) {
         auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-        const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-        const Xbyak::Xmm left_shift = ctx.reg_alloc.UseScratchXmm(code, args[1]);
-        const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+        auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        auto const left_shift = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+        auto const tmp = ctx.reg_alloc.ScratchXmm(code);
 
         const Xbyak::Opmask negative_mask = k1;
         code.pxor(tmp, tmp);
@@ -1862,10 +1813,10 @@ void EmitX64::EmitVectorLogicalVShift16(EmitContext& ctx, IR::Inst* inst) {
     if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW)) {
         auto args = ctx.reg_alloc.GetArgumentInfo(inst);
 
-        const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-        const Xbyak::Xmm left_shift = ctx.reg_alloc.UseScratchXmm(code, args[1]);
-        const Xbyak::Xmm right_shift = ctx.reg_alloc.ScratchXmm(code);
-        const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+        auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        auto const left_shift = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+        auto const right_shift = ctx.reg_alloc.ScratchXmm(code);
+        auto const tmp = ctx.reg_alloc.ScratchXmm(code);
 
         code.vmovdqa32(tmp, code.Const(xword, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF));
         code.vpxord(right_shift, right_shift, right_shift);
@@ -1886,18 +1837,87 @@ void EmitX64::EmitVectorLogicalVShift16(EmitContext& ctx, IR::Inst* inst) {
 }
 
 void EmitX64::EmitVectorLogicalVShift32(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
     if (code.HasHostFeature(HostFeature::AVX2)) {
-        EmitVectorLogicalVShiftAVX2<32>(code, ctx, inst);
+        auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        auto const b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+        auto const result = ctx.reg_alloc.ScratchXmm(code);
+        auto const mask = ctx.reg_alloc.ScratchXmm(code);
+        // store sign bit of lowest byte of each element of b to select left/right shift later
+        code.vpslld(mask, b, u8(32 - 8));
+        // sse/avx shifts are only positive, with dedicated left/right forms - shift by lowest byte of abs(b)
+        code.vpabsb(b, b);
+        code.vpand(b, b, code.BConst<32>(xword, 0xFF));
+        // calculate shifts
+        code.vpsllvd(result, a, b);
+        code.vpsrlvd(a, a, b);
+        code.vblendvps(result, result, a, mask);
+        ctx.reg_alloc.DefineValue(code, inst, result);
     } else {
-        EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u32>& result, const VectorArray<u32>& a, const VectorArray<u32>& b) {
-            std::transform(a.begin(), a.end(), b.begin(), result.begin(), VShift<u32>);
-        });
+        auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        auto const tmp1 = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+        auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+        auto const tmp3 = ctx.reg_alloc.ScratchXmm(code);
+        auto const tmp4 = ctx.reg_alloc.ScratchXmm(code);
+        auto const tmp5 = ctx.reg_alloc.ScratchXmm(code);
+        auto const tmp6 = ctx.reg_alloc.ScratchXmm(code);
+        code.pxor(tmp3, tmp3);
+        code.movdqa(tmp2, tmp0);
+        code.psubb(tmp3, tmp1);
+        code.movdqa(tmp4, tmp2);
+        code.movdqa(tmp6, tmp2);
+        code.pminub(tmp3, tmp1);
+        code.pslld(tmp1, 24);
+        code.pand(tmp3, code.Const(xword, 0x000000ff'000000ff, 0x000000ff'000000ff));
+        code.psrad(tmp1, 31);
+        code.pshuflw(tmp0, tmp3, 254);
+        code.pshuflw(tmp5, tmp3, 84);
+        code.psrld(tmp4, tmp0);
+        code.movdqa(tmp0, tmp2);
+        code.psrld(tmp0, tmp5);
+        code.punpcklqdq(tmp0, tmp4);
+        code.pshufd(tmp4, tmp3, 238);
+        code.pslld(tmp3, 23);
+        code.paddd(tmp3, code.Const(xword, 0x3F80'00003F80'0000, 0x3F80'00003F80'0000));
+        code.pshuflw(tmp5, tmp4, 254);
+        code.pshuflw(tmp4, tmp4, 84);
+        code.psrld(tmp6, tmp5);
+        code.movdqa(tmp5, tmp2);
+        code.psrld(tmp5, tmp4);
+        code.pshufd(tmp4, tmp2, 245);
+        code.punpckhqdq(tmp5, tmp6);
+        code.cvttps2dq(tmp3, tmp3);
+        code.shufps(tmp0, tmp5, 204);
+        code.pmuludq(tmp2, tmp3);
+        code.pshufd(tmp3, tmp3, 245);
+        code.andps(tmp0, tmp1);
+        code.pmuludq(tmp3, tmp4);
+        code.pshufd(tmp2, tmp2, 232);
+        code.pshufd(tmp3, tmp3, 232);
+        code.punpckldq(tmp2, tmp3);
+        code.pandn(tmp1, tmp2);
+        code.orps(tmp0, tmp1);
+        ctx.reg_alloc.DefineValue(code, inst, tmp0);
     }
 }
 
 void EmitX64::EmitVectorLogicalVShift64(EmitContext& ctx, IR::Inst* inst) {
     if (code.HasHostFeature(HostFeature::AVX2)) {
-        EmitVectorLogicalVShiftAVX2<64>(code, ctx, inst);
+        auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+        auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        auto const b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+        auto const result = ctx.reg_alloc.ScratchXmm(code);
+        auto const mask = ctx.reg_alloc.ScratchXmm(code);
+        // store sign bit of lowest byte of each element of b to select left/right shift later
+        code.vpsllq(mask, b, u8(64 - 8));
+        // sse/avx shifts are only positive, with dedicated left/right forms - shift by lowest byte of abs(b)
+        code.vpabsb(b, b);
+        code.vpand(b, b, code.BConst<64>(xword, 0xFF));
+        // calculate shifts
+        code.vpsllvq(result, a, b);
+        code.vpsrlvq(a, a, b);
+        code.vblendvpd(result, result, a, mask);
+        ctx.reg_alloc.DefineValue(code, inst, result);
     } else {
         EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u64>& result, const VectorArray<u64>& a, const VectorArray<u64>& b) {
             std::transform(a.begin(), a.end(), b.begin(), result.begin(), VShift<u64>);
@@ -1912,28 +1932,11 @@ enum class MinMaxOperation {
     Max,
 };
 
-// Compute the minimum/maximum of two vectors of signed 8-bit integers, using only SSE2 instructons.
-// The result of the operation is placed in operand a, while b is unmodified.
-void FallbackMinMaxS8(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& a, const Xbyak::Xmm& b, MinMaxOperation op) {
-    const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(code);
-    if(op == MinMaxOperation::Min) {
-        code.movdqa(c, b);
-        code.pcmpgtb(c, a);
-    } else {
-        code.movdqa(c, a);
-        code.pcmpgtb(c, b);
-    }
-
-    code.pand(a, c);
-    code.pandn(c, b);
-    code.por(a, c);
-}
-
 // Compute the minimum/maximum of two vectors of unsigned 16-bit integers, using only SSE2 instructons.
 // The result of the operation is placed in operand a, while b is unmodified.
-void FallbackMinMaxU16(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& a, const Xbyak::Xmm& b, MinMaxOperation op) {
+void FallbackMinMaxU16(BlockOfCode& code, EmitContext& ctx, auto const& a, auto const& b, MinMaxOperation op) {
     if(op == MinMaxOperation::Min) {
-        const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(code);
+        auto const c = ctx.reg_alloc.ScratchXmm(code);
         code.movdqa(c, a);
         code.psubusw(c, b);
         code.psubw(a, c);
@@ -1945,8 +1948,8 @@ void FallbackMinMaxU16(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& a,
 
 // Compute the minimum/maximum of two vectors of signed 32-bit integers, using only SSE2 instructons.
 // The result of the operation is placed in operand a, while b is unmodified.
-void FallbackMinMaxS32(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& a, const Xbyak::Xmm& b, MinMaxOperation op) {
-    const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(code);
+void FallbackMinMaxS32(BlockOfCode& code, EmitContext& ctx, auto const& a, auto const& b, MinMaxOperation op) {
+    auto const c = ctx.reg_alloc.ScratchXmm(code);
     if(op == MinMaxOperation::Min) {
         code.movdqa(c, b);
         code.pcmpgtd(c, a);
@@ -1962,12 +1965,12 @@ void FallbackMinMaxS32(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& a,
 
 // Compute the minimum/maximum of two vectors of unsigned 32-bit integers, using only SSE2 instructons.
 // The result of the operation is placed in operand a, while b is unmodified.
-void FallbackMinMaxU32(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& a, const Xbyak::Xmm& b, MinMaxOperation op) {
-    const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(code);
+void FallbackMinMaxU32(BlockOfCode& code, EmitContext& ctx, auto const& a, auto const& b, MinMaxOperation op) {
+    auto const c = ctx.reg_alloc.ScratchXmm(code);
     code.movdqa(c, code.BConst<32>(xword, 0x80000000));
 
     // bias a and b by XORing their sign bits, then use the signed comparison function
-    const Xbyak::Xmm d = ctx.reg_alloc.ScratchXmm(code);
+    auto const d = ctx.reg_alloc.ScratchXmm(code);
     if(op == MinMaxOperation::Min) {
         code.movdqa(d, a);
         code.pxor(d, c);
@@ -1989,11 +1992,16 @@ void EmitX64::EmitVectorMaxS8(EmitContext& ctx, IR::Inst* inst) {
     if (code.HasHostFeature(HostFeature::SSE41)) {
         EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pmaxsb);
     } else {
-    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-    const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-    const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(code, args[1]);
-        FallbackMinMaxS8(code, ctx, a, b, MinMaxOperation::Max);
-    ctx.reg_alloc.DefineValue(code, inst, a);
+        auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+        auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        auto const b = ctx.reg_alloc.UseXmm(code, args[1]);
+        auto const c = ctx.reg_alloc.ScratchXmm(code);
+        code.movdqa(c, a);
+        code.pcmpgtb(c, b);
+        code.pand(a, c);
+        code.pandn(c, b);
+        code.por(a, c);
+        ctx.reg_alloc.DefineValue(code, inst, a);
     }
 }
 
@@ -2005,31 +2013,55 @@ void EmitX64::EmitVectorMaxS32(EmitContext& ctx, IR::Inst* inst) {
     if (code.HasHostFeature(HostFeature::SSE41)) {
         EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pmaxsd);
     } else {
-    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-    const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-    const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(code, args[1]);
-        FallbackMinMaxS32(code, ctx, a, b, MinMaxOperation::Max);
-    ctx.reg_alloc.DefineValue(code, inst, a);
+        auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+        auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        auto const tmp1 = ctx.reg_alloc.UseXmm(code, args[1]);
+        auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+        code.movdqa(tmp2, tmp0);
+        code.pcmpgtd(tmp2, tmp1);
+        code.pand(tmp0, tmp2);
+        code.pandn(tmp2, tmp1);
+        code.por(tmp0, tmp2);
+        ctx.reg_alloc.DefineValue(code, inst, tmp0);
     }
 }
 
 void EmitX64::EmitVectorMaxS64(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
     if (code.HasHostFeature(HostFeature::AVX512_Ortho)) {
-        EmitAVXVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::vpmaxsq);
+        auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        auto const xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
+        code.vpmaxsq(xmm_a, xmm_a, xmm_b);
+        ctx.reg_alloc.DefineValue(code, inst, xmm_a);
     } else if (code.HasHostFeature(HostFeature::AVX)) {
-        auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-
-        const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-        const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(code, args[1]);
-
+        auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        auto const y = ctx.reg_alloc.UseXmm(code, args[1]);
         code.vpcmpgtq(xmm0, y, x);
         code.pblendvb(x, y);
-
         ctx.reg_alloc.DefineValue(code, inst, x);
     } else {
-    EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<s64>& result, const VectorArray<s64>& a, const VectorArray<s64>& b) {
-        std::transform(a.begin(), a.end(), b.begin(), result.begin(), [](auto x, auto y) { return (std::max)(x, y); });
-    });
+        auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        auto const tmp1 = ctx.reg_alloc.UseXmm(code, args[1]);
+        auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+        auto const tmp3 = ctx.reg_alloc.ScratchXmm(code);
+        auto const tmp4 = ctx.reg_alloc.ScratchXmm(code);
+        auto const tmp5 = ctx.reg_alloc.ScratchXmm(code);
+        code.movdqa(tmp2, code.Const(xword, 0x8000'0000, 0x8000'0000));
+        code.movdqa(tmp3, tmp1);
+        code.pxor(tmp3, tmp2);
+        code.pxor(tmp2, tmp0);
+        code.movdqa(tmp4, tmp2);
+        code.pcmpeqd(tmp2, tmp3);
+        code.pcmpgtd(tmp4, tmp3);
+        code.pshufd(tmp2, tmp2, 245);
+        code.pshufd(tmp5, tmp4, 160);
+        code.pshufd(tmp3, tmp4, 245);
+        code.pand(tmp2, tmp5);
+        code.por(tmp3, tmp2);
+        code.pand(tmp0, tmp3);
+        code.pandn(tmp3, tmp1);
+        code.por(tmp0, tmp3);
+        ctx.reg_alloc.DefineValue(code, inst, tmp0);
     }
 }
 
@@ -2041,11 +2073,11 @@ void EmitX64::EmitVectorMaxU16(EmitContext& ctx, IR::Inst* inst) {
     if (code.HasHostFeature(HostFeature::SSE41)) {
         EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pmaxuw);
     } else {
-    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-    const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-    const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(code, args[1]);
+        auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+        auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        auto const b = ctx.reg_alloc.UseXmm(code, args[1]);
         FallbackMinMaxU16(code, ctx, a, b, MinMaxOperation::Max);
-    ctx.reg_alloc.DefineValue(code, inst, a);
+        ctx.reg_alloc.DefineValue(code, inst, a);
     }
 }
 
@@ -2053,35 +2085,54 @@ void EmitX64::EmitVectorMaxU32(EmitContext& ctx, IR::Inst* inst) {
     if (code.HasHostFeature(HostFeature::SSE41)) {
         EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pmaxud);
     } else {
-    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-    const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-    const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(code, args[1]);
+        auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+        auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        auto const b = ctx.reg_alloc.UseXmm(code, args[1]);
         FallbackMinMaxU32(code, ctx, a, b, MinMaxOperation::Max);
-    ctx.reg_alloc.DefineValue(code, inst, a);
+        ctx.reg_alloc.DefineValue(code, inst, a);
     }
 }
 
 void EmitX64::EmitVectorMaxU64(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
     if (code.HasHostFeature(HostFeature::AVX512_Ortho)) {
-        EmitAVXVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::vpmaxuq);
+        auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        auto const xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
+        code.vpmaxuq(xmm_a, xmm_a, xmm_b);
+        ctx.reg_alloc.DefineValue(code, inst, xmm_a);
     } else if (code.HasHostFeature(HostFeature::AVX)) {
-        auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-
-        const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-        const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(code, args[1]);
-        const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
-
+        auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        auto const y = ctx.reg_alloc.UseXmm(code, args[1]);
+        auto const tmp = ctx.reg_alloc.ScratchXmm(code);
         code.vmovdqa(xmm0, code.Const(xword, 0x8000000000000000, 0x8000000000000000));
         code.vpsubq(tmp, y, xmm0);
         code.vpsubq(xmm0, x, xmm0);
         code.vpcmpgtq(xmm0, tmp, xmm0);
         code.pblendvb(x, y);
-
         ctx.reg_alloc.DefineValue(code, inst, x);
     } else {
-    EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u64>& result, const VectorArray<u64>& a, const VectorArray<u64>& b) {
-        std::transform(a.begin(), a.end(), b.begin(), result.begin(), [](auto x, auto y) { return (std::max)(x, y); });
-    });
+        auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        auto const tmp1 = ctx.reg_alloc.UseXmm(code, args[1]);
+        auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+        auto const tmp3 = ctx.reg_alloc.ScratchXmm(code);
+        auto const tmp4 = ctx.reg_alloc.ScratchXmm(code);
+        auto const tmp5 = ctx.reg_alloc.ScratchXmm(code);
+        code.movdqa(tmp2, code.Const(xword, 0x8000000080000000, 0x8000000080000000));
+        code.movdqa(tmp3, tmp1);
+        code.pxor(tmp3, tmp2);
+        code.pxor(tmp2, tmp0);
+        code.movdqa(tmp4, tmp2);
+        code.pcmpeqd(tmp2, tmp3);
+        code.pcmpgtd(tmp4, tmp3);
+        code.pshufd(tmp2, tmp2, 245);
+        code.pshufd(tmp5, tmp4, 160);
+        code.pshufd(tmp3, tmp4, 245);
+        code.pand(tmp2, tmp5);
+        code.por(tmp3, tmp2);
+        code.pand(tmp0, tmp3);
+        code.pandn(tmp3, tmp1);
+        code.por(tmp0, tmp3);
+        ctx.reg_alloc.DefineValue(code, inst, tmp0);
     }
 }
 
@@ -2089,11 +2140,16 @@ void EmitX64::EmitVectorMinS8(EmitContext& ctx, IR::Inst* inst) {
     if (code.HasHostFeature(HostFeature::SSE41)) {
         EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pminsb);
     } else {
-    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-    const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-    const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(code, args[1]);
-        FallbackMinMaxS8(code, ctx, a, b, MinMaxOperation::Min);
-    ctx.reg_alloc.DefineValue(code, inst, a);
+        auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+        auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        auto const b = ctx.reg_alloc.UseXmm(code, args[1]);
+        auto const c = ctx.reg_alloc.ScratchXmm(code);
+        code.movdqa(c, b);
+        code.pcmpgtb(c, a);
+        code.pand(a, c);
+        code.pandn(c, b);
+        code.por(a, c);
+        ctx.reg_alloc.DefineValue(code, inst, a);
     }
 }
 
@@ -2105,31 +2161,51 @@ void EmitX64::EmitVectorMinS32(EmitContext& ctx, IR::Inst* inst) {
     if (code.HasHostFeature(HostFeature::SSE41)) {
         EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pminsd);
     } else {
-    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-    const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-    const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(code, args[1]);
+        auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+        auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        auto const b = ctx.reg_alloc.UseXmm(code, args[1]);
         FallbackMinMaxS32(code, ctx, a, b, MinMaxOperation::Min);
-    ctx.reg_alloc.DefineValue(code, inst, a);
+        ctx.reg_alloc.DefineValue(code, inst, a);
     }
 }
 
 void EmitX64::EmitVectorMinS64(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
     if (code.HasHostFeature(HostFeature::AVX512_Ortho)) {
-        EmitAVXVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::vpminsq);
+        auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        auto const xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
+        code.vpminsq(xmm_a, xmm_a, xmm_b);
+        ctx.reg_alloc.DefineValue(code, inst, xmm_a);
     } else if (code.HasHostFeature(HostFeature::AVX)) {
-        auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-
-        const Xbyak::Xmm x = ctx.reg_alloc.UseXmm(code, args[0]);
-        const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
-
+        auto const x = ctx.reg_alloc.UseXmm(code, args[0]);
+        auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
         code.vpcmpgtq(xmm0, y, x);
         code.pblendvb(y, x);
-
         ctx.reg_alloc.DefineValue(code, inst, y);
     } else {
-    EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<s64>& result, const VectorArray<s64>& a, const VectorArray<s64>& b) {
-        std::transform(a.begin(), a.end(), b.begin(), result.begin(), [](auto x, auto y) { return (std::min)(x, y); });
-    });
+        auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        auto const tmp1 = ctx.reg_alloc.UseXmm(code, args[1]);
+        auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+        auto const tmp3 = ctx.reg_alloc.ScratchXmm(code);
+        auto const tmp4 = ctx.reg_alloc.ScratchXmm(code);
+        auto const tmp5 = ctx.reg_alloc.ScratchXmm(code);
+        code.movdqa(tmp2, code.Const(xword, 0x8000'0000, 0x8000'0000));
+        code.movdqa(tmp3, tmp1);
+        code.pxor(tmp3, tmp2);
+        code.pxor(tmp2, tmp0);
+        code.movdqa(tmp4, tmp2);
+        code.pcmpeqd(tmp2, tmp3);
+        code.pcmpgtd(tmp4, tmp3);
+        code.pshufd(tmp3, tmp2, 245);
+        code.pshufd(tmp5, tmp4, 160);
+        code.pshufd(tmp2, tmp4, 245);
+        code.pand(tmp3, tmp5);
+        code.por(tmp2, tmp3);
+        code.pand(tmp1, tmp2);
+        code.pandn(tmp2, tmp0);
+        code.por(tmp2, tmp1);
+        //code.movdqa(tmp0, tmp2);
+        ctx.reg_alloc.DefineValue(code, inst, tmp2);
     }
 }
 
@@ -2141,11 +2217,11 @@ void EmitX64::EmitVectorMinU16(EmitContext& ctx, IR::Inst* inst) {
     if (code.HasHostFeature(HostFeature::SSE41)) {
         EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pminuw);
     } else {
-    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-    const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-    const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(code, args[1]);
+        auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+        auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        auto const b = ctx.reg_alloc.UseXmm(code, args[1]);
         FallbackMinMaxU16(code, ctx, a, b, MinMaxOperation::Min);
-    ctx.reg_alloc.DefineValue(code, inst, a);
+        ctx.reg_alloc.DefineValue(code, inst, a);
     }
 }
 
@@ -2153,57 +2229,93 @@ void EmitX64::EmitVectorMinU32(EmitContext& ctx, IR::Inst* inst) {
     if (code.HasHostFeature(HostFeature::SSE41)) {
         EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pminud);
     } else {
-    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-    const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-    const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(code, args[1]);
+        auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+        auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        auto const b = ctx.reg_alloc.UseXmm(code, args[1]);
         FallbackMinMaxU32(code, ctx, a, b, MinMaxOperation::Min);
-    ctx.reg_alloc.DefineValue(code, inst, a);
+        ctx.reg_alloc.DefineValue(code, inst, a);
     }
 }
 
 void EmitX64::EmitVectorMinU64(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
     if (code.HasHostFeature(HostFeature::AVX512_Ortho)) {
-        EmitAVXVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::vpminuq);
+        auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        auto const xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
+        code.vpminuq(xmm_a, xmm_a, xmm_b);
+        ctx.reg_alloc.DefineValue(code, inst, xmm_a);
     } else if (code.HasHostFeature(HostFeature::AVX)) {
-        auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-
-        const Xbyak::Xmm x = ctx.reg_alloc.UseXmm(code, args[0]);
-        const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
-        const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
-
+        auto const x = ctx.reg_alloc.UseXmm(code, args[0]);
+        auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+        auto const tmp = ctx.reg_alloc.ScratchXmm(code);
         code.vmovdqa(xmm0, code.Const(xword, 0x8000000000000000, 0x8000000000000000));
         code.vpsubq(tmp, y, xmm0);
         code.vpsubq(xmm0, x, xmm0);
         code.vpcmpgtq(xmm0, tmp, xmm0);
         code.pblendvb(y, x);
-
         ctx.reg_alloc.DefineValue(code, inst, y);
     } else {
-    EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u64>& result, const VectorArray<u64>& a, const VectorArray<u64>& b) {
-        std::transform(a.begin(), a.end(), b.begin(), result.begin(), [](auto x, auto y) { return (std::min)(x, y); });
-    });
+        auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        auto const tmp1 = ctx.reg_alloc.UseXmm(code, args[1]);
+        auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+        auto const tmp3 = ctx.reg_alloc.ScratchXmm(code);
+        auto const tmp4 = ctx.reg_alloc.ScratchXmm(code);
+        auto const tmp5 = ctx.reg_alloc.ScratchXmm(code);
+        code.movdqa(tmp2, code.Const(xword, 0x8000000080000000, 0x8000000080000000));
+        code.movdqa(tmp3, tmp1);
+        code.pxor(tmp3, tmp2);
+        code.pxor(tmp2, tmp0);
+        code.movdqa(tmp4, tmp2);
+        code.pcmpeqd(tmp2, tmp3);
+        code.pcmpgtd(tmp4, tmp3);
+        code.pshufd(tmp3, tmp2, 245);
+        code.pshufd(tmp5, tmp4, 160);
+        code.pshufd(tmp2, tmp4, 245);
+        code.pand(tmp3, tmp5);
+        code.por(tmp2, tmp3);
+        code.pand(tmp1, tmp2);
+        code.pandn(tmp2, tmp0);
+        code.por(tmp2, tmp1);
+        //code.movdqa(tmp0, tmp2);
+        ctx.reg_alloc.DefineValue(code, inst, tmp2);
     }
 }
 
 void EmitX64::EmitVectorMultiply8(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-    const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-    const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
-    const Xbyak::Xmm tmp_a = ctx.reg_alloc.ScratchXmm(code);
-    const Xbyak::Xmm tmp_b = ctx.reg_alloc.ScratchXmm(code);
-
-    // TODO: Optimize
-    code.movdqa(tmp_a, a);
-    code.movdqa(tmp_b, b);
-    code.pmullw(a, b);
-    code.psrlw(tmp_a, 8);
-    code.psrlw(tmp_b, 8);
-    code.pmullw(tmp_a, tmp_b);
-    code.pand(a, code.Const(xword, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF));
-    code.psllw(tmp_a, 8);
-    code.por(a, tmp_a);
-
-    ctx.reg_alloc.DefineValue(code, inst, a);
+    if (code.HasHostFeature(HostFeature::AVX)) {
+        auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        auto const tmp1 = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+        auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+        auto const tmp3 = ctx.reg_alloc.ScratchXmm(code);
+        code.vbroadcastss(tmp3, code.Const(dword, 0x00ff'00ff));
+        code.vpmullw(tmp2, tmp1, tmp0);
+        code.vpandn(tmp0, tmp3, tmp0);
+        code.vpand(tmp2, tmp2, tmp3);
+        code.vpmaddubsw(tmp0, tmp1, tmp0);
+        code.vpsllw(tmp0, tmp0, 8);
+        code.vpor(tmp0, tmp2, tmp0);
+        ctx.reg_alloc.DefineValue(code, inst, tmp0);
+    } else {
+        auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        auto const tmp1 = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+        auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+        auto const tmp3 = ctx.reg_alloc.ScratchXmm(code);
+        auto const tmp4 = ctx.reg_alloc.ScratchXmm(code);
+        code.movdqa(tmp2, tmp0);
+        code.movdqa(tmp3, tmp1);
+        code.movdqa(tmp4, code.Const(xword, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF));
+        code.punpckhbw(tmp2, tmp2);
+        code.punpckhbw(tmp3, tmp3);
+        code.punpcklbw(tmp0, tmp0);
+        code.punpcklbw(tmp1, tmp1);
+        code.pmullw(tmp3, tmp2);
+        code.pmullw(tmp0, tmp1);
+        code.pand(tmp3, tmp4);
+        code.pand(tmp0, tmp4);
+        code.packuswb(tmp0, tmp3);
+        ctx.reg_alloc.DefineValue(code, inst, tmp0);
+    }
 }
 
 void EmitX64::EmitVectorMultiply16(EmitContext& ctx, IR::Inst* inst) {
@@ -2214,31 +2326,32 @@ void EmitX64::EmitVectorMultiply32(EmitContext& ctx, IR::Inst* inst) {
     if (code.HasHostFeature(HostFeature::SSE41)) {
         EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pmulld);
     } else {
-    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-    const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-    const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
-    const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
-
-    code.movdqa(tmp, a);
-    code.psrlq(a, 32);
-    code.pmuludq(tmp, b);
-    code.psrlq(b, 32);
-    code.pmuludq(a, b);
-    code.pshufd(tmp, tmp, 0b00001000);
-    code.pshufd(b, a, 0b00001000);
-    code.punpckldq(tmp, b);
-
-    ctx.reg_alloc.DefineValue(code, inst, tmp);
+        auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+        auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        auto const b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+        auto const tmp = ctx.reg_alloc.ScratchXmm(code);
+        code.movdqa(tmp, a);
+        code.psrlq(a, 32);
+        code.pmuludq(tmp, b);
+        code.psrlq(b, 32);
+        code.pmuludq(a, b);
+        code.pshufd(tmp, tmp, 0b00001000);
+        code.pshufd(b, a, 0b00001000);
+        code.punpckldq(tmp, b);
+        ctx.reg_alloc.DefineValue(code, inst, tmp);
     }
 }
 
 void EmitX64::EmitVectorMultiply64(EmitContext& ctx, IR::Inst* inst) {
-    if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512DQ)) {
-        EmitAVXVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::vpmullq);
-    } else if (code.HasHostFeature(HostFeature::SSE41)) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-        const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-        const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(code, args[1]);
+    if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512DQ)) {
+        auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        auto const xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
+        code.vpmullq(xmm_a, xmm_a, xmm_b);
+        ctx.reg_alloc.DefineValue(code, inst, xmm_a);
+    } else if (code.HasHostFeature(HostFeature::SSE41)) {
+        auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        auto const b = ctx.reg_alloc.UseXmm(code, args[1]);
         const Xbyak::Reg64 tmp1 = ctx.reg_alloc.ScratchGpr(code);
         const Xbyak::Reg64 tmp2 = ctx.reg_alloc.ScratchGpr(code);
 
@@ -2253,29 +2366,28 @@ void EmitX64::EmitVectorMultiply64(EmitContext& ctx, IR::Inst* inst) {
 
         ctx.reg_alloc.DefineValue(code, inst, a);
     } else {
-        auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-    const Xbyak::Xmm a = ctx.reg_alloc.UseXmm(code, args[0]);
-    const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
-    const Xbyak::Xmm tmp1 = ctx.reg_alloc.ScratchXmm(code);
-    const Xbyak::Xmm tmp2 = ctx.reg_alloc.ScratchXmm(code);
-    const Xbyak::Xmm tmp3 = ctx.reg_alloc.ScratchXmm(code);
+        auto const a = ctx.reg_alloc.UseXmm(code, args[0]);
+        auto const b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+        auto const tmp1 = ctx.reg_alloc.ScratchXmm(code);
+        auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+        auto const tmp3 = ctx.reg_alloc.ScratchXmm(code);
 
-    code.movdqa(tmp1, a);
-    code.movdqa(tmp2, a);
-    code.movdqa(tmp3, b);
+        code.movdqa(tmp1, a);
+        code.movdqa(tmp2, a);
+        code.movdqa(tmp3, b);
 
-    code.psrlq(tmp1, 32);
-    code.psrlq(tmp3, 32);
+        code.psrlq(tmp1, 32);
+        code.psrlq(tmp3, 32);
 
-    code.pmuludq(tmp2, b);
-    code.pmuludq(tmp3, a);
-    code.pmuludq(b, tmp1);
+        code.pmuludq(tmp2, b);
+        code.pmuludq(tmp3, a);
+        code.pmuludq(b, tmp1);
 
-    code.paddq(b, tmp3);
-    code.psllq(b, 32);
-    code.paddq(tmp2, b);
+        code.paddq(b, tmp3);
+        code.psllq(b, 32);
+        code.paddq(tmp2, b);
 
-    ctx.reg_alloc.DefineValue(code, inst, tmp2);
+        ctx.reg_alloc.DefineValue(code, inst, tmp2);
     }
 }
 
@@ -2307,15 +2419,15 @@ void EmitX64::EmitVectorNarrow16(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
 
     if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW)) {
-        const Xbyak::Xmm a = ctx.reg_alloc.UseXmm(code, args[0]);
-        const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+        auto const a = ctx.reg_alloc.UseXmm(code, args[0]);
+        auto const result = ctx.reg_alloc.ScratchXmm(code);
 
         code.vpmovwb(result, a);
 
         ctx.reg_alloc.DefineValue(code, inst, result);
     } else {
-    const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-    const Xbyak::Xmm zeros = ctx.reg_alloc.ScratchXmm(code);
+    auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const zeros = ctx.reg_alloc.ScratchXmm(code);
 
     code.pxor(zeros, zeros);
     code.pand(a, code.Const(xword, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF));
@@ -2328,13 +2440,13 @@ void EmitX64::EmitVectorNarrow16(EmitContext& ctx, IR::Inst* inst) {
 void EmitX64::EmitVectorNarrow32(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
     if (code.HasHostFeature(HostFeature::AVX512_Ortho)) {
-        const Xbyak::Xmm a = ctx.reg_alloc.UseXmm(code, args[0]);
-        const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+        auto const a = ctx.reg_alloc.UseXmm(code, args[0]);
+        auto const result = ctx.reg_alloc.ScratchXmm(code);
         code.vpmovdw(result, a);
         ctx.reg_alloc.DefineValue(code, inst, result);
     } else {
-    const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-    const Xbyak::Xmm zeros = ctx.reg_alloc.ScratchXmm(code);
+    auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const zeros = ctx.reg_alloc.ScratchXmm(code);
     code.pxor(zeros, zeros);
     if (code.HasHostFeature(HostFeature::SSE41)) {
         code.pblendw(a, zeros, 0b10101010);
@@ -2352,15 +2464,15 @@ void EmitX64::EmitVectorNarrow64(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
 
     if (code.HasHostFeature(HostFeature::AVX512_Ortho)) {
-        const Xbyak::Xmm a = ctx.reg_alloc.UseXmm(code, args[0]);
-        const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+        auto const a = ctx.reg_alloc.UseXmm(code, args[0]);
+        auto const result = ctx.reg_alloc.ScratchXmm(code);
 
         code.vpmovqd(result, a);
 
         ctx.reg_alloc.DefineValue(code, inst, result);
     } else {
-    const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-    const Xbyak::Xmm zeros = ctx.reg_alloc.ScratchXmm(code);
+    auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const zeros = ctx.reg_alloc.ScratchXmm(code);
 
     code.pxor(zeros, zeros);
     code.shufps(a, zeros, 0b00001000);
@@ -2373,13 +2485,13 @@ void EmitX64::EmitVectorNot(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
 
     if (code.HasHostFeature(HostFeature::AVX512_Ortho)) {
-        const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
-        const Xbyak::Xmm operand = ctx.reg_alloc.UseXmm(code, args[0]);
+        auto const result = ctx.reg_alloc.ScratchXmm(code);
+        auto const operand = ctx.reg_alloc.UseXmm(code, args[0]);
         code.vpternlogq(result, operand, operand, u8(~Tern::c));
         ctx.reg_alloc.DefineValue(code, inst, result);
     } else {
-    const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-    const Xbyak::Xmm xmm_b = ctx.reg_alloc.ScratchXmm(code);
+    auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const xmm_b = ctx.reg_alloc.ScratchXmm(code);
     code.pcmpeqw(xmm_b, xmm_b);
     code.pxor(xmm_a, xmm_b);
     ctx.reg_alloc.DefineValue(code, inst, xmm_a);
@@ -2393,9 +2505,9 @@ void EmitX64::EmitVectorOr(EmitContext& ctx, IR::Inst* inst) {
 void EmitX64::EmitVectorPairedAddLower8(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
 
-    const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-    const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
-    const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+    auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
+    auto const tmp = ctx.reg_alloc.ScratchXmm(code);
 
     code.punpcklqdq(xmm_a, xmm_b);
     code.movdqa(tmp, xmm_a);
@@ -2411,9 +2523,9 @@ void EmitX64::EmitVectorPairedAddLower8(EmitContext& ctx, IR::Inst* inst) {
 void EmitX64::EmitVectorPairedAddLower16(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
 
-    const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-    const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
-    const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+    auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
+    auto const tmp = ctx.reg_alloc.ScratchXmm(code);
 
     code.punpcklqdq(xmm_a, xmm_b);
     if (code.HasHostFeature(HostFeature::SSSE3)) {
@@ -2434,9 +2546,9 @@ void EmitX64::EmitVectorPairedAddLower16(EmitContext& ctx, IR::Inst* inst) {
 void EmitX64::EmitVectorPairedAddLower32(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
 
-    const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-    const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
-    const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+    auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
+    auto const tmp = ctx.reg_alloc.ScratchXmm(code);
 
     code.punpcklqdq(xmm_a, xmm_b);
     if (code.HasHostFeature(HostFeature::SSSE3)) {
@@ -2456,10 +2568,10 @@ void EmitX64::EmitVectorPairedAddLower32(EmitContext& ctx, IR::Inst* inst) {
 void EmitX64::EmitVectorPairedAdd8(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
 
-    const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-    const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
-    const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(code);
-    const Xbyak::Xmm d = ctx.reg_alloc.ScratchXmm(code);
+    auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+    auto const c = ctx.reg_alloc.ScratchXmm(code);
+    auto const d = ctx.reg_alloc.ScratchXmm(code);
 
     code.movdqa(c, a);
     code.movdqa(d, b);
@@ -2478,17 +2590,17 @@ void EmitX64::EmitVectorPairedAdd16(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
 
     if (code.HasHostFeature(HostFeature::SSSE3)) {
-        const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-        const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(code, args[1]);
+        auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        auto const b = ctx.reg_alloc.UseXmm(code, args[1]);
 
         code.phaddw(a, b);
 
         ctx.reg_alloc.DefineValue(code, inst, a);
     } else {
-        const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-        const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
-        const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(code);
-        const Xbyak::Xmm d = ctx.reg_alloc.ScratchXmm(code);
+        auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        auto const b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+        auto const c = ctx.reg_alloc.ScratchXmm(code);
+        auto const d = ctx.reg_alloc.ScratchXmm(code);
 
         code.movdqa(c, a);
         code.movdqa(d, b);
@@ -2508,17 +2620,17 @@ void EmitX64::EmitVectorPairedAdd32(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
 
     if (code.HasHostFeature(HostFeature::SSSE3)) {
-        const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-        const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(code, args[1]);
+        auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        auto const b = ctx.reg_alloc.UseXmm(code, args[1]);
 
         code.phaddd(a, b);
 
         ctx.reg_alloc.DefineValue(code, inst, a);
     } else {
-        const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-        const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
-        const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(code);
-        const Xbyak::Xmm d = ctx.reg_alloc.ScratchXmm(code);
+        auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        auto const b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+        auto const c = ctx.reg_alloc.ScratchXmm(code);
+        auto const d = ctx.reg_alloc.ScratchXmm(code);
 
         code.movdqa(c, a);
         code.movdqa(d, b);
@@ -2535,9 +2647,9 @@ void EmitX64::EmitVectorPairedAdd32(EmitContext& ctx, IR::Inst* inst) {
 void EmitX64::EmitVectorPairedAdd64(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
 
-    const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-    const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(code, args[1]);
-    const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(code);
+    auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const b = ctx.reg_alloc.UseXmm(code, args[1]);
+    auto const c = ctx.reg_alloc.ScratchXmm(code);
 
     code.movdqa(c, a);
     code.punpcklqdq(a, b);
@@ -2550,8 +2662,8 @@ void EmitX64::EmitVectorPairedAdd64(EmitContext& ctx, IR::Inst* inst) {
 void EmitX64::EmitVectorPairedAddSignedWiden8(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
 
-    const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-    const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(code);
+    auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const c = ctx.reg_alloc.ScratchXmm(code);
 
     code.movdqa(c, a);
     code.psllw(a, 8);
@@ -2565,8 +2677,8 @@ void EmitX64::EmitVectorPairedAddSignedWiden8(EmitContext& ctx, IR::Inst* inst)
 void EmitX64::EmitVectorPairedAddSignedWiden16(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
 
-    const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-    const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(code);
+    auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const c = ctx.reg_alloc.ScratchXmm(code);
 
     code.movdqa(c, a);
     code.pslld(a, 16);
@@ -2580,18 +2692,18 @@ void EmitX64::EmitVectorPairedAddSignedWiden16(EmitContext& ctx, IR::Inst* inst)
 void EmitX64::EmitVectorPairedAddSignedWiden32(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
 
-    const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
 
     if (code.HasHostFeature(HostFeature::AVX512_Ortho)) {
-        const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(code);
+        auto const c = ctx.reg_alloc.ScratchXmm(code);
         code.vpsraq(c, a, 32);
         code.vpsllq(a, a, 32);
         code.vpsraq(a, a, 32);
         code.vpaddq(a, a, c);
     } else {
-        const Xbyak::Xmm tmp1 = ctx.reg_alloc.ScratchXmm(code);
-        const Xbyak::Xmm tmp2 = ctx.reg_alloc.ScratchXmm(code);
-        const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(code);
+        auto const tmp1 = ctx.reg_alloc.ScratchXmm(code);
+        auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+        auto const c = ctx.reg_alloc.ScratchXmm(code);
 
         code.movdqa(c, a);
         code.psllq(a, 32);
@@ -2613,8 +2725,8 @@ void EmitX64::EmitVectorPairedAddSignedWiden32(EmitContext& ctx, IR::Inst* inst)
 void EmitX64::EmitVectorPairedAddUnsignedWiden8(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
 
-    const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-    const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(code);
+    auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const c = ctx.reg_alloc.ScratchXmm(code);
 
     code.movdqa(c, a);
     code.psllw(a, 8);
@@ -2628,8 +2740,8 @@ void EmitX64::EmitVectorPairedAddUnsignedWiden8(EmitContext& ctx, IR::Inst* inst
 void EmitX64::EmitVectorPairedAddUnsignedWiden16(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
 
-    const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-    const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(code);
+    auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const c = ctx.reg_alloc.ScratchXmm(code);
 
     code.movdqa(c, a);
     code.pslld(a, 16);
@@ -2643,8 +2755,8 @@ void EmitX64::EmitVectorPairedAddUnsignedWiden16(EmitContext& ctx, IR::Inst* ins
 void EmitX64::EmitVectorPairedAddUnsignedWiden32(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
 
-    const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-    const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(code);
+    auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const c = ctx.reg_alloc.ScratchXmm(code);
 
     code.movdqa(c, a);
     code.psllq(a, 32);
@@ -2658,14 +2770,10 @@ void EmitX64::EmitVectorPairedAddUnsignedWiden32(EmitContext& ctx, IR::Inst* ins
 template<typename T, typename Function>
 static void PairedOperation(VectorArray<T>& result, const VectorArray<T>& x, const VectorArray<T>& y, Function fn) {
     const size_t range = x.size() / 2;
-
-    for (size_t i = 0; i < range; i++) {
+    for (size_t i = 0; i < range; i++)
         result[i] = fn(x[2 * i], x[2 * i + 1]);
-    }
-
-    for (size_t i = 0; i < range; i++) {
+    for (size_t i = 0; i < range; i++)
         result[range + i] = fn(y[2 * i], y[2 * i + 1]);
-    }
 }
 
 template<typename T, typename Function>
@@ -2686,11 +2794,6 @@ static void PairedMax(VectorArray<T>& result, const VectorArray<T>& x, const Vec
     PairedOperation(result, x, y, [](auto a, auto b) { return (std::max)(a, b); });
 }
 
-template<typename T>
-static void PairedMin(VectorArray<T>& result, const VectorArray<T>& x, const VectorArray<T>& y) {
-    PairedOperation(result, x, y, [](auto a, auto b) { return (std::min)(a, b); });
-}
-
 template<typename T>
 static void LowerPairedMax(VectorArray<T>& result, const VectorArray<T>& x, const VectorArray<T>& y) {
     LowerPairedOperation(result, x, y, [](auto a, auto b) { return (std::max)(a, b); });
@@ -2705,19 +2808,16 @@ template<typename Function>
 static void EmitVectorPairedMinMax8(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
 
-    const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-    const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
-    const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+    auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+    auto const tmp = ctx.reg_alloc.ScratchXmm(code);
 
     code.movdqa(tmp, code.Const(xword, 0x0E'0C'0A'08'06'04'02'00, 0x0F'0D'0B'09'07'05'03'01));
     code.pshufb(x, tmp);
     code.pshufb(y, tmp);
-
     code.movaps(tmp, x);
     code.shufps(tmp, y, 0b01'00'01'00);
-
     code.shufps(x, y, 0b11'10'11'10);
-
     if constexpr (std::is_member_function_pointer_v<Function>) {
         (code.*fn)(x, tmp);
     } else {
@@ -2730,21 +2830,17 @@ static void EmitVectorPairedMinMax8(BlockOfCode& code, EmitContext& ctx, IR::Ins
 template<typename Function>
 static void EmitVectorPairedMinMaxLower8(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-
-    const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-    const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
-
+    auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
     code.punpcklqdq(x, y);
     code.pshufb(x, code.Const(xword, 0x0E'0C'0A'08'06'04'02'00, 0x0F'0D'0B'09'07'05'03'01));
     code.movhlps(y, x);
     code.movq(x, x);
-
     if constexpr (std::is_member_function_pointer_v<Function>) {
         (code.*fn)(x, y);
     } else {
         fn(x, y);
     }
-
     ctx.reg_alloc.DefineValue(code, inst, x);
 }
 
@@ -2752,9 +2848,9 @@ template<typename Function>
 static void EmitVectorPairedMinMax16(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
 
-    const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-    const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
-    const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+    auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+    auto const tmp = ctx.reg_alloc.ScratchXmm(code);
 
     // swap idxs 1 and 2 within 64-bit lanes so that both registers contain [even, odd, even, odd]-indexed pairs of elements
     code.pshuflw(x, x, 0b11'01'10'00);
@@ -2780,63 +2876,31 @@ static void EmitVectorPairedMinMax16(BlockOfCode& code, EmitContext& ctx, IR::In
     ctx.reg_alloc.DefineValue(code, inst, x);
 }
 
-template<typename Function>
-static void EmitVectorPairedMinMaxLower16(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) {
-    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-
-    const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-    const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
-    const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
-
-    // swap idxs 1 and 2 so that both registers contain even then odd-indexed pairs of elements
-    code.pshuflw(x, x, 0b11'01'10'00);
-    code.pshuflw(y, y, 0b11'01'10'00);
-
-    // move pairs of even/odd-indexed elements into one register each
-
-    // tmp = x[0, 2], y[0, 2], 0s...
-    code.movaps(tmp, y);
-    code.insertps(tmp, x, 0b01001100);
-    // x   = x[1, 3], y[1, 3], 0s...
-    code.insertps(x, y, 0b00011100);
-
-    (code.*fn)(x, tmp);
-
-    ctx.reg_alloc.DefineValue(code, inst, x);
-}
-
-static void EmitVectorPairedMinMaxLower32(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, void (Xbyak::CodeGenerator::*fn)(const Xbyak::Xmm&, const Xbyak::Operand&)) {
-    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-
-    const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-    const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(code, args[1]);
-    const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
-
-    // tmp = x[1], y[1], 0, 0
-    code.movaps(tmp, y);
-    code.insertps(tmp, x, 0b01001100);
-    // x   = x[0], y[0], 0, 0
-    code.insertps(x, y, 0b00011100);
-
-    (code.*fn)(x, tmp);
-
-    ctx.reg_alloc.DefineValue(code, inst, x);
-}
 
 void EmitX64::EmitVectorPairedMaxS8(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+    auto const tmp = ctx.reg_alloc.ScratchXmm(code);
+    code.movdqa(tmp, code.Const(xword, 0x0E'0C'0A'08'06'04'02'00, 0x0F'0D'0B'09'07'05'03'01));
+    code.pshufb(x, tmp);
+    code.pshufb(y, tmp);
+    code.movaps(tmp, x);
+    code.shufps(tmp, y, 0b01'00'01'00);
+    code.shufps(x, y, 0b11'10'11'10);
     if (code.HasHostFeature(HostFeature::SSE41)) {
-        EmitVectorPairedMinMax8(code, ctx, inst, &Xbyak::CodeGenerator::pmaxsb);
-        return;
-    } else if (code.HasHostFeature(HostFeature::SSSE3)) {
-        EmitVectorPairedMinMax8(code, ctx, inst, [&](const auto& lhs, const auto& rhs) {
-            FallbackMinMaxS8(code, ctx, lhs, rhs, MinMaxOperation::Max);
-        });
-        return;
+        code.pmaxsb(x, tmp);
+    } else {
+        auto const a = x;
+        auto const b = tmp;
+        auto const c = ctx.reg_alloc.ScratchXmm(code);
+        code.movdqa(c, a);
+        code.pcmpgtb(c, b);
+        code.pand(a, c);
+        code.pandn(c, b);
+        code.por(a, c);
     }
-
-    EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<s8>& result, const VectorArray<s8>& a, const VectorArray<s8>& b) {
-        PairedMax(result, a, b);
-    });
+    ctx.reg_alloc.DefineValue(code, inst, x);
 }
 
 void EmitX64::EmitVectorPairedMaxS16(EmitContext& ctx, IR::Inst* inst) {
@@ -2846,9 +2910,9 @@ void EmitX64::EmitVectorPairedMaxS16(EmitContext& ctx, IR::Inst* inst) {
 void EmitX64::EmitVectorPairedMaxS32(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
 
-    const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-    const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(code, args[1]);
-    const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+    auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const y = ctx.reg_alloc.UseXmm(code, args[1]);
+    auto const tmp = ctx.reg_alloc.ScratchXmm(code);
 
     code.movdqa(tmp, x);
     code.shufps(tmp, y, 0b10001000);
@@ -2866,12 +2930,24 @@ void EmitX64::EmitVectorPairedMaxS32(EmitContext& ctx, IR::Inst* inst) {
 void EmitX64::EmitVectorPairedMaxU8(EmitContext& ctx, IR::Inst* inst) {
     if (code.HasHostFeature(HostFeature::SSSE3)) {
         EmitVectorPairedMinMax8(code, ctx, inst, &Xbyak::CodeGenerator::pmaxub);
-        return;
+    } else {
+        auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+        auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        auto const tmp1 = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+        auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+        auto const tmp3 = ctx.reg_alloc.ScratchXmm(code);
+        auto const constant_00ff = code.Const(xword, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF);
+        code.movdqa(tmp2, constant_00ff);
+        code.movdqa(tmp3, tmp1);
+        code.pand(tmp3, tmp2);
+        code.pand(tmp2, tmp0);
+        code.packuswb(tmp2, tmp3);
+        code.psrlw(tmp1, 8);
+        code.psrlw(tmp0, 8);
+        code.packuswb(tmp0, tmp1);
+        code.pmaxub(tmp0, tmp2);
+        ctx.reg_alloc.DefineValue(code, inst, tmp0);
     }
-
-    EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u8>& result, const VectorArray<u8>& a, const VectorArray<u8>& b) {
-        PairedMax(result, a, b);
-    });
 }
 
 void EmitX64::EmitVectorPairedMaxU16(EmitContext& ctx, IR::Inst* inst) {
@@ -2887,9 +2963,9 @@ void EmitX64::EmitVectorPairedMaxU16(EmitContext& ctx, IR::Inst* inst) {
 void EmitX64::EmitVectorPairedMaxU32(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
 
-    const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-    const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(code, args[1]);
-    const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+    auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const y = ctx.reg_alloc.UseXmm(code, args[1]);
+    auto const tmp = ctx.reg_alloc.ScratchXmm(code);
 
     code.movdqa(tmp, x);
     code.shufps(tmp, y, 0b10001000);
@@ -2907,14 +2983,15 @@ void EmitX64::EmitVectorPairedMaxU32(EmitContext& ctx, IR::Inst* inst) {
 void EmitX64::EmitVectorPairedMinS8(EmitContext& ctx, IR::Inst* inst) {
     if (code.HasHostFeature(HostFeature::SSE41)) {
         EmitVectorPairedMinMax8(code, ctx, inst, &Xbyak::CodeGenerator::pminsb);
-    } else if (code.HasHostFeature(HostFeature::SSSE3)) {
-        EmitVectorPairedMinMax8(code, ctx, inst, [&](const auto& lhs, const auto& rhs) {
-            FallbackMinMaxS8(code, ctx, lhs, rhs, MinMaxOperation::Min);
-        });
     } else {
-    EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<s8>& result, const VectorArray<s8>& a, const VectorArray<s8>& b) {
-        PairedMin(result, a, b);
-    });
+        EmitVectorPairedMinMax8(code, ctx, inst, [&](const auto& a, const auto& b) {
+            auto const c = ctx.reg_alloc.ScratchXmm(code);
+            code.movdqa(c, b);
+            code.pcmpgtb(c, a);
+            code.pand(a, c);
+            code.pandn(c, b);
+            code.por(a, c);
+        });
     }
 }
 
@@ -2925,9 +3002,9 @@ void EmitX64::EmitVectorPairedMinS16(EmitContext& ctx, IR::Inst* inst) {
 void EmitX64::EmitVectorPairedMinS32(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
 
-    const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-    const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(code, args[1]);
-    const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+    auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const y = ctx.reg_alloc.UseXmm(code, args[1]);
+    auto const tmp = ctx.reg_alloc.ScratchXmm(code);
 
     code.movdqa(tmp, x);
     code.shufps(tmp, y, 0b10001000);
@@ -2943,12 +3020,25 @@ void EmitX64::EmitVectorPairedMinS32(EmitContext& ctx, IR::Inst* inst) {
 void EmitX64::EmitVectorPairedMinU8(EmitContext& ctx, IR::Inst* inst) {
     if (code.HasHostFeature(HostFeature::SSSE3)) {
         EmitVectorPairedMinMax8(code, ctx, inst, &Xbyak::CodeGenerator::pminub);
-        return;
+    } else {
+        auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+        auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        auto const tmp1 = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+        auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+        auto const tmp3 = ctx.reg_alloc.ScratchXmm(code);
+        auto const constant_00ff = code.Const(xword, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF);
+        code.movdqa(tmp2, tmp1);
+        code.psrlw(tmp2, 8);
+        code.movdqa(tmp3, tmp0);
+        code.psrlw(tmp3, 8);
+        code.packuswb(tmp3, tmp2);
+        code.movdqa(tmp2, constant_00ff);
+        code.pand(tmp1, tmp2);
+        code.pand(tmp0, tmp2);
+        code.packuswb(tmp0, tmp1);
+        code.pminub(tmp0, tmp3);
+        ctx.reg_alloc.DefineValue(code, inst, tmp0);
     }
-
-    EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u8>& result, const VectorArray<u8>& a, const VectorArray<u8>& b) {
-        PairedMin(result, a, b);
-    });
 }
 
 void EmitX64::EmitVectorPairedMinU16(EmitContext& ctx, IR::Inst* inst) {
@@ -2964,9 +3054,9 @@ void EmitX64::EmitVectorPairedMinU16(EmitContext& ctx, IR::Inst* inst) {
 void EmitX64::EmitVectorPairedMinU32(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
 
-    const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-    const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(code, args[1]);
-    const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+    auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const y = ctx.reg_alloc.UseXmm(code, args[1]);
+    auto const tmp = ctx.reg_alloc.ScratchXmm(code);
 
     code.movdqa(tmp, x);
     code.shufps(tmp, y, 0b10001000);
@@ -2982,41 +3072,88 @@ void EmitX64::EmitVectorPairedMinU32(EmitContext& ctx, IR::Inst* inst) {
 }
 
 void EmitX64::EmitVectorPairedMaxLowerS8(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
     if (code.HasHostFeature(HostFeature::SSE41)) {
-        EmitVectorPairedMinMaxLower8(code, ctx, inst, &Xbyak::CodeGenerator::pmaxsb);
-        return;
-    } else if (code.HasHostFeature(HostFeature::SSSE3)) {
-        EmitVectorPairedMinMaxLower8(code, ctx, inst, [&](const auto& lhs, const auto& rhs) {
-            FallbackMinMaxS8(code, ctx, lhs, rhs, MinMaxOperation::Max);
-        });
-        return;
+        code.punpcklqdq(x, y);
+        code.pshufb(x, code.Const(xword, 0x0E'0C'0A'08'06'04'02'00, 0x0F'0D'0B'09'07'05'03'01));
+        code.movhlps(y, x);
+        code.movq(x, x);
+        code.pmaxsb(x, y);
+    } else {
+        auto const c = ctx.reg_alloc.ScratchXmm(code);
+        code.punpcklqdq(x, y);
+        code.pshufb(x, code.Const(xword, 0x0E'0C'0A'08'06'04'02'00, 0x0F'0D'0B'09'07'05'03'01));
+        code.movhlps(y, x);
+        code.movq(x, x);
+        code.movdqa(c, x);
+        code.pcmpgtb(c, y);
+        code.pand(x, c);
+        code.pandn(c, y);
+        code.por(x, c);
     }
-
-    EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<s8>& result, const VectorArray<s8>& a, const VectorArray<s8>& b) {
-        LowerPairedMax(result, a, b);
-    });
+    ctx.reg_alloc.DefineValue(code, inst, x);
 }
 
 void EmitX64::EmitVectorPairedMaxLowerS16(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
     if (code.HasHostFeature(HostFeature::SSE41)) {
-        EmitVectorPairedMinMaxLower16(code, ctx, inst, &Xbyak::CodeGenerator::pmaxsw);
-        return;
+        auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+        auto const tmp = ctx.reg_alloc.ScratchXmm(code);
+        // swap idxs 1 and 2 so that both registers contain even then odd-indexed pairs of elements
+        code.pshuflw(x, x, 0b11'01'10'00);
+        code.pshuflw(y, y, 0b11'01'10'00);
+        // move pairs of even/odd-indexed elements into one register each
+        // tmp = x[0, 2], y[0, 2], 0s...
+        code.movaps(tmp, y);
+        code.insertps(tmp, x, 0b01001100);
+        // x   = x[1, 3], y[1, 3], 0s...
+        code.insertps(x, y, 0b00011100);
+        code.pmaxsw(x, tmp);
+        ctx.reg_alloc.DefineValue(code, inst, x);
+    } else {
+        auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        auto const tmp1 = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+        code.punpcklwd(tmp0, tmp1);
+        code.pshufd(tmp1, tmp0, 232);
+        code.pshuflw(tmp1, tmp1, 216);
+        code.pshufd(tmp0, tmp0, 231);
+        code.pshuflw(tmp0, tmp0, 114);
+        code.pmaxsw(tmp0, tmp1);
+        code.movq(tmp0, tmp0);
+        ctx.reg_alloc.DefineValue(code, inst, tmp0);
     }
-
-    EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<s16>& result, const VectorArray<s16>& a, const VectorArray<s16>& b) {
-        LowerPairedMax(result, a, b);
-    });
 }
 
 void EmitX64::EmitVectorPairedMaxLowerS32(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
     if (code.HasHostFeature(HostFeature::SSE41)) {
-        EmitVectorPairedMinMaxLower32(code, ctx, inst, &Xbyak::CodeGenerator::pmaxsd);
-        return;
+        auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        auto const y = ctx.reg_alloc.UseXmm(code, args[1]);
+        auto const tmp = ctx.reg_alloc.ScratchXmm(code);
+        // tmp = x[1], y[1], 0, 0
+        code.movaps(tmp, y);
+        code.insertps(tmp, x, 0b01001100);
+        // x   = x[0], y[0], 0, 0
+        code.insertps(x, y, 0b00011100);
+        code.pmaxsd(x, tmp);
+        ctx.reg_alloc.DefineValue(code, inst, x);
+    } else {
+        auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        auto const tmp1 = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+        auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+        code.punpckldq(tmp0, tmp1);
+        code.pshufd(tmp1, tmp0, 238);
+        code.movdqa(tmp2, tmp0);
+        code.pcmpgtd(tmp2, tmp1);
+        code.pand(tmp0, tmp2);
+        code.pandn(tmp2, tmp1);
+        code.por(tmp2, tmp0);
+        code.movq(tmp0, tmp2);
+        ctx.reg_alloc.DefineValue(code, inst, tmp0);
     }
-
-    EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<s32>& result, const VectorArray<s32>& a, const VectorArray<s32>& b) {
-        LowerPairedMax(result, a, b);
-    });
 }
 
 void EmitX64::EmitVectorPairedMaxLowerU8(EmitContext& ctx, IR::Inst* inst) {
@@ -3031,63 +3168,143 @@ void EmitX64::EmitVectorPairedMaxLowerU8(EmitContext& ctx, IR::Inst* inst) {
 }
 
 void EmitX64::EmitVectorPairedMaxLowerU16(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
     if (code.HasHostFeature(HostFeature::SSE41)) {
-        EmitVectorPairedMinMaxLower16(code, ctx, inst, &Xbyak::CodeGenerator::pmaxuw);
-        return;
+        auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+        auto const tmp = ctx.reg_alloc.ScratchXmm(code);
+        // swap idxs 1 and 2 so that both registers contain even then odd-indexed pairs of elements
+        code.pshuflw(x, x, 0b11'01'10'00);
+        code.pshuflw(y, y, 0b11'01'10'00);
+        // move pairs of even/odd-indexed elements into one register each
+        // tmp = x[0, 2], y[0, 2], 0s...
+        code.movaps(tmp, y);
+        code.insertps(tmp, x, 0b01001100);
+        // x   = x[1, 3], y[1, 3], 0s...
+        code.insertps(x, y, 0b00011100);
+        code.pmaxuw(x, tmp);
+        ctx.reg_alloc.DefineValue(code, inst, x);
+    } else {
+        auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        auto const tmp1 = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+        code.punpcklwd(tmp0, tmp1);
+        code.pshufd(tmp1, tmp0, 232);
+        code.pshuflw(tmp1, tmp1, 216);
+        code.pshufd(tmp0, tmp0, 231);
+        code.pshuflw(tmp0, tmp0, 114);
+        code.psubusw(tmp0, tmp1);
+        code.paddw(tmp0, tmp1);
+        code.movq(tmp0, tmp0);
+        ctx.reg_alloc.DefineValue(code, inst, tmp0);
     }
-
-    EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u16>& result, const VectorArray<u16>& a, const VectorArray<u16>& b) {
-        LowerPairedMax(result, a, b);
-    });
 }
 
 void EmitX64::EmitVectorPairedMaxLowerU32(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
     if (code.HasHostFeature(HostFeature::SSE41)) {
-        EmitVectorPairedMinMaxLower32(code, ctx, inst, &Xbyak::CodeGenerator::pmaxud);
-        return;
+        auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        auto const y = ctx.reg_alloc.UseXmm(code, args[1]);
+        auto const tmp = ctx.reg_alloc.ScratchXmm(code);
+        // tmp = x[1], y[1], 0, 0
+        code.movaps(tmp, y);
+        code.insertps(tmp, x, 0b01001100);
+        // x   = x[0], y[0], 0, 0
+        code.insertps(x, y, 0b00011100);
+        code.pmaxud(x, tmp);
+        ctx.reg_alloc.DefineValue(code, inst, x);
+    } else {
+        auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        auto const tmp1 = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+        auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+        auto const tmp3 = ctx.reg_alloc.ScratchXmm(code);
+        code.punpckldq(tmp0, tmp1);
+        code.pshufd(tmp1, tmp0, 238);
+        code.movdqa(tmp2, code.Const(xword, 0x8000'00008000'0000, 0x8000'00008000'0000));
+        code.movdqa(tmp3, tmp0);
+        code.pxor(tmp3, tmp2);
+        code.pxor(tmp2, tmp1);
+        code.pcmpgtd(tmp3, tmp2);
+        code.pand(tmp0, tmp3);
+        code.pandn(tmp3, tmp1);
+        code.por(tmp3, tmp0);
+        code.movq(tmp0, tmp3);
+        ctx.reg_alloc.DefineValue(code, inst, tmp0);
     }
-
-    EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u32>& result, const VectorArray<u32>& a, const VectorArray<u32>& b) {
-        LowerPairedMax(result, a, b);
-    });
 }
 
 void EmitX64::EmitVectorPairedMinLowerS8(EmitContext& ctx, IR::Inst* inst) {
     if (code.HasHostFeature(HostFeature::SSE41)) {
         EmitVectorPairedMinMaxLower8(code, ctx, inst, &Xbyak::CodeGenerator::pminsb);
-        return;
-    } else if (code.HasHostFeature(HostFeature::SSSE3)) {
-        EmitVectorPairedMinMaxLower8(code, ctx, inst, [&](const auto& lhs, const auto& rhs) {
-            FallbackMinMaxS8(code, ctx, lhs, rhs, MinMaxOperation::Min);
+    } else {
+        EmitVectorPairedMinMaxLower8(code, ctx, inst, [&](const auto& a, const auto& b) {
+            auto const c = ctx.reg_alloc.ScratchXmm(code);
+            code.movdqa(c, b);
+            code.pcmpgtb(c, a);
+            code.pand(a, c);
+            code.pandn(c, b);
+            code.por(a, c);
         });
-        return;
     }
-
-    EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<s8>& result, const VectorArray<s8>& a, const VectorArray<s8>& b) {
-        LowerPairedMin(result, a, b);
-    });
 }
 
 void EmitX64::EmitVectorPairedMinLowerS16(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
     if (code.HasHostFeature(HostFeature::SSE41)) {
-        EmitVectorPairedMinMaxLower16(code, ctx, inst, &Xbyak::CodeGenerator::pminsw);
-        return;
+        auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+        auto const tmp = ctx.reg_alloc.ScratchXmm(code);
+        // swap idxs 1 and 2 so that both registers contain even then odd-indexed pairs of elements
+        code.pshuflw(x, x, 0b11'01'10'00);
+        code.pshuflw(y, y, 0b11'01'10'00);
+        // move pairs of even/odd-indexed elements into one register each
+        // tmp = x[0, 2], y[0, 2], 0s...
+        code.movaps(tmp, y);
+        code.insertps(tmp, x, 0b01001100);
+        // x   = x[1, 3], y[1, 3], 0s...
+        code.insertps(x, y, 0b00011100);
+        code.pminsw(x, tmp);
+        ctx.reg_alloc.DefineValue(code, inst, x);
+    } else {
+        auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        auto const tmp1 = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+        code.punpcklwd(tmp0, tmp1);
+        code.pshufd(tmp1, tmp0, 231);
+        code.pshuflw(tmp1, tmp1, 114);
+        code.pshufd(tmp0, tmp0, 232);
+        code.pshuflw(tmp0, tmp0, 216);
+        code.pminsw(tmp0, tmp1);
+        code.movq(tmp0, tmp0);
+        ctx.reg_alloc.DefineValue(code, inst, tmp0);
     }
-
-    EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<s16>& result, const VectorArray<s16>& a, const VectorArray<s16>& b) {
-        LowerPairedMin(result, a, b);
-    });
 }
 
 void EmitX64::EmitVectorPairedMinLowerS32(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
     if (code.HasHostFeature(HostFeature::SSE41)) {
-        EmitVectorPairedMinMaxLower32(code, ctx, inst, &Xbyak::CodeGenerator::pminsd);
-        return;
+        auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        auto const y = ctx.reg_alloc.UseXmm(code, args[1]);
+        auto const tmp = ctx.reg_alloc.ScratchXmm(code);
+        // tmp = x[1], y[1], 0, 0
+        code.movaps(tmp, y);
+        code.insertps(tmp, x, 0b01001100);
+        // x   = x[0], y[0], 0, 0
+        code.insertps(x, y, 0b00011100);
+        code.pminsd(x, tmp);
+        ctx.reg_alloc.DefineValue(code, inst, x);
+    } else {
+        auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        auto const tmp1 = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+        auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+        code.punpckldq(tmp0, tmp1);
+        code.pshufd(tmp1, tmp0, 238);
+        code.movdqa(tmp2, tmp0);
+        code.pcmpgtd(tmp2, tmp1);
+        code.pand(tmp1, tmp2);
+        code.pandn(tmp2, tmp0);
+        code.por(tmp2, tmp1);
+        code.movq(tmp0, tmp2);
+        ctx.reg_alloc.DefineValue(code, inst, tmp0);
     }
-
-    EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<s32>& result, const VectorArray<s32>& a, const VectorArray<s32>& b) {
-        LowerPairedMin(result, a, b);
-    });
 }
 
 void EmitX64::EmitVectorPairedMinLowerU8(EmitContext& ctx, IR::Inst* inst) {
@@ -3102,50 +3319,91 @@ void EmitX64::EmitVectorPairedMinLowerU8(EmitContext& ctx, IR::Inst* inst) {
 }
 
 void EmitX64::EmitVectorPairedMinLowerU16(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
     if (code.HasHostFeature(HostFeature::SSE41)) {
-        EmitVectorPairedMinMaxLower16(code, ctx, inst, &Xbyak::CodeGenerator::pminuw);
-        return;
+        auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+        auto const tmp = ctx.reg_alloc.ScratchXmm(code);
+        // swap idxs 1 and 2 so that both registers contain even then odd-indexed pairs of elements
+        code.pshuflw(x, x, 0b11'01'10'00);
+        code.pshuflw(y, y, 0b11'01'10'00);
+        // move pairs of even/odd-indexed elements into one register each
+        // tmp = x[0, 2], y[0, 2], 0s...
+        code.movaps(tmp, y);
+        code.insertps(tmp, x, 0b01001100);
+        // x   = x[1, 3], y[1, 3], 0s...
+        code.insertps(x, y, 0b00011100);
+        code.pminuw(x, tmp);
+        ctx.reg_alloc.DefineValue(code, inst, x);
+    } else {
+        auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        auto const tmp1 = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+        auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+        code.punpcklwd(tmp0, tmp1);
+        code.pshufd(tmp1, tmp0, 231);
+        code.pshuflw(tmp1, tmp1, 114);
+        code.pshufd(tmp0, tmp0, 232);
+        code.pshuflw(tmp0, tmp0, 216);
+        code.movdqa(tmp2, tmp1);
+        code.psubusw(tmp2, tmp0);
+        code.psubw(tmp1, tmp2);
+        code.movq(tmp0, tmp1);
+        ctx.reg_alloc.DefineValue(code, inst, tmp0);
     }
-
-    EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u16>& result, const VectorArray<u16>& a, const VectorArray<u16>& b) {
-        LowerPairedMin(result, a, b);
-    });
 }
 
 void EmitX64::EmitVectorPairedMinLowerU32(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
     if (code.HasHostFeature(HostFeature::SSE41)) {
-        EmitVectorPairedMinMaxLower32(code, ctx, inst, &Xbyak::CodeGenerator::pminud);
-        return;
+        auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        auto const y = ctx.reg_alloc.UseXmm(code, args[1]);
+        auto const tmp = ctx.reg_alloc.ScratchXmm(code);
+        // tmp = x[1], y[1], 0, 0
+        code.movaps(tmp, y);
+        code.insertps(tmp, x, 0b01001100);
+        // x   = x[0], y[0], 0, 0
+        code.insertps(x, y, 0b00011100);
+        code.pminud(x, tmp);
+        ctx.reg_alloc.DefineValue(code, inst, x);
+    } else {
+        auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        auto const tmp1 = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+        auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+        auto const tmp3 = ctx.reg_alloc.ScratchXmm(code);
+        code.punpckldq(tmp0, tmp1);
+        code.pshufd(tmp1, tmp0, 238);
+        code.movdqa(tmp2, code.Const(xword, 0x8000'00008000'0000, 0x8000'00008000'0000));
+        code.movdqa(tmp3, tmp0);
+        code.pxor(tmp3, tmp2);
+        code.pxor(tmp2, tmp1);
+        code.pcmpgtd(tmp3, tmp2);
+        code.pand(tmp1, tmp3);
+        code.pandn(tmp3, tmp0);
+        code.por(tmp3, tmp1);
+        code.movq(tmp0, tmp3);
+        ctx.reg_alloc.DefineValue(code, inst, tmp0);
     }
-
-    EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray<u32>& result, const VectorArray<u32>& a, const VectorArray<u32>& b) {
-        LowerPairedMin(result, a, b);
-    });
 }
 
 template<typename D, typename T>
 static D PolynomialMultiply(T lhs, T rhs) {
     constexpr size_t bit_size = mcl::bitsizeof<T>;
     const std::bitset<bit_size> operand(lhs);
-
     D res = 0;
-    for (size_t i = 0; i < bit_size; i++) {
-        if (operand[i]) {
+    for (size_t i = 0; i < bit_size; i++)
+        if (operand[i])
             res ^= rhs << i;
-        }
-    }
-
     return res;
 }
 
 void EmitX64::EmitVectorPolynomialMultiply8(EmitContext& ctx, IR::Inst* inst) {
     if (code.HasHostFeature(HostFeature::SSE41)) {
         auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-        const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-        const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
-        const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
-        const Xbyak::Xmm alternate = ctx.reg_alloc.ScratchXmm(code);
-        const Xbyak::Xmm mask = ctx.reg_alloc.ScratchXmm(code);
+        auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        auto const xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
+        auto const result = ctx.reg_alloc.ScratchXmm(code);
+        auto const alternate = ctx.reg_alloc.ScratchXmm(code);
+        auto const mask = ctx.reg_alloc.ScratchXmm(code);
         const Xbyak::Reg32 counter = ctx.reg_alloc.ScratchGpr(code).cvt32();
 
         Xbyak::Label loop;
@@ -3183,11 +3441,11 @@ void EmitX64::EmitVectorPolynomialMultiply8(EmitContext& ctx, IR::Inst* inst) {
 void EmitX64::EmitVectorPolynomialMultiplyLong8(EmitContext& ctx, IR::Inst* inst) {
     if (code.HasHostFeature(HostFeature::SSE41)) {
         auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-        const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-        const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
-        const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
-        const Xbyak::Xmm alternate = ctx.reg_alloc.ScratchXmm(code);
-        const Xbyak::Xmm mask = ctx.reg_alloc.ScratchXmm(code);
+        auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        auto const xmm_b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+        auto const result = ctx.reg_alloc.ScratchXmm(code);
+        auto const alternate = ctx.reg_alloc.ScratchXmm(code);
+        auto const mask = ctx.reg_alloc.ScratchXmm(code);
         const Xbyak::Reg32 counter = ctx.reg_alloc.ScratchGpr(code).cvt32();
 
         Xbyak::Label loop;
@@ -3229,8 +3487,8 @@ void EmitX64::EmitVectorPolynomialMultiplyLong8(EmitContext& ctx, IR::Inst* inst
 void EmitX64::EmitVectorPolynomialMultiplyLong64(EmitContext& ctx, IR::Inst* inst) {
     if (code.HasHostFeature(HostFeature::PCLMULQDQ)) {
         auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-        const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-        const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
+        auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        auto const xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
 
         code.pclmulqdq(xmm_a, xmm_b, 0x00);
 
@@ -3260,7 +3518,7 @@ void EmitX64::EmitVectorPolynomialMultiplyLong64(EmitContext& ctx, IR::Inst* ins
 void EmitX64::EmitVectorPopulationCount(EmitContext& ctx, IR::Inst* inst) {
     if (code.HasHostFeature(HostFeature::AVX512VL | HostFeature::AVX512BITALG)) {
         auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-        const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        auto const data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
 
         code.vpopcntb(data, data);
 
@@ -3271,10 +3529,10 @@ void EmitX64::EmitVectorPopulationCount(EmitContext& ctx, IR::Inst* inst) {
     if (code.HasHostFeature(HostFeature::SSSE3)) {
         auto args = ctx.reg_alloc.GetArgumentInfo(inst);
 
-        const Xbyak::Xmm low_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-        const Xbyak::Xmm high_a = ctx.reg_alloc.ScratchXmm(code);
-        const Xbyak::Xmm tmp1 = ctx.reg_alloc.ScratchXmm(code);
-        const Xbyak::Xmm tmp2 = ctx.reg_alloc.ScratchXmm(code);
+        auto const low_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        auto const high_a = ctx.reg_alloc.ScratchXmm(code);
+        auto const tmp1 = ctx.reg_alloc.ScratchXmm(code);
+        auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
 
         code.movdqa(high_a, low_a);
         code.psrlw(high_a, 4);
@@ -3303,12 +3561,12 @@ void EmitX64::EmitVectorPopulationCount(EmitContext& ctx, IR::Inst* inst) {
 void EmitX64::EmitVectorReverseBits(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
 
-    const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
 
     if (code.HasHostFeature(HostFeature::GFNI)) {
         code.gf2p8affineqb(data, code.Const(xword, 0x8040201008040201, 0x8040201008040201), 0);
     } else {
-        const Xbyak::Xmm high_nibble_reg = ctx.reg_alloc.ScratchXmm(code);
+        auto const high_nibble_reg = ctx.reg_alloc.ScratchXmm(code);
         code.movdqa(high_nibble_reg, code.Const(xword, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0));
         code.pand(high_nibble_reg, data);
         code.pxor(data, high_nibble_reg);
@@ -3316,7 +3574,7 @@ void EmitX64::EmitVectorReverseBits(EmitContext& ctx, IR::Inst* inst) {
 
         if (code.HasHostFeature(HostFeature::SSSE3)) {
             // High lookup
-            const Xbyak::Xmm high_reversed_reg = ctx.reg_alloc.ScratchXmm(code);
+            auto const high_reversed_reg = ctx.reg_alloc.ScratchXmm(code);
             code.movdqa(high_reversed_reg, code.Const(xword, 0xE060A020C0408000, 0xF070B030D0509010));
             code.pshufb(high_reversed_reg, data);
 
@@ -3350,8 +3608,8 @@ void EmitX64::EmitVectorReverseBits(EmitContext& ctx, IR::Inst* inst) {
 void EmitX64::EmitVectorReverseElementsInHalfGroups8(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
 
-    const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-    const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+    auto const data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const tmp = ctx.reg_alloc.ScratchXmm(code);
 
     code.movdqa(tmp, data);
     code.psllw(tmp, 8);
@@ -3363,13 +3621,13 @@ void EmitX64::EmitVectorReverseElementsInHalfGroups8(EmitContext& ctx, IR::Inst*
 
 void EmitX64::EmitVectorReverseElementsInWordGroups8(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-    const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
     if (code.HasHostFeature(HostFeature::AVX)) {
         code.vpshufb(data, data, code.Const(xword, 0x0405060700010203, 0x0c0d0e0f08090a0b));
     } else if (code.HasHostFeature(HostFeature::SSSE3)) {
         code.pshufb(data, code.Const(xword, 0x0405060700010203, 0x0c0d0e0f08090a0b));
     } else {
-        const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+        auto const tmp = ctx.reg_alloc.ScratchXmm(code);
         code.movdqa(tmp, data);
         code.psllw(tmp, 8);
         code.psrlw(data, 8);
@@ -3382,7 +3640,7 @@ void EmitX64::EmitVectorReverseElementsInWordGroups8(EmitContext& ctx, IR::Inst*
 
 void EmitX64::EmitVectorReverseElementsInWordGroups16(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-    const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
     code.pshuflw(data, data, 0b10110001);
     code.pshufhw(data, data, 0b10110001);
     ctx.reg_alloc.DefineValue(code, inst, data);
@@ -3390,13 +3648,13 @@ void EmitX64::EmitVectorReverseElementsInWordGroups16(EmitContext& ctx, IR::Inst
 
 void EmitX64::EmitVectorReverseElementsInLongGroups8(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-    const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
     if (code.HasHostFeature(HostFeature::AVX)) {
         code.vpshufb(data, data, code.Const(xword, 0x0001020304050607, 0x08090a0b0c0d0e0f));
     } else if (code.HasHostFeature(HostFeature::SSSE3)) {
         code.pshufb(data, code.Const(xword, 0x0001020304050607, 0x08090a0b0c0d0e0f));
     } else {
-        const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+        auto const tmp = ctx.reg_alloc.ScratchXmm(code);
         code.movdqa(tmp, data);
         code.psllw(tmp, 8);
         code.psrlw(data, 8);
@@ -3410,7 +3668,7 @@ void EmitX64::EmitVectorReverseElementsInLongGroups8(EmitContext& ctx, IR::Inst*
 void EmitX64::EmitVectorReverseElementsInLongGroups16(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
 
-    const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
 
     code.pshuflw(data, data, 0b00011011);
     code.pshufhw(data, data, 0b00011011);
@@ -3421,7 +3679,7 @@ void EmitX64::EmitVectorReverseElementsInLongGroups16(EmitContext& ctx, IR::Inst
 void EmitX64::EmitVectorReverseElementsInLongGroups32(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
 
-    const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
 
     code.pshuflw(data, data, 0b01001110);
     code.pshufhw(data, data, 0b01001110);
@@ -3432,8 +3690,8 @@ void EmitX64::EmitVectorReverseElementsInLongGroups32(EmitContext& ctx, IR::Inst
 void EmitX64::EmitVectorReduceAdd8(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
 
-    const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-    const Xbyak::Xmm temp = xmm0;
+    auto const data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const temp = xmm0;
 
     // Add upper elements to lower elements
     code.pshufd(temp, data, 0b01'00'11'10);
@@ -3453,8 +3711,8 @@ void EmitX64::EmitVectorReduceAdd8(EmitContext& ctx, IR::Inst* inst) {
 void EmitX64::EmitVectorReduceAdd16(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
 
-    const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-    const Xbyak::Xmm temp = xmm0;
+    auto const data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const temp = xmm0;
 
     if (code.HasHostFeature(HostFeature::SSSE3)) {
         code.pxor(temp, temp);
@@ -3484,8 +3742,8 @@ void EmitX64::EmitVectorReduceAdd16(EmitContext& ctx, IR::Inst* inst) {
 void EmitX64::EmitVectorReduceAdd32(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
 
-    const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-    const Xbyak::Xmm temp = xmm0;
+    auto const data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const temp = xmm0;
 
     // Add upper elements to lower elements(reversed)
     code.pshufd(temp, data, 0b00'01'10'11);
@@ -3508,8 +3766,8 @@ void EmitX64::EmitVectorReduceAdd32(EmitContext& ctx, IR::Inst* inst) {
 void EmitX64::EmitVectorReduceAdd64(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
 
-    const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-    const Xbyak::Xmm temp = xmm0;
+    auto const data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const temp = xmm0;
 
     // Add upper elements to lower elements
     code.pshufd(temp, data, 0b01'00'11'10);
@@ -3524,8 +3782,8 @@ void EmitX64::EmitVectorReduceAdd64(EmitContext& ctx, IR::Inst* inst) {
 void EmitX64::EmitVectorRotateWholeVectorRight(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
 
-    const Xbyak::Xmm operand = ctx.reg_alloc.UseXmm(code, args[0]);
-    const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+    auto const operand = ctx.reg_alloc.UseXmm(code, args[0]);
+    auto const result = ctx.reg_alloc.ScratchXmm(code);
     const u8 shift_amount = args[1].GetImmediateU8();
     ASSERT(shift_amount % 32 == 0);
     const u8 shuffle_imm = std::rotr<u8>(0b11100100, shift_amount / 32 * 2);
@@ -3538,12 +3796,12 @@ void EmitX64::EmitVectorRotateWholeVectorRight(EmitContext& ctx, IR::Inst* inst)
 static void EmitVectorRoundingHalvingAddSigned(size_t esize, EmitContext& ctx, IR::Inst* inst, BlockOfCode& code) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
 
-    const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-    const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+    auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
 
     switch (esize) {
     case 8: {
-        const Xbyak::Xmm vec_128 = ctx.reg_alloc.ScratchXmm(code);
+        auto const vec_128 = ctx.reg_alloc.ScratchXmm(code);
         code.movdqa(vec_128, code.Const(xword, 0x8080808080808080, 0x8080808080808080));
 
         code.paddb(a, vec_128);
@@ -3553,7 +3811,7 @@ static void EmitVectorRoundingHalvingAddSigned(size_t esize, EmitContext& ctx, I
         break;
     }
     case 16: {
-        const Xbyak::Xmm vec_32768 = ctx.reg_alloc.ScratchXmm(code);
+        auto const vec_32768 = ctx.reg_alloc.ScratchXmm(code);
         code.movdqa(vec_32768, code.Const(xword, 0x8000800080008000, 0x8000800080008000));
 
         code.paddw(a, vec_32768);
@@ -3563,7 +3821,7 @@ static void EmitVectorRoundingHalvingAddSigned(size_t esize, EmitContext& ctx, I
         break;
     }
     case 32: {
-        const Xbyak::Xmm tmp1 = ctx.reg_alloc.ScratchXmm(code);
+        auto const tmp1 = ctx.reg_alloc.ScratchXmm(code);
         code.movdqa(tmp1, a);
 
         code.por(a, b);
@@ -3603,9 +3861,9 @@ static void EmitVectorRoundingHalvingAddUnsigned(size_t esize, EmitContext& ctx,
     case 32: {
         auto args = ctx.reg_alloc.GetArgumentInfo(inst);
 
-        const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-        const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
-        const Xbyak::Xmm tmp1 = ctx.reg_alloc.ScratchXmm(code);
+        auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        auto const b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+        auto const tmp1 = ctx.reg_alloc.ScratchXmm(code);
 
         code.movdqa(tmp1, a);
 
@@ -3669,18 +3927,18 @@ static void EmitUnsignedRoundingShiftLeft(BlockOfCode& code, EmitContext& ctx, I
     static_assert(esize == 32 || esize == 64);
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
 
-    const Xbyak::Xmm a = ctx.reg_alloc.UseXmm(code, args[0]);
-    const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(code, args[1]);
+    auto const a = ctx.reg_alloc.UseXmm(code, args[0]);
+    auto const b = ctx.reg_alloc.UseXmm(code, args[1]);
 
     // positive values of b are left shifts, while negative values are (positive) rounding right shifts
     // only the lowest byte of each element is read as the shift amount
     // conveniently, the behavior of bit shifts greater than element width is the same in NEON and SSE/AVX - filled with zeros
-    const Xbyak::Xmm shift_amount = ctx.reg_alloc.ScratchXmm(code);
+    auto const shift_amount = ctx.reg_alloc.ScratchXmm(code);
     code.vpabsb(shift_amount, b);
     code.vpand(shift_amount, shift_amount, code.BConst<esize>(xword, 0xFF));
 
     // if b is positive, do a normal left shift
-    const Xbyak::Xmm left_shift = ctx.reg_alloc.ScratchXmm(code);
+    auto const left_shift = ctx.reg_alloc.ScratchXmm(code);
     ICODE(vpsllv)(left_shift, a, shift_amount);
 
     // if b is negative, compute the rounding right shift
@@ -3691,7 +3949,7 @@ static void EmitUnsignedRoundingShiftLeft(BlockOfCode& code, EmitContext& ctx, I
     // tmp = (a >> (b - 1)) & 1
     // res = (a >> b) + tmp
     // to add the value of the last bit to be shifted off to the result of the right shift
-    const Xbyak::Xmm right_shift = ctx.reg_alloc.ScratchXmm(code);
+    auto const right_shift = ctx.reg_alloc.ScratchXmm(code);
     code.vmovdqa(xmm0, code.BConst<esize>(xword, 1));
 
     // find value of last bit to be shifted off
@@ -3775,12 +4033,12 @@ void EmitX64::EmitVectorRoundingShiftLeftU64(EmitContext& ctx, IR::Inst* inst) {
 void EmitX64::EmitVectorSignExtend8(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
     if (code.HasHostFeature(HostFeature::SSE41)) {
-        const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
         code.pmovsxbw(a, a);
         ctx.reg_alloc.DefineValue(code, inst, a);
     } else {
-        const Xbyak::Xmm a = ctx.reg_alloc.UseXmm(code, args[0]);
-        const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+        auto const a = ctx.reg_alloc.UseXmm(code, args[0]);
+        auto const result = ctx.reg_alloc.ScratchXmm(code);
         code.pxor(result, result);
         code.punpcklbw(result, a);
         code.psraw(result, 8);
@@ -3791,12 +4049,12 @@ void EmitX64::EmitVectorSignExtend8(EmitContext& ctx, IR::Inst* inst) {
 void EmitX64::EmitVectorSignExtend16(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
     if (code.HasHostFeature(HostFeature::SSE41)) {
-        const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
         code.pmovsxwd(a, a);
         ctx.reg_alloc.DefineValue(code, inst, a);
     } else {
-        const Xbyak::Xmm a = ctx.reg_alloc.UseXmm(code, args[0]);
-        const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+        auto const a = ctx.reg_alloc.UseXmm(code, args[0]);
+        auto const result = ctx.reg_alloc.ScratchXmm(code);
         code.pxor(result, result);
         code.punpcklwd(result, a);
         code.psrad(result, 16);
@@ -3806,12 +4064,12 @@ void EmitX64::EmitVectorSignExtend16(EmitContext& ctx, IR::Inst* inst) {
 
 void EmitX64::EmitVectorSignExtend32(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-    const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
 
     if (code.HasHostFeature(HostFeature::SSE41)) {
         code.pmovsxdq(a, a);
     } else {
-        const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+        auto const tmp = ctx.reg_alloc.ScratchXmm(code);
 
         code.movaps(tmp, a);
         code.psrad(tmp, 31);
@@ -3824,7 +4082,7 @@ void EmitX64::EmitVectorSignExtend32(EmitContext& ctx, IR::Inst* inst) {
 void EmitX64::EmitVectorSignExtend64(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
 
-    const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
     const Xbyak::Reg64 gpr_tmp = ctx.reg_alloc.ScratchGpr(code);
 
     code.movq(gpr_tmp, data);
@@ -3833,7 +4091,7 @@ void EmitX64::EmitVectorSignExtend64(EmitContext& ctx, IR::Inst* inst) {
     if (code.HasHostFeature(HostFeature::SSE41)) {
         code.pinsrq(data, gpr_tmp, 1);
     } else {
-        const Xbyak::Xmm xmm_tmp = ctx.reg_alloc.ScratchXmm(code);
+        auto const xmm_tmp = ctx.reg_alloc.ScratchXmm(code);
 
         code.movq(xmm_tmp, gpr_tmp);
         code.punpcklqdq(data, xmm_tmp);
@@ -3844,9 +4102,9 @@ void EmitX64::EmitVectorSignExtend64(EmitContext& ctx, IR::Inst* inst) {
 
 static void EmitVectorSignedAbsoluteDifference(size_t esize, EmitContext& ctx, IR::Inst* inst, BlockOfCode& code) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-    const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-    const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(code, args[1]);
-    const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+    auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const y = ctx.reg_alloc.UseXmm(code, args[1]);
+    auto const tmp = ctx.reg_alloc.ScratchXmm(code);
 
     // only signed 16-bit min/max are available below SSE4.1
     if (code.HasHostFeature(HostFeature::SSE41) || esize == 16) {
@@ -3912,11 +4170,11 @@ void EmitX64::EmitVectorSignedMultiply16(EmitContext& ctx, IR::Inst* inst) {
     const auto lower_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetLowerFromOp);
 
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-    const Xbyak::Xmm x = ctx.reg_alloc.UseXmm(code, args[0]);
-    const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(code, args[1]);
+    auto const x = ctx.reg_alloc.UseXmm(code, args[0]);
+    auto const y = ctx.reg_alloc.UseXmm(code, args[1]);
 
     if (upper_inst) {
-        const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+        auto const result = ctx.reg_alloc.ScratchXmm(code);
         if (code.HasHostFeature(HostFeature::AVX)) {
             code.vpmulhw(result, x, y);
         } else {
@@ -3928,7 +4186,7 @@ void EmitX64::EmitVectorSignedMultiply16(EmitContext& ctx, IR::Inst* inst) {
     }
 
     if (lower_inst) {
-        const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+        auto const result = ctx.reg_alloc.ScratchXmm(code);
         if (code.HasHostFeature(HostFeature::AVX)) {
             code.vpmullw(result, x, y);
         } else {
@@ -3946,9 +4204,9 @@ void EmitX64::EmitVectorSignedMultiply32(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
 
     if (lower_inst && !upper_inst && code.HasHostFeature(HostFeature::AVX)) {
-        const Xbyak::Xmm x = ctx.reg_alloc.UseXmm(code, args[0]);
-        const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(code, args[1]);
-        const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+        auto const x = ctx.reg_alloc.UseXmm(code, args[0]);
+        auto const y = ctx.reg_alloc.UseXmm(code, args[1]);
+        auto const result = ctx.reg_alloc.ScratchXmm(code);
 
         code.vpmulld(result, x, y);
 
@@ -3957,16 +4215,16 @@ void EmitX64::EmitVectorSignedMultiply32(EmitContext& ctx, IR::Inst* inst) {
     }
 
     if (code.HasHostFeature(HostFeature::AVX)) {
-        const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-        const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+        auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
 
         if (lower_inst) {
-            const Xbyak::Xmm lower_result = ctx.reg_alloc.ScratchXmm(code);
+            auto const lower_result = ctx.reg_alloc.ScratchXmm(code);
             code.vpmulld(lower_result, x, y);
             ctx.reg_alloc.DefineValue(code, lower_inst, lower_result);
         }
 
-        const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+        auto const result = ctx.reg_alloc.ScratchXmm(code);
 
         code.vpmuldq(result, x, y);
         code.vpsrlq(x, x, 32);
@@ -3978,12 +4236,12 @@ void EmitX64::EmitVectorSignedMultiply32(EmitContext& ctx, IR::Inst* inst) {
         return;
     }
 
-    const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-    const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
-    const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
-    const Xbyak::Xmm sign_correction = ctx.reg_alloc.ScratchXmm(code);
-    const Xbyak::Xmm upper_result = ctx.reg_alloc.ScratchXmm(code);
-    const Xbyak::Xmm lower_result = ctx.reg_alloc.ScratchXmm(code);
+    auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+    auto const tmp = ctx.reg_alloc.ScratchXmm(code);
+    auto const sign_correction = ctx.reg_alloc.ScratchXmm(code);
+    auto const upper_result = ctx.reg_alloc.ScratchXmm(code);
+    auto const lower_result = ctx.reg_alloc.ScratchXmm(code);
 
     // calculate sign correction
     code.movdqa(tmp, x);
@@ -4026,7 +4284,7 @@ void EmitX64::EmitVectorSignedMultiply32(EmitContext& ctx, IR::Inst* inst) {
 static void EmitVectorSignedSaturatedAbs(size_t esize, BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
 
-    const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
     const Xbyak::Reg32 bit = ctx.reg_alloc.ScratchGpr(code).cvt32();
 
     // SSE absolute value functions return an unsigned result
@@ -4038,21 +4296,34 @@ static void EmitVectorSignedSaturatedAbs(size_t esize, BlockOfCode& code, EmitCo
     // or shift in sign bits to create a mask of (msb == 1 ? -1 : 0), then add to the result vector
     switch (esize) {
     case 8: {
-        VectorAbs8(code, ctx, data);
+        if (code.HasHostFeature(HostFeature::SSSE3)) {
+            code.pabsb(data, data);
+        } else {
+            auto const temp = ctx.reg_alloc.ScratchXmm(code);
+            code.pxor(temp, temp);
+            code.psubb(temp, data);
+            code.pminub(data, temp);
+        }
         code.pmovmskb(bit, data);
-
         code.pminub(data, code.BConst<8>(xword, 0x7F));
         break;
     }
     case 16: {
-        VectorAbs16(code, ctx, data);
+        if (code.HasHostFeature(HostFeature::SSSE3)) {
+            code.pabsw(data, data);
+        } else {
+            auto const temp = ctx.reg_alloc.ScratchXmm(code);
+            code.pxor(temp, temp);
+            code.psubw(temp, data);
+            code.pmaxsw(data, temp);
+        }
         code.pmovmskb(bit, data);
         code.and_(bit, 0xAAAA);  // toggle mask bits that aren't the msb of an int16 to 0
 
         if (code.HasHostFeature(HostFeature::SSE41)) {
             code.pminuw(data, code.BConst<16>(xword, 0x7FFF));
         } else {
-            const Xbyak::Xmm tmp = xmm0;
+            auto const tmp = xmm0;
             code.movdqa(tmp, data);
             code.psraw(data, 15);
             code.paddw(data, tmp);
@@ -4060,13 +4331,21 @@ static void EmitVectorSignedSaturatedAbs(size_t esize, BlockOfCode& code, EmitCo
         break;
     }
     case 32: {
-        VectorAbs32(code, ctx, data);
+        if (code.HasHostFeature(HostFeature::SSSE3)) {
+            code.pabsd(data, data);
+        } else {
+            auto const temp = ctx.reg_alloc.ScratchXmm(code);
+            code.movdqa(temp, data);
+            code.psrad(temp, 31);
+            code.pxor(data, temp);
+            code.psubd(data, temp);
+        }
         code.movmskps(bit, data);
 
         if (code.HasHostFeature(HostFeature::SSE41)) {
             code.pminud(data, code.BConst<32>(xword, 0x7FFFFFFF));
         } else {
-            const Xbyak::Xmm tmp = xmm0;
+            auto const tmp = xmm0;
             code.movdqa(tmp, data);
             code.psrad(data, 31);
             code.paddd(data, tmp);
@@ -4074,10 +4353,18 @@ static void EmitVectorSignedSaturatedAbs(size_t esize, BlockOfCode& code, EmitCo
         break;
     }
     case 64: {
-        VectorAbs64(code, ctx, data);
+        if (code.HasHostFeature(HostFeature::AVX512_Ortho)) {
+            code.vpabsq(data, data);
+        } else {
+            auto const temp = ctx.reg_alloc.ScratchXmm(code);
+            code.pshufd(temp, data, 0b11110101);
+            code.psrad(temp, 31);
+            code.pxor(data, temp);
+            code.psubq(data, temp);
+        }
         code.movmskpd(bit, data);
 
-        const Xbyak::Xmm tmp = xmm0;
+        auto const tmp = xmm0;
         if (code.HasHostFeature(HostFeature::SSE42)) {
             // create a -1 mask if msb is set
             code.pxor(tmp, tmp);
@@ -4119,13 +4406,13 @@ template<size_t bit_width>
 static void EmitVectorSignedSaturatedAccumulateUnsigned(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
 
-    const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(code, args[1]);
+    auto const y = ctx.reg_alloc.UseXmm(code, args[1]);
     code.movdqa(xmm0, y);
     ctx.reg_alloc.Release(y);
 
-    const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-    const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
-    const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+    auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const result = ctx.reg_alloc.ScratchXmm(code);
+    auto const tmp = ctx.reg_alloc.ScratchXmm(code);
 
     switch (bit_width) {
     case 8:
@@ -4182,7 +4469,7 @@ static void EmitVectorSignedSaturatedAccumulateUnsigned(BlockOfCode& code, EmitC
     switch (bit_width) {
     case 8:
         if (code.HasHostFeature(HostFeature::AVX)) {
-            const Xbyak::Xmm tmp2 = ctx.reg_alloc.ScratchXmm(code);
+            auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
             code.pcmpeqb(tmp2, tmp2);
             code.pxor(tmp, tmp);
             code.vpblendvb(xmm0, tmp, tmp2, xmm0);
@@ -4262,10 +4549,10 @@ void EmitX64::EmitVectorSignedSaturatedAccumulateUnsigned64(EmitContext& ctx, IR
 template<bool is_rounding>
 static void EmitVectorSignedSaturatedDoublingMultiply16(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-    const Xbyak::Xmm x = ctx.reg_alloc.UseXmm(code, args[0]);
-    const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(code, args[1]);
-    const Xbyak::Xmm upper_tmp = ctx.reg_alloc.ScratchXmm(code);
-    const Xbyak::Xmm lower_tmp = ctx.reg_alloc.ScratchXmm(code);
+    auto const x = ctx.reg_alloc.UseXmm(code, args[0]);
+    auto const y = ctx.reg_alloc.UseXmm(code, args[1]);
+    auto const upper_tmp = ctx.reg_alloc.ScratchXmm(code);
+    auto const lower_tmp = ctx.reg_alloc.ScratchXmm(code);
 
     if (code.HasHostFeature(HostFeature::AVX)) {
         code.vpmulhw(upper_tmp, x, y);
@@ -4284,7 +4571,7 @@ static void EmitVectorSignedSaturatedDoublingMultiply16(BlockOfCode& code, EmitC
     ctx.reg_alloc.Release(x);
     ctx.reg_alloc.Release(y);
 
-    const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+    auto const result = ctx.reg_alloc.ScratchXmm(code);
 
     if (code.HasHostFeature(HostFeature::AVX)) {
         if constexpr (is_rounding) {
@@ -4334,10 +4621,10 @@ void EmitVectorSignedSaturatedDoublingMultiply32(BlockOfCode& code, EmitContext&
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
 
     if (code.HasHostFeature(HostFeature::AVX)) {
-        const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-        const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
-        const Xbyak::Xmm odds = ctx.reg_alloc.ScratchXmm(code);
-        const Xbyak::Xmm even = ctx.reg_alloc.ScratchXmm(code);
+        auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+        auto const odds = ctx.reg_alloc.ScratchXmm(code);
+        auto const even = ctx.reg_alloc.ScratchXmm(code);
 
         code.vpmuldq(odds, x, y);
         code.vpsrlq(x, x, 32);
@@ -4350,7 +4637,7 @@ void EmitVectorSignedSaturatedDoublingMultiply32(BlockOfCode& code, EmitContext&
         code.vpaddq(odds, odds, odds);
         code.vpaddq(even, even, even);
 
-        const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+        auto const result = ctx.reg_alloc.ScratchXmm(code);
 
         if constexpr (is_rounding) {
             code.vmovdqa(result, code.Const(xword, 0x0000000080000000, 0x0000000080000000));
@@ -4361,7 +4648,7 @@ void EmitVectorSignedSaturatedDoublingMultiply32(BlockOfCode& code, EmitContext&
         code.vpsrlq(result, odds, 32);
         code.vblendps(result, result, even, 0b1010);
 
-        const Xbyak::Xmm mask = ctx.reg_alloc.ScratchXmm(code);
+        auto const mask = ctx.reg_alloc.ScratchXmm(code);
         const Xbyak::Reg32 bit = ctx.reg_alloc.ScratchGpr(code).cvt32();
 
         code.vpcmpeqd(mask, result, code.Const(xword, 0x8000000080000000, 0x8000000080000000));
@@ -4376,11 +4663,11 @@ void EmitVectorSignedSaturatedDoublingMultiply32(BlockOfCode& code, EmitContext&
         return;
     }
 
-    const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-    const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
-    const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
-    const Xbyak::Xmm sign_correction = ctx.reg_alloc.ScratchXmm(code);
-    const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+    auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+    auto const tmp = ctx.reg_alloc.ScratchXmm(code);
+    auto const sign_correction = ctx.reg_alloc.ScratchXmm(code);
+    auto const result = ctx.reg_alloc.ScratchXmm(code);
 
     // calculate sign correction
     code.movdqa(tmp, x);
@@ -4439,8 +4726,8 @@ void EmitX64::EmitVectorSignedSaturatedDoublingMultiplyHighRounding32(EmitContex
 void EmitX64::EmitVectorSignedSaturatedDoublingMultiplyLong16(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
 
-    const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-    const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+    auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
 
     code.punpcklwd(x, x);
     code.punpcklwd(y, y);
@@ -4465,8 +4752,8 @@ void EmitX64::EmitVectorSignedSaturatedDoublingMultiplyLong16(EmitContext& ctx,
 void EmitX64::EmitVectorSignedSaturatedDoublingMultiplyLong32(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
 
-    const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-    const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+    auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
 
     if (code.HasHostFeature(HostFeature::AVX)) {
         code.vpmovsxdq(x, x);
@@ -4517,10 +4804,10 @@ void EmitX64::EmitVectorSignedSaturatedDoublingMultiplyLong32(EmitContext& ctx,
 
 static void EmitVectorSignedSaturatedNarrowToSigned(size_t original_esize, BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-    const Xbyak::Xmm src = ctx.reg_alloc.UseXmm(code, args[0]);
-    const Xbyak::Xmm dest = ctx.reg_alloc.ScratchXmm(code);
-    const Xbyak::Xmm reconstructed = ctx.reg_alloc.ScratchXmm(code);
-    const Xbyak::Xmm sign = ctx.reg_alloc.ScratchXmm(code);
+    auto const src = ctx.reg_alloc.UseXmm(code, args[0]);
+    auto const dest = ctx.reg_alloc.ScratchXmm(code);
+    auto const reconstructed = ctx.reg_alloc.ScratchXmm(code);
+    auto const sign = ctx.reg_alloc.ScratchXmm(code);
 
     code.movdqa(dest, src);
     code.pxor(xmm0, xmm0);
@@ -4577,9 +4864,9 @@ void EmitX64::EmitVectorSignedSaturatedNarrowToSigned64(EmitContext& ctx, IR::In
 
 static void EmitVectorSignedSaturatedNarrowToUnsigned(size_t original_esize, BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-    const Xbyak::Xmm src = ctx.reg_alloc.UseXmm(code, args[0]);
-    const Xbyak::Xmm dest = ctx.reg_alloc.ScratchXmm(code);
-    const Xbyak::Xmm reconstructed = ctx.reg_alloc.ScratchXmm(code);
+    auto const src = ctx.reg_alloc.UseXmm(code, args[0]);
+    auto const dest = ctx.reg_alloc.ScratchXmm(code);
+    auto const reconstructed = ctx.reg_alloc.ScratchXmm(code);
 
     code.movdqa(dest, src);
     code.pxor(xmm0, xmm0);
@@ -4647,9 +4934,9 @@ void EmitX64::EmitVectorSignedSaturatedNarrowToUnsigned64(EmitContext& ctx, IR::
 static void EmitVectorSignedSaturatedNeg(size_t esize, BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
 
-    const Xbyak::Xmm data = ctx.reg_alloc.UseXmm(code, args[0]);
-    const Xbyak::Xmm zero = ctx.reg_alloc.ScratchXmm(code);
-    const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+    auto const data = ctx.reg_alloc.UseXmm(code, args[0]);
+    auto const zero = ctx.reg_alloc.ScratchXmm(code);
+    auto const tmp = ctx.reg_alloc.ScratchXmm(code);
     const Xbyak::Address mask = [esize, &code] {
         switch (esize) {
         case 8:
@@ -4665,7 +4952,7 @@ static void EmitVectorSignedSaturatedNeg(size_t esize, BlockOfCode& code, EmitCo
         }
     }();
 
-    const auto vector_equality = [esize, &code](const Xbyak::Xmm& x, const auto& y) {
+    const auto vector_equality = [esize, &code](auto const& x, const auto& y) {
         switch (esize) {
         case 8:
             code.pcmpeqb(x, y);
@@ -4810,33 +5097,23 @@ void EmitX64::EmitVectorSignedSaturatedShiftLeft64(EmitContext& ctx, IR::Inst* i
     EmitTwoArgumentFallbackWithSaturation(code, ctx, inst, VectorSignedSaturatedShiftLeft<s64>);
 }
 
-template<typename T, typename U = std::make_unsigned_t<T>>
+template<typename T>
 static bool VectorSignedSaturatedShiftLeftUnsigned(VectorArray<T>& dst, const VectorArray<T>& data, u8 shift_amount) {
+    using U = std::make_unsigned_t<T>;
     static_assert(std::is_signed_v<T>, "T must be signed.");
-
     bool qc_flag = false;
     for (size_t i = 0; i < dst.size(); i++) {
-        const T element = data[i];
-        const T shift = static_cast<T>(shift_amount);
-
-        if (element == 0) {
-            dst[i] = 0;
-        } else if (element < 0) {
-            dst[i] = 0;
-            qc_flag = true;
-        } else {
-            const U shifted = static_cast<U>(element) << static_cast<U>(shift);
-            const U shifted_test = shifted >> static_cast<U>(shift);
-
-            if (shifted_test != static_cast<U>(element)) {
-                dst[i] = static_cast<T>((std::numeric_limits<U>::max)());
-                qc_flag = true;
-            } else {
-                dst[i] = shifted;
-            }
-        }
+        auto const element = data[i];
+        auto const shifted = U(element) << U(T(shift_amount));
+        auto const shifted_test = shifted >> U(T(shift_amount));
+        auto result = 0;
+        if (element > 0 && shifted_test != U(element))
+            result = T((std::numeric_limits<U>::max)());
+        if (element > 0 && shifted_test == U(element))
+            result = shifted;
+        qc_flag |= element < 0 || (element > 0 && shifted_test != U(element));
+        dst[i] = result;
     }
-
     return qc_flag;
 }
 
@@ -4849,7 +5126,97 @@ void EmitX64::EmitVectorSignedSaturatedShiftLeftUnsigned16(EmitContext& ctx, IR:
 }
 
 void EmitX64::EmitVectorSignedSaturatedShiftLeftUnsigned32(EmitContext& ctx, IR::Inst* inst) {
-    EmitTwoArgumentFallbackWithSaturationAndImmediate(code, ctx, inst, VectorSignedSaturatedShiftLeftUnsigned<s32>);
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    auto const imm8 = args[1].GetImmediateU8();
+    if (code.HasHostFeature(HostFeature::AVX2)) {
+        auto const tmp_flag = ctx.reg_alloc.ScratchGpr(code);
+        auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        if (imm8 == 0) {
+            auto const tmp1 = ctx.reg_alloc.ScratchXmm(code);
+            auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+            code.vpshufd(tmp1, tmp0, 85);
+            code.vpshufd(tmp2, tmp0, 238);
+            code.vpor(tmp1, tmp1, tmp2);
+            code.vpshufd(tmp2, tmp0, 255);
+            code.vpor(tmp2, tmp2, tmp0);
+            code.vpor(tmp1, tmp1, tmp2);
+            code.vmovd(tmp_flag.cvt32(), tmp1);
+            code.shr(tmp_flag.cvt32(), 31);
+            code.vpxor(tmp1, tmp1, tmp1);
+            code.vpmaxsd(tmp0, tmp0, tmp1);
+        } else {
+            auto const tmp1 = ctx.reg_alloc.ScratchXmm(code);
+            auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+            auto const tmp3 = ctx.reg_alloc.ScratchXmm(code);
+            auto const tmp4 = ctx.reg_alloc.ScratchXmm(code);
+            auto const cmp_value = u32(1ULL << 31) >> (imm8 - 1);
+            code.vpshufd(tmp1, tmp0, 238);
+            code.vpor(tmp1, tmp1, tmp0);
+            code.vpshufd(tmp2, tmp1, 85);
+            code.vpor(tmp1, tmp1, tmp2);
+            code.vmovd(tmp_flag.cvt32(), tmp1);
+            code.cmp(tmp_flag.cvt32(), cmp_value);
+            code.vpslld(tmp1, tmp0, imm8);
+            code.vpbroadcastd(tmp2, code.Const(dword, cmp_value - 2));
+            code.vpbroadcastd(tmp3, code.Const(dword, cmp_value - 1));
+            code.vpcmpgtd(tmp3, tmp0, tmp3);
+            code.vpcmpeqd(tmp4, tmp4, tmp4);
+            code.vpaddd(tmp0, tmp0, tmp4);
+            code.vpminud(tmp2, tmp0, tmp2);
+            code.vpcmpeqd(tmp0, tmp0, tmp2);
+            code.vblendvps(tmp0, tmp3, tmp1, tmp0);
+            code.setae(tmp_flag.cvt8());
+        }
+        code.or_(code.byte[code.ABI_JIT_PTR + code.GetJitStateInfo().offsetof_fpsr_qc], tmp_flag.cvt8());
+        ctx.reg_alloc.DefineValue(code, inst, tmp0);
+    } else {
+        auto const tmp_flag = ctx.reg_alloc.ScratchGpr(code);
+        auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        if (imm8 == 0) {
+            auto const tmp1 = ctx.reg_alloc.ScratchXmm(code);
+            auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+            code.pshufd(tmp1, tmp0, 85);
+            code.pshufd(tmp2, tmp0, 238);
+            code.por(tmp2, tmp1);
+            code.pshufd(tmp1, tmp0, 255);
+            code.por(tmp1, tmp0);
+            code.por(tmp1, tmp2);
+            code.movd(tmp_flag.cvt32(), tmp1);
+            code.shr(tmp_flag.cvt32(), 31);
+            code.pxor(tmp1, tmp1);
+            code.movdqa(tmp2, tmp0);
+            code.pcmpgtd(tmp2, tmp1);
+            code.pand(tmp0, tmp2);
+        } else {
+            auto const tmp1 = ctx.reg_alloc.ScratchXmm(code);
+            auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+            auto const tmp3 = ctx.reg_alloc.ScratchXmm(code);
+            u64 const cmp_value = u64(1ULL << 31) >> (imm8 - 1);
+            u64 const cmp_one = cmp_value - 1;
+            u64 const cmp_add = (cmp_value - 2) + 0x80000000;
+            code.pshufd(tmp1, tmp0, 238);
+            code.por(tmp1, tmp0);
+            code.pshufd(tmp2, tmp1, 85);
+            code.por(tmp2, tmp1);
+            code.movd(tmp_flag.cvt32(), tmp2);
+            code.cmp(tmp_flag.cvt32(), cmp_value);
+            code.movdqa(tmp1, tmp0);
+            code.pslld(tmp1, imm8);
+            code.movdqa(tmp2, tmp0);
+            code.pcmpgtd(tmp2, code.Const(xword, cmp_one | (cmp_one << 32), cmp_one | (cmp_one << 32)));
+            code.pcmpeqd(tmp3, tmp3);
+            code.paddd(tmp0, tmp3);
+            code.pxor(tmp0, code.Const(xword, 0x80000000'80000000, 0x80000000'80000000));
+            code.pcmpgtd(tmp0, code.Const(xword, cmp_add | (cmp_add << 32), cmp_add | (cmp_add << 32)));
+            code.pand(tmp2, tmp0);
+            code.pandn(tmp0, tmp1);
+            code.por(tmp0, tmp2);
+            code.setae(tmp_flag.cvt8());
+        }
+        code.or_(code.byte[code.ABI_JIT_PTR + code.GetJitStateInfo().offsetof_fpsr_qc], tmp_flag.cvt8());
+        ctx.reg_alloc.DefineValue(code, inst, tmp0);
+//        EmitTwoArgumentFallbackWithSaturationAndImmediate(code, ctx, inst, VectorSignedSaturatedShiftLeftUnsigned<s32>);
+    }
 }
 
 void EmitX64::EmitVectorSignedSaturatedShiftLeftUnsigned64(EmitContext& ctx, IR::Inst* inst) {
@@ -4887,7 +5254,7 @@ void EmitX64::EmitVectorTableLookup64(EmitContext& ctx, IR::Inst* inst) {
     const bool is_defaults_zero = inst->GetArg(0).IsZero();
 
     if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW | HostFeature::AVX512VBMI)) {
-        const Xbyak::Xmm indicies = table_size <= 2 ? ctx.reg_alloc.UseXmm(code, args[2]) : ctx.reg_alloc.UseScratchXmm(code, args[2]);
+        auto const indicies = table_size <= 2 ? ctx.reg_alloc.UseXmm(code, args[2]) : ctx.reg_alloc.UseScratchXmm(code, args[2]);
 
         const u64 index_count = mcl::bit::replicate_element<u8, u64>(static_cast<u8>(table_size * 8));
 
@@ -4895,43 +5262,43 @@ void EmitX64::EmitVectorTableLookup64(EmitContext& ctx, IR::Inst* inst) {
 
         switch (table_size) {
         case 1: {
-            const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseXmm(code, table[0]);
+            auto const xmm_table0 = ctx.reg_alloc.UseXmm(code, table[0]);
             if (is_defaults_zero) {
-                const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+                auto const result = ctx.reg_alloc.ScratchXmm(code);
                 code.vpermb(result | k1 | T_z, indicies, xmm_table0);
                 ctx.reg_alloc.DefineValue(code, inst, result);
             } else {
-                const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+                auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
                 code.vpermb(result | k1, indicies, xmm_table0);
                 ctx.reg_alloc.DefineValue(code, inst, result);
             }
             break;
         }
         case 2: {
-            const Xbyak::Xmm xmm_table0_lower = ctx.reg_alloc.UseXmm(code, table[0]);
-            const Xbyak::Xmm xmm_table0_upper = ctx.reg_alloc.UseXmm(code, table[1]);
+            auto const xmm_table0_lower = ctx.reg_alloc.UseXmm(code, table[0]);
+            auto const xmm_table0_upper = ctx.reg_alloc.UseXmm(code, table[1]);
             code.vpunpcklqdq(xmm0, xmm_table0_lower, xmm_table0_upper);
             if (is_defaults_zero) {
-                const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+                auto const result = ctx.reg_alloc.ScratchXmm(code);
                 code.vpermb(result | k1 | T_z, indicies, xmm0);
                 ctx.reg_alloc.DefineValue(code, inst, result);
             } else {
-                const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+                auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
                 code.vpermb(result | k1, indicies, xmm0);
                 ctx.reg_alloc.DefineValue(code, inst, result);
             }
             break;
         }
         case 3: {
-            const Xbyak::Xmm xmm_table0_lower = ctx.reg_alloc.UseXmm(code, table[0]);
-            const Xbyak::Xmm xmm_table0_upper = ctx.reg_alloc.UseXmm(code, table[1]);
-            const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseXmm(code, table[2]);
+            auto const xmm_table0_lower = ctx.reg_alloc.UseXmm(code, table[0]);
+            auto const xmm_table0_upper = ctx.reg_alloc.UseXmm(code, table[1]);
+            auto const xmm_table1 = ctx.reg_alloc.UseXmm(code, table[2]);
             code.vpunpcklqdq(xmm0, xmm_table0_lower, xmm_table0_upper);
             if (is_defaults_zero) {
                 code.vpermi2b(indicies | k1 | T_z, xmm0, xmm_table1);
                 ctx.reg_alloc.DefineValue(code, inst, indicies);
             } else {
-                const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+                auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
                 code.vpermi2b(indicies, xmm0, xmm_table1);
                 code.vmovdqu8(result | k1, indicies);
                 ctx.reg_alloc.DefineValue(code, inst, result);
@@ -4939,17 +5306,17 @@ void EmitX64::EmitVectorTableLookup64(EmitContext& ctx, IR::Inst* inst) {
             break;
         }
         case 4: {
-            const Xbyak::Xmm xmm_table0_lower = ctx.reg_alloc.UseXmm(code, table[0]);
-            const Xbyak::Xmm xmm_table0_upper = ctx.reg_alloc.UseXmm(code, table[1]);
-            const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseScratchXmm(code, table[2]);
-            const Xbyak::Xmm xmm_table1_upper = ctx.reg_alloc.UseXmm(code, table[3]);
+            auto const xmm_table0_lower = ctx.reg_alloc.UseXmm(code, table[0]);
+            auto const xmm_table0_upper = ctx.reg_alloc.UseXmm(code, table[1]);
+            auto const xmm_table1 = ctx.reg_alloc.UseScratchXmm(code, table[2]);
+            auto const xmm_table1_upper = ctx.reg_alloc.UseXmm(code, table[3]);
             code.vpunpcklqdq(xmm0, xmm_table0_lower, xmm_table0_upper);
             code.vpunpcklqdq(xmm_table1, xmm_table1, xmm_table1_upper);
             if (is_defaults_zero) {
                 code.vpermi2b(indicies | k1 | T_z, xmm0, xmm_table1);
                 ctx.reg_alloc.DefineValue(code, inst, indicies);
             } else {
-                const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+                auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
                 code.vpermi2b(indicies, xmm0, xmm_table1);
                 code.vmovdqu8(result | k1, indicies);
                 ctx.reg_alloc.DefineValue(code, inst, result);
@@ -4972,9 +5339,9 @@ void EmitX64::EmitVectorTableLookup64(EmitContext& ctx, IR::Inst* inst) {
     };
 
     if (code.HasHostFeature(HostFeature::SSSE3) && is_defaults_zero && table_size == 1) {
-        const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]);
-        const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseXmm(code, table[0]);
-        const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+        auto const indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]);
+        auto const xmm_table0 = ctx.reg_alloc.UseXmm(code, table[0]);
+        auto const result = ctx.reg_alloc.ScratchXmm(code);
 
         code.xorps(result, result);
         code.movsd(result, xmm_table0);
@@ -4986,9 +5353,9 @@ void EmitX64::EmitVectorTableLookup64(EmitContext& ctx, IR::Inst* inst) {
     }
 
     if (code.HasHostFeature(HostFeature::SSSE3) && is_defaults_zero && table_size == 2) {
-        const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]);
-        const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(code, table[0]);
-        const Xbyak::Xmm xmm_table0_upper = ctx.reg_alloc.UseXmm(code, table[1]);
+        auto const indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]);
+        auto const xmm_table0 = ctx.reg_alloc.UseScratchXmm(code, table[0]);
+        auto const xmm_table0_upper = ctx.reg_alloc.UseXmm(code, table[1]);
 
         code.punpcklqdq(xmm_table0, xmm_table0_upper);
         code.paddusb(indicies, code.Const(xword, 0x7070707070707070, 0xFFFFFFFFFFFFFFFF));
@@ -4999,12 +5366,12 @@ void EmitX64::EmitVectorTableLookup64(EmitContext& ctx, IR::Inst* inst) {
     }
 
     if (code.HasHostFeature(HostFeature::SSE41) && table_size <= 2) {
-        const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(code, args[2]);
-        const Xbyak::Xmm defaults = ctx.reg_alloc.UseXmm(code, args[0]);
-        const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(code, table[0]);
+        auto const indicies = ctx.reg_alloc.UseXmm(code, args[2]);
+        auto const defaults = ctx.reg_alloc.UseXmm(code, args[0]);
+        auto const xmm_table0 = ctx.reg_alloc.UseScratchXmm(code, table[0]);
 
         if (table_size == 2) {
-            const Xbyak::Xmm xmm_table0_upper = ctx.reg_alloc.UseXmm(code, table[1]);
+            auto const xmm_table0_upper = ctx.reg_alloc.UseXmm(code, table[1]);
             code.punpcklqdq(xmm_table0, xmm_table0_upper);
             ctx.reg_alloc.Release(xmm_table0_upper);
         }
@@ -5023,12 +5390,12 @@ void EmitX64::EmitVectorTableLookup64(EmitContext& ctx, IR::Inst* inst) {
     }
 
     if (code.HasHostFeature(HostFeature::SSE41) && is_defaults_zero) {
-        const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]);
-        const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(code, table[0]);
-        const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseScratchXmm(code, table[2]);
+        auto const indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]);
+        auto const xmm_table0 = ctx.reg_alloc.UseScratchXmm(code, table[0]);
+        auto const xmm_table1 = ctx.reg_alloc.UseScratchXmm(code, table[2]);
 
         {
-            const Xbyak::Xmm xmm_table0_upper = ctx.reg_alloc.UseXmm(code, table[1]);
+            auto const xmm_table0_upper = ctx.reg_alloc.UseXmm(code, table[1]);
             code.punpcklqdq(xmm_table0, xmm_table0_upper);
             ctx.reg_alloc.Release(xmm_table0_upper);
         }
@@ -5037,7 +5404,7 @@ void EmitX64::EmitVectorTableLookup64(EmitContext& ctx, IR::Inst* inst) {
             code.punpcklqdq(xmm_table1, xmm0);
         } else {
             ASSERT(table_size == 4);
-            const Xbyak::Xmm xmm_table1_upper = ctx.reg_alloc.UseXmm(code, table[3]);
+            auto const xmm_table1_upper = ctx.reg_alloc.UseXmm(code, table[3]);
             code.punpcklqdq(xmm_table1, xmm_table1_upper);
             ctx.reg_alloc.Release(xmm_table1_upper);
         }
@@ -5058,18 +5425,18 @@ void EmitX64::EmitVectorTableLookup64(EmitContext& ctx, IR::Inst* inst) {
     }
 
     if (code.HasHostFeature(HostFeature::SSE41)) {
-        const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]);
-        const Xbyak::Xmm defaults = ctx.reg_alloc.UseXmm(code, args[0]);
-        const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(code, table[0]);
-        const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseScratchXmm(code, table[2]);
+        auto const indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]);
+        auto const defaults = ctx.reg_alloc.UseXmm(code, args[0]);
+        auto const xmm_table0 = ctx.reg_alloc.UseScratchXmm(code, table[0]);
+        auto const xmm_table1 = ctx.reg_alloc.UseScratchXmm(code, table[2]);
 
         {
-            const Xbyak::Xmm xmm_table0_upper = ctx.reg_alloc.UseXmm(code, table[1]);
+            auto const xmm_table0_upper = ctx.reg_alloc.UseXmm(code, table[1]);
             code.punpcklqdq(xmm_table0, xmm_table0_upper);
             ctx.reg_alloc.Release(xmm_table0_upper);
         }
         if (table_size == 4) {
-            const Xbyak::Xmm xmm_table1_upper = ctx.reg_alloc.UseXmm(code, table[3]);
+            auto const xmm_table1_upper = ctx.reg_alloc.UseXmm(code, table[3]);
             code.punpcklqdq(xmm_table1, xmm_table1_upper);
             ctx.reg_alloc.Release(xmm_table1_upper);
         }
@@ -5098,37 +5465,31 @@ void EmitX64::EmitVectorTableLookup64(EmitContext& ctx, IR::Inst* inst) {
     const u32 stack_space = static_cast<u32>(6 * 8);
     ctx.reg_alloc.AllocStackSpace(code, stack_space + ABI_SHADOW_SPACE);
     for (size_t i = 0; i < table_size; ++i) {
-        const Xbyak::Xmm table_value = ctx.reg_alloc.UseXmm(code, table[i]);
+        auto const table_value = ctx.reg_alloc.UseXmm(code, table[i]);
         code.movq(qword[rsp + ABI_SHADOW_SPACE + i * 8], table_value);
         ctx.reg_alloc.Release(table_value);
     }
-    const Xbyak::Xmm defaults = ctx.reg_alloc.UseXmm(code, args[0]);
-    const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(code, args[2]);
-    const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+    auto const defaults = ctx.reg_alloc.UseXmm(code, args[0]);
+    auto const indicies = ctx.reg_alloc.UseXmm(code, args[2]);
+    auto const result = ctx.reg_alloc.ScratchXmm(code);
     ctx.reg_alloc.EndOfAllocScope();
     ctx.reg_alloc.HostCall(code, nullptr);
-
     code.lea(code.ABI_PARAM1, ptr[rsp + ABI_SHADOW_SPACE]);
     code.lea(code.ABI_PARAM2, ptr[rsp + ABI_SHADOW_SPACE + 4 * 8]);
     code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE + 5 * 8]);
     code.mov(code.ABI_PARAM4.cvt32(), table_size);
     code.movq(qword[code.ABI_PARAM2], defaults);
     code.movq(qword[code.ABI_PARAM3], indicies);
-
-    code.CallLambda(
-        [](const HalfVectorArray<u8>* table, HalfVectorArray<u8>& result, const HalfVectorArray<u8>& indicies, size_t table_size) {
-            for (size_t i = 0; i < result.size(); ++i) {
-                const size_t index = indicies[i] / table[0].size();
-                const size_t elem = indicies[i] % table[0].size();
-                if (index < table_size) {
-                    result[i] = table[index][elem];
-                }
-            }
-        });
-
+    code.CallLambda([](const HalfVectorArray<u8>* table, HalfVectorArray<u8>& result, const HalfVectorArray<u8>& indicies, size_t table_size) {
+        for (size_t i = 0; i < result.size(); ++i) {
+            const size_t index = indicies[i] / table[0].size();
+            const size_t elem = indicies[i] % table[0].size();
+            if (index < table_size)
+                result[i] = table[index][elem];
+        }
+    });
     code.movq(result, qword[rsp + ABI_SHADOW_SPACE + 4 * 8]);
     ctx.reg_alloc.ReleaseStackSpace(code, stack_space + ABI_SHADOW_SPACE);
-
     ctx.reg_alloc.DefineValue(code, inst, result);
 }
 
@@ -5142,14 +5503,14 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) {
     const bool is_defaults_zero = !inst->GetArg(0).IsImmediate() && inst->GetArg(0).GetInst()->GetOpcode() == IR::Opcode::ZeroVector;
 
     if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW | HostFeature::AVX512VBMI) && table_size == 4) {
-        const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]);
+        auto const indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]);
 
         code.vpcmpub(k1, indicies, code.BConst<8>(xword, 2 * 16), CmpInt::LessThan);
         code.vpcmpub(k2, indicies, code.BConst<8>(xword, 4 * 16), CmpInt::LessThan);
 
         // Handle vector-table 0,1
-        const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseXmm(code, table[0]);
-        const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseXmm(code, table[1]);
+        auto const xmm_table0 = ctx.reg_alloc.UseXmm(code, table[0]);
+        auto const xmm_table1 = ctx.reg_alloc.UseXmm(code, table[1]);
 
         code.vpermi2b(indicies | k1, xmm_table0, xmm_table1);
 
@@ -5157,8 +5518,8 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) {
         ctx.reg_alloc.Release(xmm_table1);
 
         // Handle vector-table 2,3
-        const Xbyak::Xmm xmm_table2 = ctx.reg_alloc.UseXmm(code, table[2]);
-        const Xbyak::Xmm xmm_table3 = ctx.reg_alloc.UseXmm(code, table[3]);
+        auto const xmm_table2 = ctx.reg_alloc.UseXmm(code, table[2]);
+        auto const xmm_table3 = ctx.reg_alloc.UseXmm(code, table[3]);
 
         code.kandnw(k1, k1, k2);
         code.vpermi2b(indicies | k1, xmm_table2, xmm_table3);
@@ -5167,19 +5528,19 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) {
             code.vmovdqu8(indicies | k2 | T_z, indicies);
             ctx.reg_alloc.DefineValue(code, inst, indicies);
         } else {
-            const Xbyak::Xmm defaults = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+            auto const defaults = ctx.reg_alloc.UseScratchXmm(code, args[0]);
             code.vmovdqu8(defaults | k2, indicies);
             ctx.reg_alloc.DefineValue(code, inst, defaults);
         }
     } else if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW | HostFeature::AVX512VBMI) && table_size == 3) {
-        const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]);
+        auto const indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]);
 
         code.vpcmpub(k1, indicies, code.BConst<8>(xword, 2 * 16), CmpInt::LessThan);
         code.vpcmpub(k2, indicies, code.BConst<8>(xword, 3 * 16), CmpInt::LessThan);
 
         // Handle vector-table 0,1
-        const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseXmm(code, table[0]);
-        const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseXmm(code, table[1]);
+        auto const xmm_table0 = ctx.reg_alloc.UseXmm(code, table[0]);
+        auto const xmm_table1 = ctx.reg_alloc.UseXmm(code, table[1]);
 
         code.vpermi2b(indicies | k1, xmm_table0, xmm_table1);
 
@@ -5187,7 +5548,7 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) {
         ctx.reg_alloc.Release(xmm_table1);
 
         // Handle vector-table 2
-        const Xbyak::Xmm xmm_table2 = ctx.reg_alloc.UseXmm(code, table[2]);
+        auto const xmm_table2 = ctx.reg_alloc.UseXmm(code, table[2]);
 
         code.kandnw(k1, k1, k2);
         code.vpermb(indicies | k1, indicies, xmm_table2);
@@ -5196,14 +5557,14 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) {
             code.vmovdqu8(indicies | k2 | T_z, indicies);
             ctx.reg_alloc.DefineValue(code, inst, indicies);
         } else {
-            const Xbyak::Xmm defaults = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+            auto const defaults = ctx.reg_alloc.UseScratchXmm(code, args[0]);
             code.vmovdqu8(defaults | k2, indicies);
             ctx.reg_alloc.DefineValue(code, inst, defaults);
         }
     } else if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW | HostFeature::AVX512VBMI) && table_size == 2) {
-        const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]);
-        const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseXmm(code, table[0]);
-        const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseXmm(code, table[1]);
+        auto const indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]);
+        auto const xmm_table0 = ctx.reg_alloc.UseXmm(code, table[0]);
+        auto const xmm_table1 = ctx.reg_alloc.UseXmm(code, table[1]);
 
         code.vpcmpub(k1, indicies, code.BConst<8>(xword, 2 * 16), CmpInt::LessThan);
 
@@ -5211,36 +5572,36 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) {
             code.vpermi2b(indicies | k1 | T_z, xmm_table0, xmm_table1);
             ctx.reg_alloc.DefineValue(code, inst, indicies);
         } else {
-            const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+            auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
             code.vpermi2b(indicies, xmm_table0, xmm_table1);
             code.vmovdqu8(result | k1, indicies);
             ctx.reg_alloc.DefineValue(code, inst, result);
         }
     } else if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW | HostFeature::AVX512VBMI) && table_size == 1) {
-        const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(code, args[2]);
-        const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseXmm(code, table[0]);
+        auto const indicies = ctx.reg_alloc.UseXmm(code, args[2]);
+        auto const xmm_table0 = ctx.reg_alloc.UseXmm(code, table[0]);
         code.vpcmpub(k1, indicies, code.BConst<8>(xword, 1 * 16), CmpInt::LessThan);
         if (is_defaults_zero) {
-            const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+            auto const result = ctx.reg_alloc.ScratchXmm(code);
             code.vpermb(result | k1 | T_z, indicies, xmm_table0);
             ctx.reg_alloc.DefineValue(code, inst, result);
         } else {
-            const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+            auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
             code.vpermb(result | k1, indicies, xmm_table0);
             ctx.reg_alloc.DefineValue(code, inst, result);
         }
     } else if (code.HasHostFeature(HostFeature::SSSE3) && is_defaults_zero && table_size == 1) {
-        const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]);
-        const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(code, table[0]);
+        auto const indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]);
+        auto const xmm_table0 = ctx.reg_alloc.UseScratchXmm(code, table[0]);
 
         code.paddusb(indicies, code.Const(xword, 0x7070707070707070, 0x7070707070707070));
         code.pshufb(xmm_table0, indicies);
 
         ctx.reg_alloc.DefineValue(code, inst, xmm_table0);
     } else if (code.HasHostFeature(HostFeature::SSE41) && table_size == 1) {
-        const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(code, args[2]);
-        const Xbyak::Xmm defaults = ctx.reg_alloc.UseXmm(code, args[0]);
-        const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(code, table[0]);
+        auto const indicies = ctx.reg_alloc.UseXmm(code, args[2]);
+        auto const defaults = ctx.reg_alloc.UseXmm(code, args[0]);
+        auto const xmm_table0 = ctx.reg_alloc.UseScratchXmm(code, table[0]);
 
         if (code.HasHostFeature(HostFeature::AVX)) {
             code.vpaddusb(xmm0, indicies, code.Const(xword, 0x7070707070707070, 0x7070707070707070));
@@ -5253,9 +5614,9 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) {
 
         ctx.reg_alloc.DefineValue(code, inst, xmm_table0);
     } else if (code.HasHostFeature(HostFeature::SSE41) && is_defaults_zero && table_size == 2) {
-        const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]);
-        const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(code, table[0]);
-        const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseScratchXmm(code, table[1]);
+        auto const indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]);
+        auto const xmm_table0 = ctx.reg_alloc.UseScratchXmm(code, table[0]);
+        auto const xmm_table1 = ctx.reg_alloc.UseScratchXmm(code, table[1]);
 
         if (code.HasHostFeature(HostFeature::AVX)) {
             code.vpaddusb(xmm0, indicies, code.Const(xword, 0x7070707070707070, 0x7070707070707070));
@@ -5271,14 +5632,14 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) {
         ctx.reg_alloc.DefineValue(code, inst, xmm_table0);
         return;
     } else if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW)) {
-        const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(code, args[2]);
-        const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-        const Xbyak::Xmm masked = ctx.reg_alloc.ScratchXmm(code);
+        auto const indicies = ctx.reg_alloc.UseXmm(code, args[2]);
+        auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        auto const masked = ctx.reg_alloc.ScratchXmm(code);
 
         code.vpandd(masked, indicies, code.Const(xword_b, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0));
 
         for (size_t i = 0; i < table_size; ++i) {
-            const Xbyak::Xmm xmm_table = ctx.reg_alloc.UseScratchXmm(code, table[i]);
+            auto const xmm_table = ctx.reg_alloc.UseScratchXmm(code, table[i]);
             const Xbyak::Opmask table_mask = k1;
             const u64 table_index = mcl::bit::replicate_element<u8, u64>(i * 16);
 
@@ -5295,15 +5656,15 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) {
 
         ctx.reg_alloc.DefineValue(code, inst, result);
     } else if (code.HasHostFeature(HostFeature::SSE41)) {
-        const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(code, args[2]);
-        const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-        const Xbyak::Xmm masked = ctx.reg_alloc.ScratchXmm(code);
+        auto const indicies = ctx.reg_alloc.UseXmm(code, args[2]);
+        auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        auto const masked = ctx.reg_alloc.ScratchXmm(code);
 
         code.movaps(masked, code.Const(xword, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0));
         code.pand(masked, indicies);
 
         for (size_t i = 0; i < table_size; ++i) {
-            const Xbyak::Xmm xmm_table = ctx.reg_alloc.UseScratchXmm(code, table[i]);
+            auto const xmm_table = ctx.reg_alloc.UseScratchXmm(code, table[i]);
 
             const u64 table_index = mcl::bit::replicate_element<u8, u64>(i * 16);
 
@@ -5327,13 +5688,13 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) {
         const u32 stack_space = static_cast<u32>((table_size + 2) * 16);
         ctx.reg_alloc.AllocStackSpace(code, stack_space + ABI_SHADOW_SPACE);
         for (size_t i = 0; i < table_size; ++i) {
-            const Xbyak::Xmm table_value = ctx.reg_alloc.UseXmm(code, table[i]);
+            auto const table_value = ctx.reg_alloc.UseXmm(code, table[i]);
             code.movaps(xword[rsp + ABI_SHADOW_SPACE + i * 16], table_value);
             ctx.reg_alloc.Release(table_value);
         }
-        const Xbyak::Xmm defaults = ctx.reg_alloc.UseXmm(code, args[0]);
-        const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(code, args[2]);
-        const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+        auto const defaults = ctx.reg_alloc.UseXmm(code, args[0]);
+        auto const indicies = ctx.reg_alloc.UseXmm(code, args[2]);
+        auto const result = ctx.reg_alloc.ScratchXmm(code);
         ctx.reg_alloc.EndOfAllocScope();
         ctx.reg_alloc.HostCall(code, nullptr);
         code.lea(code.ABI_PARAM1, ptr[rsp + ABI_SHADOW_SPACE]);
@@ -5360,8 +5721,8 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) {
 void EmitX64::EmitVectorTranspose8(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
 
-    const Xbyak::Xmm lower = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-    const Xbyak::Xmm upper = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+    auto const lower = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const upper = ctx.reg_alloc.UseScratchXmm(code, args[1]);
     const bool part = args[2].GetImmediateU1();
 
     if (!part) {
@@ -5379,8 +5740,8 @@ void EmitX64::EmitVectorTranspose8(EmitContext& ctx, IR::Inst* inst) {
 void EmitX64::EmitVectorTranspose16(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
 
-    const Xbyak::Xmm lower = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-    const Xbyak::Xmm upper = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+    auto const lower = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const upper = ctx.reg_alloc.UseScratchXmm(code, args[1]);
     const bool part = args[2].GetImmediateU1();
 
     if (!part) {
@@ -5398,8 +5759,8 @@ void EmitX64::EmitVectorTranspose16(EmitContext& ctx, IR::Inst* inst) {
 void EmitX64::EmitVectorTranspose32(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
 
-    const Xbyak::Xmm lower = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-    const Xbyak::Xmm upper = ctx.reg_alloc.UseXmm(code, args[1]);
+    auto const lower = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const upper = ctx.reg_alloc.UseXmm(code, args[1]);
     const bool part = args[2].GetImmediateU1();
 
     code.shufps(lower, upper, !part ? 0b10001000 : 0b11011101);
@@ -5411,8 +5772,8 @@ void EmitX64::EmitVectorTranspose32(EmitContext& ctx, IR::Inst* inst) {
 void EmitX64::EmitVectorTranspose64(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
 
-    const Xbyak::Xmm lower = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-    const Xbyak::Xmm upper = ctx.reg_alloc.UseXmm(code, args[1]);
+    auto const lower = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const upper = ctx.reg_alloc.UseXmm(code, args[1]);
     const bool part = args[2].GetImmediateU1();
 
     code.shufpd(lower, upper, !part ? 0b00 : 0b11);
@@ -5420,89 +5781,87 @@ void EmitX64::EmitVectorTranspose64(EmitContext& ctx, IR::Inst* inst) {
     ctx.reg_alloc.DefineValue(code, inst, lower);
 }
 
-static void EmitVectorUnsignedAbsoluteDifference(size_t esize, EmitContext& ctx, IR::Inst* inst, BlockOfCode& code) {
+
+void EmitX64::EmitVectorUnsignedAbsoluteDifference8(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-
-    const Xbyak::Xmm temp = ctx.reg_alloc.ScratchXmm(code);
-
-    switch (esize) {
-    case 8: {
-        const Xbyak::Xmm x = ctx.reg_alloc.UseXmm(code, args[0]);
-        const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
-
+    if (code.HasHostFeature(HostFeature::AVX)) {
+        auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        auto const tmp1 = ctx.reg_alloc.UseXmm(code, args[1]);
+        auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+        code.vpminub(tmp2, tmp0, tmp1);
+        code.vpmaxub(tmp0, tmp0, tmp1);
+        code.vpsubb(tmp0, tmp0, tmp2);
+        ctx.reg_alloc.DefineValue(code, inst, tmp0);
+    } else {
+        auto const temp = ctx.reg_alloc.ScratchXmm(code);
+        auto const x = ctx.reg_alloc.UseXmm(code, args[0]);
+        auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
         code.movdqa(temp, x);
         code.psubusb(temp, y);
         code.psubusb(y, x);
         code.por(temp, y);
-        break;
+        ctx.reg_alloc.DefineValue(code, inst, temp);
     }
-    case 16: {
-        const Xbyak::Xmm x = ctx.reg_alloc.UseXmm(code, args[0]);
-        const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+}
 
+void EmitX64::EmitVectorUnsignedAbsoluteDifference16(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    if (code.HasHostFeature(HostFeature::AVX)) {
+        auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        auto const tmp1 = ctx.reg_alloc.UseXmm(code, args[1]);
+        auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+        code.vpminuw(tmp2, tmp0, tmp1);
+        code.vpmaxuw(tmp0, tmp0, tmp1);
+        code.vpsubw(tmp0, tmp0, tmp2);
+        ctx.reg_alloc.DefineValue(code, inst, tmp0);
+    } else {
+        auto const temp = ctx.reg_alloc.ScratchXmm(code);
+        auto const x = ctx.reg_alloc.UseXmm(code, args[0]);
+        auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
         code.movdqa(temp, x);
         code.psubusw(temp, y);
         code.psubusw(y, x);
         code.por(temp, y);
-        break;
+        ctx.reg_alloc.DefineValue(code, inst, temp);
     }
-    case 32:
-        // See https://stackoverflow.com/questions/3380785/compute-the-absolute-difference-between-unsigned-integers-using-sse/3527267#3527267
-        if (code.HasHostFeature(HostFeature::SSE41)) {
-            const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-            const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(code, args[1]);
-
-            code.movdqa(temp, x);
-            code.pminud(x, y);
-            code.pmaxud(temp, y);
-            code.psubd(temp, x);
-        } else {
-            const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-            const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
-            if (ctx.HasOptimization(OptimizationFlag::CodeSpeed)) {
-                // About 45 bytes
-                const Xbyak::Xmm temp_x = ctx.reg_alloc.ScratchXmm(code);
-                const Xbyak::Xmm temp_y = ctx.reg_alloc.ScratchXmm(code);
-                code.pcmpeqd(temp, temp);
-                code.pslld(temp, 31);
-                code.movdqa(temp_x, x);
-                code.movdqa(temp_y, y);
-                code.paddd(temp_x, x);
-                code.paddd(temp_y, y);
-                code.pcmpgtd(temp_y, temp_x);
-                code.psubd(x, y);
-                code.pandn(temp, temp_y);
-                code.pxor(x, y);
-                code.psubd(x, y);
-            } else {
-                // Smaller code size - about 36 bytes
-                code.movdqa(temp, code.Const(xword, 0x8000000080000000, 0x8000000080000000));
-                code.pxor(x, temp);
-                code.pxor(y, temp);
-                code.movdqa(temp, x);
-                code.psubd(temp, y);
-                code.pcmpgtd(y, x);
-                code.psrld(y, 1);
-                code.pxor(temp, y);
-                code.psubd(temp, y);
-            }
-        }
-        break;
-    }
-
-    ctx.reg_alloc.DefineValue(code, inst, temp);
-}
-
-void EmitX64::EmitVectorUnsignedAbsoluteDifference8(EmitContext& ctx, IR::Inst* inst) {
-    EmitVectorUnsignedAbsoluteDifference(8, ctx, inst, code);
-}
-
-void EmitX64::EmitVectorUnsignedAbsoluteDifference16(EmitContext& ctx, IR::Inst* inst) {
-    EmitVectorUnsignedAbsoluteDifference(16, ctx, inst, code);
 }
 
 void EmitX64::EmitVectorUnsignedAbsoluteDifference32(EmitContext& ctx, IR::Inst* inst) {
-    EmitVectorUnsignedAbsoluteDifference(32, ctx, inst, code);
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    if (code.HasHostFeature(HostFeature::AVX)) {
+        auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        auto const tmp1 = ctx.reg_alloc.UseXmm(code, args[1]);
+        auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+        code.vpminud(tmp2, tmp0, tmp1);
+        code.vpmaxud(tmp0, tmp0, tmp1);
+        code.vpsubd(tmp0, tmp0, tmp2);
+        ctx.reg_alloc.DefineValue(code, inst, tmp0);
+    } else if (code.HasHostFeature(HostFeature::SSE41)) {
+        // See https://stackoverflow.com/questions/3380785/compute-the-absolute-difference-between-unsigned-integers-using-sse/3527267#3527267
+        auto const temp = ctx.reg_alloc.ScratchXmm(code);
+        auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        auto const y = ctx.reg_alloc.UseXmm(code, args[1]);
+        code.movdqa(temp, x);
+        code.pminud(x, y);
+        code.pmaxud(temp, y);
+        code.psubd(temp, x);
+        ctx.reg_alloc.DefineValue(code, inst, temp);
+    } else {
+        auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        auto const tmp1 = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+        auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+        auto const tmp3 = ctx.reg_alloc.ScratchXmm(code);
+        code.movdqa(tmp2, code.Const(xword, 0x8000'00008000'0000, 0x8000'00008000'0000));
+        code.movdqa(tmp3, tmp1);
+        code.pxor(tmp3, tmp2);
+        code.pxor(tmp2, tmp0);
+        code.pcmpgtd(tmp2, tmp3);
+        code.psubd(tmp0, tmp1);
+        code.pxor(tmp0, tmp2);
+        code.psubd(tmp2, tmp0);
+        //code.movdqa(tmp0, tmp2);
+        ctx.reg_alloc.DefineValue(code, inst, tmp2);
+    }
 }
 
 void EmitX64::EmitVectorUnsignedMultiply16(EmitContext& ctx, IR::Inst* inst) {
@@ -5510,11 +5869,11 @@ void EmitX64::EmitVectorUnsignedMultiply16(EmitContext& ctx, IR::Inst* inst) {
     const auto lower_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetLowerFromOp);
 
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-    const Xbyak::Xmm x = ctx.reg_alloc.UseXmm(code, args[0]);
-    const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(code, args[1]);
+    auto const x = ctx.reg_alloc.UseXmm(code, args[0]);
+    auto const y = ctx.reg_alloc.UseXmm(code, args[1]);
 
     if (upper_inst) {
-        const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+        auto const result = ctx.reg_alloc.ScratchXmm(code);
         if (code.HasHostFeature(HostFeature::AVX)) {
             code.vpmulhuw(result, x, y);
         } else {
@@ -5526,7 +5885,7 @@ void EmitX64::EmitVectorUnsignedMultiply16(EmitContext& ctx, IR::Inst* inst) {
     }
 
     if (lower_inst) {
-        const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+        auto const result = ctx.reg_alloc.ScratchXmm(code);
         if (code.HasHostFeature(HostFeature::AVX)) {
             code.vpmullw(result, x, y);
         } else {
@@ -5544,24 +5903,24 @@ void EmitX64::EmitVectorUnsignedMultiply32(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
 
     if (lower_inst && !upper_inst && code.HasHostFeature(HostFeature::AVX)) {
-        const Xbyak::Xmm x = ctx.reg_alloc.UseXmm(code, args[0]);
-        const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(code, args[1]);
-        const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+        auto const x = ctx.reg_alloc.UseXmm(code, args[0]);
+        auto const y = ctx.reg_alloc.UseXmm(code, args[1]);
+        auto const result = ctx.reg_alloc.ScratchXmm(code);
 
         code.vpmulld(result, x, y);
 
         ctx.reg_alloc.DefineValue(code, lower_inst, result);
     } else if (code.HasHostFeature(HostFeature::AVX)) {
-        const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-        const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+        auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
 
         if (lower_inst) {
-            const Xbyak::Xmm lower_result = ctx.reg_alloc.ScratchXmm(code);
+            auto const lower_result = ctx.reg_alloc.ScratchXmm(code);
             code.vpmulld(lower_result, x, y);
             ctx.reg_alloc.DefineValue(code, lower_inst, lower_result);
         }
 
-        const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+        auto const result = ctx.reg_alloc.ScratchXmm(code);
 
         code.vpmuludq(result, x, y);
         code.vpsrlq(x, x, 32);
@@ -5571,11 +5930,11 @@ void EmitX64::EmitVectorUnsignedMultiply32(EmitContext& ctx, IR::Inst* inst) {
 
         ctx.reg_alloc.DefineValue(code, upper_inst, result);
     } else {
-        const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-        const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
-        const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
-        const Xbyak::Xmm upper_result = upper_inst ? ctx.reg_alloc.ScratchXmm(code) : Xbyak::Xmm{-1};
-        const Xbyak::Xmm lower_result = lower_inst ? ctx.reg_alloc.ScratchXmm(code) : Xbyak::Xmm{-1};
+        auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+        auto const tmp = ctx.reg_alloc.ScratchXmm(code);
+        auto const upper_result = upper_inst ? ctx.reg_alloc.ScratchXmm(code) : Xbyak::Xmm{-1};
+        auto const lower_result = lower_inst ? ctx.reg_alloc.ScratchXmm(code) : Xbyak::Xmm{-1};
 
         // calculate unsigned multiply
         code.movdqa(tmp, x);
@@ -5792,11 +6151,11 @@ void EmitX64::EmitVectorUnsignedSaturatedShiftLeft64(EmitContext& ctx, IR::Inst*
 
 void EmitX64::EmitVectorZeroExtend8(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-    const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
     if (code.HasHostFeature(HostFeature::SSE41)) {
         code.pmovzxbw(a, a);
     } else {
-        const Xbyak::Xmm zeros = ctx.reg_alloc.ScratchXmm(code);
+        auto const zeros = ctx.reg_alloc.ScratchXmm(code);
         code.pxor(zeros, zeros);
         code.punpcklbw(a, zeros);
     }
@@ -5805,11 +6164,11 @@ void EmitX64::EmitVectorZeroExtend8(EmitContext& ctx, IR::Inst* inst) {
 
 void EmitX64::EmitVectorZeroExtend16(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-    const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
     if (code.HasHostFeature(HostFeature::SSE41)) {
         code.pmovzxwd(a, a);
     } else {
-        const Xbyak::Xmm zeros = ctx.reg_alloc.ScratchXmm(code);
+        auto const zeros = ctx.reg_alloc.ScratchXmm(code);
         code.pxor(zeros, zeros);
         code.punpcklwd(a, zeros);
     }
@@ -5818,11 +6177,11 @@ void EmitX64::EmitVectorZeroExtend16(EmitContext& ctx, IR::Inst* inst) {
 
 void EmitX64::EmitVectorZeroExtend32(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-    const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
     if (code.HasHostFeature(HostFeature::SSE41)) {
         code.pmovzxdq(a, a);
     } else {
-        const Xbyak::Xmm zeros = ctx.reg_alloc.ScratchXmm(code);
+        auto const zeros = ctx.reg_alloc.ScratchXmm(code);
         code.pxor(zeros, zeros);
         code.punpckldq(a, zeros);
     }
@@ -5831,8 +6190,8 @@ void EmitX64::EmitVectorZeroExtend32(EmitContext& ctx, IR::Inst* inst) {
 
 void EmitX64::EmitVectorZeroExtend64(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-    const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-    const Xbyak::Xmm zeros = ctx.reg_alloc.ScratchXmm(code);
+    auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const zeros = ctx.reg_alloc.ScratchXmm(code);
     code.pxor(zeros, zeros);
     code.punpcklqdq(a, zeros);
     ctx.reg_alloc.DefineValue(code, inst, a);
@@ -5840,7 +6199,7 @@ void EmitX64::EmitVectorZeroExtend64(EmitContext& ctx, IR::Inst* inst) {
 
 void EmitX64::EmitVectorZeroUpper(EmitContext& ctx, IR::Inst* inst) {
     auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-    const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+    auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
 
     code.movq(a, a);  // TODO: !IsLastUse
 
@@ -5848,7 +6207,7 @@ void EmitX64::EmitVectorZeroUpper(EmitContext& ctx, IR::Inst* inst) {
 }
 
 void EmitX64::EmitZeroVector(EmitContext& ctx, IR::Inst* inst) {
-    const Xbyak::Xmm a = ctx.reg_alloc.ScratchXmm(code);
+    auto const a = ctx.reg_alloc.ScratchXmm(code);
     code.pxor(a, a);
     ctx.reg_alloc.DefineValue(code, inst, a);
 }
diff --git a/src/dynarmic/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp b/src/dynarmic/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp
index 70edfbd0bc..046ecc78d6 100644
--- a/src/dynarmic/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp
+++ b/src/dynarmic/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp
@@ -24,6 +24,7 @@
 #include "dynarmic/common/fp/fpcr.h"
 #include "dynarmic/common/fp/info.h"
 #include "dynarmic/common/fp/op.h"
+#include "dynarmic/common/fp/rounding_mode.h"
 #include "dynarmic/common/fp/util.h"
 #include "dynarmic/interface/optimization_flags.h"
 #include "dynarmic/ir/basic_block.h"
@@ -93,7 +94,7 @@ void HandleNaNs(BlockOfCode& code, EmitContext& ctx, bool fpcr_controlled, std::
         code.cmp(bitmask, 0);
     }
 
-    SharedLabel end = GenSharedLabel(), nan = GenSharedLabel();
+    SharedLabel end = ctx.GenSharedLabel(), nan = ctx.GenSharedLabel();
 
     code.jnz(*nan, code.T_NEAR);
     code.L(*end);
@@ -188,23 +189,6 @@ void ForceToDefaultNaN(BlockOfCode& code, FP::FPCR fpcr, Xbyak::Xmm result) {
     }
 }
 
-template<size_t fsize>
-void ZeroIfNaN(BlockOfCode& code, Xbyak::Xmm result) {
-    const Xbyak::Xmm nan_mask = xmm0;
-    if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) {
-        constexpr u32 nan_to_zero = FixupLUT(FpFixup::PosZero,
-                                             FpFixup::PosZero);
-        FCODE(vfixupimmp)(result, result, code.BConst<32>(ptr_b, nan_to_zero), u8(0));
-    } else if (code.HasHostFeature(HostFeature::AVX)) {
-        FCODE(vcmpordp)(nan_mask, result, result);
-        FCODE(vandp)(result, result, nan_mask);
-    } else {
-        code.movaps(nan_mask, result);
-        FCODE(cmpordp)(nan_mask, nan_mask);
-        code.andps(result, nan_mask);
-    }
-}
-
 template<size_t fsize>
 void DenormalsAreZero(BlockOfCode& code, FP::FPCR fpcr, std::initializer_list<Xbyak::Xmm> to_daz, Xbyak::Xmm tmp) {
     if (fpcr.FZ()) {
@@ -1330,7 +1314,7 @@ void EmitFPVectorMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
             const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
             const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
 
-            SharedLabel end = GenSharedLabel(), fallback = GenSharedLabel();
+            SharedLabel end = ctx.GenSharedLabel(), fallback = ctx.GenSharedLabel();
 
             MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] {
                 code.movaps(result, xmm_a);
@@ -1603,7 +1587,7 @@ static void EmitRecipStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst* in
             const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(code, args[1]);
             const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
 
-            SharedLabel end = GenSharedLabel(), fallback = GenSharedLabel();
+            SharedLabel end = ctx.GenSharedLabel(), fallback = ctx.GenSharedLabel();
 
             MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] {
                 code.movaps(result, GetVectorOf<fsize, false, 0, 2>(code));
@@ -1776,7 +1760,7 @@ static void EmitRSqrtEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* ins
             const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
             const Xbyak::Xmm value = ctx.reg_alloc.ScratchXmm(code);
 
-            SharedLabel bad_values = GenSharedLabel(), end = GenSharedLabel();
+            SharedLabel bad_values = ctx.GenSharedLabel(), end = ctx.GenSharedLabel();
 
             code.movaps(value, operand);
 
@@ -1867,7 +1851,7 @@ static void EmitRSqrtStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst* in
             const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
             const Xbyak::Xmm mask = ctx.reg_alloc.ScratchXmm(code);
 
-            SharedLabel end = GenSharedLabel(), fallback = GenSharedLabel();
+            SharedLabel end = ctx.GenSharedLabel(), fallback = ctx.GenSharedLabel();
 
             MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] {
                 code.vmovaps(result, GetVectorOf<fsize, false, 0, 3>(code));
@@ -2004,120 +1988,123 @@ void EmitX64::EmitFPVectorToHalf32(EmitContext& ctx, IR::Inst* inst) {
 template<size_t fsize, bool unsigned_>
 void EmitFPVectorToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
     const size_t fbits = inst->GetArg(1).GetU8();
-    const auto rounding = static_cast<FP::RoundingMode>(inst->GetArg(2).GetU8());
+    const auto rounding = FP::RoundingMode(inst->GetArg(2).GetU8());
     [[maybe_unused]] const bool fpcr_controlled = inst->GetArg(3).GetU1();
 
-    if constexpr (fsize != 16) {
-        if (code.HasHostFeature(HostFeature::SSE41) && rounding != FP::RoundingMode::ToNearest_TieAwayFromZero) {
-            auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-
-            const Xbyak::Xmm src = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-
-            MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] {
-                const int round_imm = [&] {
-                    switch (rounding) {
-                    case FP::RoundingMode::ToNearest_TieEven:
-                    default:
-                        return 0b00;
-                    case FP::RoundingMode::TowardsPlusInfinity:
-                        return 0b10;
-                    case FP::RoundingMode::TowardsMinusInfinity:
-                        return 0b01;
-                    case FP::RoundingMode::TowardsZero:
-                        return 0b11;
-                    }
-                }();
-
-                const auto perform_conversion = [&code, &ctx](const Xbyak::Xmm& src) {
-                    // MSVC doesn't allow us to use a [&] capture, so we have to do this instead.
-                    (void)ctx;
-
-                    if constexpr (fsize == 32) {
-                        code.cvttps2dq(src, src);
-                    } else {
-                        if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) {
-                            code.vcvttpd2qq(src, src);
-                        } else {
-                            const Xbyak::Reg64 hi = ctx.reg_alloc.ScratchGpr(code);
-                            const Xbyak::Reg64 lo = ctx.reg_alloc.ScratchGpr(code);
-
-                            code.cvttsd2si(lo, src);
-                            code.punpckhqdq(src, src);
-                            code.cvttsd2si(hi, src);
-                            code.movq(src, lo);
-                            code.pinsrq(src, hi, 1);
-
-                            ctx.reg_alloc.Release(hi);
-                            ctx.reg_alloc.Release(lo);
-                        }
-                    }
-                };
-
-                if (fbits != 0) {
-                    const u64 scale_factor = fsize == 32
-                                               ? static_cast<u64>(fbits + 127) << 23
-                                               : static_cast<u64>(fbits + 1023) << 52;
-                    FCODE(mulp)(src, GetVectorOf<fsize>(code, scale_factor));
+    if (code.HasHostFeature(HostFeature::SSE41) && fsize != 16 && rounding != FP::RoundingMode::ToNearest_TieAwayFromZero) {
+        auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+        const Xbyak::Xmm src = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+        MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] {
+            const int round_imm = [&] {
+                switch (rounding) {
+                case FP::RoundingMode::ToNearest_TieEven:
+                default:
+                    return 0b00;
+                case FP::RoundingMode::TowardsPlusInfinity:
+                    return 0b10;
+                case FP::RoundingMode::TowardsMinusInfinity:
+                    return 0b01;
+                case FP::RoundingMode::TowardsZero:
+                    return 0b11;
                 }
+            }();
+            const auto perform_conversion = [&code, &ctx](const Xbyak::Xmm& src) {
+                // MSVC doesn't allow us to use a [&] capture, so we have to do this instead.
+                (void)ctx;
 
-                FCODE(roundp)(src, src, static_cast<u8>(round_imm));
-                ZeroIfNaN<fsize>(code, src);
-
-                constexpr u64 float_upper_limit_signed = fsize == 32 ? 0x4f000000 : 0x43e0000000000000;
-                [[maybe_unused]] constexpr u64 float_upper_limit_unsigned = fsize == 32 ? 0x4f800000 : 0x43f0000000000000;
-
-                if constexpr (unsigned_) {
+                if constexpr (fsize == 32) {
+                    code.cvttps2dq(src, src);
+                } else {
                     if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) {
-                        // Mask positive values
-                        code.xorps(xmm0, xmm0);
-                        FCODE(vcmpp)(k1, src, xmm0, Cmp::GreaterEqual_OQ);
-
-                        // Convert positive values to unsigned integers, write 0 anywhere else
-                        // vcvttp*2u*q already saturates out-of-range values to (0xFFFF...)
-                        if constexpr (fsize == 32) {
-                            code.vcvttps2udq(src | k1 | T_z, src);
-                        } else {
-                            code.vcvttpd2uqq(src | k1 | T_z, src);
-                        }
+                        code.vcvttpd2qq(src, src);
                     } else {
-                        // Zero is minimum
-                        code.xorps(xmm0, xmm0);
-                        FCODE(cmplep)(xmm0, src);
-                        FCODE(andp)(src, xmm0);
+                        const Xbyak::Reg64 hi = ctx.reg_alloc.ScratchGpr(code);
+                        const Xbyak::Reg64 lo = ctx.reg_alloc.ScratchGpr(code);
 
-                        // Will we exceed unsigned range?
-                        const Xbyak::Xmm exceed_unsigned = ctx.reg_alloc.ScratchXmm(code);
-                        code.movaps(exceed_unsigned, GetVectorOf<fsize, float_upper_limit_unsigned>(code));
-                        FCODE(cmplep)(exceed_unsigned, src);
+                        code.cvttsd2si(lo, src);
+                        code.punpckhqdq(src, src);
+                        code.cvttsd2si(hi, src);
+                        code.movq(src, lo);
+                        code.pinsrq(src, hi, 1);
 
-                        // Will be exceed signed range?
-                        const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
-                        code.movaps(tmp, GetVectorOf<fsize, float_upper_limit_signed>(code));
-                        code.movaps(xmm0, tmp);
-                        FCODE(cmplep)(xmm0, src);
-                        FCODE(andp)(tmp, xmm0);
-                        FCODE(subp)(src, tmp);
-                        perform_conversion(src);
-                        ICODE(psll)(xmm0, u8(fsize - 1));
-                        FCODE(orp)(src, xmm0);
+                        ctx.reg_alloc.Release(hi);
+                        ctx.reg_alloc.Release(lo);
+                    }
+                }
+            };
+            if (fbits != 0) {
+                const u64 scale_factor = fsize == 32
+                    ? u64(fbits + 127) << 23
+                    : u64(fbits + 1023) << 52;
+                FCODE(mulp)(src, GetVectorOf<fsize>(code, scale_factor));
+            }
 
-                        // Saturate to max
-                        FCODE(orp)(src, exceed_unsigned);
+            FCODE(roundp)(src, src, u8(round_imm));
+            const Xbyak::Xmm nan_mask = xmm0;
+            if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) {
+                static constexpr u32 nan_to_zero = FixupLUT(FpFixup::PosZero, FpFixup::PosZero);
+                FCODE(vfixupimmp)(src, src, code.BConst<32>(ptr_b, nan_to_zero), u8(0));
+            } else if (code.HasHostFeature(HostFeature::AVX)) {
+                FCODE(vcmpordp)(nan_mask, src, src);
+                FCODE(vandp)(src, src, nan_mask);
+            } else {
+                code.movaps(nan_mask, src);
+                FCODE(cmpordp)(nan_mask, nan_mask);
+                code.andps(src, nan_mask);
+            }
+
+            constexpr u64 float_upper_limit_signed = fsize == 32 ? 0x4f000000 : 0x43e0000000000000;
+            [[maybe_unused]] constexpr u64 float_upper_limit_unsigned = fsize == 32 ? 0x4f800000 : 0x43f0000000000000;
+
+            if constexpr (unsigned_) {
+                if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) {
+                    // Mask positive values
+                    code.xorps(xmm0, xmm0);
+                    FCODE(vcmpp)(k1, src, xmm0, Cmp::GreaterEqual_OQ);
+
+                    // Convert positive values to unsigned integers, write 0 anywhere else
+                    // vcvttp*2u*q already saturates out-of-range values to (0xFFFF...)
+                    if (fsize == 32) {
+                        code.vcvttps2udq(src | k1 | T_z, src);
+                    } else {
+                        code.vcvttpd2uqq(src | k1 | T_z, src);
                     }
                 } else {
-                    using FPT = mcl::unsigned_integer_of_size<fsize>;  // WORKAROUND: For issue 678 on MSVC
-                    constexpr u64 integer_max = FPT((std::numeric_limits<std::conditional_t<unsigned_, FPT, std::make_signed_t<FPT>>>::max)());
-
-                    code.movaps(xmm0, GetVectorOf<fsize, float_upper_limit_signed>(code));
+                    // Zero is minimum
+                    code.xorps(xmm0, xmm0);
                     FCODE(cmplep)(xmm0, src);
-                    perform_conversion(src);
-                    FCODE(blendvp)(src, GetVectorOf<fsize, integer_max>(code));
-                }
-            });
+                    FCODE(andp)(src, xmm0);
 
-            ctx.reg_alloc.DefineValue(code, inst, src);
-            return;
-        }
+                    // Will we exceed unsigned range?
+                    const Xbyak::Xmm exceed_unsigned = ctx.reg_alloc.ScratchXmm(code);
+                    code.movaps(exceed_unsigned, GetVectorOf<fsize, float_upper_limit_unsigned>(code));
+                    FCODE(cmplep)(exceed_unsigned, src);
+
+                    // Will be exceed signed range?
+                    const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+                    code.movaps(tmp, GetVectorOf<fsize, float_upper_limit_signed>(code));
+                    code.movaps(xmm0, tmp);
+                    FCODE(cmplep)(xmm0, src);
+                    FCODE(andp)(tmp, xmm0);
+                    FCODE(subp)(src, tmp);
+                    perform_conversion(src);
+                    ICODE(psll)(xmm0, u8(fsize - 1));
+                    FCODE(orp)(src, xmm0);
+
+                    // Saturate to max
+                    FCODE(orp)(src, exceed_unsigned);
+                }
+            } else {
+                using FPT = mcl::unsigned_integer_of_size<fsize>;  // WORKAROUND: For issue 678 on MSVC
+                constexpr u64 integer_max = FPT((std::numeric_limits<std::conditional_t<unsigned_, FPT, std::make_signed_t<FPT>>>::max)());
+                code.movaps(xmm0, GetVectorOf<fsize, float_upper_limit_signed>(code));
+                FCODE(cmplep)(xmm0, src);
+                perform_conversion(src);
+                FCODE(blendvp)(src, GetVectorOf<fsize, integer_max>(code));
+            }
+        });
+        ctx.reg_alloc.DefineValue(code, inst, src);
+        return;
     }
 
     using FPT = mcl::unsigned_integer_of_size<fsize>; // WORKAROUND: For issue 678 on MSVC
diff --git a/src/dynarmic/src/dynarmic/backend/x64/exception_handler_windows.cpp b/src/dynarmic/src/dynarmic/backend/x64/exception_handler_windows.cpp
index 3ae553bccd..bae397ff2b 100644
--- a/src/dynarmic/src/dynarmic/backend/x64/exception_handler_windows.cpp
+++ b/src/dynarmic/src/dynarmic/backend/x64/exception_handler_windows.cpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
+// SPDX-FileCopyrightText: Copyright 2026 Eden Emulator Project
 // SPDX-License-Identifier: GPL-3.0-or-later
 
 /* This file is part of the dynarmic project.
@@ -176,7 +176,7 @@ struct ExceptionHandler::Impl final {
 
         code.align(16);
         const u8* exception_handler_without_cb = code.getCurr<u8*>();
-        code.mov(code.eax, static_cast<u32>(ExceptionContinueSearch));
+        code.mov(code.eax, u32(ExceptionContinueSearch));
         code.ret();
 
         code.align(16);
@@ -192,20 +192,18 @@ struct ExceptionHandler::Impl final {
         code.lea(code.rsp, code.ptr[code.rsp - 8]);
         code.mov(code.ABI_PARAM1, std::bit_cast<u64>(&cb));
         code.mov(code.ABI_PARAM2, code.ABI_PARAM3);
-        code.CallLambda(
-            [](const std::function<FakeCall(u64)>& cb_, PCONTEXT ctx) {
-                FakeCall fc = cb_(ctx->Rip);
-
-                ctx->Rsp -= sizeof(u64);
-                *std::bit_cast<u64*>(ctx->Rsp) = fc.ret_rip;
-                ctx->Rip = fc.call_rip;
-            });
+        code.CallLambda([](const std::function<FakeCall(u64)>& cb_, PCONTEXT ctx) {
+            FakeCall fc = cb_(ctx->Rip);
+            ctx->Rsp -= sizeof(u64);
+            *std::bit_cast<u64*>(ctx->Rsp) = fc.ret_rip;
+            ctx->Rip = fc.call_rip;
+        });
         code.add(code.rsp, 8);
-        code.mov(code.eax, static_cast<u32>(ExceptionContinueExecution));
+        code.mov(code.eax, u32(ExceptionContinueExecution));
         code.ret();
 
-        exception_handler_without_cb_offset = static_cast<ULONG>(exception_handler_without_cb - code.getCode<u8*>());
-        exception_handler_with_cb_offset = static_cast<ULONG>(exception_handler_with_cb - code.getCode<u8*>());
+        exception_handler_without_cb_offset = ULONG(exception_handler_without_cb - code.getCode<u8*>());
+        exception_handler_with_cb_offset = ULONG(exception_handler_with_cb - code.getCode<u8*>());
 
         code.align(16);
         UNWIND_INFO* unwind_info = static_cast<UNWIND_INFO*>(code.AllocateFromCodeSpace(sizeof(UNWIND_INFO)));
diff --git a/src/dynarmic/src/dynarmic/backend/x64/reg_alloc.cpp b/src/dynarmic/src/dynarmic/backend/x64/reg_alloc.cpp
index 5c5ed25131..2cfa14ae18 100644
--- a/src/dynarmic/src/dynarmic/backend/x64/reg_alloc.cpp
+++ b/src/dynarmic/src/dynarmic/backend/x64/reg_alloc.cpp
@@ -417,7 +417,8 @@ HostLoc RegAlloc::SelectARegister(std::bitset<32> desired_locations) const noexc
             // While R13 and R14 are technically available, we avoid allocating for them
             // at all costs, because theoretically skipping them is better than spilling
             // all over the place - i also fixes bugs with high reg pressure
-            } else if (i >= HostLoc::R13 && i <= HostLoc::R15) {
+            // %rbp must not be trashed, so skip it as well
+            } else if (i == HostLoc::RBP || (i >= HostLoc::R13 && i <= HostLoc::R15)) {
                 // skip, do not touch
             // Intel recommends to reuse registers as soon as they're overwritable (DO NOT SPILL)
             } else if (loc_info.IsEmpty()) {
diff --git a/src/dynarmic/src/dynarmic/backend/x64/stack_layout.h b/src/dynarmic/src/dynarmic/backend/x64/stack_layout.h
index 50737f12eb..43a3fc7ab2 100644
--- a/src/dynarmic/src/dynarmic/backend/x64/stack_layout.h
+++ b/src/dynarmic/src/dynarmic/backend/x64/stack_layout.h
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
+// SPDX-FileCopyrightText: Copyright 2026 Eden Emulator Project
 // SPDX-License-Identifier: GPL-3.0-or-later
 
 /* This file is part of the dynarmic project.
@@ -22,14 +22,13 @@ constexpr size_t SpillCount = 64;
 #endif
 
 struct alignas(16) StackLayout {
+    // Needs alignment for VMOV and XMM spills
+    alignas(16) std::array<std::array<u64, 2>, SpillCount> spill;
     s64 cycles_remaining;
     s64 cycles_to_run;
-
-    std::array<std::array<u64, 2>, SpillCount> spill;
-
     u32 save_host_MXCSR;
-
     bool check_bit;
+    u64 abi_base_pointer;
 };
 
 #ifdef _MSC_VER
diff --git a/src/dynarmic/src/dynarmic/ir/opcodes.inc b/src/dynarmic/src/dynarmic/ir/opcodes.inc
index 6f57f278a3..b1ba5b2993 100644
--- a/src/dynarmic/src/dynarmic/ir/opcodes.inc
+++ b/src/dynarmic/src/dynarmic/ir/opcodes.inc
@@ -1,3 +1,6 @@
+// SPDX-FileCopyrightText: Copyright 2026 Eden Emulator Project
+// SPDX-License-Identifier: GPL-3.0-or-later
+
 // First we list common shared opcodes
 // Since we give priority to A64 performance, we include them first, this is so we
 // can discard all A32 opcodes instead of having a "hole" in our checks
@@ -710,6 +713,8 @@ A64OPC(ExclusiveWriteMemory32,                              U32,            U64,
 A64OPC(ExclusiveWriteMemory64,                              U32,            U64,            U64,            U64,            AccType         )
 A64OPC(ExclusiveWriteMemory128,                             U32,            U64,            U64,            U128,           AccType         )
 
+// Remember to update:
+// - a32_emit_x64.cpp
 
 // A32 Context getters/setters
 A32OPC(SetCheckBit,                                         Void,           U1                                                              )
diff --git a/src/dynarmic/tests/A64/a64.cpp b/src/dynarmic/tests/A64/a64.cpp
index 4d4484e53e..e85986ea5a 100644
--- a/src/dynarmic/tests/A64/a64.cpp
+++ b/src/dynarmic/tests/A64/a64.cpp
@@ -415,6 +415,105 @@ TEST_CASE("A64: URSHL", "[a64]") {
     CHECK(jit.GetVector(9) == Vector{0x0000000000000002, 0x12db8b8280e0ba});
 }
 
+TEST_CASE("A64: SQSHLU", "[a64]") {
+    A64TestEnv env;
+    A64::UserConfig jit_user_config{};
+    jit_user_config.callbacks = &env;
+    A64::Jit jit{jit_user_config};
+
+    oaknut::VectorCodeGenerator code{env.code_mem, nullptr};
+    code.SQSHLU(V8.B16(), V0.B16(), 1);
+    code.SQSHLU(V9.H8(), V1.H8(), 2);
+    code.SQSHLU(V10.S4(), V2.S4(), 28);
+    code.SQSHLU(V11.D2(), V3.D2(), 4);
+    code.SQSHLU(V12.S4(), V0.S4(), 1);
+    code.SQSHLU(V13.S4(), V1.S4(), 3);
+    code.SQSHLU(V14.S4(), V2.S4(), 0);
+    code.SQSHLU(V15.S4(), V3.S4(), 0);
+
+    jit.SetVector(0, Vector{0xffffffff'18ba6a6a, 0x7fffffff'943b954f});
+    jit.SetVector(1, Vector{0x0000000b'0000000f, 0xffffffff'ffffffff});
+    jit.SetVector(2, Vector{0x00000001'000000ff, 0x00000010'0000007f});
+    jit.SetVector(3, Vector{0xffffffffffffffff, 0x96dc5c140705cd04});
+
+    env.ticks_left = env.code_mem.size();
+    CheckedRun([&]() { jit.Run(); });
+
+    CHECK(jit.GetVector(8) == Vector{0x3000d4d4, 0xfe0000000076009e});
+    CHECK(jit.GetVector(9) == Vector{0x2c0000003c, 0});
+    CHECK(jit.GetVector(10) == Vector{0x10000000'ffffffff, 0xffffffff'ffffffff});
+    CHECK(jit.GetVector(11) == Vector{0, 0});
+    CHECK(jit.GetVector(12) == Vector{0x3174d4d4, 0xfffffffe00000000});
+    CHECK(jit.GetVector(13) == Vector{0x5800000078, 0});
+    CHECK(jit.GetVector(14) == Vector{0x1000000ff, 0x100000007f});
+    CHECK(jit.GetVector(15) == Vector{0, 0x705cd04});
+}
+
+TEST_CASE("A64: SMIN", "[a64]") {
+    A64TestEnv env;
+    A64::UserConfig jit_user_config{};
+    jit_user_config.callbacks = &env;
+    A64::Jit jit{jit_user_config};
+
+    oaknut::VectorCodeGenerator code{env.code_mem, nullptr};
+    code.SMIN(V8.B16(), V0.B16(), V3.B16());
+    code.SMIN(V9.H8(), V1.H8(), V2.H8());
+    code.SMIN(V10.S4(), V2.S4(), V3.S4());
+    code.SMIN(V11.S4(), V3.S4(), V3.S4());
+    code.SMIN(V12.S4(), V0.S4(), V3.S4());
+    code.SMIN(V13.S4(), V1.S4(), V2.S4());
+    code.SMIN(V14.S4(), V2.S4(), V1.S4());
+    code.SMIN(V15.S4(), V3.S4(), V0.S4());
+
+    jit.SetPC(0);
+    jit.SetVector(0, Vector{0xffffffff'18ba6a6a, 0x7fffffff'943b954f});
+    jit.SetVector(1, Vector{0x0000000b'0000000f, 0xffffffff'ffffffff});
+    jit.SetVector(2, Vector{0x00000001'000000ff, 0x00000010'0000007f});
+    jit.SetVector(3, Vector{0xffffffff'ffffffff, 0x96dc5c14'0705cd04});
+
+    env.ticks_left = 4;
+    CheckedRun([&]() { jit.Run(); });
+
+    REQUIRE(jit.GetVector(8) == Vector{0xffffffffffbaffff, 0x96dcffff94059504});
+    REQUIRE(jit.GetVector(9) == Vector{0x10000000f, 0xffffffffffffffff});
+    REQUIRE(jit.GetVector(10) == Vector{0xffffffffffffffff, 0x96dc5c140000007f});
+}
+
+TEST_CASE("A64: SMINP", "[a64]") {
+    A64TestEnv env;
+    A64::UserConfig jit_user_config{};
+    jit_user_config.callbacks = &env;
+    A64::Jit jit{jit_user_config};
+
+    oaknut::VectorCodeGenerator code{env.code_mem, nullptr};
+    code.SMINP(V8.B16(), V0.B16(), V3.B16());
+    code.SMINP(V9.H8(), V1.H8(), V2.H8());
+    code.SMINP(V10.S4(), V2.S4(), V1.S4());
+    code.SMINP(V11.S4(), V3.S4(), V3.S4());
+    code.SMINP(V12.S4(), V0.S4(), V3.S4());
+    code.SMINP(V13.S4(), V1.S4(), V2.S4());
+    code.SMINP(V14.S4(), V2.S4(), V1.S4());
+    code.SMINP(V15.S4(), V3.S4(), V0.S4());
+
+    jit.SetPC(0);
+    jit.SetVector(0, Vector{0xffffffff'18ba6a6a, 0x7fffffff'943b954f});
+    jit.SetVector(1, Vector{0x0000000b'0000000f, 0xffffffff'ffffffff});
+    jit.SetVector(2, Vector{0x00000001'000000ff, 0x00000010'0000007f});
+    jit.SetVector(3, Vector{0xffffffff'ffffffff, 0x96dc5c14'0705cd04});
+
+    env.ticks_left = 4;
+    CheckedRun([&]() { jit.Run(); });
+
+    REQUIRE(jit.GetVector(8) == Vector{0xffff9495ffffba6a, 0x961405cdffffffff});
+    REQUIRE(jit.GetVector(9) == Vector{0xffffffff00000000, 0});
+    REQUIRE(jit.GetVector(10) == Vector{0x1000000001, 0xffffffff0000000b});
+    REQUIRE(jit.GetVector(11) == Vector{0x96dc5c14ffffffff, 0x96dc5c14ffffffff});
+    REQUIRE(jit.GetVector(12) == Vector{0x943b954fffffffff, 0x96dc5c14ffffffff});
+    REQUIRE(jit.GetVector(13) == Vector{0xffffffff0000000b, 0x1000000001});
+    REQUIRE(jit.GetVector(14) == Vector{0x1000000001, 0xffffffff0000000b});
+    REQUIRE(jit.GetVector(15) == Vector{0x96dc5c14ffffffff, 0x943b954fffffffff});
+}
+
 TEST_CASE("A64: XTN", "[a64]") {
     A64TestEnv env;
     A64::UserConfig jit_user_config{};
diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp
index e989bf6b31..75fbcaa968 100644
--- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp
+++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp
@@ -49,7 +49,7 @@ using VideoCore::Surface::PixelFormatFromDepthFormat;
 using VideoCore::Surface::PixelFormatFromRenderTargetFormat;
 
 constexpr size_t NUM_STAGES = Maxwell::MaxShaderStage;
-constexpr size_t MAX_IMAGE_ELEMENTS = 64;
+constexpr size_t INLINE_IMAGE_ELEMENTS = 64;
 
 DescriptorLayoutBuilder MakeBuilder(const Device& device, std::span<const Shader::Info> infos) {
     DescriptorLayoutBuilder builder{device};
@@ -264,7 +264,11 @@ GraphicsPipeline::GraphicsPipeline(
         stage_infos[stage] = *info;
         enabled_uniform_buffer_masks[stage] = info->constant_buffer_mask;
         std::ranges::copy(info->constant_buffer_used_sizes, uniform_buffer_sizes[stage].begin());
+        num_image_elements += Shader::NumDescriptors(info->texture_buffer_descriptors);
+        num_image_elements += Shader::NumDescriptors(info->image_buffer_descriptors);
         num_textures += Shader::NumDescriptors(info->texture_descriptors);
+        num_image_elements += Shader::NumDescriptors(info->texture_descriptors);
+        num_image_elements += Shader::NumDescriptors(info->image_descriptors);
     }
     fragment_has_color0_output = stage_infos[NUM_STAGES - 1].stores_frag_color[0];
     auto func{[this, shader_notify, &render_pass_cache, &descriptor_pool, pipeline_statistics] {
@@ -310,10 +314,10 @@ void GraphicsPipeline::AddTransition(GraphicsPipeline* transition) {
 
 template <typename Spec>
 bool GraphicsPipeline::ConfigureImpl(bool is_indexed) {
-    std::array<VideoCommon::ImageViewInOut, MAX_IMAGE_ELEMENTS> views;
-    std::array<VideoCommon::SamplerId, MAX_IMAGE_ELEMENTS> samplers;
-    size_t sampler_index{};
-    size_t view_index{};
+    small_vector<VideoCommon::ImageViewInOut, INLINE_IMAGE_ELEMENTS> views;
+    small_vector<VideoCommon::SamplerId, INLINE_IMAGE_ELEMENTS> samplers;
+    views.reserve(num_image_elements);
+    samplers.reserve(num_textures);
 
     texture_cache.SynchronizeGraphicsDescriptors();
 
@@ -358,11 +362,11 @@ bool GraphicsPipeline::ConfigureImpl(bool is_indexed) {
         const auto add_image{[&](const auto& desc, bool blacklist) LAMBDA_FORCEINLINE {
             for (u32 index = 0; index < desc.count; ++index) {
                 const auto handle{read_handle(desc, index)};
-                views[view_index++] = {
+                views.push_back({
                     .index = handle.first,
                     .blacklist = blacklist,
                     .id = {}
-                };
+                });
             }
         }};
         if constexpr (Spec::has_texture_buffers) {
@@ -378,10 +382,10 @@ bool GraphicsPipeline::ConfigureImpl(bool is_indexed) {
         for (const auto& desc : info.texture_descriptors) {
             for (u32 index = 0; index < desc.count; ++index) {
                 const auto handle{read_handle(desc, index)};
-                views[view_index++] = {handle.first};
+                views.push_back({handle.first});
 
                 VideoCommon::SamplerId sampler{texture_cache.GetGraphicsSamplerId(handle.second)};
-                samplers[sampler_index++] = sampler;
+                samplers.push_back(sampler);
             }
         }
         if constexpr (Spec::has_images) {
@@ -407,7 +411,9 @@ bool GraphicsPipeline::ConfigureImpl(bool is_indexed) {
     if constexpr (Spec::enabled_stages[4]) {
         config_stage(4);
     }
-    texture_cache.FillGraphicsImageViews<Spec::has_images>(std::span(views.data(), view_index));
+    ASSERT(views.size() == num_image_elements);
+    ASSERT(samplers.size() == num_textures);
+    texture_cache.FillGraphicsImageViews<Spec::has_images>(std::span(views.data(), views.size()));
 
     VideoCommon::ImageViewInOut* texture_buffer_it{views.data()};
     const auto bind_stage_info{[&](size_t stage) LAMBDA_FORCEINLINE {
@@ -501,7 +507,8 @@ bool GraphicsPipeline::ConfigureImpl(bool is_indexed) {
         buffer_cache.any_buffer_uploaded = false;
     }
     texture_cache.UpdateRenderTargets(false);
-    texture_cache.CheckFeedbackLoop(views);
+    texture_cache.CheckFeedbackLoop(std::span<const VideoCommon::ImageViewInOut>{views.data(),
+                                                                                 views.size()});
     ConfigureDraw(rescaling, render_area);
 
     return true;
@@ -987,7 +994,7 @@ void GraphicsPipeline::Validate() {
         num_images += Shader::NumDescriptors(info.texture_descriptors);
         num_images += Shader::NumDescriptors(info.image_descriptors);
     }
-    ASSERT(num_images <= MAX_IMAGE_ELEMENTS);
+    ASSERT(num_images == num_image_elements);
 }
 
 } // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.h b/src/video_core/renderer_vulkan/vk_graphics_pipeline.h
index 34941d6e8d..1a41e50a36 100644
--- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.h
+++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.h
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
+// SPDX-FileCopyrightText: Copyright 2026 Eden Emulator Project
 // SPDX-License-Identifier: GPL-3.0-or-later
 
 // SPDX-FileCopyrightText: Copyright 2021 yuzu Emulator Project
@@ -159,6 +159,7 @@ private:
     std::array<Shader::Info, NUM_STAGES> stage_infos;
     std::array<u32, 5> enabled_uniform_buffer_masks{};
     VideoCommon::UniformBufferSizes uniform_buffer_sizes{};
+    size_t num_image_elements{};
     u32 num_textures{};
     bool fragment_has_color0_output{};