diff --git a/dist/dev.eden_emu.eden.svg b/dist/dev.eden_emu.eden.svg
index b125f4fb80..f88b52f625 100644
--- a/dist/dev.eden_emu.eden.svg
+++ b/dist/dev.eden_emu.eden.svg
@@ -6,191 +6,225 @@
viewBox="0 0 512 512"
version="1.1"
id="svg7"
- sodipodi:docname="saintpatrick2026_named.svg"
- inkscape:version="1.4.2 (ebf0e940d0, 2025-05-08)"
- xml:space="preserve"
- inkscape:export-filename="dev.eden_emu.eden.png"
+ sodipodi:docname="base.svg.2026_01_12_14_43_47.0.svg"
+ inkscape:version="1.4.2 (ebf0e94, 2025-05-08)"
+ inkscape:export-filename="base.svg.2026_01_12_14_43_47.0.svg"
inkscape:export-xdpi="96"
inkscape:export-ydpi="96"
xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
xmlns:xlink="http://www.w3.org/1999/xlink"
xmlns="http://www.w3.org/2000/svg"
- xmlns:svg="http://www.w3.org/2000/svg"
- xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
- xmlns:cc="http://creativecommons.org/ns#"
- xmlns:dc="http://purl.org/dc/elements/1.1/">
-
-
-
-
-
-
- Madeline_Dev
- mailto:madelvidel@gmail.com
-
-
- 2025
-
- 2025 Eden Emulator Project
- https://git.eden-emu.dev
-
-
-
+ xmlns:svg="http://www.w3.org/2000/svg">
+ id="defs7">
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/dist/eden.bmp b/dist/eden.bmp
index 498d6f3893..888138ccf7 100644
Binary files a/dist/eden.bmp and b/dist/eden.bmp differ
diff --git a/dist/eden.ico b/dist/eden.ico
index 187013ae63..45120ef312 100644
Binary files a/dist/eden.ico and b/dist/eden.ico differ
diff --git a/dist/qt_themes/default/icons/256x256/eden.png b/dist/qt_themes/default/icons/256x256/eden.png
index fbee9f1836..3c4bd566a1 100644
Binary files a/dist/qt_themes/default/icons/256x256/eden.png and b/dist/qt_themes/default/icons/256x256/eden.png differ
diff --git a/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/AddonsFragment.kt b/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/AddonsFragment.kt
index b20d75ef0a..96632b4606 100644
--- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/AddonsFragment.kt
+++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/AddonsFragment.kt
@@ -25,6 +25,7 @@ import org.yuzu.yuzu_emu.model.AddonViewModel
import org.yuzu.yuzu_emu.model.HomeViewModel
import org.yuzu.yuzu_emu.utils.AddonUtil
import org.yuzu.yuzu_emu.utils.FileUtil.copyFilesTo
+import org.yuzu.yuzu_emu.utils.InstallableActions
import org.yuzu.yuzu_emu.utils.ViewUtils.updateMargins
import org.yuzu.yuzu_emu.utils.collect
import java.io.File
@@ -107,6 +108,12 @@ class AddonsFragment : Fragment() {
).show(parentFragmentManager, MessageDialogFragment.TAG)
}
}
+ parentFragmentManager.setFragmentResultListener(
+ ContentTypeSelectionDialogFragment.REQUEST_INSTALL_GAME_UPDATE,
+ viewLifecycleOwner
+ ) { _, _ ->
+ installGameUpdate.launch(arrayOf("*/*"))
+ }
binding.buttonInstall.setOnClickListener {
ContentTypeSelectionDialogFragment().show(
@@ -130,7 +137,7 @@ class AddonsFragment : Fragment() {
super.onDestroy()
}
- val installAddon =
+ private val installAddon =
registerForActivityResult(ActivityResultContracts.OpenDocumentTree()) { result ->
if (result == null) {
return@registerForActivityResult
@@ -175,6 +182,17 @@ class AddonsFragment : Fragment() {
}
}
+ private val installGameUpdate =
+ registerForActivityResult(ActivityResultContracts.OpenMultipleDocuments()) { documents ->
+ InstallableActions.verifyAndInstallContent(
+ activity = requireActivity(),
+ fragmentManager = parentFragmentManager,
+ addonViewModel = addonViewModel,
+ documents = documents,
+ programId = args.game.programId
+ )
+ }
+
private fun setInsets() =
ViewCompat.setOnApplyWindowInsetsListener(
binding.root
diff --git a/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/ContentTypeSelectionDialogFragment.kt b/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/ContentTypeSelectionDialogFragment.kt
index 880c2ff3bf..fb59a3a52c 100644
--- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/ContentTypeSelectionDialogFragment.kt
+++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/ContentTypeSelectionDialogFragment.kt
@@ -8,18 +8,14 @@ package org.yuzu.yuzu_emu.fragments
import android.app.Dialog
import android.content.DialogInterface
-import android.net.Uri
import android.os.Bundle
-import androidx.activity.result.contract.ActivityResultContracts
import androidx.fragment.app.DialogFragment
import androidx.fragment.app.activityViewModels
import androidx.preference.PreferenceManager
import com.google.android.material.dialog.MaterialAlertDialogBuilder
-import org.yuzu.yuzu_emu.NativeLibrary
import org.yuzu.yuzu_emu.R
import org.yuzu.yuzu_emu.YuzuApplication
import org.yuzu.yuzu_emu.model.AddonViewModel
-import org.yuzu.yuzu_emu.utils.InstallableActions
class ContentTypeSelectionDialogFragment : DialogFragment() {
private val addonViewModel: AddonViewModel by activityViewModels()
@@ -29,52 +25,6 @@ class ContentTypeSelectionDialogFragment : DialogFragment() {
private var selectedItem = 0
- private val installGameUpdateLauncher =
- registerForActivityResult(ActivityResultContracts.OpenMultipleDocuments()) { documents ->
- if (documents.isEmpty()) {
- return@registerForActivityResult
- }
-
- val game = addonViewModel.game
- if (game == null) {
- installContent(documents)
- return@registerForActivityResult
- }
-
- ProgressDialogFragment.newInstance(
- requireActivity(),
- R.string.verifying_content,
- false
- ) { _, _ ->
- var updatesMatchProgram = true
- for (document in documents) {
- val valid = NativeLibrary.doesUpdateMatchProgram(
- game.programId,
- document.toString()
- )
- if (!valid) {
- updatesMatchProgram = false
- break
- }
- }
-
- requireActivity().runOnUiThread {
- if (updatesMatchProgram) {
- installContent(documents)
- } else {
- MessageDialogFragment.newInstance(
- requireActivity(),
- titleId = R.string.content_install_notice,
- descriptionId = R.string.content_install_notice_description,
- positiveAction = { installContent(documents) },
- negativeAction = {}
- ).show(parentFragmentManager, MessageDialogFragment.TAG)
- }
- }
- return@newInstance Any()
- }.show(parentFragmentManager, ProgressDialogFragment.TAG)
- }
-
override fun onCreateDialog(savedInstanceState: Bundle?): Dialog {
val launchOptions =
arrayOf(getString(R.string.updates_and_dlc), getString(R.string.mods_and_cheats))
@@ -87,7 +37,10 @@ class ContentTypeSelectionDialogFragment : DialogFragment() {
.setTitle(R.string.select_content_type)
.setPositiveButton(android.R.string.ok) { _: DialogInterface, _: Int ->
when (selectedItem) {
- 0 -> installGameUpdateLauncher.launch(arrayOf("*/*"))
+ 0 -> parentFragmentManager.setFragmentResult(
+ REQUEST_INSTALL_GAME_UPDATE,
+ Bundle()
+ )
else -> {
if (!preferences.getBoolean(MOD_NOTICE_SEEN, false)) {
preferences.edit().putBoolean(MOD_NOTICE_SEEN, true).apply()
@@ -112,17 +65,9 @@ class ContentTypeSelectionDialogFragment : DialogFragment() {
companion object {
const val TAG = "ContentTypeSelectionDialogFragment"
+ const val REQUEST_INSTALL_GAME_UPDATE = "RequestInstallGameUpdate"
private const val SELECTED_ITEM = "SelectedItem"
private const val MOD_NOTICE_SEEN = "ModNoticeSeen"
}
-
- private fun installContent(documents: List) {
- InstallableActions.installContent(
- activity = requireActivity(),
- fragmentManager = parentFragmentManager,
- addonViewModel = addonViewModel,
- documents = documents
- )
- }
}
diff --git a/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/DriverManagerFragment.kt b/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/DriverManagerFragment.kt
index 89a6362dc6..877097dc80 100644
--- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/DriverManagerFragment.kt
+++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/DriverManagerFragment.kt
@@ -142,6 +142,17 @@ class DriverManagerFragment : Fragment() {
driverViewModel.onCloseDriverManager(args.game)
}
+ override fun onResume() {
+ super.onResume()
+ refreshDriverList()
+ }
+
+ private fun refreshDriverList() {
+ driverViewModel.reloadDriverData()
+ (binding.listDrivers.adapter as? DriverAdapter)
+ ?.replaceList(driverViewModel.driverList.value)
+ }
+
private fun setInsets() =
ViewCompat.setOnApplyWindowInsetsListener(
binding.root
diff --git a/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/InstallableFragment.kt b/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/InstallableFragment.kt
index 10862c37b4..6510c069e3 100644
--- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/InstallableFragment.kt
+++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/InstallableFragment.kt
@@ -227,66 +227,13 @@ class InstallableFragment : Fragment() {
private val installGameUpdateLauncher =
registerForActivityResult(ActivityResultContracts.OpenMultipleDocuments()) { documents ->
- if (documents.isEmpty()) {
- return@registerForActivityResult
- }
-
- if (addonViewModel.game == null) {
- InstallableActions.installContent(
- activity = requireActivity(),
- fragmentManager = parentFragmentManager,
- addonViewModel = addonViewModel,
- documents = documents
- )
- return@registerForActivityResult
- }
-
- ProgressDialogFragment.newInstance(
- requireActivity(),
- R.string.verifying_content,
- false
- ) { _, _ ->
- var updatesMatchProgram = true
- for (document in documents) {
- val valid = NativeLibrary.doesUpdateMatchProgram(
- addonViewModel.game!!.programId,
- document.toString()
- )
- if (!valid) {
- updatesMatchProgram = false
- break
- }
- }
-
- if (updatesMatchProgram) {
- requireActivity().runOnUiThread {
- InstallableActions.installContent(
- activity = requireActivity(),
- fragmentManager = parentFragmentManager,
- addonViewModel = addonViewModel,
- documents = documents
- )
- }
- } else {
- requireActivity().runOnUiThread {
- MessageDialogFragment.newInstance(
- requireActivity(),
- titleId = R.string.content_install_notice,
- descriptionId = R.string.content_install_notice_description,
- positiveAction = {
- InstallableActions.installContent(
- activity = requireActivity(),
- fragmentManager = parentFragmentManager,
- addonViewModel = addonViewModel,
- documents = documents
- )
- },
- negativeAction = {}
- ).show(parentFragmentManager, MessageDialogFragment.TAG)
- }
- }
- return@newInstance Any()
- }.show(parentFragmentManager, ProgressDialogFragment.TAG)
+ InstallableActions.verifyAndInstallContent(
+ activity = requireActivity(),
+ fragmentManager = parentFragmentManager,
+ addonViewModel = addonViewModel,
+ documents = documents,
+ programId = addonViewModel.game?.programId
+ )
}
private val importUserDataLauncher =
diff --git a/src/android/app/src/main/java/org/yuzu/yuzu_emu/model/DriverViewModel.kt b/src/android/app/src/main/java/org/yuzu/yuzu_emu/model/DriverViewModel.kt
index cd5792b33a..fc7fbc9bfc 100644
--- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/model/DriverViewModel.kt
+++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/model/DriverViewModel.kt
@@ -71,6 +71,8 @@ class DriverViewModel : ViewModel() {
fun reloadDriverData() {
_areDriversLoading.value = true
driverData = GpuDriverHelper.getDrivers()
+ .filterNot { driversToDelete.contains(it.first) }
+ .toMutableList()
updateDriverList()
_areDriversLoading.value = false
}
@@ -167,26 +169,25 @@ class DriverViewModel : ViewModel() {
fun onCloseDriverManager(game: Game?) {
_isDeletingDrivers.value = true
- updateDriverNameForGame(game)
- if (game == null) {
- NativeConfig.saveGlobalConfig()
- } else {
- NativeConfig.savePerGameConfig()
- NativeConfig.unloadPerGameConfig()
- NativeConfig.reloadGlobalConfig()
- }
-
- viewModelScope.launch {
- withContext(Dispatchers.IO) {
- driversToDelete.forEach {
- val driver = File(it)
- if (driver.exists()) {
- driver.delete()
- }
- }
- driversToDelete.clear()
- _isDeletingDrivers.value = false
+ try {
+ updateDriverNameForGame(game)
+ if (game == null) {
+ NativeConfig.saveGlobalConfig()
+ } else {
+ NativeConfig.savePerGameConfig()
+ NativeConfig.unloadPerGameConfig()
+ NativeConfig.reloadGlobalConfig()
}
+
+ driversToDelete.forEach {
+ val driver = File(it)
+ if (driver.exists()) {
+ driver.delete()
+ }
+ }
+ driversToDelete.clear()
+ } finally {
+ _isDeletingDrivers.value = false
}
}
diff --git a/src/android/app/src/main/java/org/yuzu/yuzu_emu/ui/main/MainActivity.kt b/src/android/app/src/main/java/org/yuzu/yuzu_emu/ui/main/MainActivity.kt
index f0806df786..3a771edfcb 100644
--- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/ui/main/MainActivity.kt
+++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/ui/main/MainActivity.kt
@@ -32,7 +32,6 @@ import org.yuzu.yuzu_emu.databinding.ActivityMainBinding
import org.yuzu.yuzu_emu.dialogs.NetPlayDialog
import org.yuzu.yuzu_emu.features.settings.model.Settings
import org.yuzu.yuzu_emu.fragments.AddGameFolderDialogFragment
-import org.yuzu.yuzu_emu.fragments.ProgressDialogFragment
import org.yuzu.yuzu_emu.fragments.MessageDialogFragment
import org.yuzu.yuzu_emu.model.AddonViewModel
import org.yuzu.yuzu_emu.model.DriverViewModel
@@ -479,49 +478,6 @@ class MainActivity : AppCompatActivity(), ThemeProvider {
)
}
- val installGameUpdate = registerForActivityResult(
- ActivityResultContracts.OpenMultipleDocuments()
- ) { documents: List ->
- if (documents.isEmpty()) {
- return@registerForActivityResult
- }
-
- if (addonViewModel.game == null) {
- installContent(documents)
- return@registerForActivityResult
- }
-
- ProgressDialogFragment.newInstance(
- this@MainActivity,
- R.string.verifying_content,
- false
- ) { _, _ ->
- var updatesMatchProgram = true
- for (document in documents) {
- val valid = NativeLibrary.doesUpdateMatchProgram(
- addonViewModel.game!!.programId,
- document.toString()
- )
- if (!valid) {
- updatesMatchProgram = false
- break
- }
- }
-
- if (updatesMatchProgram) {
- homeViewModel.setContentToInstall(documents)
- } else {
- MessageDialogFragment.newInstance(
- this@MainActivity,
- titleId = R.string.content_install_notice,
- descriptionId = R.string.content_install_notice_description,
- positiveAction = { homeViewModel.setContentToInstall(documents) },
- negativeAction = {}
- )
- }
- }.show(supportFragmentManager, ProgressDialogFragment.TAG)
- }
-
private fun installContent(documents: List) {
InstallableActions.installContent(
activity = this,
diff --git a/src/android/app/src/main/java/org/yuzu/yuzu_emu/utils/InstallableActions.kt b/src/android/app/src/main/java/org/yuzu/yuzu_emu/utils/InstallableActions.kt
index d385e2a095..882bae965b 100644
--- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/utils/InstallableActions.kt
+++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/utils/InstallableActions.kt
@@ -26,6 +26,78 @@ import java.util.zip.ZipEntry
import java.util.zip.ZipInputStream
object InstallableActions {
+ private fun verifyGameContentAndInstall(
+ activity: FragmentActivity,
+ fragmentManager: FragmentManager,
+ documents: List,
+ programId: String?,
+ onInstallConfirmed: () -> Unit
+ ) {
+ if (documents.isEmpty()) {
+ return
+ }
+
+ if (programId == null) {
+ onInstallConfirmed()
+ return
+ }
+
+ ProgressDialogFragment.newInstance(
+ activity,
+ R.string.verifying_content,
+ false
+ ) { _, _ ->
+ var updatesMatchProgram = true
+ for (document in documents) {
+ val valid = NativeLibrary.doesUpdateMatchProgram(
+ programId,
+ document.toString()
+ )
+ if (!valid) {
+ updatesMatchProgram = false
+ break
+ }
+ }
+
+ activity.runOnUiThread {
+ if (updatesMatchProgram) {
+ onInstallConfirmed()
+ } else {
+ MessageDialogFragment.newInstance(
+ activity,
+ titleId = R.string.content_install_notice,
+ descriptionId = R.string.content_install_notice_description,
+ positiveAction = onInstallConfirmed,
+ negativeAction = {}
+ ).show(fragmentManager, MessageDialogFragment.TAG)
+ }
+ }
+ return@newInstance Any()
+ }.show(fragmentManager, ProgressDialogFragment.TAG)
+ }
+
+ fun verifyAndInstallContent(
+ activity: FragmentActivity,
+ fragmentManager: FragmentManager,
+ addonViewModel: AddonViewModel,
+ documents: List,
+ programId: String?
+ ) {
+ verifyGameContentAndInstall(
+ activity = activity,
+ fragmentManager = fragmentManager,
+ documents = documents,
+ programId = programId
+ ) {
+ installContent(
+ activity = activity,
+ fragmentManager = fragmentManager,
+ addonViewModel = addonViewModel,
+ documents = documents
+ )
+ }
+ }
+
fun processKey(
activity: FragmentActivity,
fragmentManager: FragmentManager,
diff --git a/src/android/app/src/main/jni/android_settings.h b/src/android/app/src/main/jni/android_settings.h
index 606ce2ce84..8628021f75 100644
--- a/src/android/app/src/main/jni/android_settings.h
+++ b/src/android/app/src/main/jni/android_settings.h
@@ -56,7 +56,7 @@ namespace AndroidSettings {
Settings::Setting theme{linkage, 0, "theme", Settings::Category::Android};
Settings::Setting theme_mode{linkage, -1, "theme_mode", Settings::Category::Android};
- Settings::Setting static_theme_color{linkage, 5, "static_theme_color", Settings::Category::Android};
+ Settings::Setting static_theme_color{linkage, 0, "static_theme_color", Settings::Category::Android};
Settings::Setting black_backgrounds{linkage, false, "black_backgrounds",
Settings::Category::Android};
Settings::Setting app_language{linkage, 0, "app_language", Settings::Category::Android};
diff --git a/src/android/app/src/main/res/drawable/ic_launcher_foreground.png b/src/android/app/src/main/res/drawable/ic_launcher_foreground.png
index 1ccbbd4a5d..53f1cace9b 100644
Binary files a/src/android/app/src/main/res/drawable/ic_launcher_foreground.png and b/src/android/app/src/main/res/drawable/ic_launcher_foreground.png differ
diff --git a/src/android/app/src/main/res/drawable/ic_yuzu.png b/src/android/app/src/main/res/drawable/ic_yuzu.png
index c03a370305..fce02afa1f 100644
Binary files a/src/android/app/src/main/res/drawable/ic_yuzu.png and b/src/android/app/src/main/res/drawable/ic_yuzu.png differ
diff --git a/src/android/app/src/main/res/drawable/ic_yuzu_splash.png b/src/android/app/src/main/res/drawable/ic_yuzu_splash.png
index 2500b856b5..0e43cb9374 100644
Binary files a/src/android/app/src/main/res/drawable/ic_yuzu_splash.png and b/src/android/app/src/main/res/drawable/ic_yuzu_splash.png differ
diff --git a/src/android/app/src/main/res/mipmap-hdpi/ic_launcher.png b/src/android/app/src/main/res/mipmap-hdpi/ic_launcher.png
index f83b149c8e..23bc2897c3 100644
Binary files a/src/android/app/src/main/res/mipmap-hdpi/ic_launcher.png and b/src/android/app/src/main/res/mipmap-hdpi/ic_launcher.png differ
diff --git a/src/android/app/src/main/res/mipmap-mdpi/ic_launcher.png b/src/android/app/src/main/res/mipmap-mdpi/ic_launcher.png
index a790c42402..f630e793e3 100644
Binary files a/src/android/app/src/main/res/mipmap-mdpi/ic_launcher.png and b/src/android/app/src/main/res/mipmap-mdpi/ic_launcher.png differ
diff --git a/src/android/app/src/main/res/mipmap-xhdpi/ic_launcher.png b/src/android/app/src/main/res/mipmap-xhdpi/ic_launcher.png
index 2719752e3c..1daa3c624f 100644
Binary files a/src/android/app/src/main/res/mipmap-xhdpi/ic_launcher.png and b/src/android/app/src/main/res/mipmap-xhdpi/ic_launcher.png differ
diff --git a/src/android/app/src/main/res/mipmap-xxhdpi/ic_launcher.png b/src/android/app/src/main/res/mipmap-xxhdpi/ic_launcher.png
index c9df40719b..7fc64e1393 100644
Binary files a/src/android/app/src/main/res/mipmap-xxhdpi/ic_launcher.png and b/src/android/app/src/main/res/mipmap-xxhdpi/ic_launcher.png differ
diff --git a/src/android/app/src/main/res/mipmap-xxxhdpi/ic_launcher.png b/src/android/app/src/main/res/mipmap-xxxhdpi/ic_launcher.png
index 919453824f..53ed9b9914 100644
Binary files a/src/android/app/src/main/res/mipmap-xxxhdpi/ic_launcher.png and b/src/android/app/src/main/res/mipmap-xxxhdpi/ic_launcher.png differ
diff --git a/src/android/app/src/main/res/values/colors.xml b/src/android/app/src/main/res/values/colors.xml
index 7c3dd1a8c2..472567b323 100644
--- a/src/android/app/src/main/res/values/colors.xml
+++ b/src/android/app/src/main/res/values/colors.xml
@@ -1 +1 @@
-#3cce5bff
+#1F143C
diff --git a/src/android/app/src/main/res/values/strings.xml b/src/android/app/src/main/res/values/strings.xml
index 83c04ad40e..e9bd0f1d1c 100644
--- a/src/android/app/src/main/res/values/strings.xml
+++ b/src/android/app/src/main/res/values/strings.xml
@@ -1227,7 +1227,7 @@
Blue
Cyan
Red
- Green (Default)
+ Green
Yellow
Orange
Pink
diff --git a/src/core/hle/api_version.h b/src/core/hle/api_version.h
index a28930a59a..bffd30d863 100644
--- a/src/core/hle/api_version.h
+++ b/src/core/hle/api_version.h
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
+// SPDX-FileCopyrightText: Copyright 2026 Eden Emulator Project
// SPDX-License-Identifier: GPL-3.0-or-later
// SPDX-FileCopyrightText: Copyright 2021 yuzu Emulator Project
@@ -14,25 +14,25 @@ namespace HLE::ApiVersion {
// Horizon OS version constants.
-constexpr u8 HOS_VERSION_MAJOR = 21;
-constexpr u8 HOS_VERSION_MINOR = 2;
+constexpr u8 HOS_VERSION_MAJOR = 22;
+constexpr u8 HOS_VERSION_MINOR = 0;
constexpr u8 HOS_VERSION_MICRO = 0;
// NintendoSDK version constants.
-constexpr u8 SDK_REVISION_MAJOR = 1;
+constexpr u8 SDK_REVISION_MAJOR = 3;
constexpr u8 SDK_REVISION_MINOR = 0;
constexpr char PLATFORM_STRING[] = "NX";
-constexpr char VERSION_HASH[] = "ff8d6ddacae7c7fd1287e22c3c88bb961acb290c";
-constexpr char DISPLAY_VERSION[] = "21.2.0";
-constexpr char DISPLAY_TITLE[] = "NintendoSDK Firmware for NX 21.2.0-1.0";
+constexpr char VERSION_HASH[] = "da42070c4ad25840c9ee25344bde9d0a8584f5a9";
+constexpr char DISPLAY_VERSION[] = "22.0.0";
+constexpr char DISPLAY_TITLE[] = "NintendoSDK Firmware for NX 22.0.0-3.0";
// Atmosphere version constants.
constexpr u8 ATMOSPHERE_RELEASE_VERSION_MAJOR = 1;
-constexpr u8 ATMOSPHERE_RELEASE_VERSION_MINOR = 9;
-constexpr u8 ATMOSPHERE_RELEASE_VERSION_MICRO = 1;
+constexpr u8 ATMOSPHERE_RELEASE_VERSION_MINOR = 10;
+constexpr u8 ATMOSPHERE_RELEASE_VERSION_MICRO = 2;
constexpr u32 AtmosphereTargetFirmwareWithRevision(u8 major, u8 minor, u8 micro, u8 rev) {
return u32{major} << 24 | u32{minor} << 16 | u32{micro} << 8 | u32{rev};
diff --git a/src/dynarmic/src/dynarmic/backend/x64/emit_x64_vector.cpp b/src/dynarmic/src/dynarmic/backend/x64/emit_x64_vector.cpp
index 1f96939d88..704cc621d4 100644
--- a/src/dynarmic/src/dynarmic/backend/x64/emit_x64_vector.cpp
+++ b/src/dynarmic/src/dynarmic/backend/x64/emit_x64_vector.cpp
@@ -40,33 +40,21 @@ template
static void EmitVectorOperation(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
(code.*fn)(xmm_a, xmm_b);
ctx.reg_alloc.DefineValue(code, inst, xmm_a);
}
-template
-static void EmitAVXVectorOperation(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) {
- auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-
- const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
-
- (code.*fn)(xmm_a, xmm_a, xmm_b);
-
- ctx.reg_alloc.DefineValue(code, inst, xmm_a);
-}
-
template
static void EmitOneArgumentFallback(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Lambda lambda) {
const auto fn = static_cast*>(lambda);
constexpr u32 stack_space = 2 * 16;
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm arg1 = ctx.reg_alloc.UseXmm(code, args[0]);
- const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+ auto const arg1 = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const result = ctx.reg_alloc.ScratchXmm(code);
ctx.reg_alloc.EndOfAllocScope();
ctx.reg_alloc.HostCall(code, nullptr);
@@ -88,8 +76,8 @@ static void EmitOneArgumentFallbackWithSaturation(BlockOfCode& code, EmitContext
const auto fn = static_cast*>(lambda);
constexpr u32 stack_space = 2 * 16;
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm arg1 = ctx.reg_alloc.UseXmm(code, args[0]);
- const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+ auto const arg1 = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const result = ctx.reg_alloc.ScratchXmm(code);
ctx.reg_alloc.EndOfAllocScope();
ctx.reg_alloc.HostCall(code, nullptr);
@@ -113,9 +101,9 @@ static void EmitTwoArgumentFallbackWithSaturation(BlockOfCode& code, EmitContext
const auto fn = static_cast*>(lambda);
constexpr u32 stack_space = 3 * 16;
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm arg1 = ctx.reg_alloc.UseXmm(code, args[0]);
- const Xbyak::Xmm arg2 = ctx.reg_alloc.UseXmm(code, args[1]);
- const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+ auto const arg1 = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const arg2 = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const result = ctx.reg_alloc.ScratchXmm(code);
ctx.reg_alloc.EndOfAllocScope();
ctx.reg_alloc.HostCall(code, nullptr);
@@ -141,9 +129,9 @@ static void EmitTwoArgumentFallbackWithSaturationAndImmediate(BlockOfCode& code,
const auto fn = static_cast*>(lambda);
constexpr u32 stack_space = 2 * 16;
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm arg1 = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const arg1 = ctx.reg_alloc.UseXmm(code, args[0]);
const u8 arg2 = args[1].GetImmediateU8();
- const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+ auto const result = ctx.reg_alloc.ScratchXmm(code);
ctx.reg_alloc.EndOfAllocScope();
ctx.reg_alloc.HostCall(code, nullptr);
@@ -168,9 +156,9 @@ static void EmitTwoArgumentFallback(BlockOfCode& code, EmitContext& ctx, IR::Ins
const auto fn = static_cast*>(lambda);
constexpr u32 stack_space = 3 * 16;
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm arg1 = ctx.reg_alloc.UseXmm(code, args[0]);
- const Xbyak::Xmm arg2 = ctx.reg_alloc.UseXmm(code, args[1]);
- const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+ auto const arg1 = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const arg2 = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const result = ctx.reg_alloc.ScratchXmm(code);
ctx.reg_alloc.EndOfAllocScope();
ctx.reg_alloc.HostCall(code, nullptr);
@@ -196,7 +184,7 @@ void EmitX64::EmitVectorGetElement8(EmitContext& ctx, IR::Inst* inst) {
// TODO: DefineValue directly on Argument for index == 0
- const Xbyak::Xmm source = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const source = ctx.reg_alloc.UseXmm(code, args[0]);
const Xbyak::Reg32 dest = ctx.reg_alloc.ScratchGpr(code).cvt32();
if (code.HasHostFeature(HostFeature::SSE41)) {
@@ -220,7 +208,7 @@ void EmitX64::EmitVectorGetElement16(EmitContext& ctx, IR::Inst* inst) {
// TODO: DefineValue directly on Argument for index == 0
- const Xbyak::Xmm source = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const source = ctx.reg_alloc.UseXmm(code, args[0]);
const Xbyak::Reg32 dest = ctx.reg_alloc.ScratchGpr(code).cvt32();
code.pextrw(dest, source, index);
ctx.reg_alloc.DefineValue(code, inst, dest);
@@ -236,10 +224,10 @@ void EmitX64::EmitVectorGetElement32(EmitContext& ctx, IR::Inst* inst) {
const Xbyak::Reg32 dest = ctx.reg_alloc.ScratchGpr(code).cvt32();
if (code.HasHostFeature(HostFeature::SSE41)) {
- const Xbyak::Xmm source = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const source = ctx.reg_alloc.UseXmm(code, args[0]);
code.pextrd(dest, source, index);
} else {
- const Xbyak::Xmm source = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const source = ctx.reg_alloc.UseScratchXmm(code, args[0]);
code.pshufd(source, source, index);
code.movd(dest, source);
}
@@ -255,7 +243,7 @@ void EmitX64::EmitVectorGetElement64(EmitContext& ctx, IR::Inst* inst) {
if (index == 0) {
// TODO: DefineValue directly on Argument for index == 0
const Xbyak::Reg64 dest = ctx.reg_alloc.ScratchGpr(code).cvt64();
- const Xbyak::Xmm source = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const source = ctx.reg_alloc.UseXmm(code, args[0]);
code.movq(dest, source);
ctx.reg_alloc.DefineValue(code, inst, dest);
return;
@@ -264,10 +252,10 @@ void EmitX64::EmitVectorGetElement64(EmitContext& ctx, IR::Inst* inst) {
const Xbyak::Reg64 dest = ctx.reg_alloc.ScratchGpr(code).cvt64();
if (code.HasHostFeature(HostFeature::SSE41)) {
- const Xbyak::Xmm source = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const source = ctx.reg_alloc.UseXmm(code, args[0]);
code.pextrq(dest, source, 1);
} else {
- const Xbyak::Xmm source = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const source = ctx.reg_alloc.UseScratchXmm(code, args[0]);
code.punpckhqdq(source, source);
code.movq(dest, source);
}
@@ -279,7 +267,7 @@ void EmitX64::EmitVectorSetElement8(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
ASSERT(args[1].IsImmediate());
const u8 index = args[1].GetImmediateU8();
- const Xbyak::Xmm source_vector = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const source_vector = ctx.reg_alloc.UseScratchXmm(code, args[0]);
if (code.HasHostFeature(HostFeature::SSE41)) {
const Xbyak::Reg8 source_elem = ctx.reg_alloc.UseGpr(code, args[2]).cvt8();
@@ -312,7 +300,7 @@ void EmitX64::EmitVectorSetElement16(EmitContext& ctx, IR::Inst* inst) {
ASSERT(args[1].IsImmediate());
const u8 index = args[1].GetImmediateU8();
- const Xbyak::Xmm source_vector = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const source_vector = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const Xbyak::Reg16 source_elem = ctx.reg_alloc.UseGpr(code, args[2]).cvt16();
code.pinsrw(source_vector, source_elem.cvt32(), index);
@@ -324,7 +312,7 @@ void EmitX64::EmitVectorSetElement32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
ASSERT(args[1].IsImmediate());
const u8 index = args[1].GetImmediateU8();
- const Xbyak::Xmm source_vector = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const source_vector = ctx.reg_alloc.UseScratchXmm(code, args[0]);
if (code.HasHostFeature(HostFeature::SSE41)) {
const Xbyak::Reg32 source_elem = ctx.reg_alloc.UseGpr(code, args[2]).cvt32();
@@ -347,7 +335,7 @@ void EmitX64::EmitVectorSetElement64(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
ASSERT(args[1].IsImmediate());
const u8 index = args[1].GetImmediateU8();
- const Xbyak::Xmm source_vector = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const source_vector = ctx.reg_alloc.UseScratchXmm(code, args[0]);
if (code.HasHostFeature(HostFeature::SSE41)) {
const Xbyak::Reg64 source_elem = ctx.reg_alloc.UseGpr(code, args[2]);
@@ -357,7 +345,7 @@ void EmitX64::EmitVectorSetElement64(EmitContext& ctx, IR::Inst* inst) {
ctx.reg_alloc.DefineValue(code, inst, source_vector);
} else {
const Xbyak::Reg64 source_elem = ctx.reg_alloc.UseGpr(code, args[2]);
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
code.movq(tmp, source_elem);
@@ -371,72 +359,53 @@ void EmitX64::EmitVectorSetElement64(EmitContext& ctx, IR::Inst* inst) {
}
}
-static void VectorAbs8(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& data) {
- if (code.HasHostFeature(HostFeature::SSSE3)) {
- code.pabsb(data, data);
- } else {
- const Xbyak::Xmm temp = ctx.reg_alloc.ScratchXmm(code);
- code.pxor(temp, temp);
- code.psubb(temp, data);
- code.pminub(data, temp);
- }
-}
-
-static void VectorAbs16(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& data) {
- if (code.HasHostFeature(HostFeature::SSSE3)) {
- code.pabsw(data, data);
- } else {
- const Xbyak::Xmm temp = ctx.reg_alloc.ScratchXmm(code);
- code.pxor(temp, temp);
- code.psubw(temp, data);
- code.pmaxsw(data, temp);
- }
-}
-
-static void VectorAbs32(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& data) {
- if (code.HasHostFeature(HostFeature::SSSE3)) {
- code.pabsd(data, data);
- } else {
- const Xbyak::Xmm temp = ctx.reg_alloc.ScratchXmm(code);
- code.movdqa(temp, data);
- code.psrad(temp, 31);
- code.pxor(data, temp);
- code.psubd(data, temp);
- }
-}
-
-static void VectorAbs64(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& data) {
- if (code.HasHostFeature(HostFeature::AVX512_Ortho)) {
- code.vpabsq(data, data);
- } else {
- const Xbyak::Xmm temp = ctx.reg_alloc.ScratchXmm(code);
- code.pshufd(temp, data, 0b11110101);
- code.psrad(temp, 31);
- code.pxor(data, temp);
- code.psubq(data, temp);
- }
-}
-
static void EmitVectorAbs(size_t esize, EmitContext& ctx, IR::Inst* inst, BlockOfCode& code) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-
- const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-
+ auto const data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
switch (esize) {
case 8:
- VectorAbs8(code, ctx, data);
+ if (code.HasHostFeature(HostFeature::SSSE3)) {
+ code.pabsb(data, data);
+ } else {
+ auto const temp = ctx.reg_alloc.ScratchXmm(code);
+ code.pxor(temp, temp);
+ code.psubb(temp, data);
+ code.pminub(data, temp);
+ }
break;
case 16:
- VectorAbs16(code, ctx, data);
+ if (code.HasHostFeature(HostFeature::SSSE3)) {
+ code.pabsw(data, data);
+ } else {
+ auto const temp = ctx.reg_alloc.ScratchXmm(code);
+ code.pxor(temp, temp);
+ code.psubw(temp, data);
+ code.pmaxsw(data, temp);
+ }
break;
case 32:
- VectorAbs32(code, ctx, data);
+ if (code.HasHostFeature(HostFeature::SSSE3)) {
+ code.pabsd(data, data);
+ } else {
+ auto const temp = ctx.reg_alloc.ScratchXmm(code);
+ code.movdqa(temp, data);
+ code.psrad(temp, 31);
+ code.pxor(data, temp);
+ code.psubd(data, temp);
+ }
break;
case 64:
- VectorAbs64(code, ctx, data);
+ if (code.HasHostFeature(HostFeature::AVX512_Ortho)) {
+ code.vpabsq(data, data);
+ } else {
+ auto const temp = ctx.reg_alloc.ScratchXmm(code);
+ code.pshufd(temp, data, 0b11110101);
+ code.psrad(temp, 31);
+ code.pxor(data, temp);
+ code.psubq(data, temp);
+ }
break;
}
-
ctx.reg_alloc.DefineValue(code, inst, data);
}
@@ -479,15 +448,15 @@ void EmitX64::EmitVectorAnd(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorAndNot(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseXmm(code, args[0]);
- const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const xmm_a = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const xmm_b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
code.pandn(xmm_b, xmm_a);
ctx.reg_alloc.DefineValue(code, inst, xmm_b);
}
-static void ArithmeticShiftRightByte(EmitContext& ctx, BlockOfCode& code, const Xbyak::Xmm& result, u8 shift_amount) {
+static void ArithmeticShiftRightByte(EmitContext& ctx, BlockOfCode& code, auto const& result, u8 shift_amount) {
if (code.HasHostFeature(HostFeature::GFNI)) {
const u64 shift_matrix = shift_amount < 8
? (0x0102040810204080 << (shift_amount * 8)) | (0x8080808080808080 >> (64 - shift_amount * 8))
@@ -496,7 +465,7 @@ static void ArithmeticShiftRightByte(EmitContext& ctx, BlockOfCode& code, const
return;
}
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
code.punpckhbw(tmp, result);
code.punpcklbw(result, result);
@@ -508,7 +477,7 @@ static void ArithmeticShiftRightByte(EmitContext& ctx, BlockOfCode& code, const
void EmitX64::EmitVectorArithmeticShiftRight8(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const u8 shift_amount = args[1].GetImmediateU8();
ArithmeticShiftRightByte(ctx, code, result, shift_amount);
@@ -519,7 +488,7 @@ void EmitX64::EmitVectorArithmeticShiftRight8(EmitContext& ctx, IR::Inst* inst)
void EmitX64::EmitVectorArithmeticShiftRight16(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const u8 shift_amount = args[1].GetImmediateU8();
code.psraw(result, shift_amount);
@@ -530,7 +499,7 @@ void EmitX64::EmitVectorArithmeticShiftRight16(EmitContext& ctx, IR::Inst* inst)
void EmitX64::EmitVectorArithmeticShiftRight32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const u8 shift_amount = args[1].GetImmediateU8();
code.psrad(result, shift_amount);
@@ -540,14 +509,14 @@ void EmitX64::EmitVectorArithmeticShiftRight32(EmitContext& ctx, IR::Inst* inst)
void EmitX64::EmitVectorArithmeticShiftRight64(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const u8 shift_amount = (std::min)(args[1].GetImmediateU8(), u8(63));
if (code.HasHostFeature(HostFeature::AVX512_Ortho)) {
code.vpsraq(result, result, shift_amount);
} else {
- const Xbyak::Xmm tmp1 = ctx.reg_alloc.ScratchXmm(code);
- const Xbyak::Xmm tmp2 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp1 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
const u64 sign_bit = 0x80000000'00000000u >> shift_amount;
@@ -662,12 +631,12 @@ void EmitX64::EmitVectorArithmeticVShift64(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorBroadcastLower8(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
if (code.HasHostFeature(HostFeature::AVX2)) {
code.vpbroadcastb(a, a);
code.vmovq(a, a);
} else if (code.HasHostFeature(HostFeature::SSSE3)) {
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
code.pxor(tmp, tmp);
code.pshufb(a, tmp);
code.movq(a, a);
@@ -680,7 +649,7 @@ void EmitX64::EmitVectorBroadcastLower8(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorBroadcastLower16(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
code.pshuflw(a, a, 0);
@@ -689,7 +658,7 @@ void EmitX64::EmitVectorBroadcastLower16(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorBroadcastLower32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
code.pshuflw(a, a, 0b01000100);
@@ -698,11 +667,11 @@ void EmitX64::EmitVectorBroadcastLower32(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorBroadcast8(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
if (code.HasHostFeature(HostFeature::AVX2)) {
code.vpbroadcastb(a, a);
} else if (code.HasHostFeature(HostFeature::SSSE3)) {
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
code.pxor(tmp, tmp);
code.pshufb(a, tmp);
} else {
@@ -715,7 +684,7 @@ void EmitX64::EmitVectorBroadcast8(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorBroadcast16(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
if (code.HasHostFeature(HostFeature::AVX2)) {
code.vpbroadcastw(a, a);
} else {
@@ -727,7 +696,7 @@ void EmitX64::EmitVectorBroadcast16(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorBroadcast32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
if (code.HasHostFeature(HostFeature::AVX2)) {
code.vpbroadcastd(a, a);
} else {
@@ -738,7 +707,7 @@ void EmitX64::EmitVectorBroadcast32(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorBroadcast64(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
if (code.HasHostFeature(HostFeature::AVX2)) {
code.vpbroadcastq(a, a);
} else {
@@ -749,7 +718,7 @@ void EmitX64::EmitVectorBroadcast64(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorBroadcastElementLower8(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
ASSERT(args[1].IsImmediate());
const u8 index = args[1].GetImmediateU8();
ASSERT(index < 16);
@@ -760,7 +729,7 @@ void EmitX64::EmitVectorBroadcastElementLower8(EmitContext& ctx, IR::Inst* inst)
code.vpbroadcastb(a, a);
code.vmovq(a, a);
} else if (code.HasHostFeature(HostFeature::SSSE3)) {
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
code.pxor(tmp, tmp);
code.pshufb(a, tmp);
code.movq(a, a);
@@ -773,7 +742,7 @@ void EmitX64::EmitVectorBroadcastElementLower8(EmitContext& ctx, IR::Inst* inst)
void EmitX64::EmitVectorBroadcastElementLower16(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
ASSERT(args[1].IsImmediate());
const u8 index = args[1].GetImmediateU8();
ASSERT(index < 8);
@@ -786,7 +755,7 @@ void EmitX64::EmitVectorBroadcastElementLower16(EmitContext& ctx, IR::Inst* inst
void EmitX64::EmitVectorBroadcastElementLower32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
ASSERT(args[1].IsImmediate());
const u8 index = args[1].GetImmediateU8();
ASSERT(index < 4);
@@ -802,7 +771,7 @@ void EmitX64::EmitVectorBroadcastElementLower32(EmitContext& ctx, IR::Inst* inst
void EmitX64::EmitVectorBroadcastElement8(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
ASSERT(args[1].IsImmediate());
const u8 index = args[1].GetImmediateU8();
ASSERT(index < 16);
@@ -812,7 +781,7 @@ void EmitX64::EmitVectorBroadcastElement8(EmitContext& ctx, IR::Inst* inst) {
if (code.HasHostFeature(HostFeature::AVX2)) {
code.vpbroadcastb(a, a);
} else if (code.HasHostFeature(HostFeature::SSSE3)) {
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
code.pxor(tmp, tmp);
code.pshufb(a, tmp);
@@ -826,7 +795,7 @@ void EmitX64::EmitVectorBroadcastElement8(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorBroadcastElement16(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
ASSERT(args[1].IsImmediate());
const u8 index = args[1].GetImmediateU8();
ASSERT(index < 8);
@@ -846,7 +815,7 @@ void EmitX64::EmitVectorBroadcastElement16(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorBroadcastElement32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
ASSERT(args[1].IsImmediate());
const u8 index = args[1].GetImmediateU8();
ASSERT(index < 4);
@@ -858,7 +827,7 @@ void EmitX64::EmitVectorBroadcastElement32(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorBroadcastElement64(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
ASSERT(args[1].IsImmediate());
const u8 index = args[1].GetImmediateU8();
ASSERT(index < 2);
@@ -1045,9 +1014,9 @@ void EmitX64::EmitVectorCountLeadingZeros32(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorDeinterleaveEven8(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(code, args[1]);
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const rhs = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(tmp, code.Const(xword, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF));
code.pand(lhs, tmp);
@@ -1059,11 +1028,11 @@ void EmitX64::EmitVectorDeinterleaveEven8(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorDeinterleaveEven16(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const rhs = ctx.reg_alloc.UseScratchXmm(code, args[1]);
if (code.HasHostFeature(HostFeature::SSE41)) {
- const Xbyak::Xmm zero = ctx.reg_alloc.ScratchXmm(code);
+ auto const zero = ctx.reg_alloc.ScratchXmm(code);
code.pxor(zero, zero);
code.pblendw(lhs, zero, 0b10101010);
@@ -1084,8 +1053,8 @@ void EmitX64::EmitVectorDeinterleaveEven16(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorDeinterleaveEven32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const rhs = ctx.reg_alloc.UseXmm(code, args[1]);
code.shufps(lhs, rhs, 0b10001000);
@@ -1094,8 +1063,8 @@ void EmitX64::EmitVectorDeinterleaveEven32(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorDeinterleaveEven64(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const rhs = ctx.reg_alloc.UseXmm(code, args[1]);
code.shufpd(lhs, rhs, 0b00);
@@ -1104,16 +1073,16 @@ void EmitX64::EmitVectorDeinterleaveEven64(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorDeinterleaveEvenLower8(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
if (code.HasHostFeature(HostFeature::SSSE3)) {
- const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const rhs = ctx.reg_alloc.UseXmm(code, args[1]);
code.punpcklbw(lhs, rhs);
code.pshufb(lhs, code.Const(xword, 0x0D'09'05'01'0C'08'04'00, 0x8080808080808080));
} else {
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
- const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const rhs = ctx.reg_alloc.UseScratchXmm(code, args[1]);
code.movdqa(tmp, code.Const(xword, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF));
code.pand(lhs, tmp);
@@ -1128,15 +1097,15 @@ void EmitX64::EmitVectorDeinterleaveEvenLower8(EmitContext& ctx, IR::Inst* inst)
void EmitX64::EmitVectorDeinterleaveEvenLower16(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
if (code.HasHostFeature(HostFeature::SSSE3)) {
- const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const rhs = ctx.reg_alloc.UseXmm(code, args[1]);
code.punpcklwd(lhs, rhs);
code.pshufb(lhs, code.Const(xword, 0x0B0A'0302'0908'0100, 0x8080'8080'8080'8080));
} else {
- const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const rhs = ctx.reg_alloc.UseScratchXmm(code, args[1]);
code.pslld(lhs, 16);
code.psrad(lhs, 16);
@@ -1154,8 +1123,8 @@ void EmitX64::EmitVectorDeinterleaveEvenLower16(EmitContext& ctx, IR::Inst* inst
void EmitX64::EmitVectorDeinterleaveEvenLower32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const rhs = ctx.reg_alloc.UseXmm(code, args[1]);
if (code.HasHostFeature(HostFeature::SSE41)) {
// copy bytes 0:3 of rhs to lhs, zero out upper 8 bytes
@@ -1170,8 +1139,8 @@ void EmitX64::EmitVectorDeinterleaveEvenLower32(EmitContext& ctx, IR::Inst* inst
void EmitX64::EmitVectorDeinterleaveOdd8(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const rhs = ctx.reg_alloc.UseScratchXmm(code, args[1]);
code.psraw(lhs, 8);
code.psraw(rhs, 8);
@@ -1182,8 +1151,8 @@ void EmitX64::EmitVectorDeinterleaveOdd8(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorDeinterleaveOdd16(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const rhs = ctx.reg_alloc.UseScratchXmm(code, args[1]);
code.psrad(lhs, 16);
code.psrad(rhs, 16);
@@ -1194,8 +1163,8 @@ void EmitX64::EmitVectorDeinterleaveOdd16(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorDeinterleaveOdd32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const rhs = ctx.reg_alloc.UseXmm(code, args[1]);
code.shufps(lhs, rhs, 0b11011101);
@@ -1204,8 +1173,8 @@ void EmitX64::EmitVectorDeinterleaveOdd32(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorDeinterleaveOdd64(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const rhs = ctx.reg_alloc.UseXmm(code, args[1]);
code.shufpd(lhs, rhs, 0b11);
@@ -1214,15 +1183,15 @@ void EmitX64::EmitVectorDeinterleaveOdd64(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorDeinterleaveOddLower8(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
if (code.HasHostFeature(HostFeature::SSSE3)) {
- const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const rhs = ctx.reg_alloc.UseXmm(code, args[1]);
code.punpcklbw(lhs, rhs);
code.pshufb(lhs, code.Const(xword, 0x0F'0B'07'03'0E'0A'06'02, 0x8080808080808080));
} else {
- const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const rhs = ctx.reg_alloc.UseScratchXmm(code, args[1]);
code.psraw(lhs, 8);
code.psraw(rhs, 8);
@@ -1236,15 +1205,15 @@ void EmitX64::EmitVectorDeinterleaveOddLower8(EmitContext& ctx, IR::Inst* inst)
void EmitX64::EmitVectorDeinterleaveOddLower16(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
if (code.HasHostFeature(HostFeature::SSSE3)) {
- const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const rhs = ctx.reg_alloc.UseXmm(code, args[1]);
code.punpcklwd(lhs, rhs);
code.pshufb(lhs, code.Const(xword, 0x0F0E'0706'0D0C'0504, 0x8080'8080'8080'8080));
} else {
- const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const rhs = ctx.reg_alloc.UseScratchXmm(code, args[1]);
code.psrad(lhs, 16);
code.psrad(rhs, 16);
@@ -1260,17 +1229,17 @@ void EmitX64::EmitVectorDeinterleaveOddLower32(EmitContext& ctx, IR::Inst* inst)
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasHostFeature(HostFeature::SSE41)) {
- const Xbyak::Xmm lhs = ctx.reg_alloc.UseXmm(code, args[0]);
- const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const lhs = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const rhs = ctx.reg_alloc.UseScratchXmm(code, args[1]);
// copy bytes 4:7 of lhs to bytes 0:3 of rhs, zero out upper 8 bytes
code.insertps(rhs, lhs, 0b01001100);
ctx.reg_alloc.DefineValue(code, inst, rhs);
} else {
- const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(code, args[1]);
- const Xbyak::Xmm zero = ctx.reg_alloc.ScratchXmm(code);
+ auto const lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const rhs = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const zero = ctx.reg_alloc.ScratchXmm(code);
code.xorps(zero, zero);
code.unpcklps(lhs, rhs);
@@ -1304,9 +1273,9 @@ void EmitX64::EmitVectorEqual64(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
code.pcmpeqd(xmm_a, xmm_b);
code.pshufd(tmp, xmm_a, 0b10110001);
@@ -1319,9 +1288,9 @@ void EmitX64::EmitVectorEqual128(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasHostFeature(HostFeature::SSE41)) {
- const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
code.pcmpeqq(xmm_a, xmm_b);
code.pshufd(tmp, xmm_a, 0b01001110);
@@ -1329,9 +1298,9 @@ void EmitX64::EmitVectorEqual128(EmitContext& ctx, IR::Inst* inst) {
ctx.reg_alloc.DefineValue(code, inst, xmm_a);
} else {
- const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
code.pcmpeqd(xmm_a, xmm_b);
code.pshufd(tmp, xmm_a, 0b10110001);
@@ -1355,16 +1324,16 @@ void EmitX64::EmitVectorExtract(EmitContext& ctx, IR::Inst* inst) {
}
if (code.HasHostFeature(HostFeature::SSSE3)) {
- const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseXmm(code, args[0]);
- const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const xmm_a = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const xmm_b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
code.palignr(xmm_b, xmm_a, position / 8);
ctx.reg_alloc.DefineValue(code, inst, xmm_b);
return;
}
- const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const xmm_b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
code.psrldq(xmm_a, position / 8);
code.pslldq(xmm_b, (128 - position) / 8);
@@ -1376,13 +1345,13 @@ void EmitX64::EmitVectorExtract(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorExtractLower(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const u8 position = args[2].GetImmediateU8();
ASSERT(position % 8 == 0);
if (position != 0) {
- const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
code.punpcklqdq(xmm_a, xmm_b);
code.psrldq(xmm_a, position / 8);
@@ -1407,22 +1376,33 @@ void EmitX64::EmitVectorGreaterS32(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorGreaterS64(EmitContext& ctx, IR::Inst* inst) {
if (code.HasHostFeature(HostFeature::SSE42)) {
EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pcmpgtq);
- return;
+ } else {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const tmp1 = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp3 = ctx.reg_alloc.ScratchXmm(code);
+ code.movdqa(tmp2, code.Const(xword, 0x80000000, 0x80000000));
+ code.pxor(tmp0, tmp2);
+ code.pxor(tmp1, tmp2);
+ code.movdqa(tmp2, tmp0);
+ code.pcmpeqd(tmp0, tmp1);
+ code.pcmpgtd(tmp2, tmp1);
+ code.pshufd(tmp1, tmp0, 245);
+ code.pshufd(tmp3, tmp2, 160);
+ code.pshufd(tmp0, tmp2, 245);
+ code.pand(tmp1, tmp3);
+ code.por(tmp0, tmp1);
+ ctx.reg_alloc.DefineValue(code, inst, tmp0);
}
-
- EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) {
- for (size_t i = 0; i < result.size(); ++i) {
- result[i] = (a[i] > b[i]) ? ~u64(0) : 0;
- }
- });
}
static void EmitVectorHalvingAddSigned(size_t esize, EmitContext& ctx, IR::Inst* inst, BlockOfCode& code) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(tmp, b);
code.pand(tmp, a);
@@ -1461,9 +1441,9 @@ void EmitX64::EmitVectorHalvingAddS32(EmitContext& ctx, IR::Inst* inst) {
static void EmitVectorHalvingAddUnsigned(size_t esize, EmitContext& ctx, IR::Inst* inst, BlockOfCode& code) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(tmp, b);
@@ -1506,12 +1486,12 @@ void EmitX64::EmitVectorHalvingAddU32(EmitContext& ctx, IR::Inst* inst) {
static void EmitVectorHalvingSubSigned(size_t esize, EmitContext& ctx, IR::Inst* inst, BlockOfCode& code) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
switch (esize) {
case 8: {
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(tmp, code.Const(xword, 0x8080808080808080, 0x8080808080808080));
code.pxor(a, tmp);
code.pxor(b, tmp);
@@ -1520,7 +1500,7 @@ static void EmitVectorHalvingSubSigned(size_t esize, EmitContext& ctx, IR::Inst*
break;
}
case 16: {
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(tmp, code.Const(xword, 0x8000800080008000, 0x8000800080008000));
code.pxor(a, tmp);
code.pxor(b, tmp);
@@ -1554,8 +1534,8 @@ void EmitX64::EmitVectorHalvingSubS32(EmitContext& ctx, IR::Inst* inst) {
static void EmitVectorHalvingSubUnsigned(size_t esize, EmitContext& ctx, IR::Inst* inst, BlockOfCode& code) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
switch (esize) {
case 8:
@@ -1592,8 +1572,8 @@ void EmitX64::EmitVectorHalvingSubU32(EmitContext& ctx, IR::Inst* inst) {
static void EmitVectorInterleaveLower(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, int size) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const b = ctx.reg_alloc.UseXmm(code, args[1]);
switch (size) {
case 8:
@@ -1632,8 +1612,8 @@ void EmitX64::EmitVectorInterleaveLower64(EmitContext& ctx, IR::Inst* inst) {
static void EmitVectorInterleaveUpper(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, int size) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const b = ctx.reg_alloc.UseXmm(code, args[1]);
switch (size) {
case 8:
@@ -1672,7 +1652,7 @@ void EmitX64::EmitVectorInterleaveUpper64(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorLogicalShiftLeft8(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const u8 shift_amount = args[1].GetImmediateU8();
if (shift_amount == 0) {
@@ -1698,7 +1678,7 @@ void EmitX64::EmitVectorLogicalShiftLeft8(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorLogicalShiftLeft16(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const u8 shift_amount = args[1].GetImmediateU8();
code.psllw(result, shift_amount);
@@ -1709,7 +1689,7 @@ void EmitX64::EmitVectorLogicalShiftLeft16(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorLogicalShiftLeft32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const u8 shift_amount = args[1].GetImmediateU8();
code.pslld(result, shift_amount);
@@ -1720,7 +1700,7 @@ void EmitX64::EmitVectorLogicalShiftLeft32(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorLogicalShiftLeft64(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const u8 shift_amount = args[1].GetImmediateU8();
code.psllq(result, shift_amount);
@@ -1731,7 +1711,7 @@ void EmitX64::EmitVectorLogicalShiftLeft64(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorLogicalShiftRight8(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const u8 shift_amount = args[1].GetImmediateU8();
if (shift_amount == 0) {
@@ -1755,7 +1735,7 @@ void EmitX64::EmitVectorLogicalShiftRight8(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorLogicalShiftRight16(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const u8 shift_amount = args[1].GetImmediateU8();
code.psrlw(result, shift_amount);
@@ -1766,7 +1746,7 @@ void EmitX64::EmitVectorLogicalShiftRight16(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorLogicalShiftRight32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const u8 shift_amount = args[1].GetImmediateU8();
code.psrld(result, shift_amount);
@@ -1777,7 +1757,7 @@ void EmitX64::EmitVectorLogicalShiftRight32(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorLogicalShiftRight64(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const u8 shift_amount = args[1].GetImmediateU8();
code.psrlq(result, shift_amount);
@@ -1785,41 +1765,12 @@ void EmitX64::EmitVectorLogicalShiftRight64(EmitContext& ctx, IR::Inst* inst) {
ctx.reg_alloc.DefineValue(code, inst, result);
}
-template
-static void EmitVectorLogicalVShiftAVX2(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
- static_assert(esize == 32 || esize == 64);
- auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
- const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
-
- // store sign bit of lowest byte of each element of b to select left/right shift later
- ICODE(vpsll)(xmm0, b, u8(esize - 8));
-
- // sse/avx shifts are only positive, with dedicated left/right forms - shift by lowest byte of abs(b)
- code.vpabsb(b, b);
- code.vpand(b, b, code.BConst(xword, 0xFF));
-
- // calculate shifts
- ICODE(vpsllv)(result, a, b);
- ICODE(vpsrlv)(a, a, b);
-
- // implicit argument: xmm0 (sign of lowest byte of b)
- if (esize == 32) {
- code.blendvps(result, a);
- } else {
- code.blendvpd(result, a);
- }
- ctx.reg_alloc.DefineValue(code, inst, result);
-}
-
void EmitX64::EmitVectorLogicalVShift8(EmitContext& ctx, IR::Inst* inst) {
if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW | HostFeature::GFNI)) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm left_shift = ctx.reg_alloc.UseScratchXmm(code, args[1]);
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const left_shift = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
const Xbyak::Opmask negative_mask = k1;
code.pxor(tmp, tmp);
@@ -1864,10 +1815,10 @@ void EmitX64::EmitVectorLogicalVShift16(EmitContext& ctx, IR::Inst* inst) {
if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW)) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm left_shift = ctx.reg_alloc.UseScratchXmm(code, args[1]);
- const Xbyak::Xmm right_shift = ctx.reg_alloc.ScratchXmm(code);
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const left_shift = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const right_shift = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
code.vmovdqa32(tmp, code.Const(xword, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF));
code.vpxord(right_shift, right_shift, right_shift);
@@ -1888,18 +1839,87 @@ void EmitX64::EmitVectorLogicalVShift16(EmitContext& ctx, IR::Inst* inst) {
}
void EmitX64::EmitVectorLogicalVShift32(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasHostFeature(HostFeature::AVX2)) {
- EmitVectorLogicalVShiftAVX2<32>(code, ctx, inst);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const result = ctx.reg_alloc.ScratchXmm(code);
+ auto const mask = ctx.reg_alloc.ScratchXmm(code);
+ // store sign bit of lowest byte of each element of b to select left/right shift later
+ code.vpslld(mask, b, u8(32 - 8));
+ // sse/avx shifts are only positive, with dedicated left/right forms - shift by lowest byte of abs(b)
+ code.vpabsb(b, b);
+ code.vpand(b, b, code.BConst<32>(xword, 0xFF));
+ // calculate shifts
+ code.vpsllvd(result, a, b);
+ code.vpsrlvd(a, a, b);
+ code.vblendvps(result, result, a, mask);
+ ctx.reg_alloc.DefineValue(code, inst, result);
} else {
- EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) {
- std::transform(a.begin(), a.end(), b.begin(), result.begin(), VShift);
- });
+ auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const tmp1 = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp3 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp4 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp5 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp6 = ctx.reg_alloc.ScratchXmm(code);
+ code.pxor(tmp3, tmp3);
+ code.movdqa(tmp2, tmp0);
+ code.psubb(tmp3, tmp1);
+ code.movdqa(tmp4, tmp2);
+ code.movdqa(tmp6, tmp2);
+ code.pminub(tmp3, tmp1);
+ code.pslld(tmp1, 24);
+ code.pand(tmp3, code.Const(xword, 0x000000ff'000000ff, 0x000000ff'000000ff));
+ code.psrad(tmp1, 31);
+ code.pshuflw(tmp0, tmp3, 254);
+ code.pshuflw(tmp5, tmp3, 84);
+ code.psrld(tmp4, tmp0);
+ code.movdqa(tmp0, tmp2);
+ code.psrld(tmp0, tmp5);
+ code.punpcklqdq(tmp0, tmp4);
+ code.pshufd(tmp4, tmp3, 238);
+ code.pslld(tmp3, 23);
+ code.paddd(tmp3, code.Const(xword, 0x3F80'00003F80'0000, 0x3F80'00003F80'0000));
+ code.pshuflw(tmp5, tmp4, 254);
+ code.pshuflw(tmp4, tmp4, 84);
+ code.psrld(tmp6, tmp5);
+ code.movdqa(tmp5, tmp2);
+ code.psrld(tmp5, tmp4);
+ code.pshufd(tmp4, tmp2, 245);
+ code.punpckhqdq(tmp5, tmp6);
+ code.cvttps2dq(tmp3, tmp3);
+ code.shufps(tmp0, tmp5, 204);
+ code.pmuludq(tmp2, tmp3);
+ code.pshufd(tmp3, tmp3, 245);
+ code.andps(tmp0, tmp1);
+ code.pmuludq(tmp3, tmp4);
+ code.pshufd(tmp2, tmp2, 232);
+ code.pshufd(tmp3, tmp3, 232);
+ code.punpckldq(tmp2, tmp3);
+ code.pandn(tmp1, tmp2);
+ code.orps(tmp0, tmp1);
+ ctx.reg_alloc.DefineValue(code, inst, tmp0);
}
}
void EmitX64::EmitVectorLogicalVShift64(EmitContext& ctx, IR::Inst* inst) {
if (code.HasHostFeature(HostFeature::AVX2)) {
- EmitVectorLogicalVShiftAVX2<64>(code, ctx, inst);
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const result = ctx.reg_alloc.ScratchXmm(code);
+ auto const mask = ctx.reg_alloc.ScratchXmm(code);
+ // store sign bit of lowest byte of each element of b to select left/right shift later
+ code.vpsllq(mask, b, u8(64 - 8));
+ // sse/avx shifts are only positive, with dedicated left/right forms - shift by lowest byte of abs(b)
+ code.vpabsb(b, b);
+ code.vpand(b, b, code.BConst<64>(xword, 0xFF));
+ // calculate shifts
+ code.vpsllvq(result, a, b);
+ code.vpsrlvq(a, a, b);
+ code.vblendvpd(result, result, a, mask);
+ ctx.reg_alloc.DefineValue(code, inst, result);
} else {
EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) {
std::transform(a.begin(), a.end(), b.begin(), result.begin(), VShift);
@@ -1914,28 +1934,11 @@ enum class MinMaxOperation {
Max,
};
-// Compute the minimum/maximum of two vectors of signed 8-bit integers, using only SSE2 instructons.
-// The result of the operation is placed in operand a, while b is unmodified.
-void FallbackMinMaxS8(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& a, const Xbyak::Xmm& b, MinMaxOperation op) {
- const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(code);
- if(op == MinMaxOperation::Min) {
- code.movdqa(c, b);
- code.pcmpgtb(c, a);
- } else {
- code.movdqa(c, a);
- code.pcmpgtb(c, b);
- }
-
- code.pand(a, c);
- code.pandn(c, b);
- code.por(a, c);
-}
-
// Compute the minimum/maximum of two vectors of unsigned 16-bit integers, using only SSE2 instructons.
// The result of the operation is placed in operand a, while b is unmodified.
-void FallbackMinMaxU16(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& a, const Xbyak::Xmm& b, MinMaxOperation op) {
+void FallbackMinMaxU16(BlockOfCode& code, EmitContext& ctx, auto const& a, auto const& b, MinMaxOperation op) {
if(op == MinMaxOperation::Min) {
- const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(code);
+ auto const c = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(c, a);
code.psubusw(c, b);
code.psubw(a, c);
@@ -1947,8 +1950,8 @@ void FallbackMinMaxU16(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& a,
// Compute the minimum/maximum of two vectors of signed 32-bit integers, using only SSE2 instructons.
// The result of the operation is placed in operand a, while b is unmodified.
-void FallbackMinMaxS32(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& a, const Xbyak::Xmm& b, MinMaxOperation op) {
- const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(code);
+void FallbackMinMaxS32(BlockOfCode& code, EmitContext& ctx, auto const& a, auto const& b, MinMaxOperation op) {
+ auto const c = ctx.reg_alloc.ScratchXmm(code);
if(op == MinMaxOperation::Min) {
code.movdqa(c, b);
code.pcmpgtd(c, a);
@@ -1964,12 +1967,12 @@ void FallbackMinMaxS32(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& a,
// Compute the minimum/maximum of two vectors of unsigned 32-bit integers, using only SSE2 instructons.
// The result of the operation is placed in operand a, while b is unmodified.
-void FallbackMinMaxU32(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& a, const Xbyak::Xmm& b, MinMaxOperation op) {
- const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(code);
+void FallbackMinMaxU32(BlockOfCode& code, EmitContext& ctx, auto const& a, auto const& b, MinMaxOperation op) {
+ auto const c = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(c, code.BConst<32>(xword, 0x80000000));
// bias a and b by XORing their sign bits, then use the signed comparison function
- const Xbyak::Xmm d = ctx.reg_alloc.ScratchXmm(code);
+ auto const d = ctx.reg_alloc.ScratchXmm(code);
if(op == MinMaxOperation::Min) {
code.movdqa(d, a);
code.pxor(d, c);
@@ -1991,11 +1994,16 @@ void EmitX64::EmitVectorMaxS8(EmitContext& ctx, IR::Inst* inst) {
if (code.HasHostFeature(HostFeature::SSE41)) {
EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pmaxsb);
} else {
- auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(code, args[1]);
- FallbackMinMaxS8(code, ctx, a, b, MinMaxOperation::Max);
- ctx.reg_alloc.DefineValue(code, inst, a);
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const b = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const c = ctx.reg_alloc.ScratchXmm(code);
+ code.movdqa(c, a);
+ code.pcmpgtb(c, b);
+ code.pand(a, c);
+ code.pandn(c, b);
+ code.por(a, c);
+ ctx.reg_alloc.DefineValue(code, inst, a);
}
}
@@ -2007,31 +2015,55 @@ void EmitX64::EmitVectorMaxS32(EmitContext& ctx, IR::Inst* inst) {
if (code.HasHostFeature(HostFeature::SSE41)) {
EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pmaxsd);
} else {
- auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(code, args[1]);
- FallbackMinMaxS32(code, ctx, a, b, MinMaxOperation::Max);
- ctx.reg_alloc.DefineValue(code, inst, a);
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const tmp1 = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+ code.movdqa(tmp2, tmp0);
+ code.pcmpgtd(tmp2, tmp1);
+ code.pand(tmp0, tmp2);
+ code.pandn(tmp2, tmp1);
+ code.por(tmp0, tmp2);
+ ctx.reg_alloc.DefineValue(code, inst, tmp0);
}
}
void EmitX64::EmitVectorMaxS64(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasHostFeature(HostFeature::AVX512_Ortho)) {
- EmitAVXVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::vpmaxsq);
+ auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
+ code.vpmaxsq(xmm_a, xmm_a, xmm_b);
+ ctx.reg_alloc.DefineValue(code, inst, xmm_a);
} else if (code.HasHostFeature(HostFeature::AVX)) {
- auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-
- const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(code, args[1]);
-
+ auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const y = ctx.reg_alloc.UseXmm(code, args[1]);
code.vpcmpgtq(xmm0, y, x);
code.pblendvb(x, y);
-
ctx.reg_alloc.DefineValue(code, inst, x);
} else {
- EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) {
- std::transform(a.begin(), a.end(), b.begin(), result.begin(), [](auto x, auto y) { return (std::max)(x, y); });
- });
+ auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const tmp1 = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp3 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp4 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp5 = ctx.reg_alloc.ScratchXmm(code);
+ code.movdqa(tmp2, code.Const(xword, 0x8000'0000, 0x8000'0000));
+ code.movdqa(tmp3, tmp1);
+ code.pxor(tmp3, tmp2);
+ code.pxor(tmp2, tmp0);
+ code.movdqa(tmp4, tmp2);
+ code.pcmpeqd(tmp2, tmp3);
+ code.pcmpgtd(tmp4, tmp3);
+ code.pshufd(tmp2, tmp2, 245);
+ code.pshufd(tmp5, tmp4, 160);
+ code.pshufd(tmp3, tmp4, 245);
+ code.pand(tmp2, tmp5);
+ code.por(tmp3, tmp2);
+ code.pand(tmp0, tmp3);
+ code.pandn(tmp3, tmp1);
+ code.por(tmp0, tmp3);
+ ctx.reg_alloc.DefineValue(code, inst, tmp0);
}
}
@@ -2043,11 +2075,11 @@ void EmitX64::EmitVectorMaxU16(EmitContext& ctx, IR::Inst* inst) {
if (code.HasHostFeature(HostFeature::SSE41)) {
EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pmaxuw);
} else {
- auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const b = ctx.reg_alloc.UseXmm(code, args[1]);
FallbackMinMaxU16(code, ctx, a, b, MinMaxOperation::Max);
- ctx.reg_alloc.DefineValue(code, inst, a);
+ ctx.reg_alloc.DefineValue(code, inst, a);
}
}
@@ -2055,35 +2087,54 @@ void EmitX64::EmitVectorMaxU32(EmitContext& ctx, IR::Inst* inst) {
if (code.HasHostFeature(HostFeature::SSE41)) {
EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pmaxud);
} else {
- auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const b = ctx.reg_alloc.UseXmm(code, args[1]);
FallbackMinMaxU32(code, ctx, a, b, MinMaxOperation::Max);
- ctx.reg_alloc.DefineValue(code, inst, a);
+ ctx.reg_alloc.DefineValue(code, inst, a);
}
}
void EmitX64::EmitVectorMaxU64(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasHostFeature(HostFeature::AVX512_Ortho)) {
- EmitAVXVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::vpmaxuq);
+ auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
+ code.vpmaxuq(xmm_a, xmm_a, xmm_b);
+ ctx.reg_alloc.DefineValue(code, inst, xmm_a);
} else if (code.HasHostFeature(HostFeature::AVX)) {
- auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-
- const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(code, args[1]);
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
-
+ auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const y = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
code.vmovdqa(xmm0, code.Const(xword, 0x8000000000000000, 0x8000000000000000));
code.vpsubq(tmp, y, xmm0);
code.vpsubq(xmm0, x, xmm0);
code.vpcmpgtq(xmm0, tmp, xmm0);
code.pblendvb(x, y);
-
ctx.reg_alloc.DefineValue(code, inst, x);
} else {
- EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) {
- std::transform(a.begin(), a.end(), b.begin(), result.begin(), [](auto x, auto y) { return (std::max)(x, y); });
- });
+ auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const tmp1 = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp3 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp4 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp5 = ctx.reg_alloc.ScratchXmm(code);
+ code.movdqa(tmp2, code.Const(xword, 0x8000000080000000, 0x8000000080000000));
+ code.movdqa(tmp3, tmp1);
+ code.pxor(tmp3, tmp2);
+ code.pxor(tmp2, tmp0);
+ code.movdqa(tmp4, tmp2);
+ code.pcmpeqd(tmp2, tmp3);
+ code.pcmpgtd(tmp4, tmp3);
+ code.pshufd(tmp2, tmp2, 245);
+ code.pshufd(tmp5, tmp4, 160);
+ code.pshufd(tmp3, tmp4, 245);
+ code.pand(tmp2, tmp5);
+ code.por(tmp3, tmp2);
+ code.pand(tmp0, tmp3);
+ code.pandn(tmp3, tmp1);
+ code.por(tmp0, tmp3);
+ ctx.reg_alloc.DefineValue(code, inst, tmp0);
}
}
@@ -2091,11 +2142,16 @@ void EmitX64::EmitVectorMinS8(EmitContext& ctx, IR::Inst* inst) {
if (code.HasHostFeature(HostFeature::SSE41)) {
EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pminsb);
} else {
- auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(code, args[1]);
- FallbackMinMaxS8(code, ctx, a, b, MinMaxOperation::Min);
- ctx.reg_alloc.DefineValue(code, inst, a);
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const b = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const c = ctx.reg_alloc.ScratchXmm(code);
+ code.movdqa(c, b);
+ code.pcmpgtb(c, a);
+ code.pand(a, c);
+ code.pandn(c, b);
+ code.por(a, c);
+ ctx.reg_alloc.DefineValue(code, inst, a);
}
}
@@ -2107,31 +2163,51 @@ void EmitX64::EmitVectorMinS32(EmitContext& ctx, IR::Inst* inst) {
if (code.HasHostFeature(HostFeature::SSE41)) {
EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pminsd);
} else {
- auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const b = ctx.reg_alloc.UseXmm(code, args[1]);
FallbackMinMaxS32(code, ctx, a, b, MinMaxOperation::Min);
- ctx.reg_alloc.DefineValue(code, inst, a);
+ ctx.reg_alloc.DefineValue(code, inst, a);
}
}
void EmitX64::EmitVectorMinS64(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasHostFeature(HostFeature::AVX512_Ortho)) {
- EmitAVXVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::vpminsq);
+ auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
+ code.vpminsq(xmm_a, xmm_a, xmm_b);
+ ctx.reg_alloc.DefineValue(code, inst, xmm_a);
} else if (code.HasHostFeature(HostFeature::AVX)) {
- auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-
- const Xbyak::Xmm x = ctx.reg_alloc.UseXmm(code, args[0]);
- const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
-
+ auto const x = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
code.vpcmpgtq(xmm0, y, x);
code.pblendvb(y, x);
-
ctx.reg_alloc.DefineValue(code, inst, y);
} else {
- EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) {
- std::transform(a.begin(), a.end(), b.begin(), result.begin(), [](auto x, auto y) { return (std::min)(x, y); });
- });
+ auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const tmp1 = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp3 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp4 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp5 = ctx.reg_alloc.ScratchXmm(code);
+ code.movdqa(tmp2, code.Const(xword, 0x8000'0000, 0x8000'0000));
+ code.movdqa(tmp3, tmp1);
+ code.pxor(tmp3, tmp2);
+ code.pxor(tmp2, tmp0);
+ code.movdqa(tmp4, tmp2);
+ code.pcmpeqd(tmp2, tmp3);
+ code.pcmpgtd(tmp4, tmp3);
+ code.pshufd(tmp3, tmp2, 245);
+ code.pshufd(tmp5, tmp4, 160);
+ code.pshufd(tmp2, tmp4, 245);
+ code.pand(tmp3, tmp5);
+ code.por(tmp2, tmp3);
+ code.pand(tmp1, tmp2);
+ code.pandn(tmp2, tmp0);
+ code.por(tmp2, tmp1);
+ //code.movdqa(tmp0, tmp2);
+ ctx.reg_alloc.DefineValue(code, inst, tmp2);
}
}
@@ -2143,11 +2219,11 @@ void EmitX64::EmitVectorMinU16(EmitContext& ctx, IR::Inst* inst) {
if (code.HasHostFeature(HostFeature::SSE41)) {
EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pminuw);
} else {
- auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const b = ctx.reg_alloc.UseXmm(code, args[1]);
FallbackMinMaxU16(code, ctx, a, b, MinMaxOperation::Min);
- ctx.reg_alloc.DefineValue(code, inst, a);
+ ctx.reg_alloc.DefineValue(code, inst, a);
}
}
@@ -2155,57 +2231,93 @@ void EmitX64::EmitVectorMinU32(EmitContext& ctx, IR::Inst* inst) {
if (code.HasHostFeature(HostFeature::SSE41)) {
EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pminud);
} else {
- auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const b = ctx.reg_alloc.UseXmm(code, args[1]);
FallbackMinMaxU32(code, ctx, a, b, MinMaxOperation::Min);
- ctx.reg_alloc.DefineValue(code, inst, a);
+ ctx.reg_alloc.DefineValue(code, inst, a);
}
}
void EmitX64::EmitVectorMinU64(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasHostFeature(HostFeature::AVX512_Ortho)) {
- EmitAVXVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::vpminuq);
+ auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
+ code.vpminuq(xmm_a, xmm_a, xmm_b);
+ ctx.reg_alloc.DefineValue(code, inst, xmm_a);
} else if (code.HasHostFeature(HostFeature::AVX)) {
- auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-
- const Xbyak::Xmm x = ctx.reg_alloc.UseXmm(code, args[0]);
- const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
-
+ auto const x = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
code.vmovdqa(xmm0, code.Const(xword, 0x8000000000000000, 0x8000000000000000));
code.vpsubq(tmp, y, xmm0);
code.vpsubq(xmm0, x, xmm0);
code.vpcmpgtq(xmm0, tmp, xmm0);
code.pblendvb(y, x);
-
ctx.reg_alloc.DefineValue(code, inst, y);
} else {
- EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) {
- std::transform(a.begin(), a.end(), b.begin(), result.begin(), [](auto x, auto y) { return (std::min)(x, y); });
- });
+ auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const tmp1 = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp3 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp4 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp5 = ctx.reg_alloc.ScratchXmm(code);
+ code.movdqa(tmp2, code.Const(xword, 0x8000000080000000, 0x8000000080000000));
+ code.movdqa(tmp3, tmp1);
+ code.pxor(tmp3, tmp2);
+ code.pxor(tmp2, tmp0);
+ code.movdqa(tmp4, tmp2);
+ code.pcmpeqd(tmp2, tmp3);
+ code.pcmpgtd(tmp4, tmp3);
+ code.pshufd(tmp3, tmp2, 245);
+ code.pshufd(tmp5, tmp4, 160);
+ code.pshufd(tmp2, tmp4, 245);
+ code.pand(tmp3, tmp5);
+ code.por(tmp2, tmp3);
+ code.pand(tmp1, tmp2);
+ code.pandn(tmp2, tmp0);
+ code.por(tmp2, tmp1);
+ //code.movdqa(tmp0, tmp2);
+ ctx.reg_alloc.DefineValue(code, inst, tmp2);
}
}
void EmitX64::EmitVectorMultiply8(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
- const Xbyak::Xmm tmp_a = ctx.reg_alloc.ScratchXmm(code);
- const Xbyak::Xmm tmp_b = ctx.reg_alloc.ScratchXmm(code);
-
- // TODO: Optimize
- code.movdqa(tmp_a, a);
- code.movdqa(tmp_b, b);
- code.pmullw(a, b);
- code.psrlw(tmp_a, 8);
- code.psrlw(tmp_b, 8);
- code.pmullw(tmp_a, tmp_b);
- code.pand(a, code.Const(xword, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF));
- code.psllw(tmp_a, 8);
- code.por(a, tmp_a);
-
- ctx.reg_alloc.DefineValue(code, inst, a);
+ if (code.HasHostFeature(HostFeature::AVX)) {
+ auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const tmp1 = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp3 = ctx.reg_alloc.ScratchXmm(code);
+ code.vbroadcastss(tmp3, code.Const(dword, 0x00ff'00ff));
+ code.vpmullw(tmp2, tmp1, tmp0);
+ code.vpandn(tmp0, tmp3, tmp0);
+ code.vpand(tmp2, tmp2, tmp3);
+ code.vpmaddubsw(tmp0, tmp1, tmp0);
+ code.vpsllw(tmp0, tmp0, 8);
+ code.vpor(tmp0, tmp2, tmp0);
+ ctx.reg_alloc.DefineValue(code, inst, tmp0);
+ } else {
+ auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const tmp1 = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp3 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp4 = ctx.reg_alloc.ScratchXmm(code);
+ code.movdqa(tmp2, tmp0);
+ code.movdqa(tmp3, tmp1);
+ code.movdqa(tmp4, code.Const(xword, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF));
+ code.punpckhbw(tmp2, tmp2);
+ code.punpckhbw(tmp3, tmp3);
+ code.punpcklbw(tmp0, tmp0);
+ code.punpcklbw(tmp1, tmp1);
+ code.pmullw(tmp3, tmp2);
+ code.pmullw(tmp0, tmp1);
+ code.pand(tmp3, tmp4);
+ code.pand(tmp0, tmp4);
+ code.packuswb(tmp0, tmp3);
+ ctx.reg_alloc.DefineValue(code, inst, tmp0);
+ }
}
void EmitX64::EmitVectorMultiply16(EmitContext& ctx, IR::Inst* inst) {
@@ -2216,31 +2328,32 @@ void EmitX64::EmitVectorMultiply32(EmitContext& ctx, IR::Inst* inst) {
if (code.HasHostFeature(HostFeature::SSE41)) {
EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pmulld);
} else {
- auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
-
- code.movdqa(tmp, a);
- code.psrlq(a, 32);
- code.pmuludq(tmp, b);
- code.psrlq(b, 32);
- code.pmuludq(a, b);
- code.pshufd(tmp, tmp, 0b00001000);
- code.pshufd(b, a, 0b00001000);
- code.punpckldq(tmp, b);
-
- ctx.reg_alloc.DefineValue(code, inst, tmp);
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
+ code.movdqa(tmp, a);
+ code.psrlq(a, 32);
+ code.pmuludq(tmp, b);
+ code.psrlq(b, 32);
+ code.pmuludq(a, b);
+ code.pshufd(tmp, tmp, 0b00001000);
+ code.pshufd(b, a, 0b00001000);
+ code.punpckldq(tmp, b);
+ ctx.reg_alloc.DefineValue(code, inst, tmp);
}
}
void EmitX64::EmitVectorMultiply64(EmitContext& ctx, IR::Inst* inst) {
- if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512DQ)) {
- EmitAVXVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::vpmullq);
- } else if (code.HasHostFeature(HostFeature::SSE41)) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(code, args[1]);
+ if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512DQ)) {
+ auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
+ code.vpmullq(xmm_a, xmm_a, xmm_b);
+ ctx.reg_alloc.DefineValue(code, inst, xmm_a);
+ } else if (code.HasHostFeature(HostFeature::SSE41)) {
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const b = ctx.reg_alloc.UseXmm(code, args[1]);
const Xbyak::Reg64 tmp1 = ctx.reg_alloc.ScratchGpr(code);
const Xbyak::Reg64 tmp2 = ctx.reg_alloc.ScratchGpr(code);
@@ -2255,29 +2368,28 @@ void EmitX64::EmitVectorMultiply64(EmitContext& ctx, IR::Inst* inst) {
ctx.reg_alloc.DefineValue(code, inst, a);
} else {
- auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseXmm(code, args[0]);
- const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
- const Xbyak::Xmm tmp1 = ctx.reg_alloc.ScratchXmm(code);
- const Xbyak::Xmm tmp2 = ctx.reg_alloc.ScratchXmm(code);
- const Xbyak::Xmm tmp3 = ctx.reg_alloc.ScratchXmm(code);
+ auto const a = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const tmp1 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp3 = ctx.reg_alloc.ScratchXmm(code);
- code.movdqa(tmp1, a);
- code.movdqa(tmp2, a);
- code.movdqa(tmp3, b);
+ code.movdqa(tmp1, a);
+ code.movdqa(tmp2, a);
+ code.movdqa(tmp3, b);
- code.psrlq(tmp1, 32);
- code.psrlq(tmp3, 32);
+ code.psrlq(tmp1, 32);
+ code.psrlq(tmp3, 32);
- code.pmuludq(tmp2, b);
- code.pmuludq(tmp3, a);
- code.pmuludq(b, tmp1);
+ code.pmuludq(tmp2, b);
+ code.pmuludq(tmp3, a);
+ code.pmuludq(b, tmp1);
- code.paddq(b, tmp3);
- code.psllq(b, 32);
- code.paddq(tmp2, b);
+ code.paddq(b, tmp3);
+ code.psllq(b, 32);
+ code.paddq(tmp2, b);
- ctx.reg_alloc.DefineValue(code, inst, tmp2);
+ ctx.reg_alloc.DefineValue(code, inst, tmp2);
}
}
@@ -2309,15 +2421,15 @@ void EmitX64::EmitVectorNarrow16(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW)) {
- const Xbyak::Xmm a = ctx.reg_alloc.UseXmm(code, args[0]);
- const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+ auto const a = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const result = ctx.reg_alloc.ScratchXmm(code);
code.vpmovwb(result, a);
ctx.reg_alloc.DefineValue(code, inst, result);
} else {
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm zeros = ctx.reg_alloc.ScratchXmm(code);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const zeros = ctx.reg_alloc.ScratchXmm(code);
code.pxor(zeros, zeros);
code.pand(a, code.Const(xword, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF));
@@ -2330,13 +2442,13 @@ void EmitX64::EmitVectorNarrow16(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorNarrow32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasHostFeature(HostFeature::AVX512_Ortho)) {
- const Xbyak::Xmm a = ctx.reg_alloc.UseXmm(code, args[0]);
- const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+ auto const a = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const result = ctx.reg_alloc.ScratchXmm(code);
code.vpmovdw(result, a);
ctx.reg_alloc.DefineValue(code, inst, result);
} else {
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm zeros = ctx.reg_alloc.ScratchXmm(code);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const zeros = ctx.reg_alloc.ScratchXmm(code);
code.pxor(zeros, zeros);
if (code.HasHostFeature(HostFeature::SSE41)) {
code.pblendw(a, zeros, 0b10101010);
@@ -2354,15 +2466,15 @@ void EmitX64::EmitVectorNarrow64(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasHostFeature(HostFeature::AVX512_Ortho)) {
- const Xbyak::Xmm a = ctx.reg_alloc.UseXmm(code, args[0]);
- const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+ auto const a = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const result = ctx.reg_alloc.ScratchXmm(code);
code.vpmovqd(result, a);
ctx.reg_alloc.DefineValue(code, inst, result);
} else {
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm zeros = ctx.reg_alloc.ScratchXmm(code);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const zeros = ctx.reg_alloc.ScratchXmm(code);
code.pxor(zeros, zeros);
code.shufps(a, zeros, 0b00001000);
@@ -2375,13 +2487,13 @@ void EmitX64::EmitVectorNot(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasHostFeature(HostFeature::AVX512_Ortho)) {
- const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
- const Xbyak::Xmm operand = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const result = ctx.reg_alloc.ScratchXmm(code);
+ auto const operand = ctx.reg_alloc.UseXmm(code, args[0]);
code.vpternlogq(result, operand, operand, u8(~Tern::c));
ctx.reg_alloc.DefineValue(code, inst, result);
} else {
- const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm xmm_b = ctx.reg_alloc.ScratchXmm(code);
+ auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const xmm_b = ctx.reg_alloc.ScratchXmm(code);
code.pcmpeqw(xmm_b, xmm_b);
code.pxor(xmm_a, xmm_b);
ctx.reg_alloc.DefineValue(code, inst, xmm_a);
@@ -2395,9 +2507,9 @@ void EmitX64::EmitVectorOr(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorPairedAddLower8(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
code.punpcklqdq(xmm_a, xmm_b);
code.movdqa(tmp, xmm_a);
@@ -2413,9 +2525,9 @@ void EmitX64::EmitVectorPairedAddLower8(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorPairedAddLower16(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
code.punpcklqdq(xmm_a, xmm_b);
if (code.HasHostFeature(HostFeature::SSSE3)) {
@@ -2436,9 +2548,9 @@ void EmitX64::EmitVectorPairedAddLower16(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorPairedAddLower32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
code.punpcklqdq(xmm_a, xmm_b);
if (code.HasHostFeature(HostFeature::SSSE3)) {
@@ -2458,10 +2570,10 @@ void EmitX64::EmitVectorPairedAddLower32(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorPairedAdd8(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
- const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(code);
- const Xbyak::Xmm d = ctx.reg_alloc.ScratchXmm(code);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const c = ctx.reg_alloc.ScratchXmm(code);
+ auto const d = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(c, a);
code.movdqa(d, b);
@@ -2480,17 +2592,17 @@ void EmitX64::EmitVectorPairedAdd16(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasHostFeature(HostFeature::SSSE3)) {
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const b = ctx.reg_alloc.UseXmm(code, args[1]);
code.phaddw(a, b);
ctx.reg_alloc.DefineValue(code, inst, a);
} else {
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
- const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(code);
- const Xbyak::Xmm d = ctx.reg_alloc.ScratchXmm(code);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const c = ctx.reg_alloc.ScratchXmm(code);
+ auto const d = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(c, a);
code.movdqa(d, b);
@@ -2510,17 +2622,17 @@ void EmitX64::EmitVectorPairedAdd32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasHostFeature(HostFeature::SSSE3)) {
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const b = ctx.reg_alloc.UseXmm(code, args[1]);
code.phaddd(a, b);
ctx.reg_alloc.DefineValue(code, inst, a);
} else {
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
- const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(code);
- const Xbyak::Xmm d = ctx.reg_alloc.ScratchXmm(code);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const c = ctx.reg_alloc.ScratchXmm(code);
+ auto const d = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(c, a);
code.movdqa(d, b);
@@ -2537,9 +2649,9 @@ void EmitX64::EmitVectorPairedAdd32(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorPairedAdd64(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(code, args[1]);
- const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(code);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const b = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const c = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(c, a);
code.punpcklqdq(a, b);
@@ -2552,8 +2664,8 @@ void EmitX64::EmitVectorPairedAdd64(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorPairedAddSignedWiden8(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(code);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const c = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(c, a);
code.psllw(a, 8);
@@ -2567,8 +2679,8 @@ void EmitX64::EmitVectorPairedAddSignedWiden8(EmitContext& ctx, IR::Inst* inst)
void EmitX64::EmitVectorPairedAddSignedWiden16(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(code);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const c = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(c, a);
code.pslld(a, 16);
@@ -2582,18 +2694,18 @@ void EmitX64::EmitVectorPairedAddSignedWiden16(EmitContext& ctx, IR::Inst* inst)
void EmitX64::EmitVectorPairedAddSignedWiden32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
if (code.HasHostFeature(HostFeature::AVX512_Ortho)) {
- const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(code);
+ auto const c = ctx.reg_alloc.ScratchXmm(code);
code.vpsraq(c, a, 32);
code.vpsllq(a, a, 32);
code.vpsraq(a, a, 32);
code.vpaddq(a, a, c);
} else {
- const Xbyak::Xmm tmp1 = ctx.reg_alloc.ScratchXmm(code);
- const Xbyak::Xmm tmp2 = ctx.reg_alloc.ScratchXmm(code);
- const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp1 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+ auto const c = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(c, a);
code.psllq(a, 32);
@@ -2615,8 +2727,8 @@ void EmitX64::EmitVectorPairedAddSignedWiden32(EmitContext& ctx, IR::Inst* inst)
void EmitX64::EmitVectorPairedAddUnsignedWiden8(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(code);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const c = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(c, a);
code.psllw(a, 8);
@@ -2630,8 +2742,8 @@ void EmitX64::EmitVectorPairedAddUnsignedWiden8(EmitContext& ctx, IR::Inst* inst
void EmitX64::EmitVectorPairedAddUnsignedWiden16(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(code);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const c = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(c, a);
code.pslld(a, 16);
@@ -2645,8 +2757,8 @@ void EmitX64::EmitVectorPairedAddUnsignedWiden16(EmitContext& ctx, IR::Inst* ins
void EmitX64::EmitVectorPairedAddUnsignedWiden32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(code);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const c = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(c, a);
code.psllq(a, 32);
@@ -2660,14 +2772,10 @@ void EmitX64::EmitVectorPairedAddUnsignedWiden32(EmitContext& ctx, IR::Inst* ins
template
static void PairedOperation(VectorArray& result, const VectorArray& x, const VectorArray& y, Function fn) {
const size_t range = x.size() / 2;
-
- for (size_t i = 0; i < range; i++) {
+ for (size_t i = 0; i < range; i++)
result[i] = fn(x[2 * i], x[2 * i + 1]);
- }
-
- for (size_t i = 0; i < range; i++) {
+ for (size_t i = 0; i < range; i++)
result[range + i] = fn(y[2 * i], y[2 * i + 1]);
- }
}
template
@@ -2688,11 +2796,6 @@ static void PairedMax(VectorArray& result, const VectorArray& x, const Vec
PairedOperation(result, x, y, [](auto a, auto b) { return (std::max)(a, b); });
}
-template
-static void PairedMin(VectorArray& result, const VectorArray& x, const VectorArray& y) {
- PairedOperation(result, x, y, [](auto a, auto b) { return (std::min)(a, b); });
-}
-
template
static void LowerPairedMax(VectorArray& result, const VectorArray& x, const VectorArray& y) {
LowerPairedOperation(result, x, y, [](auto a, auto b) { return (std::max)(a, b); });
@@ -2707,19 +2810,16 @@ template
static void EmitVectorPairedMinMax8(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(tmp, code.Const(xword, 0x0E'0C'0A'08'06'04'02'00, 0x0F'0D'0B'09'07'05'03'01));
code.pshufb(x, tmp);
code.pshufb(y, tmp);
-
code.movaps(tmp, x);
code.shufps(tmp, y, 0b01'00'01'00);
-
code.shufps(x, y, 0b11'10'11'10);
-
if constexpr (std::is_member_function_pointer_v) {
(code.*fn)(x, tmp);
} else {
@@ -2732,21 +2832,17 @@ static void EmitVectorPairedMinMax8(BlockOfCode& code, EmitContext& ctx, IR::Ins
template
static void EmitVectorPairedMinMaxLower8(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-
- const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
-
+ auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
code.punpcklqdq(x, y);
code.pshufb(x, code.Const(xword, 0x0E'0C'0A'08'06'04'02'00, 0x0F'0D'0B'09'07'05'03'01));
code.movhlps(y, x);
code.movq(x, x);
-
if constexpr (std::is_member_function_pointer_v) {
(code.*fn)(x, y);
} else {
fn(x, y);
}
-
ctx.reg_alloc.DefineValue(code, inst, x);
}
@@ -2754,9 +2850,9 @@ template
static void EmitVectorPairedMinMax16(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
// swap idxs 1 and 2 within 64-bit lanes so that both registers contain [even, odd, even, odd]-indexed pairs of elements
code.pshuflw(x, x, 0b11'01'10'00);
@@ -2782,63 +2878,31 @@ static void EmitVectorPairedMinMax16(BlockOfCode& code, EmitContext& ctx, IR::In
ctx.reg_alloc.DefineValue(code, inst, x);
}
-template
-static void EmitVectorPairedMinMaxLower16(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) {
- auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-
- const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
-
- // swap idxs 1 and 2 so that both registers contain even then odd-indexed pairs of elements
- code.pshuflw(x, x, 0b11'01'10'00);
- code.pshuflw(y, y, 0b11'01'10'00);
-
- // move pairs of even/odd-indexed elements into one register each
-
- // tmp = x[0, 2], y[0, 2], 0s...
- code.movaps(tmp, y);
- code.insertps(tmp, x, 0b01001100);
- // x = x[1, 3], y[1, 3], 0s...
- code.insertps(x, y, 0b00011100);
-
- (code.*fn)(x, tmp);
-
- ctx.reg_alloc.DefineValue(code, inst, x);
-}
-
-static void EmitVectorPairedMinMaxLower32(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, void (Xbyak::CodeGenerator::*fn)(const Xbyak::Xmm&, const Xbyak::Operand&)) {
- auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-
- const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(code, args[1]);
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
-
- // tmp = x[1], y[1], 0, 0
- code.movaps(tmp, y);
- code.insertps(tmp, x, 0b01001100);
- // x = x[0], y[0], 0, 0
- code.insertps(x, y, 0b00011100);
-
- (code.*fn)(x, tmp);
-
- ctx.reg_alloc.DefineValue(code, inst, x);
-}
void EmitX64::EmitVectorPairedMaxS8(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
+ code.movdqa(tmp, code.Const(xword, 0x0E'0C'0A'08'06'04'02'00, 0x0F'0D'0B'09'07'05'03'01));
+ code.pshufb(x, tmp);
+ code.pshufb(y, tmp);
+ code.movaps(tmp, x);
+ code.shufps(tmp, y, 0b01'00'01'00);
+ code.shufps(x, y, 0b11'10'11'10);
if (code.HasHostFeature(HostFeature::SSE41)) {
- EmitVectorPairedMinMax8(code, ctx, inst, &Xbyak::CodeGenerator::pmaxsb);
- return;
- } else if (code.HasHostFeature(HostFeature::SSSE3)) {
- EmitVectorPairedMinMax8(code, ctx, inst, [&](const auto& lhs, const auto& rhs) {
- FallbackMinMaxS8(code, ctx, lhs, rhs, MinMaxOperation::Max);
- });
- return;
+ code.pmaxsb(x, tmp);
+ } else {
+ auto const a = x;
+ auto const b = tmp;
+ auto const c = ctx.reg_alloc.ScratchXmm(code);
+ code.movdqa(c, a);
+ code.pcmpgtb(c, b);
+ code.pand(a, c);
+ code.pandn(c, b);
+ code.por(a, c);
}
-
- EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) {
- PairedMax(result, a, b);
- });
+ ctx.reg_alloc.DefineValue(code, inst, x);
}
void EmitX64::EmitVectorPairedMaxS16(EmitContext& ctx, IR::Inst* inst) {
@@ -2848,9 +2912,9 @@ void EmitX64::EmitVectorPairedMaxS16(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorPairedMaxS32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(code, args[1]);
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const y = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(tmp, x);
code.shufps(tmp, y, 0b10001000);
@@ -2868,12 +2932,24 @@ void EmitX64::EmitVectorPairedMaxS32(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorPairedMaxU8(EmitContext& ctx, IR::Inst* inst) {
if (code.HasHostFeature(HostFeature::SSSE3)) {
EmitVectorPairedMinMax8(code, ctx, inst, &Xbyak::CodeGenerator::pmaxub);
- return;
+ } else {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const tmp1 = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp3 = ctx.reg_alloc.ScratchXmm(code);
+ auto const constant_00ff = code.Const(xword, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF);
+ code.movdqa(tmp2, constant_00ff);
+ code.movdqa(tmp3, tmp1);
+ code.pand(tmp3, tmp2);
+ code.pand(tmp2, tmp0);
+ code.packuswb(tmp2, tmp3);
+ code.psrlw(tmp1, 8);
+ code.psrlw(tmp0, 8);
+ code.packuswb(tmp0, tmp1);
+ code.pmaxub(tmp0, tmp2);
+ ctx.reg_alloc.DefineValue(code, inst, tmp0);
}
-
- EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) {
- PairedMax(result, a, b);
- });
}
void EmitX64::EmitVectorPairedMaxU16(EmitContext& ctx, IR::Inst* inst) {
@@ -2889,9 +2965,9 @@ void EmitX64::EmitVectorPairedMaxU16(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorPairedMaxU32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(code, args[1]);
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const y = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(tmp, x);
code.shufps(tmp, y, 0b10001000);
@@ -2909,14 +2985,15 @@ void EmitX64::EmitVectorPairedMaxU32(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorPairedMinS8(EmitContext& ctx, IR::Inst* inst) {
if (code.HasHostFeature(HostFeature::SSE41)) {
EmitVectorPairedMinMax8(code, ctx, inst, &Xbyak::CodeGenerator::pminsb);
- } else if (code.HasHostFeature(HostFeature::SSSE3)) {
- EmitVectorPairedMinMax8(code, ctx, inst, [&](const auto& lhs, const auto& rhs) {
- FallbackMinMaxS8(code, ctx, lhs, rhs, MinMaxOperation::Min);
- });
} else {
- EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) {
- PairedMin(result, a, b);
- });
+ EmitVectorPairedMinMax8(code, ctx, inst, [&](const auto& a, const auto& b) {
+ auto const c = ctx.reg_alloc.ScratchXmm(code);
+ code.movdqa(c, b);
+ code.pcmpgtb(c, a);
+ code.pand(a, c);
+ code.pandn(c, b);
+ code.por(a, c);
+ });
}
}
@@ -2927,9 +3004,9 @@ void EmitX64::EmitVectorPairedMinS16(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorPairedMinS32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(code, args[1]);
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const y = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(tmp, x);
code.shufps(tmp, y, 0b10001000);
@@ -2945,12 +3022,25 @@ void EmitX64::EmitVectorPairedMinS32(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorPairedMinU8(EmitContext& ctx, IR::Inst* inst) {
if (code.HasHostFeature(HostFeature::SSSE3)) {
EmitVectorPairedMinMax8(code, ctx, inst, &Xbyak::CodeGenerator::pminub);
- return;
+ } else {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const tmp1 = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp3 = ctx.reg_alloc.ScratchXmm(code);
+ auto const constant_00ff = code.Const(xword, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF);
+ code.movdqa(tmp2, tmp1);
+ code.psrlw(tmp2, 8);
+ code.movdqa(tmp3, tmp0);
+ code.psrlw(tmp3, 8);
+ code.packuswb(tmp3, tmp2);
+ code.movdqa(tmp2, constant_00ff);
+ code.pand(tmp1, tmp2);
+ code.pand(tmp0, tmp2);
+ code.packuswb(tmp0, tmp1);
+ code.pminub(tmp0, tmp3);
+ ctx.reg_alloc.DefineValue(code, inst, tmp0);
}
-
- EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) {
- PairedMin(result, a, b);
- });
}
void EmitX64::EmitVectorPairedMinU16(EmitContext& ctx, IR::Inst* inst) {
@@ -2966,9 +3056,9 @@ void EmitX64::EmitVectorPairedMinU16(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorPairedMinU32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(code, args[1]);
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const y = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(tmp, x);
code.shufps(tmp, y, 0b10001000);
@@ -2984,41 +3074,88 @@ void EmitX64::EmitVectorPairedMinU32(EmitContext& ctx, IR::Inst* inst) {
}
void EmitX64::EmitVectorPairedMaxLowerS8(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
if (code.HasHostFeature(HostFeature::SSE41)) {
- EmitVectorPairedMinMaxLower8(code, ctx, inst, &Xbyak::CodeGenerator::pmaxsb);
- return;
- } else if (code.HasHostFeature(HostFeature::SSSE3)) {
- EmitVectorPairedMinMaxLower8(code, ctx, inst, [&](const auto& lhs, const auto& rhs) {
- FallbackMinMaxS8(code, ctx, lhs, rhs, MinMaxOperation::Max);
- });
- return;
+ code.punpcklqdq(x, y);
+ code.pshufb(x, code.Const(xword, 0x0E'0C'0A'08'06'04'02'00, 0x0F'0D'0B'09'07'05'03'01));
+ code.movhlps(y, x);
+ code.movq(x, x);
+ code.pmaxsb(x, y);
+ } else {
+ auto const c = ctx.reg_alloc.ScratchXmm(code);
+ code.punpcklqdq(x, y);
+ code.pshufb(x, code.Const(xword, 0x0E'0C'0A'08'06'04'02'00, 0x0F'0D'0B'09'07'05'03'01));
+ code.movhlps(y, x);
+ code.movq(x, x);
+ code.movdqa(c, x);
+ code.pcmpgtb(c, y);
+ code.pand(x, c);
+ code.pandn(c, y);
+ code.por(x, c);
}
-
- EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) {
- LowerPairedMax(result, a, b);
- });
+ ctx.reg_alloc.DefineValue(code, inst, x);
}
void EmitX64::EmitVectorPairedMaxLowerS16(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasHostFeature(HostFeature::SSE41)) {
- EmitVectorPairedMinMaxLower16(code, ctx, inst, &Xbyak::CodeGenerator::pmaxsw);
- return;
+ auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
+ // swap idxs 1 and 2 so that both registers contain even then odd-indexed pairs of elements
+ code.pshuflw(x, x, 0b11'01'10'00);
+ code.pshuflw(y, y, 0b11'01'10'00);
+ // move pairs of even/odd-indexed elements into one register each
+ // tmp = x[0, 2], y[0, 2], 0s...
+ code.movaps(tmp, y);
+ code.insertps(tmp, x, 0b01001100);
+ // x = x[1, 3], y[1, 3], 0s...
+ code.insertps(x, y, 0b00011100);
+ code.pmaxsw(x, tmp);
+ ctx.reg_alloc.DefineValue(code, inst, x);
+ } else {
+ auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const tmp1 = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ code.punpcklwd(tmp0, tmp1);
+ code.pshufd(tmp1, tmp0, 232);
+ code.pshuflw(tmp1, tmp1, 216);
+ code.pshufd(tmp0, tmp0, 231);
+ code.pshuflw(tmp0, tmp0, 114);
+ code.pmaxsw(tmp0, tmp1);
+ code.movq(tmp0, tmp0);
+ ctx.reg_alloc.DefineValue(code, inst, tmp0);
}
-
- EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) {
- LowerPairedMax(result, a, b);
- });
}
void EmitX64::EmitVectorPairedMaxLowerS32(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasHostFeature(HostFeature::SSE41)) {
- EmitVectorPairedMinMaxLower32(code, ctx, inst, &Xbyak::CodeGenerator::pmaxsd);
- return;
+ auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const y = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
+ // tmp = x[1], y[1], 0, 0
+ code.movaps(tmp, y);
+ code.insertps(tmp, x, 0b01001100);
+ // x = x[0], y[0], 0, 0
+ code.insertps(x, y, 0b00011100);
+ code.pmaxsd(x, tmp);
+ ctx.reg_alloc.DefineValue(code, inst, x);
+ } else {
+ auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const tmp1 = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+ code.punpckldq(tmp0, tmp1);
+ code.pshufd(tmp1, tmp0, 238);
+ code.movdqa(tmp2, tmp0);
+ code.pcmpgtd(tmp2, tmp1);
+ code.pand(tmp0, tmp2);
+ code.pandn(tmp2, tmp1);
+ code.por(tmp2, tmp0);
+ code.movq(tmp0, tmp2);
+ ctx.reg_alloc.DefineValue(code, inst, tmp0);
}
-
- EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) {
- LowerPairedMax(result, a, b);
- });
}
void EmitX64::EmitVectorPairedMaxLowerU8(EmitContext& ctx, IR::Inst* inst) {
@@ -3033,63 +3170,143 @@ void EmitX64::EmitVectorPairedMaxLowerU8(EmitContext& ctx, IR::Inst* inst) {
}
void EmitX64::EmitVectorPairedMaxLowerU16(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasHostFeature(HostFeature::SSE41)) {
- EmitVectorPairedMinMaxLower16(code, ctx, inst, &Xbyak::CodeGenerator::pmaxuw);
- return;
+ auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
+ // swap idxs 1 and 2 so that both registers contain even then odd-indexed pairs of elements
+ code.pshuflw(x, x, 0b11'01'10'00);
+ code.pshuflw(y, y, 0b11'01'10'00);
+ // move pairs of even/odd-indexed elements into one register each
+ // tmp = x[0, 2], y[0, 2], 0s...
+ code.movaps(tmp, y);
+ code.insertps(tmp, x, 0b01001100);
+ // x = x[1, 3], y[1, 3], 0s...
+ code.insertps(x, y, 0b00011100);
+ code.pmaxuw(x, tmp);
+ ctx.reg_alloc.DefineValue(code, inst, x);
+ } else {
+ auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const tmp1 = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ code.punpcklwd(tmp0, tmp1);
+ code.pshufd(tmp1, tmp0, 232);
+ code.pshuflw(tmp1, tmp1, 216);
+ code.pshufd(tmp0, tmp0, 231);
+ code.pshuflw(tmp0, tmp0, 114);
+ code.psubusw(tmp0, tmp1);
+ code.paddw(tmp0, tmp1);
+ code.movq(tmp0, tmp0);
+ ctx.reg_alloc.DefineValue(code, inst, tmp0);
}
-
- EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) {
- LowerPairedMax(result, a, b);
- });
}
void EmitX64::EmitVectorPairedMaxLowerU32(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasHostFeature(HostFeature::SSE41)) {
- EmitVectorPairedMinMaxLower32(code, ctx, inst, &Xbyak::CodeGenerator::pmaxud);
- return;
+ auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const y = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
+ // tmp = x[1], y[1], 0, 0
+ code.movaps(tmp, y);
+ code.insertps(tmp, x, 0b01001100);
+ // x = x[0], y[0], 0, 0
+ code.insertps(x, y, 0b00011100);
+ code.pmaxud(x, tmp);
+ ctx.reg_alloc.DefineValue(code, inst, x);
+ } else {
+ auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const tmp1 = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp3 = ctx.reg_alloc.ScratchXmm(code);
+ code.punpckldq(tmp0, tmp1);
+ code.pshufd(tmp1, tmp0, 238);
+ code.movdqa(tmp2, code.Const(xword, 0x8000'00008000'0000, 0x8000'00008000'0000));
+ code.movdqa(tmp3, tmp0);
+ code.pxor(tmp3, tmp2);
+ code.pxor(tmp2, tmp1);
+ code.pcmpgtd(tmp3, tmp2);
+ code.pand(tmp0, tmp3);
+ code.pandn(tmp3, tmp1);
+ code.por(tmp3, tmp0);
+ code.movq(tmp0, tmp3);
+ ctx.reg_alloc.DefineValue(code, inst, tmp0);
}
-
- EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) {
- LowerPairedMax(result, a, b);
- });
}
void EmitX64::EmitVectorPairedMinLowerS8(EmitContext& ctx, IR::Inst* inst) {
if (code.HasHostFeature(HostFeature::SSE41)) {
EmitVectorPairedMinMaxLower8(code, ctx, inst, &Xbyak::CodeGenerator::pminsb);
- return;
- } else if (code.HasHostFeature(HostFeature::SSSE3)) {
- EmitVectorPairedMinMaxLower8(code, ctx, inst, [&](const auto& lhs, const auto& rhs) {
- FallbackMinMaxS8(code, ctx, lhs, rhs, MinMaxOperation::Min);
+ } else {
+ EmitVectorPairedMinMaxLower8(code, ctx, inst, [&](const auto& a, const auto& b) {
+ auto const c = ctx.reg_alloc.ScratchXmm(code);
+ code.movdqa(c, b);
+ code.pcmpgtb(c, a);
+ code.pand(a, c);
+ code.pandn(c, b);
+ code.por(a, c);
});
- return;
}
-
- EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) {
- LowerPairedMin(result, a, b);
- });
}
void EmitX64::EmitVectorPairedMinLowerS16(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasHostFeature(HostFeature::SSE41)) {
- EmitVectorPairedMinMaxLower16(code, ctx, inst, &Xbyak::CodeGenerator::pminsw);
- return;
+ auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
+ // swap idxs 1 and 2 so that both registers contain even then odd-indexed pairs of elements
+ code.pshuflw(x, x, 0b11'01'10'00);
+ code.pshuflw(y, y, 0b11'01'10'00);
+ // move pairs of even/odd-indexed elements into one register each
+ // tmp = x[0, 2], y[0, 2], 0s...
+ code.movaps(tmp, y);
+ code.insertps(tmp, x, 0b01001100);
+ // x = x[1, 3], y[1, 3], 0s...
+ code.insertps(x, y, 0b00011100);
+ code.pminsw(x, tmp);
+ ctx.reg_alloc.DefineValue(code, inst, x);
+ } else {
+ auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const tmp1 = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ code.punpcklwd(tmp0, tmp1);
+ code.pshufd(tmp1, tmp0, 231);
+ code.pshuflw(tmp1, tmp1, 114);
+ code.pshufd(tmp0, tmp0, 232);
+ code.pshuflw(tmp0, tmp0, 216);
+ code.pminsw(tmp0, tmp1);
+ code.movq(tmp0, tmp0);
+ ctx.reg_alloc.DefineValue(code, inst, tmp0);
}
-
- EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) {
- LowerPairedMin(result, a, b);
- });
}
void EmitX64::EmitVectorPairedMinLowerS32(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasHostFeature(HostFeature::SSE41)) {
- EmitVectorPairedMinMaxLower32(code, ctx, inst, &Xbyak::CodeGenerator::pminsd);
- return;
+ auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const y = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
+ // tmp = x[1], y[1], 0, 0
+ code.movaps(tmp, y);
+ code.insertps(tmp, x, 0b01001100);
+ // x = x[0], y[0], 0, 0
+ code.insertps(x, y, 0b00011100);
+ code.pminsd(x, tmp);
+ ctx.reg_alloc.DefineValue(code, inst, x);
+ } else {
+ auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const tmp1 = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+ code.punpckldq(tmp0, tmp1);
+ code.pshufd(tmp1, tmp0, 238);
+ code.movdqa(tmp2, tmp0);
+ code.pcmpgtd(tmp2, tmp1);
+ code.pand(tmp1, tmp2);
+ code.pandn(tmp2, tmp0);
+ code.por(tmp2, tmp1);
+ code.movq(tmp0, tmp2);
+ ctx.reg_alloc.DefineValue(code, inst, tmp0);
}
-
- EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) {
- LowerPairedMin(result, a, b);
- });
}
void EmitX64::EmitVectorPairedMinLowerU8(EmitContext& ctx, IR::Inst* inst) {
@@ -3104,50 +3321,91 @@ void EmitX64::EmitVectorPairedMinLowerU8(EmitContext& ctx, IR::Inst* inst) {
}
void EmitX64::EmitVectorPairedMinLowerU16(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasHostFeature(HostFeature::SSE41)) {
- EmitVectorPairedMinMaxLower16(code, ctx, inst, &Xbyak::CodeGenerator::pminuw);
- return;
+ auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
+ // swap idxs 1 and 2 so that both registers contain even then odd-indexed pairs of elements
+ code.pshuflw(x, x, 0b11'01'10'00);
+ code.pshuflw(y, y, 0b11'01'10'00);
+ // move pairs of even/odd-indexed elements into one register each
+ // tmp = x[0, 2], y[0, 2], 0s...
+ code.movaps(tmp, y);
+ code.insertps(tmp, x, 0b01001100);
+ // x = x[1, 3], y[1, 3], 0s...
+ code.insertps(x, y, 0b00011100);
+ code.pminuw(x, tmp);
+ ctx.reg_alloc.DefineValue(code, inst, x);
+ } else {
+ auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const tmp1 = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+ code.punpcklwd(tmp0, tmp1);
+ code.pshufd(tmp1, tmp0, 231);
+ code.pshuflw(tmp1, tmp1, 114);
+ code.pshufd(tmp0, tmp0, 232);
+ code.pshuflw(tmp0, tmp0, 216);
+ code.movdqa(tmp2, tmp1);
+ code.psubusw(tmp2, tmp0);
+ code.psubw(tmp1, tmp2);
+ code.movq(tmp0, tmp1);
+ ctx.reg_alloc.DefineValue(code, inst, tmp0);
}
-
- EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) {
- LowerPairedMin(result, a, b);
- });
}
void EmitX64::EmitVectorPairedMinLowerU32(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasHostFeature(HostFeature::SSE41)) {
- EmitVectorPairedMinMaxLower32(code, ctx, inst, &Xbyak::CodeGenerator::pminud);
- return;
+ auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const y = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
+ // tmp = x[1], y[1], 0, 0
+ code.movaps(tmp, y);
+ code.insertps(tmp, x, 0b01001100);
+ // x = x[0], y[0], 0, 0
+ code.insertps(x, y, 0b00011100);
+ code.pminud(x, tmp);
+ ctx.reg_alloc.DefineValue(code, inst, x);
+ } else {
+ auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const tmp1 = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp3 = ctx.reg_alloc.ScratchXmm(code);
+ code.punpckldq(tmp0, tmp1);
+ code.pshufd(tmp1, tmp0, 238);
+ code.movdqa(tmp2, code.Const(xword, 0x8000'00008000'0000, 0x8000'00008000'0000));
+ code.movdqa(tmp3, tmp0);
+ code.pxor(tmp3, tmp2);
+ code.pxor(tmp2, tmp1);
+ code.pcmpgtd(tmp3, tmp2);
+ code.pand(tmp1, tmp3);
+ code.pandn(tmp3, tmp0);
+ code.por(tmp3, tmp1);
+ code.movq(tmp0, tmp3);
+ ctx.reg_alloc.DefineValue(code, inst, tmp0);
}
-
- EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) {
- LowerPairedMin(result, a, b);
- });
}
template
static D PolynomialMultiply(T lhs, T rhs) {
constexpr size_t bit_size = mcl::bitsizeof;
const std::bitset operand(lhs);
-
D res = 0;
- for (size_t i = 0; i < bit_size; i++) {
- if (operand[i]) {
+ for (size_t i = 0; i < bit_size; i++)
+ if (operand[i])
res ^= rhs << i;
- }
- }
-
return res;
}
void EmitX64::EmitVectorPolynomialMultiply8(EmitContext& ctx, IR::Inst* inst) {
if (code.HasHostFeature(HostFeature::SSE41)) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
- const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
- const Xbyak::Xmm alternate = ctx.reg_alloc.ScratchXmm(code);
- const Xbyak::Xmm mask = ctx.reg_alloc.ScratchXmm(code);
+ auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const result = ctx.reg_alloc.ScratchXmm(code);
+ auto const alternate = ctx.reg_alloc.ScratchXmm(code);
+ auto const mask = ctx.reg_alloc.ScratchXmm(code);
const Xbyak::Reg32 counter = ctx.reg_alloc.ScratchGpr(code).cvt32();
Xbyak::Label loop;
@@ -3185,11 +3443,11 @@ void EmitX64::EmitVectorPolynomialMultiply8(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorPolynomialMultiplyLong8(EmitContext& ctx, IR::Inst* inst) {
if (code.HasHostFeature(HostFeature::SSE41)) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
- const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
- const Xbyak::Xmm alternate = ctx.reg_alloc.ScratchXmm(code);
- const Xbyak::Xmm mask = ctx.reg_alloc.ScratchXmm(code);
+ auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const xmm_b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const result = ctx.reg_alloc.ScratchXmm(code);
+ auto const alternate = ctx.reg_alloc.ScratchXmm(code);
+ auto const mask = ctx.reg_alloc.ScratchXmm(code);
const Xbyak::Reg32 counter = ctx.reg_alloc.ScratchGpr(code).cvt32();
Xbyak::Label loop;
@@ -3231,8 +3489,8 @@ void EmitX64::EmitVectorPolynomialMultiplyLong8(EmitContext& ctx, IR::Inst* inst
void EmitX64::EmitVectorPolynomialMultiplyLong64(EmitContext& ctx, IR::Inst* inst) {
if (code.HasHostFeature(HostFeature::PCLMULQDQ)) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
code.pclmulqdq(xmm_a, xmm_b, 0x00);
@@ -3262,7 +3520,7 @@ void EmitX64::EmitVectorPolynomialMultiplyLong64(EmitContext& ctx, IR::Inst* ins
void EmitX64::EmitVectorPopulationCount(EmitContext& ctx, IR::Inst* inst) {
if (code.HasHostFeature(HostFeature::AVX512VL | HostFeature::AVX512BITALG)) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
code.vpopcntb(data, data);
@@ -3273,10 +3531,10 @@ void EmitX64::EmitVectorPopulationCount(EmitContext& ctx, IR::Inst* inst) {
if (code.HasHostFeature(HostFeature::SSSE3)) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm low_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm high_a = ctx.reg_alloc.ScratchXmm(code);
- const Xbyak::Xmm tmp1 = ctx.reg_alloc.ScratchXmm(code);
- const Xbyak::Xmm tmp2 = ctx.reg_alloc.ScratchXmm(code);
+ auto const low_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const high_a = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp1 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(high_a, low_a);
code.psrlw(high_a, 4);
@@ -3305,12 +3563,12 @@ void EmitX64::EmitVectorPopulationCount(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorReverseBits(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
if (code.HasHostFeature(HostFeature::GFNI)) {
code.gf2p8affineqb(data, code.Const(xword, 0x8040201008040201, 0x8040201008040201), 0);
} else {
- const Xbyak::Xmm high_nibble_reg = ctx.reg_alloc.ScratchXmm(code);
+ auto const high_nibble_reg = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(high_nibble_reg, code.Const(xword, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0));
code.pand(high_nibble_reg, data);
code.pxor(data, high_nibble_reg);
@@ -3318,7 +3576,7 @@ void EmitX64::EmitVectorReverseBits(EmitContext& ctx, IR::Inst* inst) {
if (code.HasHostFeature(HostFeature::SSSE3)) {
// High lookup
- const Xbyak::Xmm high_reversed_reg = ctx.reg_alloc.ScratchXmm(code);
+ auto const high_reversed_reg = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(high_reversed_reg, code.Const(xword, 0xE060A020C0408000, 0xF070B030D0509010));
code.pshufb(high_reversed_reg, data);
@@ -3352,8 +3610,8 @@ void EmitX64::EmitVectorReverseBits(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorReverseElementsInHalfGroups8(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(tmp, data);
code.psllw(tmp, 8);
@@ -3365,13 +3623,13 @@ void EmitX64::EmitVectorReverseElementsInHalfGroups8(EmitContext& ctx, IR::Inst*
void EmitX64::EmitVectorReverseElementsInWordGroups8(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
if (code.HasHostFeature(HostFeature::AVX)) {
code.vpshufb(data, data, code.Const(xword, 0x0405060700010203, 0x0c0d0e0f08090a0b));
} else if (code.HasHostFeature(HostFeature::SSSE3)) {
code.pshufb(data, code.Const(xword, 0x0405060700010203, 0x0c0d0e0f08090a0b));
} else {
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(tmp, data);
code.psllw(tmp, 8);
code.psrlw(data, 8);
@@ -3384,7 +3642,7 @@ void EmitX64::EmitVectorReverseElementsInWordGroups8(EmitContext& ctx, IR::Inst*
void EmitX64::EmitVectorReverseElementsInWordGroups16(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
code.pshuflw(data, data, 0b10110001);
code.pshufhw(data, data, 0b10110001);
ctx.reg_alloc.DefineValue(code, inst, data);
@@ -3392,13 +3650,13 @@ void EmitX64::EmitVectorReverseElementsInWordGroups16(EmitContext& ctx, IR::Inst
void EmitX64::EmitVectorReverseElementsInLongGroups8(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
if (code.HasHostFeature(HostFeature::AVX)) {
code.vpshufb(data, data, code.Const(xword, 0x0001020304050607, 0x08090a0b0c0d0e0f));
} else if (code.HasHostFeature(HostFeature::SSSE3)) {
code.pshufb(data, code.Const(xword, 0x0001020304050607, 0x08090a0b0c0d0e0f));
} else {
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(tmp, data);
code.psllw(tmp, 8);
code.psrlw(data, 8);
@@ -3412,7 +3670,7 @@ void EmitX64::EmitVectorReverseElementsInLongGroups8(EmitContext& ctx, IR::Inst*
void EmitX64::EmitVectorReverseElementsInLongGroups16(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
code.pshuflw(data, data, 0b00011011);
code.pshufhw(data, data, 0b00011011);
@@ -3423,7 +3681,7 @@ void EmitX64::EmitVectorReverseElementsInLongGroups16(EmitContext& ctx, IR::Inst
void EmitX64::EmitVectorReverseElementsInLongGroups32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
code.pshuflw(data, data, 0b01001110);
code.pshufhw(data, data, 0b01001110);
@@ -3434,8 +3692,8 @@ void EmitX64::EmitVectorReverseElementsInLongGroups32(EmitContext& ctx, IR::Inst
void EmitX64::EmitVectorReduceAdd8(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm temp = xmm0;
+ auto const data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const temp = xmm0;
// Add upper elements to lower elements
code.pshufd(temp, data, 0b01'00'11'10);
@@ -3455,8 +3713,8 @@ void EmitX64::EmitVectorReduceAdd8(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorReduceAdd16(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm temp = xmm0;
+ auto const data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const temp = xmm0;
if (code.HasHostFeature(HostFeature::SSSE3)) {
code.pxor(temp, temp);
@@ -3486,8 +3744,8 @@ void EmitX64::EmitVectorReduceAdd16(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorReduceAdd32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm temp = xmm0;
+ auto const data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const temp = xmm0;
// Add upper elements to lower elements(reversed)
code.pshufd(temp, data, 0b00'01'10'11);
@@ -3510,8 +3768,8 @@ void EmitX64::EmitVectorReduceAdd32(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorReduceAdd64(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm temp = xmm0;
+ auto const data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const temp = xmm0;
// Add upper elements to lower elements
code.pshufd(temp, data, 0b01'00'11'10);
@@ -3526,8 +3784,8 @@ void EmitX64::EmitVectorReduceAdd64(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorRotateWholeVectorRight(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm operand = ctx.reg_alloc.UseXmm(code, args[0]);
- const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+ auto const operand = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const result = ctx.reg_alloc.ScratchXmm(code);
const u8 shift_amount = args[1].GetImmediateU8();
ASSERT(shift_amount % 32 == 0);
const u8 shuffle_imm = std::rotr(0b11100100, shift_amount / 32 * 2);
@@ -3540,12 +3798,12 @@ void EmitX64::EmitVectorRotateWholeVectorRight(EmitContext& ctx, IR::Inst* inst)
static void EmitVectorRoundingHalvingAddSigned(size_t esize, EmitContext& ctx, IR::Inst* inst, BlockOfCode& code) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
switch (esize) {
case 8: {
- const Xbyak::Xmm vec_128 = ctx.reg_alloc.ScratchXmm(code);
+ auto const vec_128 = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(vec_128, code.Const(xword, 0x8080808080808080, 0x8080808080808080));
code.paddb(a, vec_128);
@@ -3555,7 +3813,7 @@ static void EmitVectorRoundingHalvingAddSigned(size_t esize, EmitContext& ctx, I
break;
}
case 16: {
- const Xbyak::Xmm vec_32768 = ctx.reg_alloc.ScratchXmm(code);
+ auto const vec_32768 = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(vec_32768, code.Const(xword, 0x8000800080008000, 0x8000800080008000));
code.paddw(a, vec_32768);
@@ -3565,7 +3823,7 @@ static void EmitVectorRoundingHalvingAddSigned(size_t esize, EmitContext& ctx, I
break;
}
case 32: {
- const Xbyak::Xmm tmp1 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp1 = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(tmp1, a);
code.por(a, b);
@@ -3605,9 +3863,9 @@ static void EmitVectorRoundingHalvingAddUnsigned(size_t esize, EmitContext& ctx,
case 32: {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
- const Xbyak::Xmm tmp1 = ctx.reg_alloc.ScratchXmm(code);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const tmp1 = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(tmp1, a);
@@ -3671,18 +3929,18 @@ static void EmitUnsignedRoundingShiftLeft(BlockOfCode& code, EmitContext& ctx, I
static_assert(esize == 32 || esize == 64);
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseXmm(code, args[0]);
- const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const a = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const b = ctx.reg_alloc.UseXmm(code, args[1]);
// positive values of b are left shifts, while negative values are (positive) rounding right shifts
// only the lowest byte of each element is read as the shift amount
// conveniently, the behavior of bit shifts greater than element width is the same in NEON and SSE/AVX - filled with zeros
- const Xbyak::Xmm shift_amount = ctx.reg_alloc.ScratchXmm(code);
+ auto const shift_amount = ctx.reg_alloc.ScratchXmm(code);
code.vpabsb(shift_amount, b);
code.vpand(shift_amount, shift_amount, code.BConst(xword, 0xFF));
// if b is positive, do a normal left shift
- const Xbyak::Xmm left_shift = ctx.reg_alloc.ScratchXmm(code);
+ auto const left_shift = ctx.reg_alloc.ScratchXmm(code);
ICODE(vpsllv)(left_shift, a, shift_amount);
// if b is negative, compute the rounding right shift
@@ -3693,7 +3951,7 @@ static void EmitUnsignedRoundingShiftLeft(BlockOfCode& code, EmitContext& ctx, I
// tmp = (a >> (b - 1)) & 1
// res = (a >> b) + tmp
// to add the value of the last bit to be shifted off to the result of the right shift
- const Xbyak::Xmm right_shift = ctx.reg_alloc.ScratchXmm(code);
+ auto const right_shift = ctx.reg_alloc.ScratchXmm(code);
code.vmovdqa(xmm0, code.BConst(xword, 1));
// find value of last bit to be shifted off
@@ -3777,12 +4035,12 @@ void EmitX64::EmitVectorRoundingShiftLeftU64(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorSignExtend8(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasHostFeature(HostFeature::SSE41)) {
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
code.pmovsxbw(a, a);
ctx.reg_alloc.DefineValue(code, inst, a);
} else {
- const Xbyak::Xmm a = ctx.reg_alloc.UseXmm(code, args[0]);
- const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+ auto const a = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const result = ctx.reg_alloc.ScratchXmm(code);
code.pxor(result, result);
code.punpcklbw(result, a);
code.psraw(result, 8);
@@ -3793,12 +4051,12 @@ void EmitX64::EmitVectorSignExtend8(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorSignExtend16(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasHostFeature(HostFeature::SSE41)) {
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
code.pmovsxwd(a, a);
ctx.reg_alloc.DefineValue(code, inst, a);
} else {
- const Xbyak::Xmm a = ctx.reg_alloc.UseXmm(code, args[0]);
- const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+ auto const a = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const result = ctx.reg_alloc.ScratchXmm(code);
code.pxor(result, result);
code.punpcklwd(result, a);
code.psrad(result, 16);
@@ -3808,12 +4066,12 @@ void EmitX64::EmitVectorSignExtend16(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorSignExtend32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
if (code.HasHostFeature(HostFeature::SSE41)) {
code.pmovsxdq(a, a);
} else {
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
code.movaps(tmp, a);
code.psrad(tmp, 31);
@@ -3826,7 +4084,7 @@ void EmitX64::EmitVectorSignExtend32(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorSignExtend64(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const Xbyak::Reg64 gpr_tmp = ctx.reg_alloc.ScratchGpr(code);
code.movq(gpr_tmp, data);
@@ -3835,7 +4093,7 @@ void EmitX64::EmitVectorSignExtend64(EmitContext& ctx, IR::Inst* inst) {
if (code.HasHostFeature(HostFeature::SSE41)) {
code.pinsrq(data, gpr_tmp, 1);
} else {
- const Xbyak::Xmm xmm_tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const xmm_tmp = ctx.reg_alloc.ScratchXmm(code);
code.movq(xmm_tmp, gpr_tmp);
code.punpcklqdq(data, xmm_tmp);
@@ -3846,9 +4104,9 @@ void EmitX64::EmitVectorSignExtend64(EmitContext& ctx, IR::Inst* inst) {
static void EmitVectorSignedAbsoluteDifference(size_t esize, EmitContext& ctx, IR::Inst* inst, BlockOfCode& code) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(code, args[1]);
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const y = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
// only signed 16-bit min/max are available below SSE4.1
if (code.HasHostFeature(HostFeature::SSE41) || esize == 16) {
@@ -3914,11 +4172,11 @@ void EmitX64::EmitVectorSignedMultiply16(EmitContext& ctx, IR::Inst* inst) {
const auto lower_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetLowerFromOp);
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm x = ctx.reg_alloc.UseXmm(code, args[0]);
- const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const x = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const y = ctx.reg_alloc.UseXmm(code, args[1]);
if (upper_inst) {
- const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+ auto const result = ctx.reg_alloc.ScratchXmm(code);
if (code.HasHostFeature(HostFeature::AVX)) {
code.vpmulhw(result, x, y);
} else {
@@ -3930,7 +4188,7 @@ void EmitX64::EmitVectorSignedMultiply16(EmitContext& ctx, IR::Inst* inst) {
}
if (lower_inst) {
- const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+ auto const result = ctx.reg_alloc.ScratchXmm(code);
if (code.HasHostFeature(HostFeature::AVX)) {
code.vpmullw(result, x, y);
} else {
@@ -3948,9 +4206,9 @@ void EmitX64::EmitVectorSignedMultiply32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (lower_inst && !upper_inst && code.HasHostFeature(HostFeature::AVX)) {
- const Xbyak::Xmm x = ctx.reg_alloc.UseXmm(code, args[0]);
- const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(code, args[1]);
- const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+ auto const x = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const y = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const result = ctx.reg_alloc.ScratchXmm(code);
code.vpmulld(result, x, y);
@@ -3959,16 +4217,16 @@ void EmitX64::EmitVectorSignedMultiply32(EmitContext& ctx, IR::Inst* inst) {
}
if (code.HasHostFeature(HostFeature::AVX)) {
- const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
if (lower_inst) {
- const Xbyak::Xmm lower_result = ctx.reg_alloc.ScratchXmm(code);
+ auto const lower_result = ctx.reg_alloc.ScratchXmm(code);
code.vpmulld(lower_result, x, y);
ctx.reg_alloc.DefineValue(code, lower_inst, lower_result);
}
- const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+ auto const result = ctx.reg_alloc.ScratchXmm(code);
code.vpmuldq(result, x, y);
code.vpsrlq(x, x, 32);
@@ -3980,12 +4238,12 @@ void EmitX64::EmitVectorSignedMultiply32(EmitContext& ctx, IR::Inst* inst) {
return;
}
- const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
- const Xbyak::Xmm sign_correction = ctx.reg_alloc.ScratchXmm(code);
- const Xbyak::Xmm upper_result = ctx.reg_alloc.ScratchXmm(code);
- const Xbyak::Xmm lower_result = ctx.reg_alloc.ScratchXmm(code);
+ auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const sign_correction = ctx.reg_alloc.ScratchXmm(code);
+ auto const upper_result = ctx.reg_alloc.ScratchXmm(code);
+ auto const lower_result = ctx.reg_alloc.ScratchXmm(code);
// calculate sign correction
code.movdqa(tmp, x);
@@ -4028,7 +4286,7 @@ void EmitX64::EmitVectorSignedMultiply32(EmitContext& ctx, IR::Inst* inst) {
static void EmitVectorSignedSaturatedAbs(size_t esize, BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const Xbyak::Reg32 bit = ctx.reg_alloc.ScratchGpr(code).cvt32();
// SSE absolute value functions return an unsigned result
@@ -4040,21 +4298,34 @@ static void EmitVectorSignedSaturatedAbs(size_t esize, BlockOfCode& code, EmitCo
// or shift in sign bits to create a mask of (msb == 1 ? -1 : 0), then add to the result vector
switch (esize) {
case 8: {
- VectorAbs8(code, ctx, data);
+ if (code.HasHostFeature(HostFeature::SSSE3)) {
+ code.pabsb(data, data);
+ } else {
+ auto const temp = ctx.reg_alloc.ScratchXmm(code);
+ code.pxor(temp, temp);
+ code.psubb(temp, data);
+ code.pminub(data, temp);
+ }
code.pmovmskb(bit, data);
-
code.pminub(data, code.BConst<8>(xword, 0x7F));
break;
}
case 16: {
- VectorAbs16(code, ctx, data);
+ if (code.HasHostFeature(HostFeature::SSSE3)) {
+ code.pabsw(data, data);
+ } else {
+ auto const temp = ctx.reg_alloc.ScratchXmm(code);
+ code.pxor(temp, temp);
+ code.psubw(temp, data);
+ code.pmaxsw(data, temp);
+ }
code.pmovmskb(bit, data);
code.and_(bit, 0xAAAA); // toggle mask bits that aren't the msb of an int16 to 0
if (code.HasHostFeature(HostFeature::SSE41)) {
code.pminuw(data, code.BConst<16>(xword, 0x7FFF));
} else {
- const Xbyak::Xmm tmp = xmm0;
+ auto const tmp = xmm0;
code.movdqa(tmp, data);
code.psraw(data, 15);
code.paddw(data, tmp);
@@ -4062,13 +4333,21 @@ static void EmitVectorSignedSaturatedAbs(size_t esize, BlockOfCode& code, EmitCo
break;
}
case 32: {
- VectorAbs32(code, ctx, data);
+ if (code.HasHostFeature(HostFeature::SSSE3)) {
+ code.pabsd(data, data);
+ } else {
+ auto const temp = ctx.reg_alloc.ScratchXmm(code);
+ code.movdqa(temp, data);
+ code.psrad(temp, 31);
+ code.pxor(data, temp);
+ code.psubd(data, temp);
+ }
code.movmskps(bit, data);
if (code.HasHostFeature(HostFeature::SSE41)) {
code.pminud(data, code.BConst<32>(xword, 0x7FFFFFFF));
} else {
- const Xbyak::Xmm tmp = xmm0;
+ auto const tmp = xmm0;
code.movdqa(tmp, data);
code.psrad(data, 31);
code.paddd(data, tmp);
@@ -4076,10 +4355,18 @@ static void EmitVectorSignedSaturatedAbs(size_t esize, BlockOfCode& code, EmitCo
break;
}
case 64: {
- VectorAbs64(code, ctx, data);
+ if (code.HasHostFeature(HostFeature::AVX512_Ortho)) {
+ code.vpabsq(data, data);
+ } else {
+ auto const temp = ctx.reg_alloc.ScratchXmm(code);
+ code.pshufd(temp, data, 0b11110101);
+ code.psrad(temp, 31);
+ code.pxor(data, temp);
+ code.psubq(data, temp);
+ }
code.movmskpd(bit, data);
- const Xbyak::Xmm tmp = xmm0;
+ auto const tmp = xmm0;
if (code.HasHostFeature(HostFeature::SSE42)) {
// create a -1 mask if msb is set
code.pxor(tmp, tmp);
@@ -4121,13 +4408,13 @@ template
static void EmitVectorSignedSaturatedAccumulateUnsigned(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const y = ctx.reg_alloc.UseXmm(code, args[1]);
code.movdqa(xmm0, y);
ctx.reg_alloc.Release(y);
- const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const result = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
switch (bit_width) {
case 8:
@@ -4184,7 +4471,7 @@ static void EmitVectorSignedSaturatedAccumulateUnsigned(BlockOfCode& code, EmitC
switch (bit_width) {
case 8:
if (code.HasHostFeature(HostFeature::AVX)) {
- const Xbyak::Xmm tmp2 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
code.pcmpeqb(tmp2, tmp2);
code.pxor(tmp, tmp);
code.vpblendvb(xmm0, tmp, tmp2, xmm0);
@@ -4264,10 +4551,10 @@ void EmitX64::EmitVectorSignedSaturatedAccumulateUnsigned64(EmitContext& ctx, IR
template
static void EmitVectorSignedSaturatedDoublingMultiply16(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm x = ctx.reg_alloc.UseXmm(code, args[0]);
- const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(code, args[1]);
- const Xbyak::Xmm upper_tmp = ctx.reg_alloc.ScratchXmm(code);
- const Xbyak::Xmm lower_tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const x = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const y = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const upper_tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const lower_tmp = ctx.reg_alloc.ScratchXmm(code);
if (code.HasHostFeature(HostFeature::AVX)) {
code.vpmulhw(upper_tmp, x, y);
@@ -4286,7 +4573,7 @@ static void EmitVectorSignedSaturatedDoublingMultiply16(BlockOfCode& code, EmitC
ctx.reg_alloc.Release(x);
ctx.reg_alloc.Release(y);
- const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+ auto const result = ctx.reg_alloc.ScratchXmm(code);
if (code.HasHostFeature(HostFeature::AVX)) {
if constexpr (is_rounding) {
@@ -4336,10 +4623,10 @@ void EmitVectorSignedSaturatedDoublingMultiply32(BlockOfCode& code, EmitContext&
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasHostFeature(HostFeature::AVX)) {
- const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
- const Xbyak::Xmm odds = ctx.reg_alloc.ScratchXmm(code);
- const Xbyak::Xmm even = ctx.reg_alloc.ScratchXmm(code);
+ auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const odds = ctx.reg_alloc.ScratchXmm(code);
+ auto const even = ctx.reg_alloc.ScratchXmm(code);
code.vpmuldq(odds, x, y);
code.vpsrlq(x, x, 32);
@@ -4352,7 +4639,7 @@ void EmitVectorSignedSaturatedDoublingMultiply32(BlockOfCode& code, EmitContext&
code.vpaddq(odds, odds, odds);
code.vpaddq(even, even, even);
- const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+ auto const result = ctx.reg_alloc.ScratchXmm(code);
if constexpr (is_rounding) {
code.vmovdqa(result, code.Const(xword, 0x0000000080000000, 0x0000000080000000));
@@ -4363,7 +4650,7 @@ void EmitVectorSignedSaturatedDoublingMultiply32(BlockOfCode& code, EmitContext&
code.vpsrlq(result, odds, 32);
code.vblendps(result, result, even, 0b1010);
- const Xbyak::Xmm mask = ctx.reg_alloc.ScratchXmm(code);
+ auto const mask = ctx.reg_alloc.ScratchXmm(code);
const Xbyak::Reg32 bit = ctx.reg_alloc.ScratchGpr(code).cvt32();
code.vpcmpeqd(mask, result, code.Const(xword, 0x8000000080000000, 0x8000000080000000));
@@ -4378,11 +4665,11 @@ void EmitVectorSignedSaturatedDoublingMultiply32(BlockOfCode& code, EmitContext&
return;
}
- const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
- const Xbyak::Xmm sign_correction = ctx.reg_alloc.ScratchXmm(code);
- const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+ auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const sign_correction = ctx.reg_alloc.ScratchXmm(code);
+ auto const result = ctx.reg_alloc.ScratchXmm(code);
// calculate sign correction
code.movdqa(tmp, x);
@@ -4441,8 +4728,8 @@ void EmitX64::EmitVectorSignedSaturatedDoublingMultiplyHighRounding32(EmitContex
void EmitX64::EmitVectorSignedSaturatedDoublingMultiplyLong16(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
code.punpcklwd(x, x);
code.punpcklwd(y, y);
@@ -4467,8 +4754,8 @@ void EmitX64::EmitVectorSignedSaturatedDoublingMultiplyLong16(EmitContext& ctx,
void EmitX64::EmitVectorSignedSaturatedDoublingMultiplyLong32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
if (code.HasHostFeature(HostFeature::AVX)) {
code.vpmovsxdq(x, x);
@@ -4519,10 +4806,10 @@ void EmitX64::EmitVectorSignedSaturatedDoublingMultiplyLong32(EmitContext& ctx,
static void EmitVectorSignedSaturatedNarrowToSigned(size_t original_esize, BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm src = ctx.reg_alloc.UseXmm(code, args[0]);
- const Xbyak::Xmm dest = ctx.reg_alloc.ScratchXmm(code);
- const Xbyak::Xmm reconstructed = ctx.reg_alloc.ScratchXmm(code);
- const Xbyak::Xmm sign = ctx.reg_alloc.ScratchXmm(code);
+ auto const src = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const dest = ctx.reg_alloc.ScratchXmm(code);
+ auto const reconstructed = ctx.reg_alloc.ScratchXmm(code);
+ auto const sign = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(dest, src);
code.pxor(xmm0, xmm0);
@@ -4579,9 +4866,9 @@ void EmitX64::EmitVectorSignedSaturatedNarrowToSigned64(EmitContext& ctx, IR::In
static void EmitVectorSignedSaturatedNarrowToUnsigned(size_t original_esize, BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm src = ctx.reg_alloc.UseXmm(code, args[0]);
- const Xbyak::Xmm dest = ctx.reg_alloc.ScratchXmm(code);
- const Xbyak::Xmm reconstructed = ctx.reg_alloc.ScratchXmm(code);
+ auto const src = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const dest = ctx.reg_alloc.ScratchXmm(code);
+ auto const reconstructed = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(dest, src);
code.pxor(xmm0, xmm0);
@@ -4649,9 +4936,9 @@ void EmitX64::EmitVectorSignedSaturatedNarrowToUnsigned64(EmitContext& ctx, IR::
static void EmitVectorSignedSaturatedNeg(size_t esize, BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm data = ctx.reg_alloc.UseXmm(code, args[0]);
- const Xbyak::Xmm zero = ctx.reg_alloc.ScratchXmm(code);
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const data = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const zero = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
const Xbyak::Address mask = [esize, &code] {
switch (esize) {
case 8:
@@ -4667,7 +4954,7 @@ static void EmitVectorSignedSaturatedNeg(size_t esize, BlockOfCode& code, EmitCo
}
}();
- const auto vector_equality = [esize, &code](const Xbyak::Xmm& x, const auto& y) {
+ const auto vector_equality = [esize, &code](auto const& x, const auto& y) {
switch (esize) {
case 8:
code.pcmpeqb(x, y);
@@ -4812,33 +5099,23 @@ void EmitX64::EmitVectorSignedSaturatedShiftLeft64(EmitContext& ctx, IR::Inst* i
EmitTwoArgumentFallbackWithSaturation(code, ctx, inst, VectorSignedSaturatedShiftLeft);
}
-template>
+template
static bool VectorSignedSaturatedShiftLeftUnsigned(VectorArray& dst, const VectorArray& data, u8 shift_amount) {
+ using U = std::make_unsigned_t;
static_assert(std::is_signed_v, "T must be signed.");
-
bool qc_flag = false;
for (size_t i = 0; i < dst.size(); i++) {
- const T element = data[i];
- const T shift = static_cast(shift_amount);
-
- if (element == 0) {
- dst[i] = 0;
- } else if (element < 0) {
- dst[i] = 0;
- qc_flag = true;
- } else {
- const U shifted = static_cast(element) << static_cast(shift);
- const U shifted_test = shifted >> static_cast(shift);
-
- if (shifted_test != static_cast(element)) {
- dst[i] = static_cast((std::numeric_limits::max)());
- qc_flag = true;
- } else {
- dst[i] = shifted;
- }
- }
+ auto const element = data[i];
+ auto const shifted = U(element) << U(T(shift_amount));
+ auto const shifted_test = shifted >> U(T(shift_amount));
+ auto result = 0;
+ if (element > 0 && shifted_test != U(element))
+ result = T((std::numeric_limits::max)());
+ if (element > 0 && shifted_test == U(element))
+ result = shifted;
+ qc_flag |= element < 0 || (element > 0 && shifted_test != U(element));
+ dst[i] = result;
}
-
return qc_flag;
}
@@ -4851,7 +5128,97 @@ void EmitX64::EmitVectorSignedSaturatedShiftLeftUnsigned16(EmitContext& ctx, IR:
}
void EmitX64::EmitVectorSignedSaturatedShiftLeftUnsigned32(EmitContext& ctx, IR::Inst* inst) {
- EmitTwoArgumentFallbackWithSaturationAndImmediate(code, ctx, inst, VectorSignedSaturatedShiftLeftUnsigned);
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto const imm8 = args[1].GetImmediateU8();
+ if (code.HasHostFeature(HostFeature::AVX2)) {
+ auto const tmp_flag = ctx.reg_alloc.ScratchGpr(code);
+ auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ if (imm8 == 0) {
+ auto const tmp1 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+ code.vpshufd(tmp1, tmp0, 85);
+ code.vpshufd(tmp2, tmp0, 238);
+ code.vpor(tmp1, tmp1, tmp2);
+ code.vpshufd(tmp2, tmp0, 255);
+ code.vpor(tmp2, tmp2, tmp0);
+ code.vpor(tmp1, tmp1, tmp2);
+ code.vmovd(tmp_flag.cvt32(), tmp1);
+ code.shr(tmp_flag.cvt32(), 31);
+ code.vpxor(tmp1, tmp1, tmp1);
+ code.vpmaxsd(tmp0, tmp0, tmp1);
+ } else {
+ auto const tmp1 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp3 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp4 = ctx.reg_alloc.ScratchXmm(code);
+ auto const cmp_value = u32(1ULL << 31) >> (imm8 - 1);
+ code.vpshufd(tmp1, tmp0, 238);
+ code.vpor(tmp1, tmp1, tmp0);
+ code.vpshufd(tmp2, tmp1, 85);
+ code.vpor(tmp1, tmp1, tmp2);
+ code.vmovd(tmp_flag.cvt32(), tmp1);
+ code.cmp(tmp_flag.cvt32(), cmp_value);
+ code.vpslld(tmp1, tmp0, imm8);
+ code.vpbroadcastd(tmp2, code.Const(dword, cmp_value - 2));
+ code.vpbroadcastd(tmp3, code.Const(dword, cmp_value - 1));
+ code.vpcmpgtd(tmp3, tmp0, tmp3);
+ code.vpcmpeqd(tmp4, tmp4, tmp4);
+ code.vpaddd(tmp0, tmp0, tmp4);
+ code.vpminud(tmp2, tmp0, tmp2);
+ code.vpcmpeqd(tmp0, tmp0, tmp2);
+ code.vblendvps(tmp0, tmp3, tmp1, tmp0);
+ code.setae(tmp_flag.cvt8());
+ }
+ code.or_(code.byte[code.ABI_JIT_PTR + code.GetJitStateInfo().offsetof_fpsr_qc], tmp_flag.cvt8());
+ ctx.reg_alloc.DefineValue(code, inst, tmp0);
+ } else {
+ auto const tmp_flag = ctx.reg_alloc.ScratchGpr(code);
+ auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ if (imm8 == 0) {
+ auto const tmp1 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+ code.pshufd(tmp1, tmp0, 85);
+ code.pshufd(tmp2, tmp0, 238);
+ code.por(tmp2, tmp1);
+ code.pshufd(tmp1, tmp0, 255);
+ code.por(tmp1, tmp0);
+ code.por(tmp1, tmp2);
+ code.movd(tmp_flag.cvt32(), tmp1);
+ code.shr(tmp_flag.cvt32(), 31);
+ code.pxor(tmp1, tmp1);
+ code.movdqa(tmp2, tmp0);
+ code.pcmpgtd(tmp2, tmp1);
+ code.pand(tmp0, tmp2);
+ } else {
+ auto const tmp1 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp3 = ctx.reg_alloc.ScratchXmm(code);
+ u64 const cmp_value = u64(1ULL << 31) >> (imm8 - 1);
+ u64 const cmp_one = cmp_value - 1;
+ u64 const cmp_add = (cmp_value - 2) + 0x80000000;
+ code.pshufd(tmp1, tmp0, 238);
+ code.por(tmp1, tmp0);
+ code.pshufd(tmp2, tmp1, 85);
+ code.por(tmp2, tmp1);
+ code.movd(tmp_flag.cvt32(), tmp2);
+ code.cmp(tmp_flag.cvt32(), cmp_value);
+ code.movdqa(tmp1, tmp0);
+ code.pslld(tmp1, imm8);
+ code.movdqa(tmp2, tmp0);
+ code.pcmpgtd(tmp2, code.Const(xword, cmp_one | (cmp_one << 32), cmp_one | (cmp_one << 32)));
+ code.pcmpeqd(tmp3, tmp3);
+ code.paddd(tmp0, tmp3);
+ code.pxor(tmp0, code.Const(xword, 0x80000000'80000000, 0x80000000'80000000));
+ code.pcmpgtd(tmp0, code.Const(xword, cmp_add | (cmp_add << 32), cmp_add | (cmp_add << 32)));
+ code.pand(tmp2, tmp0);
+ code.pandn(tmp0, tmp1);
+ code.por(tmp0, tmp2);
+ code.setae(tmp_flag.cvt8());
+ }
+ code.or_(code.byte[code.ABI_JIT_PTR + code.GetJitStateInfo().offsetof_fpsr_qc], tmp_flag.cvt8());
+ ctx.reg_alloc.DefineValue(code, inst, tmp0);
+// EmitTwoArgumentFallbackWithSaturationAndImmediate(code, ctx, inst, VectorSignedSaturatedShiftLeftUnsigned);
+ }
}
void EmitX64::EmitVectorSignedSaturatedShiftLeftUnsigned64(EmitContext& ctx, IR::Inst* inst) {
@@ -4889,7 +5256,7 @@ void EmitX64::EmitVectorTableLookup64(EmitContext& ctx, IR::Inst* inst) {
const bool is_defaults_zero = inst->GetArg(0).IsZero();
if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW | HostFeature::AVX512VBMI)) {
- const Xbyak::Xmm indicies = table_size <= 2 ? ctx.reg_alloc.UseXmm(code, args[2]) : ctx.reg_alloc.UseScratchXmm(code, args[2]);
+ auto const indicies = table_size <= 2 ? ctx.reg_alloc.UseXmm(code, args[2]) : ctx.reg_alloc.UseScratchXmm(code, args[2]);
const u64 index_count = mcl::bit::replicate_element(static_cast(table_size * 8));
@@ -4897,43 +5264,43 @@ void EmitX64::EmitVectorTableLookup64(EmitContext& ctx, IR::Inst* inst) {
switch (table_size) {
case 1: {
- const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseXmm(code, table[0]);
+ auto const xmm_table0 = ctx.reg_alloc.UseXmm(code, table[0]);
if (is_defaults_zero) {
- const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+ auto const result = ctx.reg_alloc.ScratchXmm(code);
code.vpermb(result | k1 | T_z, indicies, xmm_table0);
ctx.reg_alloc.DefineValue(code, inst, result);
} else {
- const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
code.vpermb(result | k1, indicies, xmm_table0);
ctx.reg_alloc.DefineValue(code, inst, result);
}
break;
}
case 2: {
- const Xbyak::Xmm xmm_table0_lower = ctx.reg_alloc.UseXmm(code, table[0]);
- const Xbyak::Xmm xmm_table0_upper = ctx.reg_alloc.UseXmm(code, table[1]);
+ auto const xmm_table0_lower = ctx.reg_alloc.UseXmm(code, table[0]);
+ auto const xmm_table0_upper = ctx.reg_alloc.UseXmm(code, table[1]);
code.vpunpcklqdq(xmm0, xmm_table0_lower, xmm_table0_upper);
if (is_defaults_zero) {
- const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+ auto const result = ctx.reg_alloc.ScratchXmm(code);
code.vpermb(result | k1 | T_z, indicies, xmm0);
ctx.reg_alloc.DefineValue(code, inst, result);
} else {
- const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
code.vpermb(result | k1, indicies, xmm0);
ctx.reg_alloc.DefineValue(code, inst, result);
}
break;
}
case 3: {
- const Xbyak::Xmm xmm_table0_lower = ctx.reg_alloc.UseXmm(code, table[0]);
- const Xbyak::Xmm xmm_table0_upper = ctx.reg_alloc.UseXmm(code, table[1]);
- const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseXmm(code, table[2]);
+ auto const xmm_table0_lower = ctx.reg_alloc.UseXmm(code, table[0]);
+ auto const xmm_table0_upper = ctx.reg_alloc.UseXmm(code, table[1]);
+ auto const xmm_table1 = ctx.reg_alloc.UseXmm(code, table[2]);
code.vpunpcklqdq(xmm0, xmm_table0_lower, xmm_table0_upper);
if (is_defaults_zero) {
code.vpermi2b(indicies | k1 | T_z, xmm0, xmm_table1);
ctx.reg_alloc.DefineValue(code, inst, indicies);
} else {
- const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
code.vpermi2b(indicies, xmm0, xmm_table1);
code.vmovdqu8(result | k1, indicies);
ctx.reg_alloc.DefineValue(code, inst, result);
@@ -4941,17 +5308,17 @@ void EmitX64::EmitVectorTableLookup64(EmitContext& ctx, IR::Inst* inst) {
break;
}
case 4: {
- const Xbyak::Xmm xmm_table0_lower = ctx.reg_alloc.UseXmm(code, table[0]);
- const Xbyak::Xmm xmm_table0_upper = ctx.reg_alloc.UseXmm(code, table[1]);
- const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseScratchXmm(code, table[2]);
- const Xbyak::Xmm xmm_table1_upper = ctx.reg_alloc.UseXmm(code, table[3]);
+ auto const xmm_table0_lower = ctx.reg_alloc.UseXmm(code, table[0]);
+ auto const xmm_table0_upper = ctx.reg_alloc.UseXmm(code, table[1]);
+ auto const xmm_table1 = ctx.reg_alloc.UseScratchXmm(code, table[2]);
+ auto const xmm_table1_upper = ctx.reg_alloc.UseXmm(code, table[3]);
code.vpunpcklqdq(xmm0, xmm_table0_lower, xmm_table0_upper);
code.vpunpcklqdq(xmm_table1, xmm_table1, xmm_table1_upper);
if (is_defaults_zero) {
code.vpermi2b(indicies | k1 | T_z, xmm0, xmm_table1);
ctx.reg_alloc.DefineValue(code, inst, indicies);
} else {
- const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
code.vpermi2b(indicies, xmm0, xmm_table1);
code.vmovdqu8(result | k1, indicies);
ctx.reg_alloc.DefineValue(code, inst, result);
@@ -4974,9 +5341,9 @@ void EmitX64::EmitVectorTableLookup64(EmitContext& ctx, IR::Inst* inst) {
};
if (code.HasHostFeature(HostFeature::SSSE3) && is_defaults_zero && table_size == 1) {
- const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]);
- const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseXmm(code, table[0]);
- const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+ auto const indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]);
+ auto const xmm_table0 = ctx.reg_alloc.UseXmm(code, table[0]);
+ auto const result = ctx.reg_alloc.ScratchXmm(code);
code.xorps(result, result);
code.movsd(result, xmm_table0);
@@ -4988,9 +5355,9 @@ void EmitX64::EmitVectorTableLookup64(EmitContext& ctx, IR::Inst* inst) {
}
if (code.HasHostFeature(HostFeature::SSSE3) && is_defaults_zero && table_size == 2) {
- const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]);
- const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(code, table[0]);
- const Xbyak::Xmm xmm_table0_upper = ctx.reg_alloc.UseXmm(code, table[1]);
+ auto const indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]);
+ auto const xmm_table0 = ctx.reg_alloc.UseScratchXmm(code, table[0]);
+ auto const xmm_table0_upper = ctx.reg_alloc.UseXmm(code, table[1]);
code.punpcklqdq(xmm_table0, xmm_table0_upper);
code.paddusb(indicies, code.Const(xword, 0x7070707070707070, 0xFFFFFFFFFFFFFFFF));
@@ -5001,12 +5368,12 @@ void EmitX64::EmitVectorTableLookup64(EmitContext& ctx, IR::Inst* inst) {
}
if (code.HasHostFeature(HostFeature::SSE41) && table_size <= 2) {
- const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(code, args[2]);
- const Xbyak::Xmm defaults = ctx.reg_alloc.UseXmm(code, args[0]);
- const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(code, table[0]);
+ auto const indicies = ctx.reg_alloc.UseXmm(code, args[2]);
+ auto const defaults = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const xmm_table0 = ctx.reg_alloc.UseScratchXmm(code, table[0]);
if (table_size == 2) {
- const Xbyak::Xmm xmm_table0_upper = ctx.reg_alloc.UseXmm(code, table[1]);
+ auto const xmm_table0_upper = ctx.reg_alloc.UseXmm(code, table[1]);
code.punpcklqdq(xmm_table0, xmm_table0_upper);
ctx.reg_alloc.Release(xmm_table0_upper);
}
@@ -5025,12 +5392,12 @@ void EmitX64::EmitVectorTableLookup64(EmitContext& ctx, IR::Inst* inst) {
}
if (code.HasHostFeature(HostFeature::SSE41) && is_defaults_zero) {
- const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]);
- const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(code, table[0]);
- const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseScratchXmm(code, table[2]);
+ auto const indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]);
+ auto const xmm_table0 = ctx.reg_alloc.UseScratchXmm(code, table[0]);
+ auto const xmm_table1 = ctx.reg_alloc.UseScratchXmm(code, table[2]);
{
- const Xbyak::Xmm xmm_table0_upper = ctx.reg_alloc.UseXmm(code, table[1]);
+ auto const xmm_table0_upper = ctx.reg_alloc.UseXmm(code, table[1]);
code.punpcklqdq(xmm_table0, xmm_table0_upper);
ctx.reg_alloc.Release(xmm_table0_upper);
}
@@ -5039,7 +5406,7 @@ void EmitX64::EmitVectorTableLookup64(EmitContext& ctx, IR::Inst* inst) {
code.punpcklqdq(xmm_table1, xmm0);
} else {
ASSERT(table_size == 4);
- const Xbyak::Xmm xmm_table1_upper = ctx.reg_alloc.UseXmm(code, table[3]);
+ auto const xmm_table1_upper = ctx.reg_alloc.UseXmm(code, table[3]);
code.punpcklqdq(xmm_table1, xmm_table1_upper);
ctx.reg_alloc.Release(xmm_table1_upper);
}
@@ -5060,18 +5427,18 @@ void EmitX64::EmitVectorTableLookup64(EmitContext& ctx, IR::Inst* inst) {
}
if (code.HasHostFeature(HostFeature::SSE41)) {
- const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]);
- const Xbyak::Xmm defaults = ctx.reg_alloc.UseXmm(code, args[0]);
- const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(code, table[0]);
- const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseScratchXmm(code, table[2]);
+ auto const indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]);
+ auto const defaults = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const xmm_table0 = ctx.reg_alloc.UseScratchXmm(code, table[0]);
+ auto const xmm_table1 = ctx.reg_alloc.UseScratchXmm(code, table[2]);
{
- const Xbyak::Xmm xmm_table0_upper = ctx.reg_alloc.UseXmm(code, table[1]);
+ auto const xmm_table0_upper = ctx.reg_alloc.UseXmm(code, table[1]);
code.punpcklqdq(xmm_table0, xmm_table0_upper);
ctx.reg_alloc.Release(xmm_table0_upper);
}
if (table_size == 4) {
- const Xbyak::Xmm xmm_table1_upper = ctx.reg_alloc.UseXmm(code, table[3]);
+ auto const xmm_table1_upper = ctx.reg_alloc.UseXmm(code, table[3]);
code.punpcklqdq(xmm_table1, xmm_table1_upper);
ctx.reg_alloc.Release(xmm_table1_upper);
}
@@ -5100,37 +5467,31 @@ void EmitX64::EmitVectorTableLookup64(EmitContext& ctx, IR::Inst* inst) {
const u32 stack_space = static_cast(6 * 8);
ctx.reg_alloc.AllocStackSpace(code, stack_space + ABI_SHADOW_SPACE);
for (size_t i = 0; i < table_size; ++i) {
- const Xbyak::Xmm table_value = ctx.reg_alloc.UseXmm(code, table[i]);
+ auto const table_value = ctx.reg_alloc.UseXmm(code, table[i]);
code.movq(qword[rsp + ABI_SHADOW_SPACE + i * 8], table_value);
ctx.reg_alloc.Release(table_value);
}
- const Xbyak::Xmm defaults = ctx.reg_alloc.UseXmm(code, args[0]);
- const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(code, args[2]);
- const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+ auto const defaults = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const indicies = ctx.reg_alloc.UseXmm(code, args[2]);
+ auto const result = ctx.reg_alloc.ScratchXmm(code);
ctx.reg_alloc.EndOfAllocScope();
ctx.reg_alloc.HostCall(code, nullptr);
-
code.lea(code.ABI_PARAM1, ptr[rsp + ABI_SHADOW_SPACE]);
code.lea(code.ABI_PARAM2, ptr[rsp + ABI_SHADOW_SPACE + 4 * 8]);
code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE + 5 * 8]);
code.mov(code.ABI_PARAM4.cvt32(), table_size);
code.movq(qword[code.ABI_PARAM2], defaults);
code.movq(qword[code.ABI_PARAM3], indicies);
-
- code.CallLambda(
- [](const HalfVectorArray* table, HalfVectorArray& result, const HalfVectorArray& indicies, size_t table_size) {
- for (size_t i = 0; i < result.size(); ++i) {
- const size_t index = indicies[i] / table[0].size();
- const size_t elem = indicies[i] % table[0].size();
- if (index < table_size) {
- result[i] = table[index][elem];
- }
- }
- });
-
+ code.CallLambda([](const HalfVectorArray* table, HalfVectorArray& result, const HalfVectorArray& indicies, size_t table_size) {
+ for (size_t i = 0; i < result.size(); ++i) {
+ const size_t index = indicies[i] / table[0].size();
+ const size_t elem = indicies[i] % table[0].size();
+ if (index < table_size)
+ result[i] = table[index][elem];
+ }
+ });
code.movq(result, qword[rsp + ABI_SHADOW_SPACE + 4 * 8]);
ctx.reg_alloc.ReleaseStackSpace(code, stack_space + ABI_SHADOW_SPACE);
-
ctx.reg_alloc.DefineValue(code, inst, result);
}
@@ -5144,14 +5505,14 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) {
const bool is_defaults_zero = !inst->GetArg(0).IsImmediate() && inst->GetArg(0).GetInst()->GetOpcode() == IR::Opcode::ZeroVector;
if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW | HostFeature::AVX512VBMI) && table_size == 4) {
- const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]);
+ auto const indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]);
code.vpcmpub(k1, indicies, code.BConst<8>(xword, 2 * 16), CmpInt::LessThan);
code.vpcmpub(k2, indicies, code.BConst<8>(xword, 4 * 16), CmpInt::LessThan);
// Handle vector-table 0,1
- const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseXmm(code, table[0]);
- const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseXmm(code, table[1]);
+ auto const xmm_table0 = ctx.reg_alloc.UseXmm(code, table[0]);
+ auto const xmm_table1 = ctx.reg_alloc.UseXmm(code, table[1]);
code.vpermi2b(indicies | k1, xmm_table0, xmm_table1);
@@ -5159,8 +5520,8 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) {
ctx.reg_alloc.Release(xmm_table1);
// Handle vector-table 2,3
- const Xbyak::Xmm xmm_table2 = ctx.reg_alloc.UseXmm(code, table[2]);
- const Xbyak::Xmm xmm_table3 = ctx.reg_alloc.UseXmm(code, table[3]);
+ auto const xmm_table2 = ctx.reg_alloc.UseXmm(code, table[2]);
+ auto const xmm_table3 = ctx.reg_alloc.UseXmm(code, table[3]);
code.kandnw(k1, k1, k2);
code.vpermi2b(indicies | k1, xmm_table2, xmm_table3);
@@ -5169,19 +5530,19 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) {
code.vmovdqu8(indicies | k2 | T_z, indicies);
ctx.reg_alloc.DefineValue(code, inst, indicies);
} else {
- const Xbyak::Xmm defaults = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const defaults = ctx.reg_alloc.UseScratchXmm(code, args[0]);
code.vmovdqu8(defaults | k2, indicies);
ctx.reg_alloc.DefineValue(code, inst, defaults);
}
} else if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW | HostFeature::AVX512VBMI) && table_size == 3) {
- const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]);
+ auto const indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]);
code.vpcmpub(k1, indicies, code.BConst<8>(xword, 2 * 16), CmpInt::LessThan);
code.vpcmpub(k2, indicies, code.BConst<8>(xword, 3 * 16), CmpInt::LessThan);
// Handle vector-table 0,1
- const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseXmm(code, table[0]);
- const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseXmm(code, table[1]);
+ auto const xmm_table0 = ctx.reg_alloc.UseXmm(code, table[0]);
+ auto const xmm_table1 = ctx.reg_alloc.UseXmm(code, table[1]);
code.vpermi2b(indicies | k1, xmm_table0, xmm_table1);
@@ -5189,7 +5550,7 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) {
ctx.reg_alloc.Release(xmm_table1);
// Handle vector-table 2
- const Xbyak::Xmm xmm_table2 = ctx.reg_alloc.UseXmm(code, table[2]);
+ auto const xmm_table2 = ctx.reg_alloc.UseXmm(code, table[2]);
code.kandnw(k1, k1, k2);
code.vpermb(indicies | k1, indicies, xmm_table2);
@@ -5198,14 +5559,14 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) {
code.vmovdqu8(indicies | k2 | T_z, indicies);
ctx.reg_alloc.DefineValue(code, inst, indicies);
} else {
- const Xbyak::Xmm defaults = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const defaults = ctx.reg_alloc.UseScratchXmm(code, args[0]);
code.vmovdqu8(defaults | k2, indicies);
ctx.reg_alloc.DefineValue(code, inst, defaults);
}
} else if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW | HostFeature::AVX512VBMI) && table_size == 2) {
- const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]);
- const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseXmm(code, table[0]);
- const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseXmm(code, table[1]);
+ auto const indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]);
+ auto const xmm_table0 = ctx.reg_alloc.UseXmm(code, table[0]);
+ auto const xmm_table1 = ctx.reg_alloc.UseXmm(code, table[1]);
code.vpcmpub(k1, indicies, code.BConst<8>(xword, 2 * 16), CmpInt::LessThan);
@@ -5213,36 +5574,36 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) {
code.vpermi2b(indicies | k1 | T_z, xmm_table0, xmm_table1);
ctx.reg_alloc.DefineValue(code, inst, indicies);
} else {
- const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
code.vpermi2b(indicies, xmm_table0, xmm_table1);
code.vmovdqu8(result | k1, indicies);
ctx.reg_alloc.DefineValue(code, inst, result);
}
} else if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW | HostFeature::AVX512VBMI) && table_size == 1) {
- const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(code, args[2]);
- const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseXmm(code, table[0]);
+ auto const indicies = ctx.reg_alloc.UseXmm(code, args[2]);
+ auto const xmm_table0 = ctx.reg_alloc.UseXmm(code, table[0]);
code.vpcmpub(k1, indicies, code.BConst<8>(xword, 1 * 16), CmpInt::LessThan);
if (is_defaults_zero) {
- const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+ auto const result = ctx.reg_alloc.ScratchXmm(code);
code.vpermb(result | k1 | T_z, indicies, xmm_table0);
ctx.reg_alloc.DefineValue(code, inst, result);
} else {
- const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
code.vpermb(result | k1, indicies, xmm_table0);
ctx.reg_alloc.DefineValue(code, inst, result);
}
} else if (code.HasHostFeature(HostFeature::SSSE3) && is_defaults_zero && table_size == 1) {
- const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]);
- const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(code, table[0]);
+ auto const indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]);
+ auto const xmm_table0 = ctx.reg_alloc.UseScratchXmm(code, table[0]);
code.paddusb(indicies, code.Const(xword, 0x7070707070707070, 0x7070707070707070));
code.pshufb(xmm_table0, indicies);
ctx.reg_alloc.DefineValue(code, inst, xmm_table0);
} else if (code.HasHostFeature(HostFeature::SSE41) && table_size == 1) {
- const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(code, args[2]);
- const Xbyak::Xmm defaults = ctx.reg_alloc.UseXmm(code, args[0]);
- const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(code, table[0]);
+ auto const indicies = ctx.reg_alloc.UseXmm(code, args[2]);
+ auto const defaults = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const xmm_table0 = ctx.reg_alloc.UseScratchXmm(code, table[0]);
if (code.HasHostFeature(HostFeature::AVX)) {
code.vpaddusb(xmm0, indicies, code.Const(xword, 0x7070707070707070, 0x7070707070707070));
@@ -5255,9 +5616,9 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) {
ctx.reg_alloc.DefineValue(code, inst, xmm_table0);
} else if (code.HasHostFeature(HostFeature::SSE41) && is_defaults_zero && table_size == 2) {
- const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]);
- const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(code, table[0]);
- const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseScratchXmm(code, table[1]);
+ auto const indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]);
+ auto const xmm_table0 = ctx.reg_alloc.UseScratchXmm(code, table[0]);
+ auto const xmm_table1 = ctx.reg_alloc.UseScratchXmm(code, table[1]);
if (code.HasHostFeature(HostFeature::AVX)) {
code.vpaddusb(xmm0, indicies, code.Const(xword, 0x7070707070707070, 0x7070707070707070));
@@ -5273,14 +5634,14 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) {
ctx.reg_alloc.DefineValue(code, inst, xmm_table0);
return;
} else if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW)) {
- const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(code, args[2]);
- const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm masked = ctx.reg_alloc.ScratchXmm(code);
+ auto const indicies = ctx.reg_alloc.UseXmm(code, args[2]);
+ auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const masked = ctx.reg_alloc.ScratchXmm(code);
code.vpandd(masked, indicies, code.Const(xword_b, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0));
for (size_t i = 0; i < table_size; ++i) {
- const Xbyak::Xmm xmm_table = ctx.reg_alloc.UseScratchXmm(code, table[i]);
+ auto const xmm_table = ctx.reg_alloc.UseScratchXmm(code, table[i]);
const Xbyak::Opmask table_mask = k1;
const u64 table_index = mcl::bit::replicate_element(i * 16);
@@ -5297,15 +5658,15 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) {
ctx.reg_alloc.DefineValue(code, inst, result);
} else if (code.HasHostFeature(HostFeature::SSE41)) {
- const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(code, args[2]);
- const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm masked = ctx.reg_alloc.ScratchXmm(code);
+ auto const indicies = ctx.reg_alloc.UseXmm(code, args[2]);
+ auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const masked = ctx.reg_alloc.ScratchXmm(code);
code.movaps(masked, code.Const(xword, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0));
code.pand(masked, indicies);
for (size_t i = 0; i < table_size; ++i) {
- const Xbyak::Xmm xmm_table = ctx.reg_alloc.UseScratchXmm(code, table[i]);
+ auto const xmm_table = ctx.reg_alloc.UseScratchXmm(code, table[i]);
const u64 table_index = mcl::bit::replicate_element(i * 16);
@@ -5329,13 +5690,13 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) {
const u32 stack_space = static_cast((table_size + 2) * 16);
ctx.reg_alloc.AllocStackSpace(code, stack_space + ABI_SHADOW_SPACE);
for (size_t i = 0; i < table_size; ++i) {
- const Xbyak::Xmm table_value = ctx.reg_alloc.UseXmm(code, table[i]);
+ auto const table_value = ctx.reg_alloc.UseXmm(code, table[i]);
code.movaps(xword[rsp + ABI_SHADOW_SPACE + i * 16], table_value);
ctx.reg_alloc.Release(table_value);
}
- const Xbyak::Xmm defaults = ctx.reg_alloc.UseXmm(code, args[0]);
- const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(code, args[2]);
- const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+ auto const defaults = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const indicies = ctx.reg_alloc.UseXmm(code, args[2]);
+ auto const result = ctx.reg_alloc.ScratchXmm(code);
ctx.reg_alloc.EndOfAllocScope();
ctx.reg_alloc.HostCall(code, nullptr);
code.lea(code.ABI_PARAM1, ptr[rsp + ABI_SHADOW_SPACE]);
@@ -5362,8 +5723,8 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorTranspose8(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm lower = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm upper = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const lower = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const upper = ctx.reg_alloc.UseScratchXmm(code, args[1]);
const bool part = args[2].GetImmediateU1();
if (!part) {
@@ -5381,8 +5742,8 @@ void EmitX64::EmitVectorTranspose8(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorTranspose16(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm lower = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm upper = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const lower = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const upper = ctx.reg_alloc.UseScratchXmm(code, args[1]);
const bool part = args[2].GetImmediateU1();
if (!part) {
@@ -5400,8 +5761,8 @@ void EmitX64::EmitVectorTranspose16(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorTranspose32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm lower = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm upper = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const lower = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const upper = ctx.reg_alloc.UseXmm(code, args[1]);
const bool part = args[2].GetImmediateU1();
code.shufps(lower, upper, !part ? 0b10001000 : 0b11011101);
@@ -5413,8 +5774,8 @@ void EmitX64::EmitVectorTranspose32(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorTranspose64(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm lower = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm upper = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const lower = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const upper = ctx.reg_alloc.UseXmm(code, args[1]);
const bool part = args[2].GetImmediateU1();
code.shufpd(lower, upper, !part ? 0b00 : 0b11);
@@ -5422,89 +5783,87 @@ void EmitX64::EmitVectorTranspose64(EmitContext& ctx, IR::Inst* inst) {
ctx.reg_alloc.DefineValue(code, inst, lower);
}
-static void EmitVectorUnsignedAbsoluteDifference(size_t esize, EmitContext& ctx, IR::Inst* inst, BlockOfCode& code) {
+
+void EmitX64::EmitVectorUnsignedAbsoluteDifference8(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-
- const Xbyak::Xmm temp = ctx.reg_alloc.ScratchXmm(code);
-
- switch (esize) {
- case 8: {
- const Xbyak::Xmm x = ctx.reg_alloc.UseXmm(code, args[0]);
- const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
-
+ if (code.HasHostFeature(HostFeature::AVX)) {
+ auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const tmp1 = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+ code.vpminub(tmp2, tmp0, tmp1);
+ code.vpmaxub(tmp0, tmp0, tmp1);
+ code.vpsubb(tmp0, tmp0, tmp2);
+ ctx.reg_alloc.DefineValue(code, inst, tmp0);
+ } else {
+ auto const temp = ctx.reg_alloc.ScratchXmm(code);
+ auto const x = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
code.movdqa(temp, x);
code.psubusb(temp, y);
code.psubusb(y, x);
code.por(temp, y);
- break;
+ ctx.reg_alloc.DefineValue(code, inst, temp);
}
- case 16: {
- const Xbyak::Xmm x = ctx.reg_alloc.UseXmm(code, args[0]);
- const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+}
+void EmitX64::EmitVectorUnsignedAbsoluteDifference16(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ if (code.HasHostFeature(HostFeature::AVX)) {
+ auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const tmp1 = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+ code.vpminuw(tmp2, tmp0, tmp1);
+ code.vpmaxuw(tmp0, tmp0, tmp1);
+ code.vpsubw(tmp0, tmp0, tmp2);
+ ctx.reg_alloc.DefineValue(code, inst, tmp0);
+ } else {
+ auto const temp = ctx.reg_alloc.ScratchXmm(code);
+ auto const x = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
code.movdqa(temp, x);
code.psubusw(temp, y);
code.psubusw(y, x);
code.por(temp, y);
- break;
+ ctx.reg_alloc.DefineValue(code, inst, temp);
}
- case 32:
- // See https://stackoverflow.com/questions/3380785/compute-the-absolute-difference-between-unsigned-integers-using-sse/3527267#3527267
- if (code.HasHostFeature(HostFeature::SSE41)) {
- const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(code, args[1]);
-
- code.movdqa(temp, x);
- code.pminud(x, y);
- code.pmaxud(temp, y);
- code.psubd(temp, x);
- } else {
- const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
- if (ctx.HasOptimization(OptimizationFlag::CodeSpeed)) {
- // About 45 bytes
- const Xbyak::Xmm temp_x = ctx.reg_alloc.ScratchXmm(code);
- const Xbyak::Xmm temp_y = ctx.reg_alloc.ScratchXmm(code);
- code.pcmpeqd(temp, temp);
- code.pslld(temp, 31);
- code.movdqa(temp_x, x);
- code.movdqa(temp_y, y);
- code.paddd(temp_x, x);
- code.paddd(temp_y, y);
- code.pcmpgtd(temp_y, temp_x);
- code.psubd(x, y);
- code.pandn(temp, temp_y);
- code.pxor(x, y);
- code.psubd(x, y);
- } else {
- // Smaller code size - about 36 bytes
- code.movdqa(temp, code.Const(xword, 0x8000000080000000, 0x8000000080000000));
- code.pxor(x, temp);
- code.pxor(y, temp);
- code.movdqa(temp, x);
- code.psubd(temp, y);
- code.pcmpgtd(y, x);
- code.psrld(y, 1);
- code.pxor(temp, y);
- code.psubd(temp, y);
- }
- }
- break;
- }
-
- ctx.reg_alloc.DefineValue(code, inst, temp);
-}
-
-void EmitX64::EmitVectorUnsignedAbsoluteDifference8(EmitContext& ctx, IR::Inst* inst) {
- EmitVectorUnsignedAbsoluteDifference(8, ctx, inst, code);
-}
-
-void EmitX64::EmitVectorUnsignedAbsoluteDifference16(EmitContext& ctx, IR::Inst* inst) {
- EmitVectorUnsignedAbsoluteDifference(16, ctx, inst, code);
}
void EmitX64::EmitVectorUnsignedAbsoluteDifference32(EmitContext& ctx, IR::Inst* inst) {
- EmitVectorUnsignedAbsoluteDifference(32, ctx, inst, code);
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ if (code.HasHostFeature(HostFeature::AVX)) {
+ auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const tmp1 = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+ code.vpminud(tmp2, tmp0, tmp1);
+ code.vpmaxud(tmp0, tmp0, tmp1);
+ code.vpsubd(tmp0, tmp0, tmp2);
+ ctx.reg_alloc.DefineValue(code, inst, tmp0);
+ } else if (code.HasHostFeature(HostFeature::SSE41)) {
+ // See https://stackoverflow.com/questions/3380785/compute-the-absolute-difference-between-unsigned-integers-using-sse/3527267#3527267
+ auto const temp = ctx.reg_alloc.ScratchXmm(code);
+ auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const y = ctx.reg_alloc.UseXmm(code, args[1]);
+ code.movdqa(temp, x);
+ code.pminud(x, y);
+ code.pmaxud(temp, y);
+ code.psubd(temp, x);
+ ctx.reg_alloc.DefineValue(code, inst, temp);
+ } else {
+ auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const tmp1 = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp3 = ctx.reg_alloc.ScratchXmm(code);
+ code.movdqa(tmp2, code.Const(xword, 0x8000'00008000'0000, 0x8000'00008000'0000));
+ code.movdqa(tmp3, tmp1);
+ code.pxor(tmp3, tmp2);
+ code.pxor(tmp2, tmp0);
+ code.pcmpgtd(tmp2, tmp3);
+ code.psubd(tmp0, tmp1);
+ code.pxor(tmp0, tmp2);
+ code.psubd(tmp2, tmp0);
+ //code.movdqa(tmp0, tmp2);
+ ctx.reg_alloc.DefineValue(code, inst, tmp2);
+ }
}
void EmitX64::EmitVectorUnsignedMultiply16(EmitContext& ctx, IR::Inst* inst) {
@@ -5512,11 +5871,11 @@ void EmitX64::EmitVectorUnsignedMultiply16(EmitContext& ctx, IR::Inst* inst) {
const auto lower_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetLowerFromOp);
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm x = ctx.reg_alloc.UseXmm(code, args[0]);
- const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const x = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const y = ctx.reg_alloc.UseXmm(code, args[1]);
if (upper_inst) {
- const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+ auto const result = ctx.reg_alloc.ScratchXmm(code);
if (code.HasHostFeature(HostFeature::AVX)) {
code.vpmulhuw(result, x, y);
} else {
@@ -5528,7 +5887,7 @@ void EmitX64::EmitVectorUnsignedMultiply16(EmitContext& ctx, IR::Inst* inst) {
}
if (lower_inst) {
- const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+ auto const result = ctx.reg_alloc.ScratchXmm(code);
if (code.HasHostFeature(HostFeature::AVX)) {
code.vpmullw(result, x, y);
} else {
@@ -5546,24 +5905,24 @@ void EmitX64::EmitVectorUnsignedMultiply32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (lower_inst && !upper_inst && code.HasHostFeature(HostFeature::AVX)) {
- const Xbyak::Xmm x = ctx.reg_alloc.UseXmm(code, args[0]);
- const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(code, args[1]);
- const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+ auto const x = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const y = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const result = ctx.reg_alloc.ScratchXmm(code);
code.vpmulld(result, x, y);
ctx.reg_alloc.DefineValue(code, lower_inst, result);
} else if (code.HasHostFeature(HostFeature::AVX)) {
- const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
if (lower_inst) {
- const Xbyak::Xmm lower_result = ctx.reg_alloc.ScratchXmm(code);
+ auto const lower_result = ctx.reg_alloc.ScratchXmm(code);
code.vpmulld(lower_result, x, y);
ctx.reg_alloc.DefineValue(code, lower_inst, lower_result);
}
- const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+ auto const result = ctx.reg_alloc.ScratchXmm(code);
code.vpmuludq(result, x, y);
code.vpsrlq(x, x, 32);
@@ -5573,11 +5932,11 @@ void EmitX64::EmitVectorUnsignedMultiply32(EmitContext& ctx, IR::Inst* inst) {
ctx.reg_alloc.DefineValue(code, upper_inst, result);
} else {
- const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
- const Xbyak::Xmm upper_result = upper_inst ? ctx.reg_alloc.ScratchXmm(code) : Xbyak::Xmm{-1};
- const Xbyak::Xmm lower_result = lower_inst ? ctx.reg_alloc.ScratchXmm(code) : Xbyak::Xmm{-1};
+ auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const upper_result = upper_inst ? ctx.reg_alloc.ScratchXmm(code) : Xbyak::Xmm{-1};
+ auto const lower_result = lower_inst ? ctx.reg_alloc.ScratchXmm(code) : Xbyak::Xmm{-1};
// calculate unsigned multiply
code.movdqa(tmp, x);
@@ -5794,11 +6153,11 @@ void EmitX64::EmitVectorUnsignedSaturatedShiftLeft64(EmitContext& ctx, IR::Inst*
void EmitX64::EmitVectorZeroExtend8(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
if (code.HasHostFeature(HostFeature::SSE41)) {
code.pmovzxbw(a, a);
} else {
- const Xbyak::Xmm zeros = ctx.reg_alloc.ScratchXmm(code);
+ auto const zeros = ctx.reg_alloc.ScratchXmm(code);
code.pxor(zeros, zeros);
code.punpcklbw(a, zeros);
}
@@ -5807,11 +6166,11 @@ void EmitX64::EmitVectorZeroExtend8(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorZeroExtend16(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
if (code.HasHostFeature(HostFeature::SSE41)) {
code.pmovzxwd(a, a);
} else {
- const Xbyak::Xmm zeros = ctx.reg_alloc.ScratchXmm(code);
+ auto const zeros = ctx.reg_alloc.ScratchXmm(code);
code.pxor(zeros, zeros);
code.punpcklwd(a, zeros);
}
@@ -5820,11 +6179,11 @@ void EmitX64::EmitVectorZeroExtend16(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorZeroExtend32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
if (code.HasHostFeature(HostFeature::SSE41)) {
code.pmovzxdq(a, a);
} else {
- const Xbyak::Xmm zeros = ctx.reg_alloc.ScratchXmm(code);
+ auto const zeros = ctx.reg_alloc.ScratchXmm(code);
code.pxor(zeros, zeros);
code.punpckldq(a, zeros);
}
@@ -5833,8 +6192,8 @@ void EmitX64::EmitVectorZeroExtend32(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorZeroExtend64(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm zeros = ctx.reg_alloc.ScratchXmm(code);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const zeros = ctx.reg_alloc.ScratchXmm(code);
code.pxor(zeros, zeros);
code.punpcklqdq(a, zeros);
ctx.reg_alloc.DefineValue(code, inst, a);
@@ -5842,7 +6201,7 @@ void EmitX64::EmitVectorZeroExtend64(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorZeroUpper(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
code.movq(a, a); // TODO: !IsLastUse
@@ -5850,7 +6209,7 @@ void EmitX64::EmitVectorZeroUpper(EmitContext& ctx, IR::Inst* inst) {
}
void EmitX64::EmitZeroVector(EmitContext& ctx, IR::Inst* inst) {
- const Xbyak::Xmm a = ctx.reg_alloc.ScratchXmm(code);
+ auto const a = ctx.reg_alloc.ScratchXmm(code);
code.pxor(a, a);
ctx.reg_alloc.DefineValue(code, inst, a);
}
diff --git a/src/dynarmic/tests/A64/a64.cpp b/src/dynarmic/tests/A64/a64.cpp
index d331c5e8a1..d4fa3544ae 100644
--- a/src/dynarmic/tests/A64/a64.cpp
+++ b/src/dynarmic/tests/A64/a64.cpp
@@ -415,6 +415,105 @@ TEST_CASE("A64: URSHL", "[a64]") {
CHECK(jit.GetVector(9) == Vector{0x0000000000000002, 0x12db8b8280e0ba});
}
+TEST_CASE("A64: SQSHLU", "[a64]") {
+ A64TestEnv env;
+ A64::UserConfig jit_user_config{};
+ jit_user_config.callbacks = &env;
+ A64::Jit jit{jit_user_config};
+
+ oaknut::VectorCodeGenerator code{env.code_mem, nullptr};
+ code.SQSHLU(V8.B16(), V0.B16(), 1);
+ code.SQSHLU(V9.H8(), V1.H8(), 2);
+ code.SQSHLU(V10.S4(), V2.S4(), 28);
+ code.SQSHLU(V11.D2(), V3.D2(), 4);
+ code.SQSHLU(V12.S4(), V0.S4(), 1);
+ code.SQSHLU(V13.S4(), V1.S4(), 3);
+ code.SQSHLU(V14.S4(), V2.S4(), 0);
+ code.SQSHLU(V15.S4(), V3.S4(), 0);
+
+ jit.SetVector(0, Vector{0xffffffff'18ba6a6a, 0x7fffffff'943b954f});
+ jit.SetVector(1, Vector{0x0000000b'0000000f, 0xffffffff'ffffffff});
+ jit.SetVector(2, Vector{0x00000001'000000ff, 0x00000010'0000007f});
+ jit.SetVector(3, Vector{0xffffffffffffffff, 0x96dc5c140705cd04});
+
+ env.ticks_left = env.code_mem.size();
+ CheckedRun([&]() { jit.Run(); });
+
+ CHECK(jit.GetVector(8) == Vector{0x3000d4d4, 0xfe0000000076009e});
+ CHECK(jit.GetVector(9) == Vector{0x2c0000003c, 0});
+ CHECK(jit.GetVector(10) == Vector{0x10000000'ffffffff, 0xffffffff'ffffffff});
+ CHECK(jit.GetVector(11) == Vector{0, 0});
+ CHECK(jit.GetVector(12) == Vector{0x3174d4d4, 0xfffffffe00000000});
+ CHECK(jit.GetVector(13) == Vector{0x5800000078, 0});
+ CHECK(jit.GetVector(14) == Vector{0x1000000ff, 0x100000007f});
+ CHECK(jit.GetVector(15) == Vector{0, 0x705cd04});
+}
+
+TEST_CASE("A64: SMIN", "[a64]") {
+ A64TestEnv env;
+ A64::UserConfig jit_user_config{};
+ jit_user_config.callbacks = &env;
+ A64::Jit jit{jit_user_config};
+
+ oaknut::VectorCodeGenerator code{env.code_mem, nullptr};
+ code.SMIN(V8.B16(), V0.B16(), V3.B16());
+ code.SMIN(V9.H8(), V1.H8(), V2.H8());
+ code.SMIN(V10.S4(), V2.S4(), V3.S4());
+ code.SMIN(V11.S4(), V3.S4(), V3.S4());
+ code.SMIN(V12.S4(), V0.S4(), V3.S4());
+ code.SMIN(V13.S4(), V1.S4(), V2.S4());
+ code.SMIN(V14.S4(), V2.S4(), V1.S4());
+ code.SMIN(V15.S4(), V3.S4(), V0.S4());
+
+ jit.SetPC(0);
+ jit.SetVector(0, Vector{0xffffffff'18ba6a6a, 0x7fffffff'943b954f});
+ jit.SetVector(1, Vector{0x0000000b'0000000f, 0xffffffff'ffffffff});
+ jit.SetVector(2, Vector{0x00000001'000000ff, 0x00000010'0000007f});
+ jit.SetVector(3, Vector{0xffffffff'ffffffff, 0x96dc5c14'0705cd04});
+
+ env.ticks_left = 4;
+ CheckedRun([&]() { jit.Run(); });
+
+ REQUIRE(jit.GetVector(8) == Vector{0xffffffffffbaffff, 0x96dcffff94059504});
+ REQUIRE(jit.GetVector(9) == Vector{0x10000000f, 0xffffffffffffffff});
+ REQUIRE(jit.GetVector(10) == Vector{0xffffffffffffffff, 0x96dc5c140000007f});
+}
+
+TEST_CASE("A64: SMINP", "[a64]") {
+ A64TestEnv env;
+ A64::UserConfig jit_user_config{};
+ jit_user_config.callbacks = &env;
+ A64::Jit jit{jit_user_config};
+
+ oaknut::VectorCodeGenerator code{env.code_mem, nullptr};
+ code.SMINP(V8.B16(), V0.B16(), V3.B16());
+ code.SMINP(V9.H8(), V1.H8(), V2.H8());
+ code.SMINP(V10.S4(), V2.S4(), V1.S4());
+ code.SMINP(V11.S4(), V3.S4(), V3.S4());
+ code.SMINP(V12.S4(), V0.S4(), V3.S4());
+ code.SMINP(V13.S4(), V1.S4(), V2.S4());
+ code.SMINP(V14.S4(), V2.S4(), V1.S4());
+ code.SMINP(V15.S4(), V3.S4(), V0.S4());
+
+ jit.SetPC(0);
+ jit.SetVector(0, Vector{0xffffffff'18ba6a6a, 0x7fffffff'943b954f});
+ jit.SetVector(1, Vector{0x0000000b'0000000f, 0xffffffff'ffffffff});
+ jit.SetVector(2, Vector{0x00000001'000000ff, 0x00000010'0000007f});
+ jit.SetVector(3, Vector{0xffffffff'ffffffff, 0x96dc5c14'0705cd04});
+
+ env.ticks_left = 4;
+ CheckedRun([&]() { jit.Run(); });
+
+ REQUIRE(jit.GetVector(8) == Vector{0xffff9495ffffba6a, 0x961405cdffffffff});
+ REQUIRE(jit.GetVector(9) == Vector{0xffffffff00000000, 0});
+ REQUIRE(jit.GetVector(10) == Vector{0x1000000001, 0xffffffff0000000b});
+ REQUIRE(jit.GetVector(11) == Vector{0x96dc5c14ffffffff, 0x96dc5c14ffffffff});
+ REQUIRE(jit.GetVector(12) == Vector{0x943b954fffffffff, 0x96dc5c14ffffffff});
+ REQUIRE(jit.GetVector(13) == Vector{0xffffffff0000000b, 0x1000000001});
+ REQUIRE(jit.GetVector(14) == Vector{0x1000000001, 0xffffffff0000000b});
+ REQUIRE(jit.GetVector(15) == Vector{0x96dc5c14ffffffff, 0x943b954fffffffff});
+}
+
TEST_CASE("A64: XTN", "[a64]") {
A64TestEnv env;
A64::UserConfig jit_user_config{};
diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp
index e48f294a5a..6d9ebd6296 100644
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -315,15 +315,10 @@ void Maxwell3D::ConsumeSinkImpl() {
}
void Maxwell3D::ProcessDirtyRegisters(u32 method, u32 argument) {
- if (regs.reg_array[method] != argument) {
- regs.reg_array[method] = argument;
- auto const& table0 = dirty.tables[0];
- auto const& table1 = dirty.tables[1];
- u8 const flag0 = table0[method];
- u8 const flag1 = table1[method];
- dirty.flags[flag0] = true;
- if (flag1 != flag0)
- dirty.flags[flag1] = true;
+ regs.reg_array[method] = argument;
+
+ for (const auto& table : dirty.tables) {
+ dirty.flags[table[method]] = true;
}
}
diff --git a/src/video_core/renderer_vulkan/vk_scheduler.cpp b/src/video_core/renderer_vulkan/vk_scheduler.cpp
index 947de6a80e..226619d8d6 100644
--- a/src/video_core/renderer_vulkan/vk_scheduler.cpp
+++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp
@@ -270,8 +270,8 @@ u64 Scheduler::SubmitExecution(VkSemaphore signal_semaphore, VkSemaphore wait_se
.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
.dstAccessMask = VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT,
};
- upload_cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_VERTEX_SHADER_BIT |
- VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT | VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, 0, WRITE_BARRIER);
+ upload_cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT,
+ VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, WRITE_BARRIER);
upload_cmdbuf.End();
cmdbuf.End();
@@ -373,8 +373,8 @@ void Scheduler::EndRenderPass()
}
cmdbuf.EndRenderPass();
cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT |
- VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT |
- VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, 0, nullptr, nullptr, vk::Span(barriers.data(), num_images));
+ VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
+ 0, nullptr, nullptr, vk::Span(barriers.data(), num_images));
});
state.renderpass = VkRenderPass{};