diff --git a/.github/ISSUE_TEMPLATE/blank_issue_template.yml b/.forgejo/ISSUE_TEMPLATE/blank_issue_template.yml
similarity index 100%
rename from .github/ISSUE_TEMPLATE/blank_issue_template.yml
rename to .forgejo/ISSUE_TEMPLATE/blank_issue_template.yml
diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.forgejo/ISSUE_TEMPLATE/bug_report.yml
similarity index 100%
rename from .github/ISSUE_TEMPLATE/bug_report.yml
rename to .forgejo/ISSUE_TEMPLATE/bug_report.yml
diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.forgejo/ISSUE_TEMPLATE/config.yml
similarity index 100%
rename from .github/ISSUE_TEMPLATE/config.yml
rename to .forgejo/ISSUE_TEMPLATE/config.yml
diff --git a/.github/ISSUE_TEMPLATE/feature_request.yml b/.forgejo/ISSUE_TEMPLATE/feature_request.yml
similarity index 100%
rename from .github/ISSUE_TEMPLATE/feature_request.yml
rename to .forgejo/ISSUE_TEMPLATE/feature_request.yml
diff --git a/.github/workflows/license-header.yml b/.forgejo/workflows/license-header.yml
similarity index 100%
rename from .github/workflows/license-header.yml
rename to .forgejo/workflows/license-header.yml
diff --git a/.github/workflows/sources.yml b/.forgejo/workflows/sources.yml
similarity index 100%
rename from .github/workflows/sources.yml
rename to .forgejo/workflows/sources.yml
diff --git a/.github/workflows/strings.yml b/.forgejo/workflows/strings.yml
similarity index 100%
rename from .github/workflows/strings.yml
rename to .forgejo/workflows/strings.yml
diff --git a/.github/workflows/translations.yml b/.forgejo/workflows/translations.yml
similarity index 97%
rename from .github/workflows/translations.yml
rename to .forgejo/workflows/translations.yml
index 92bb1fdf5d..16ce4f1808 100644
--- a/.github/workflows/translations.yml
+++ b/.forgejo/workflows/translations.yml
@@ -3,8 +3,7 @@ name: tx-pull
on:
# monday, wednesday, saturday at 2pm
schedule:
- cron:
- - '0 14 * * 1,3,6'
+ cron: '0 14 * * 1,3,6'
workflow_dispatch:
jobs:
@@ -59,4 +58,3 @@ jobs:
-H 'Authorization: Bearer ${{ secrets.CI_FJ_TOKEN }}' \
-H 'Content-Type: application/json' \
-d "@data.json" --fail
-
diff --git a/CMakeModules/GenerateSCMRev.cmake b/CMakeModules/GenerateSCMRev.cmake
index 5b0adad8dd..947a4963ee 100644
--- a/CMakeModules/GenerateSCMRev.cmake
+++ b/CMakeModules/GenerateSCMRev.cmake
@@ -37,10 +37,10 @@ set(GIT_DESC ${BUILD_VERSION})
# Auto-updater metadata! Must somewhat mirror GitHub API endpoint
if (NIGHTLY_BUILD)
- set(BUILD_AUTO_UPDATE_WEBSITE "https://github.com")
- set(BUILD_AUTO_UPDATE_API "api.github.com")
- set(BUILD_AUTO_UPDATE_API_PATH "/repos/")
- set(BUILD_AUTO_UPDATE_REPO "Eden-CI/Nightly")
+ set(BUILD_AUTO_UPDATE_WEBSITE "https://git.eden-emu.dev")
+ set(BUILD_AUTO_UPDATE_API "git.eden-emu.dev")
+ set(BUILD_AUTO_UPDATE_API_PATH "/api/v1/repos/")
+ set(BUILD_AUTO_UPDATE_REPO "eden-ci/nightly")
set(REPO_NAME "Eden Nightly")
else()
set(BUILD_AUTO_UPDATE_WEBSITE "https://git.eden-emu.dev")
diff --git a/dist/dev.eden_emu.eden.svg b/dist/dev.eden_emu.eden.svg
index f88b52f625..7711945aa4 100644
--- a/dist/dev.eden_emu.eden.svg
+++ b/dist/dev.eden_emu.eden.svg
@@ -1,203 +1,21 @@
+
+
diff --git a/dist/eden.bmp b/dist/eden.bmp
index 888138ccf7..cffc04b308 100644
Binary files a/dist/eden.bmp and b/dist/eden.bmp differ
diff --git a/dist/eden.ico b/dist/eden.ico
index 45120ef312..106742c9ba 100644
Binary files a/dist/eden.ico and b/dist/eden.ico differ
diff --git a/dist/icon_variations/aprilfools2026.svg b/dist/icon_variations/aprilfools2026.svg
new file mode 100644
index 0000000000..7711945aa4
--- /dev/null
+++ b/dist/icon_variations/aprilfools2026.svg
@@ -0,0 +1,89 @@
+
+
+
+
diff --git a/dist/icon_variations/aprilfools2026_bgcolor b/dist/icon_variations/aprilfools2026_bgcolor
new file mode 100644
index 0000000000..fabebfa717
--- /dev/null
+++ b/dist/icon_variations/aprilfools2026_bgcolor
@@ -0,0 +1 @@
+#43fcfcff
diff --git a/dist/qt_themes/default/icons/256x256/eden.png b/dist/qt_themes/default/icons/256x256/eden.png
index 3c4bd566a1..d7286ac4c6 100644
Binary files a/dist/qt_themes/default/icons/256x256/eden.png and b/dist/qt_themes/default/icons/256x256/eden.png differ
diff --git a/externals/CMakeLists.txt b/externals/CMakeLists.txt
index 00bdf10a4f..ba0545b7a7 100644
--- a/externals/CMakeLists.txt
+++ b/externals/CMakeLists.txt
@@ -62,6 +62,12 @@ endif()
# unordered_dense
AddJsonPackage(unordered-dense)
+# httplib
+if (IOS)
+ set(HTTPLIB_USE_BROTLI_IF_AVAILABLE OFF)
+endif()
+AddJsonPackage(httplib)
+
if (YUZU_STATIC_ROOM)
return()
endif()
@@ -227,9 +233,6 @@ if (VulkanMemoryAllocator_ADDED)
endif()
endif()
-# httplib
-AddJsonPackage(httplib)
-
# cpp-jwt
if (ENABLE_WEB_SERVICE OR ENABLE_UPDATE_CHECKER)
AddJsonPackage(cpp-jwt)
diff --git a/externals/cpmfile.json b/externals/cpmfile.json
index f849426a4d..03303a5896 100644
--- a/externals/cpmfile.json
+++ b/externals/cpmfile.json
@@ -36,7 +36,8 @@
"0002-fix-zstd.patch"
],
"options": [
- "HTTPLIB_REQUIRE_OPENSSL ON"
+ "HTTPLIB_REQUIRE_OPENSSL ON",
+ "HTTPLIB_DISABLE_MACOSX_AUTOMATIC_ROOT_CERTIFICATES ON"
]
},
"cpp-jwt": {
diff --git a/src/android/app/src/main/java/org/yuzu/yuzu_emu/activities/EmulationActivity.kt b/src/android/app/src/main/java/org/yuzu/yuzu_emu/activities/EmulationActivity.kt
index 2764d7eac6..44290fd4b6 100644
--- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/activities/EmulationActivity.kt
+++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/activities/EmulationActivity.kt
@@ -25,6 +25,11 @@ import android.hardware.SensorEventListener
import android.hardware.SensorManager
import android.os.Build
import android.os.Bundle
+import android.os.Handler
+import android.os.Looper
+import androidx.navigation.NavOptions
+import org.yuzu.yuzu_emu.fragments.EmulationFragment
+import org.yuzu.yuzu_emu.utils.CustomSettingsHandler
import android.util.Rational
import android.view.InputDevice
import android.view.KeyEvent
@@ -87,6 +92,28 @@ class EmulationActivity : AppCompatActivity(), SensorEventListener, InputManager
private val emulationViewModel: EmulationViewModel by viewModels()
private var foregroundService: Intent? = null
+ private val mainHandler = Handler(Looper.getMainLooper())
+ private var pendingRomSwapIntent: Intent? = null
+ private var isWaitingForRomSwapStop = false
+ private var romSwapNativeStopped = false
+ private var romSwapThreadStopped = false
+ private var romSwapGeneration = 0
+ private var hasEmulationSession = processHasEmulationSession
+ private val romSwapStopTimeoutRunnable = Runnable { onRomSwapStopTimeout() }
+
+ private fun onRomSwapStopTimeout() {
+ if (!isWaitingForRomSwapStop) {
+ return
+ }
+ Log.warning("[EmulationActivity] ROM swap stop timed out; retrying native stop and continuing to wait")
+ NativeLibrary.stopEmulation()
+ scheduleRomSwapStopTimeout()
+ }
+
+ private fun scheduleRomSwapStopTimeout() {
+ mainHandler.removeCallbacks(romSwapStopTimeoutRunnable)
+ mainHandler.postDelayed(romSwapStopTimeoutRunnable, ROM_SWAP_STOP_TIMEOUT_MS)
+ }
override fun attachBaseContext(base: Context) {
super.attachBaseContext(YuzuApplication.applyLanguage(base))
@@ -128,9 +155,29 @@ class EmulationActivity : AppCompatActivity(), SensorEventListener, InputManager
binding = ActivityEmulationBinding.inflate(layoutInflater)
setContentView(binding.root)
+ val launchIntent = Intent(intent)
+ val shouldDeferLaunchForSwap = hasEmulationSession && isSwapIntent(launchIntent)
+ if (shouldDeferLaunchForSwap) {
+ Log.info("[EmulationActivity] onCreate detected existing session; deferring new game setup for swap")
+ emulationViewModel.setIsEmulationStopping(true)
+ emulationViewModel.setEmulationStopped(false)
+ }
+
val navHostFragment =
supportFragmentManager.findFragmentById(R.id.fragment_container) as NavHostFragment
- navHostFragment.navController.setGraph(R.navigation.emulation_navigation, intent.extras)
+ val initialArgs = if (shouldDeferLaunchForSwap) {
+ Bundle(intent.extras ?: Bundle()).apply {
+ processSessionGame?.let { putParcelable("game", it) }
+ }
+ } else {
+ intent.extras
+ }
+ navHostFragment.navController.setGraph(R.navigation.emulation_navigation, initialArgs)
+ if (shouldDeferLaunchForSwap) {
+ mainHandler.post {
+ handleSwapIntent(launchIntent)
+ }
+ }
isActivityRecreated = savedInstanceState != null
@@ -210,6 +257,7 @@ class EmulationActivity : AppCompatActivity(), SensorEventListener, InputManager
}
override fun onDestroy() {
+ mainHandler.removeCallbacks(romSwapStopTimeoutRunnable)
super.onDestroy()
inputManager.unregisterInputDeviceListener(this)
stopForegroundService(this)
@@ -228,17 +276,123 @@ class EmulationActivity : AppCompatActivity(), SensorEventListener, InputManager
override fun onNewIntent(intent: Intent) {
super.onNewIntent(intent)
- setIntent(intent)
-
- // Reset navigation graph with new intent data to recreate EmulationFragment
- val navHostFragment =
- supportFragmentManager.findFragmentById(R.id.fragment_container) as NavHostFragment
- navHostFragment.navController.setGraph(R.navigation.emulation_navigation, intent.extras)
-
+ handleSwapIntent(intent)
nfcReader.onNewIntent(intent)
InputHandler.updateControllerData()
}
+ private fun isSwapIntent(intent: Intent): Boolean {
+ return when {
+ intent.getBooleanExtra(EXTRA_OVERLAY_GAMELESS_EDIT_MODE, false) -> false
+ intent.action == CustomSettingsHandler.CUSTOM_CONFIG_ACTION -> true
+ intent.data != null -> true
+ else -> {
+ val extras = intent.extras
+ extras != null &&
+ BundleCompat.getParcelable(extras, EXTRA_SELECTED_GAME, Game::class.java) != null
+ }
+ }
+ }
+
+ private fun handleSwapIntent(intent: Intent) {
+ if (!isSwapIntent(intent)) {
+ return
+ }
+
+ pendingRomSwapIntent = Intent(intent)
+
+ if (!isWaitingForRomSwapStop) {
+ Log.info("[EmulationActivity] Begin ROM swap: data=${intent.data}")
+ isWaitingForRomSwapStop = true
+ romSwapNativeStopped = false
+ romSwapThreadStopped = false
+ romSwapGeneration += 1
+ val thisSwapGeneration = romSwapGeneration
+ emulationViewModel.setIsEmulationStopping(true)
+ emulationViewModel.setEmulationStopped(false)
+ val navHostFragment =
+ supportFragmentManager.findFragmentById(R.id.fragment_container) as? NavHostFragment
+ val childFragmentManager = navHostFragment?.childFragmentManager
+ val stoppingFragmentForSwap =
+ (childFragmentManager?.primaryNavigationFragment as? EmulationFragment) ?:
+ childFragmentManager
+ ?.fragments
+ ?.asReversed()
+ ?.firstOrNull {
+ it is EmulationFragment &&
+ it.isAdded &&
+ it.view != null &&
+ !it.isRemoving
+ } as? EmulationFragment
+
+ val hasSessionForSwap = hasEmulationSession || stoppingFragmentForSwap != null
+
+ if (!hasSessionForSwap) {
+ romSwapNativeStopped = true
+ romSwapThreadStopped = true
+ } else {
+ if (stoppingFragmentForSwap != null) {
+ stoppingFragmentForSwap.stopForRomSwap()
+ stoppingFragmentForSwap.notifyWhenEmulationThreadStops {
+ if (!isWaitingForRomSwapStop || romSwapGeneration != thisSwapGeneration) {
+ return@notifyWhenEmulationThreadStops
+ }
+ romSwapThreadStopped = true
+ Log.info("[EmulationActivity] ROM swap thread stop acknowledged")
+ launchPendingRomSwap(force = false)
+ }
+ } else {
+ Log.warning("[EmulationActivity] ROM swap stop target fragment not found; requesting native stop")
+ romSwapThreadStopped = true
+ NativeLibrary.stopEmulation()
+ }
+
+ scheduleRomSwapStopTimeout()
+ }
+ }
+
+ launchPendingRomSwap(force = false)
+ }
+
+ private fun launchPendingRomSwap(force: Boolean) {
+ if (!isWaitingForRomSwapStop) {
+ return
+ }
+ if (!force && (!romSwapNativeStopped || !romSwapThreadStopped)) {
+ return
+ }
+ val swapIntent = pendingRomSwapIntent ?: return
+ Log.info("[EmulationActivity] Launching pending ROM swap: data=${swapIntent.data}")
+ pendingRomSwapIntent = null
+ isWaitingForRomSwapStop = false
+ romSwapNativeStopped = false
+ romSwapThreadStopped = false
+ mainHandler.removeCallbacks(romSwapStopTimeoutRunnable)
+ applyGameLaunchIntent(swapIntent)
+ }
+
+ private fun applyGameLaunchIntent(intent: Intent) {
+ hasEmulationSession = true
+ processHasEmulationSession = true
+ emulationViewModel.setIsEmulationStopping(false)
+ emulationViewModel.setEmulationStopped(false)
+ setIntent(Intent(intent))
+ val navHostFragment =
+ supportFragmentManager.findFragmentById(R.id.fragment_container) as NavHostFragment
+ val navController = navHostFragment.navController
+ val startArgs = intent.extras?.let { Bundle(it) } ?: Bundle()
+ val navOptions = NavOptions.Builder()
+ .setPopUpTo(R.id.emulationFragment, true)
+ .build()
+
+ runCatching {
+ navController.navigate(R.id.emulationFragment, startArgs, navOptions)
+ }.onFailure {
+ Log.warning("[EmulationActivity] ROM swap navigate fallback to setGraph: ${it.message}")
+ navController.setGraph(R.navigation.emulation_navigation, startArgs)
+ }
+ }
+
override fun dispatchKeyEvent(event: KeyEvent): Boolean {
if (event.keyCode == KeyEvent.KEYCODE_VOLUME_UP ||
@@ -608,19 +762,48 @@ class EmulationActivity : AppCompatActivity(), SensorEventListener, InputManager
}
fun onEmulationStarted() {
+ if (Looper.myLooper() != Looper.getMainLooper()) {
+ mainHandler.post { onEmulationStarted() }
+ return
+ }
+ hasEmulationSession = true
+ processHasEmulationSession = true
emulationViewModel.setEmulationStarted(true)
+ emulationViewModel.setIsEmulationStopping(false)
+ emulationViewModel.setEmulationStopped(false)
NativeLibrary.playTimeManagerStart()
}
fun onEmulationStopped(status: Int) {
- if (status == 0 && emulationViewModel.programChanged.value == -1) {
+ if (Looper.myLooper() != Looper.getMainLooper()) {
+ mainHandler.post { onEmulationStopped(status) }
+ return
+ }
+ hasEmulationSession = false
+ processHasEmulationSession = false
+ if (isWaitingForRomSwapStop) {
+ romSwapNativeStopped = true
+ Log.info("[EmulationActivity] ROM swap native stop acknowledged")
+ launchPendingRomSwap(force = false)
+ } else if (status == 0 && emulationViewModel.programChanged.value == -1) {
+ processSessionGame = null
finish()
+ } else if (!isWaitingForRomSwapStop) {
+ processSessionGame = null
}
emulationViewModel.setEmulationStopped(true)
}
+ fun updateSessionGame(game: Game?) {
+ processSessionGame = game
+ }
+
fun onProgramChanged(programIndex: Int) {
+ if (Looper.myLooper() != Looper.getMainLooper()) {
+ mainHandler.post { onProgramChanged(programIndex) }
+ return
+ }
emulationViewModel.setProgramChanged(programIndex)
}
@@ -644,6 +827,11 @@ class EmulationActivity : AppCompatActivity(), SensorEventListener, InputManager
companion object {
const val EXTRA_SELECTED_GAME = "SelectedGame"
const val EXTRA_OVERLAY_GAMELESS_EDIT_MODE = "overlayGamelessEditMode"
+ private const val ROM_SWAP_STOP_TIMEOUT_MS = 5000L
+ @Volatile
+ private var processHasEmulationSession = false
+ @Volatile
+ private var processSessionGame: Game? = null
fun stopForegroundService(activity: Activity) {
val startIntent = Intent(activity, ForegroundService::class.java)
diff --git a/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/EmulationFragment.kt b/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/EmulationFragment.kt
index 435fe5fe2c..b67bc6a9cc 100644
--- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/EmulationFragment.kt
+++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/EmulationFragment.kt
@@ -50,6 +50,7 @@ import androidx.fragment.app.Fragment
import androidx.fragment.app.activityViewModels
import androidx.lifecycle.lifecycleScope
import androidx.navigation.findNavController
+import androidx.navigation.fragment.NavHostFragment
import androidx.navigation.fragment.navArgs
import androidx.window.layout.FoldingFeature
import androidx.window.layout.WindowInfoTracker
@@ -135,6 +136,8 @@ class EmulationFragment : Fragment(), SurfaceHolder.Callback {
private var intentGame: Game? = null
private var isCustomSettingsIntent = false
+ private var isStoppingForRomSwap = false
+ private var deferGameSetupUntilStopCompletes = false
private var perfStatsRunnable: Runnable? = null
private var socRunnable: Runnable? = null
@@ -238,6 +241,14 @@ class EmulationFragment : Fragment(), SurfaceHolder.Callback {
}
}
+ if (emulationViewModel.isEmulationStopping.value) {
+ deferGameSetupUntilStopCompletes = true
+ if (game == null) {
+ game = args.game ?: intentGame
+ }
+ return
+ }
+
finishGameSetup()
}
@@ -260,6 +271,7 @@ class EmulationFragment : Fragment(), SurfaceHolder.Callback {
}
game = gameToUse
+ emulationActivity?.updateSessionGame(gameToUse)
} catch (e: Exception) {
Log.error("[EmulationFragment] Error during game setup: ${e.message}")
Toast.makeText(
@@ -334,7 +346,8 @@ class EmulationFragment : Fragment(), SurfaceHolder.Callback {
}
emulationState = EmulationState(game!!.path) {
- return@EmulationState driverViewModel.isInteractionAllowed.value
+ return@EmulationState driverViewModel.isInteractionAllowed.value &&
+ !isStoppingForRomSwap
}
}
@@ -890,8 +903,12 @@ class EmulationFragment : Fragment(), SurfaceHolder.Callback {
}
)
- GameIconUtils.loadGameIcon(game!!, binding.loadingImage)
- binding.loadingTitle.text = game!!.title
+ game?.let {
+ GameIconUtils.loadGameIcon(it, binding.loadingImage)
+ binding.loadingTitle.text = it.title
+ } ?: run {
+ binding.loadingTitle.text = ""
+ }
binding.loadingTitle.isSelected = true
binding.loadingText.isSelected = true
@@ -959,6 +976,12 @@ class EmulationFragment : Fragment(), SurfaceHolder.Callback {
ViewUtils.showView(binding.loadingIndicator)
ViewUtils.hideView(binding.inputContainer)
ViewUtils.hideView(binding.showStatsOverlayText)
+ } else if (deferGameSetupUntilStopCompletes) {
+ if (!isAdded) {
+ return@collect
+ }
+ deferGameSetupUntilStopCompletes = false
+ finishGameSetup()
}
}
emulationViewModel.drawerOpen.collect(viewLifecycleOwner) {
@@ -995,26 +1018,24 @@ class EmulationFragment : Fragment(), SurfaceHolder.Callback {
}
driverViewModel.isInteractionAllowed.collect(viewLifecycleOwner) {
- if (it && !NativeLibrary.isRunning() && !NativeLibrary.isPaused()) {
- startEmulation()
+ if (it &&
+ !isStoppingForRomSwap &&
+ !NativeLibrary.isRunning() &&
+ !NativeLibrary.isPaused()
+ ) {
+ if (!DirectoryInitialization.areDirectoriesReady) {
+ DirectoryInitialization.start()
+ }
+
+ updateScreenLayout()
+
+ emulationState.run(emulationActivity!!.isActivityRecreated)
}
}
driverViewModel.onLaunchGame()
}
- private fun startEmulation(programIndex: Int = 0) {
- if (!NativeLibrary.isRunning() && !NativeLibrary.isPaused()) {
- if (!DirectoryInitialization.areDirectoriesReady) {
- DirectoryInitialization.start()
- }
-
- updateScreenLayout()
-
- emulationState.run(emulationActivity!!.isActivityRecreated, programIndex)
- }
- }
-
override fun onConfigurationChanged(newConfig: Configuration) {
super.onConfigurationChanged(newConfig)
val b = _binding ?: return
@@ -1375,6 +1396,9 @@ class EmulationFragment : Fragment(), SurfaceHolder.Callback {
super.onDestroyView()
amiiboLoadJob?.cancel()
amiiboLoadJob = null
+ perfStatsRunnable?.let { perfStatsUpdateHandler.removeCallbacks(it) }
+ socRunnable?.let { socUpdateHandler.removeCallbacks(it) }
+ handler.removeCallbacksAndMessages(null)
clearPausedFrame()
_binding?.surfaceInputOverlay?.touchEventListener = null
_binding = null
@@ -1382,7 +1406,9 @@ class EmulationFragment : Fragment(), SurfaceHolder.Callback {
}
override fun onDetach() {
- NativeLibrary.clearEmulationActivity()
+ if (!hasNewerEmulationFragment()) {
+ NativeLibrary.clearEmulationActivity()
+ }
super.onDetach()
}
@@ -1840,10 +1866,74 @@ class EmulationFragment : Fragment(), SurfaceHolder.Callback {
}
override fun surfaceDestroyed(holder: SurfaceHolder) {
- emulationState.clearSurface()
+ if (this::emulationState.isInitialized && !hasNewerEmulationFragment()) {
+ emulationState.clearSurface()
+ }
emulationStarted = false
}
+ private fun hasNewerEmulationFragment(): Boolean {
+ val activity = emulationActivity ?: return false
+ return try {
+ val navHostFragment =
+ activity.supportFragmentManager.findFragmentById(R.id.fragment_container) as? NavHostFragment
+ ?: return false
+ val currentFragment = navHostFragment.childFragmentManager.fragments
+ .filterIsInstance()
+ .firstOrNull()
+ currentFragment != null && currentFragment !== this
+ } catch (_: Exception) {
+ false
+ }
+ }
+
+ // xbzk: called from EmulationActivity when a new game is loaded while this fragment is still active,
+ // to wait for the emulation thread to stop before allowing the ROM swap to proceed
+ fun notifyWhenEmulationThreadStops(onStopped: () -> Unit) {
+ if (!this::emulationState.isInitialized) {
+ onStopped()
+ return
+ }
+ val emuThread = runCatching { emulationState.emulationThread }.getOrNull()
+ if (emuThread == null || !emuThread.isAlive) {
+ onStopped()
+ return
+ }
+ Thread({
+ runCatching { emuThread.join() }
+ Handler(Looper.getMainLooper()).post {
+ onStopped()
+ }
+ }, "RomSwapWait").start()
+ }
+
+ // xbzk: called from EmulationActivity when a new game is loaded while this
+ // fragment is still active, to stop the current emulation before swapping the ROM
+ fun stopForRomSwap() {
+ if (isStoppingForRomSwap) {
+ return
+ }
+ isStoppingForRomSwap = true
+ clearPausedFrame()
+ emulationViewModel.setIsEmulationStopping(true)
+ _binding?.let {
+ binding.loadingText.setText(R.string.shutting_down)
+ ViewUtils.showView(binding.loadingIndicator)
+ ViewUtils.hideView(binding.inputContainer)
+ ViewUtils.hideView(binding.showStatsOverlayText)
+ }
+ if (this::emulationState.isInitialized) {
+ emulationState.stop()
+ if (NativeLibrary.isRunning() || NativeLibrary.isPaused()) {
+ Log.warning("[EmulationFragment] ROM swap stop fallback: forcing native stop request.")
+ NativeLibrary.stopEmulation()
+ }
+ } else {
+ NativeLibrary.stopEmulation()
+ }
+ NativeConfig.reloadGlobalConfig()
+ }
+
private fun showOverlayOptions() {
val anchor = binding.inGameMenu.findViewById(R.id.menu_overlay_controls)
val popup = PopupMenu(requireContext(), anchor)
@@ -2134,6 +2224,7 @@ class EmulationFragment : Fragment(), SurfaceHolder.Callback {
state = State.STOPPED
} else {
Log.warning("[EmulationFragment] Stop called while already stopped.")
+ NativeLibrary.stopEmulation()
}
}
diff --git a/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/GamePropertiesFragment.kt b/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/GamePropertiesFragment.kt
index faa35bc3eb..c3dea79bae 100644
--- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/GamePropertiesFragment.kt
+++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/GamePropertiesFragment.kt
@@ -36,6 +36,7 @@ import org.yuzu.yuzu_emu.databinding.FragmentGamePropertiesBinding
import org.yuzu.yuzu_emu.features.DocumentProvider
import org.yuzu.yuzu_emu.features.settings.model.Settings
import org.yuzu.yuzu_emu.features.settings.ui.SettingsSubscreen
+import org.yuzu.yuzu_emu.model.AddonViewModel
import org.yuzu.yuzu_emu.model.DriverViewModel
import org.yuzu.yuzu_emu.model.GameProperty
import org.yuzu.yuzu_emu.model.GamesViewModel
@@ -46,6 +47,7 @@ import org.yuzu.yuzu_emu.model.SubmenuProperty
import org.yuzu.yuzu_emu.model.TaskState
import org.yuzu.yuzu_emu.utils.DirectoryInitialization
import org.yuzu.yuzu_emu.utils.FileUtil
+import org.yuzu.yuzu_emu.utils.GameHelper
import org.yuzu.yuzu_emu.utils.GameIconUtils
import org.yuzu.yuzu_emu.utils.GpuDriverHelper
import org.yuzu.yuzu_emu.utils.MemoryUtil
@@ -61,6 +63,7 @@ class GamePropertiesFragment : Fragment() {
private val homeViewModel: HomeViewModel by activityViewModels()
private val gamesViewModel: GamesViewModel by activityViewModels()
+ private val addonViewModel: AddonViewModel by activityViewModels()
private val driverViewModel: DriverViewModel by activityViewModels()
private val args by navArgs()
@@ -118,6 +121,20 @@ class GamePropertiesFragment : Fragment() {
.show(childFragmentManager, LaunchGameDialogFragment.TAG)
}
+ if (GameHelper.cachedGameList.isEmpty()) {
+ binding.buttonStart.isEnabled = false
+ viewLifecycleOwner.lifecycleScope.launch {
+ withContext(Dispatchers.IO) {
+ GameHelper.restoreContentForGame(args.game)
+ }
+ if (_binding == null) {
+ return@launch
+ }
+ addonViewModel.onAddonsViewStarted(args.game)
+ binding.buttonStart.isEnabled = true
+ }
+ }
+
reloadList()
homeViewModel.openImportSaves.collect(
diff --git a/src/android/app/src/main/java/org/yuzu/yuzu_emu/model/GamesViewModel.kt b/src/android/app/src/main/java/org/yuzu/yuzu_emu/model/GamesViewModel.kt
index 39ff038034..1a63a3ad82 100644
--- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/model/GamesViewModel.kt
+++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/model/GamesViewModel.kt
@@ -100,42 +100,45 @@ class GamesViewModel : ViewModel() {
viewModelScope.launch {
withContext(Dispatchers.IO) {
- if (firstStartup) {
- // Retrieve list of cached games
- val storedGames =
- PreferenceManager.getDefaultSharedPreferences(YuzuApplication.appContext)
- .getStringSet(GameHelper.KEY_GAMES, emptySet())
- if (storedGames!!.isNotEmpty()) {
- val deserializedGames = mutableSetOf()
- storedGames.forEach {
- val game: Game
- try {
- game = Json.decodeFromString(it)
- } catch (e: Exception) {
- // We don't care about any errors related to parsing the game cache
- return@forEach
- }
+ try {
+ if (firstStartup) {
+ // Retrieve list of cached games
+ val storedGames =
+ PreferenceManager.getDefaultSharedPreferences(YuzuApplication.appContext)
+ .getStringSet(GameHelper.KEY_GAMES, emptySet())
+ if (storedGames!!.isNotEmpty()) {
+ val deserializedGames = mutableSetOf()
+ storedGames.forEach {
+ val game: Game
+ try {
+ game = Json.decodeFromString(it)
+ } catch (e: Exception) {
+ // We don't care about any errors related to parsing the game cache
+ return@forEach
+ }
- val gameExists =
- DocumentFile.fromSingleUri(
- YuzuApplication.appContext,
- Uri.parse(game.path)
- )?.exists()
- if (gameExists == true) {
- deserializedGames.add(game)
+ val gameExists =
+ DocumentFile.fromSingleUri(
+ YuzuApplication.appContext,
+ Uri.parse(game.path)
+ )?.exists()
+ if (gameExists == true) {
+ deserializedGames.add(game)
+ }
}
+ setGames(deserializedGames.toList())
}
- setGames(deserializedGames.toList())
}
- }
- setGames(GameHelper.getGames())
- reloading.set(false)
- _isReloading.value = false
- _shouldScrollAfterReload.value = true
+ setGames(GameHelper.getGames())
+ _shouldScrollAfterReload.value = true
- if (directoriesChanged) {
- setShouldSwapData(true)
+ if (directoriesChanged) {
+ setShouldSwapData(true)
+ }
+ } finally {
+ reloading.set(false)
+ _isReloading.value = false
}
}
}
diff --git a/src/android/app/src/main/java/org/yuzu/yuzu_emu/utils/DirectoryInitialization.kt b/src/android/app/src/main/java/org/yuzu/yuzu_emu/utils/DirectoryInitialization.kt
index f47c60491b..f961c5e984 100644
--- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/utils/DirectoryInitialization.kt
+++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/utils/DirectoryInitialization.kt
@@ -23,8 +23,8 @@ object DirectoryInitialization {
fun start() {
if (!areDirectoriesReady) {
initializeInternalStorage()
- NativeLibrary.initializeSystem(false)
NativeConfig.initializeGlobalConfig()
+ NativeLibrary.initializeSystem(false)
NativeLibrary.reloadProfiles()
migrateSettings()
areDirectoriesReady = true
diff --git a/src/android/app/src/main/java/org/yuzu/yuzu_emu/utils/GameHelper.kt b/src/android/app/src/main/java/org/yuzu/yuzu_emu/utils/GameHelper.kt
index 4a3cf61daa..64e035afbe 100644
--- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/utils/GameHelper.kt
+++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/utils/GameHelper.kt
@@ -8,9 +8,11 @@ package org.yuzu.yuzu_emu.utils
import android.content.SharedPreferences
import android.net.Uri
+import android.provider.DocumentsContract
import androidx.preference.PreferenceManager
import kotlinx.serialization.encodeToString
import kotlinx.serialization.json.Json
+import java.io.File
import org.yuzu.yuzu_emu.NativeLibrary
import org.yuzu.yuzu_emu.YuzuApplication
import org.yuzu.yuzu_emu.model.Game
@@ -49,29 +51,8 @@ object GameHelper {
// Remove previous filesystem provider information so we can get up to date version info
NativeLibrary.clearFilesystemProvider()
- // Scan External Content directories and register all NSP/XCI files
- val externalContentDirs = NativeConfig.getExternalContentDirs()
- val uniqueExternalContentDirs = linkedSetOf()
- externalContentDirs.forEach { externalDir ->
- if (externalDir.isNotEmpty()) {
- uniqueExternalContentDirs.add(externalDir)
- }
- }
-
val mountedContainerUris = mutableSetOf()
- for (externalDir in uniqueExternalContentDirs) {
- if (externalDir.isNotEmpty()) {
- val externalDirUri = externalDir.toUri()
- if (FileUtil.isTreeUriValid(externalDirUri)) {
- scanContentContainersRecursive(FileUtil.listFiles(externalDirUri), 3) {
- val containerUri = it.uri.toString()
- if (mountedContainerUris.add(containerUri)) {
- NativeLibrary.addFileToFilesystemProvider(containerUri)
- }
- }
- }
- }
- }
+ mountExternalContentDirectories(mountedContainerUris)
val badDirs = mutableListOf()
gameDirs.forEachIndexed { index: Int, gameDir: GameDir ->
@@ -115,6 +96,15 @@ object GameHelper {
return games.toList()
}
+ fun restoreContentForGame(game: Game) {
+ NativeLibrary.reloadKeys()
+
+ val mountedContainerUris = mutableSetOf()
+ mountExternalContentDirectories(mountedContainerUris)
+ mountGameFolderContent(Uri.parse(game.path), mountedContainerUris)
+ NativeLibrary.addFileToFilesystemProvider(game.path)
+ }
+
// File extensions considered as external content, buuut should
// be done better imo.
private val externalContentExtensions = setOf("nsp", "xci")
@@ -181,6 +171,71 @@ object GameHelper {
}
}
+ private fun mountExternalContentDirectories(mountedContainerUris: MutableSet) {
+ val uniqueExternalContentDirs = linkedSetOf()
+ NativeConfig.getExternalContentDirs().forEach { externalDir ->
+ if (externalDir.isNotEmpty()) {
+ uniqueExternalContentDirs.add(externalDir)
+ }
+ }
+
+ for (externalDir in uniqueExternalContentDirs) {
+ val externalDirUri = externalDir.toUri()
+ if (FileUtil.isTreeUriValid(externalDirUri)) {
+ scanContentContainersRecursive(FileUtil.listFiles(externalDirUri), 3) {
+ val containerUri = it.uri.toString()
+ if (mountedContainerUris.add(containerUri)) {
+ NativeLibrary.addFileToFilesystemProvider(containerUri)
+ }
+ }
+ }
+ }
+ }
+
+ private fun mountGameFolderContent(gameUri: Uri, mountedContainerUris: MutableSet) {
+ if (gameUri.scheme == "content") {
+ val parentUri = getParentDocumentUri(gameUri) ?: return
+ scanContentContainersRecursive(FileUtil.listFiles(parentUri), 1) {
+ val containerUri = it.uri.toString()
+ if (mountedContainerUris.add(containerUri)) {
+ NativeLibrary.addGameFolderFileToFilesystemProvider(containerUri)
+ }
+ }
+ return
+ }
+
+ val gameFile = File(gameUri.path ?: gameUri.toString())
+ val parentDir = gameFile.parentFile ?: return
+ parentDir.listFiles()?.forEach { sibling ->
+ if (!sibling.isFile) {
+ return@forEach
+ }
+
+ val extension = sibling.extension.lowercase()
+ if (externalContentExtensions.contains(extension)) {
+ val containerUri = Uri.fromFile(sibling).toString()
+ if (mountedContainerUris.add(containerUri)) {
+ NativeLibrary.addGameFolderFileToFilesystemProvider(containerUri)
+ }
+ }
+ }
+ }
+
+ private fun getParentDocumentUri(uri: Uri): Uri? {
+ return try {
+ val documentId = DocumentsContract.getDocumentId(uri)
+ val separatorIndex = documentId.lastIndexOf('/')
+ if (separatorIndex == -1) {
+ null
+ } else {
+ val parentDocumentId = documentId.substring(0, separatorIndex)
+ DocumentsContract.buildDocumentUriUsingTree(uri, parentDocumentId)
+ }
+ } catch (_: Exception) {
+ null
+ }
+ }
+
fun getGame(
uri: Uri,
addedToLibrary: Boolean,
diff --git a/src/android/app/src/main/res/drawable/ic_launcher_foreground.png b/src/android/app/src/main/res/drawable/ic_launcher_foreground.png
index 53f1cace9b..8b970cd4cc 100644
Binary files a/src/android/app/src/main/res/drawable/ic_launcher_foreground.png and b/src/android/app/src/main/res/drawable/ic_launcher_foreground.png differ
diff --git a/src/android/app/src/main/res/drawable/ic_yuzu.png b/src/android/app/src/main/res/drawable/ic_yuzu.png
index fce02afa1f..7e2461ba24 100644
Binary files a/src/android/app/src/main/res/drawable/ic_yuzu.png and b/src/android/app/src/main/res/drawable/ic_yuzu.png differ
diff --git a/src/android/app/src/main/res/drawable/ic_yuzu_splash.png b/src/android/app/src/main/res/drawable/ic_yuzu_splash.png
index 0e43cb9374..c9404d9937 100644
Binary files a/src/android/app/src/main/res/drawable/ic_yuzu_splash.png and b/src/android/app/src/main/res/drawable/ic_yuzu_splash.png differ
diff --git a/src/android/app/src/main/res/mipmap-hdpi/ic_launcher.png b/src/android/app/src/main/res/mipmap-hdpi/ic_launcher.png
index 23bc2897c3..74c6677dd9 100644
Binary files a/src/android/app/src/main/res/mipmap-hdpi/ic_launcher.png and b/src/android/app/src/main/res/mipmap-hdpi/ic_launcher.png differ
diff --git a/src/android/app/src/main/res/mipmap-mdpi/ic_launcher.png b/src/android/app/src/main/res/mipmap-mdpi/ic_launcher.png
index f630e793e3..31a01461b4 100644
Binary files a/src/android/app/src/main/res/mipmap-mdpi/ic_launcher.png and b/src/android/app/src/main/res/mipmap-mdpi/ic_launcher.png differ
diff --git a/src/android/app/src/main/res/mipmap-xhdpi/ic_launcher.png b/src/android/app/src/main/res/mipmap-xhdpi/ic_launcher.png
index 1daa3c624f..3f0023f573 100644
Binary files a/src/android/app/src/main/res/mipmap-xhdpi/ic_launcher.png and b/src/android/app/src/main/res/mipmap-xhdpi/ic_launcher.png differ
diff --git a/src/android/app/src/main/res/mipmap-xxhdpi/ic_launcher.png b/src/android/app/src/main/res/mipmap-xxhdpi/ic_launcher.png
index 7fc64e1393..6e28b3d598 100644
Binary files a/src/android/app/src/main/res/mipmap-xxhdpi/ic_launcher.png and b/src/android/app/src/main/res/mipmap-xxhdpi/ic_launcher.png differ
diff --git a/src/android/app/src/main/res/mipmap-xxxhdpi/ic_launcher.png b/src/android/app/src/main/res/mipmap-xxxhdpi/ic_launcher.png
index 53ed9b9914..39f583b630 100644
Binary files a/src/android/app/src/main/res/mipmap-xxxhdpi/ic_launcher.png and b/src/android/app/src/main/res/mipmap-xxxhdpi/ic_launcher.png differ
diff --git a/src/android/app/src/main/res/values/colors.xml b/src/android/app/src/main/res/values/colors.xml
index 472567b323..ad3412ed27 100644
--- a/src/android/app/src/main/res/values/colors.xml
+++ b/src/android/app/src/main/res/values/colors.xml
@@ -1 +1 @@
-#1F143C
+#43fcfcff
diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt
index 1ee4794272..2846058df9 100644
--- a/src/common/CMakeLists.txt
+++ b/src/common/CMakeLists.txt
@@ -134,6 +134,8 @@ add_library(
typed_address.h
uint128.h
unique_function.h
+ random.cpp
+ random.h
uuid.cpp
uuid.h
vector_math.h
@@ -144,7 +146,8 @@ add_library(
zstd_compression.cpp
zstd_compression.h
fs/ryujinx_compat.h fs/ryujinx_compat.cpp
- fs/symlink.h fs/symlink.cpp)
+ fs/symlink.h fs/symlink.cpp
+ httplib.h)
if(WIN32)
target_sources(common PRIVATE windows/timer_resolution.cpp
@@ -242,7 +245,7 @@ else()
target_link_libraries(common PUBLIC Boost::headers)
endif()
-target_link_libraries(common PUBLIC Boost::filesystem Boost::context)
+target_link_libraries(common PUBLIC Boost::filesystem Boost::context httplib::httplib)
if (lz4_ADDED)
target_include_directories(common PRIVATE ${lz4_SOURCE_DIR}/lib)
diff --git a/src/common/httplib.h b/src/common/httplib.h
new file mode 100644
index 0000000000..57bc4eeb93
--- /dev/null
+++ b/src/common/httplib.h
@@ -0,0 +1,18 @@
+// SPDX-FileCopyrightText: Copyright 2026 Eden Emulator Project
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#pragma once
+
+#define CPPHTTPLIB_DISABLE_MACOSX_AUTOMATIC_ROOT_CERTIFICATES 1
+#define CPPHTTPLIB_OPENSSL_SUPPORT 1
+
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#ifndef __clang__
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#endif
+#endif
+#include
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
diff --git a/src/common/random.cpp b/src/common/random.cpp
new file mode 100644
index 0000000000..d951881cd2
--- /dev/null
+++ b/src/common/random.cpp
@@ -0,0 +1,22 @@
+// SPDX-FileCopyrightText: Copyright 2026 Eden Emulator Project
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#include
+#include
+#include "common/random.h"
+
+namespace Common::Random {
+ [[nodiscard]] static std::random_device& GetGlobalRandomDevice() noexcept {
+ static std::random_device g_random_device{};
+ return g_random_device;
+ }
+ [[nodiscard]] u32 Random32(u32 seed) noexcept {
+ return GetGlobalRandomDevice()();
+ }
+ [[nodiscard]] u64 Random64(u64 seed) noexcept {
+ return GetGlobalRandomDevice()();
+ }
+ [[nodiscard]] std::mt19937 GetMT19937() noexcept {
+ return std::mt19937(GetGlobalRandomDevice()());
+ }
+}
diff --git a/src/common/random.h b/src/common/random.h
new file mode 100644
index 0000000000..83210f6dc2
--- /dev/null
+++ b/src/common/random.h
@@ -0,0 +1,13 @@
+// SPDX-FileCopyrightText: Copyright 2026 Eden Emulator Project
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#pragma once
+
+#include
+#include "common/common_types.h"
+
+namespace Common::Random {
+ [[nodiscard]] u32 Random32(u32 seed) noexcept;
+ [[nodiscard]] u64 Random64(u64 seed) noexcept;
+ [[nodiscard]] std::mt19937 GetMT19937() noexcept;
+}
diff --git a/src/common/tiny_mt.h b/src/common/tiny_mt.h
index c9f9ed4a5d..4b556a33eb 100644
--- a/src/common/tiny_mt.h
+++ b/src/common/tiny_mt.h
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
+// SPDX-FileCopyrightText: Copyright 2026 Eden Emulator Project
// SPDX-License-Identifier: GPL-3.0-or-later
// SPDX-FileCopyrightText: Copyright 2021 yuzu Emulator Project
@@ -218,12 +218,6 @@ public:
return t0;
}
- u64 GenerateRandomU64() {
- const u32 lo = this->GenerateRandomU32();
- const u32 hi = this->GenerateRandomU32();
- return (u64{hi} << 32) | u64{lo};
- }
-
float GenerateRandomF32() {
// Floats have 24 bits of mantissa.
constexpr u32 MantissaBits = 24;
diff --git a/src/common/uuid.cpp b/src/common/uuid.cpp
index 8f0dba452c..d4a5733c26 100644
--- a/src/common/uuid.cpp
+++ b/src/common/uuid.cpp
@@ -1,3 +1,6 @@
+// SPDX-FileCopyrightText: Copyright 2026 Eden Emulator Project
+// SPDX-License-Identifier: GPL-3.0-or-later
+
// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
@@ -10,6 +13,7 @@
#include "common/assert.h"
#include "common/tiny_mt.h"
#include "common/uuid.h"
+#include "common/random.h"
namespace Common {
@@ -175,21 +179,16 @@ u128 UUID::AsU128() const {
}
UUID UUID::MakeRandom() {
- std::random_device device;
-
- return MakeRandomWithSeed(device());
+ return MakeRandomWithSeed(Common::Random::Random32(0));
}
UUID UUID::MakeRandomWithSeed(u32 seed) {
// Create and initialize our RNG.
TinyMT rng;
rng.Initialize(seed);
-
UUID uuid;
-
// Populate the UUID with random bytes.
rng.GenerateRandomBytes(uuid.uuid.data(), sizeof(UUID));
-
return uuid;
}
diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index 08a2d0e2db..6dfc23229a 100644
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -1269,7 +1269,6 @@ endif()
target_sources(core PRIVATE hle/service/ssl/ssl_backend_openssl.cpp)
target_link_libraries(core PRIVATE OpenSSL::SSL OpenSSL::Crypto)
-target_compile_definitions(core PRIVATE CPPHTTPLIB_OPENSSL_SUPPORT)
# TODO
diff --git a/src/core/file_sys/registered_cache.cpp b/src/core/file_sys/registered_cache.cpp
index 61671ea333..af41820a36 100644
--- a/src/core/file_sys/registered_cache.cpp
+++ b/src/core/file_sys/registered_cache.cpp
@@ -12,6 +12,7 @@
#include "common/fs/path_util.h"
#include "common/hex_util.h"
#include "common/logging.h"
+#include "common/random.h"
#include "common/string_util.h"
#include "core/crypto/key_manager.h"
#include "core/file_sys/card_image.h"
@@ -490,17 +491,13 @@ std::vector PlaceholderCache::List() const {
}
NcaID PlaceholderCache::Generate() {
- std::random_device device;
- std::mt19937 gen(device());
+ auto gen = Common::Random::GetMT19937();
std::uniform_int_distribution distribution(1, (std::numeric_limits::max)());
-
NcaID out{};
-
const auto v1 = distribution(gen);
const auto v2 = distribution(gen);
std::memcpy(out.data(), &v1, sizeof(u64));
std::memcpy(out.data() + sizeof(u64), &v2, sizeof(u64));
-
return out;
}
diff --git a/src/core/hle/kernel/board/nintendo/nx/k_system_control.cpp b/src/core/hle/kernel/board/nintendo/nx/k_system_control.cpp
index 1446653916..3343d1d282 100644
--- a/src/core/hle/kernel/board/nintendo/nx/k_system_control.cpp
+++ b/src/core/hle/kernel/board/nintendo/nx/k_system_control.cpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
+// SPDX-FileCopyrightText: Copyright 2026 Eden Emulator Project
// SPDX-License-Identifier: GPL-3.0-or-later
// SPDX-FileCopyrightText: Copyright 2021 yuzu Emulator Project
@@ -7,6 +7,7 @@
#include
#include "common/literals.h"
+#include "common/random.h"
#include "common/settings.h"
#include "core/hle/kernel/board/nintendo/nx/k_system_control.h"
@@ -201,15 +202,8 @@ u64 GenerateUniformRange(u64 min, u64 max, F f) {
} // Anonymous namespace
-u64 KSystemControl::GenerateRandomU64() {
- std::random_device device;
- std::mt19937 gen(device());
- std::uniform_int_distribution distribution(1, (std::numeric_limits::max)());
- return distribution(gen);
-}
-
u64 KSystemControl::GenerateRandomRange(u64 min, u64 max) {
- return GenerateUniformRange(min, max, GenerateRandomU64);
+ return GenerateUniformRange(min, max, Common::Random::GetMT19937());
}
size_t KSystemControl::CalculateRequiredSecureMemorySize(size_t size, u32 pool) {
diff --git a/src/core/hle/kernel/board/nintendo/nx/k_system_control.h b/src/core/hle/kernel/board/nintendo/nx/k_system_control.h
index 60c5e58b73..41a25ba1c8 100644
--- a/src/core/hle/kernel/board/nintendo/nx/k_system_control.h
+++ b/src/core/hle/kernel/board/nintendo/nx/k_system_control.h
@@ -1,3 +1,6 @@
+// SPDX-FileCopyrightText: Copyright 2026 Eden Emulator Project
+// SPDX-License-Identifier: GPL-3.0-or-later
+
// SPDX-FileCopyrightText: Copyright 2021 yuzu Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
@@ -33,7 +36,6 @@ public:
// Randomness.
static u64 GenerateRandomRange(u64 min, u64 max);
- static u64 GenerateRandomU64();
// Secure Memory.
static size_t CalculateRequiredSecureMemorySize(size_t size, u32 pool);
diff --git a/src/core/hle/kernel/k_page_bitmap.h b/src/core/hle/kernel/k_page_bitmap.h
index fc21b81574..27bd682c5c 100644
--- a/src/core/hle/kernel/k_page_bitmap.h
+++ b/src/core/hle/kernel/k_page_bitmap.h
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
+// SPDX-FileCopyrightText: Copyright 2026 Eden Emulator Project
// SPDX-License-Identifier: GPL-3.0-or-later
// SPDX-FileCopyrightText: Copyright 2021 yuzu Emulator Project
@@ -14,6 +14,7 @@
#include "common/bit_util.h"
#include "common/common_types.h"
#include "common/tiny_mt.h"
+#include "common/random.h"
#include "core/hle/kernel/k_system_control.h"
namespace Kernel {
@@ -23,7 +24,7 @@ public:
class RandomBitGenerator {
public:
RandomBitGenerator() {
- m_rng.Initialize(static_cast(KSystemControl::GenerateRandomU64()));
+ m_rng.Initialize(u32(Common::Random::Random64(0)));
}
u64 SelectRandomBit(u64 bitmap) {
diff --git a/src/core/hle/kernel/k_thread.cpp b/src/core/hle/kernel/k_thread.cpp
index 17bdb7b6fa..ea9b7eb114 100644
--- a/src/core/hle/kernel/k_thread.cpp
+++ b/src/core/hle/kernel/k_thread.cpp
@@ -20,6 +20,7 @@
#include "common/fiber.h"
#include "common/logging.h"
#include "common/settings.h"
+#include "common/random.h"
#include "core/core.h"
#include "core/cpu_manager.h"
#include "core/hardware_properties.h"
@@ -45,8 +46,7 @@ namespace {
constexpr inline s32 TerminatingThreadPriority = Kernel::Svc::SystemThreadPriorityHighest - 1;
-static void ResetThreadContext32(Kernel::Svc::ThreadContext& ctx, u64 stack_top, u64 entry_point,
- u64 arg) {
+static void ResetThreadContext32(Kernel::Svc::ThreadContext& ctx, u64 stack_top, u64 entry_point, u64 arg) {
ctx = {};
ctx.r[0] = arg;
ctx.r[15] = entry_point;
@@ -55,11 +55,10 @@ static void ResetThreadContext32(Kernel::Svc::ThreadContext& ctx, u64 stack_top,
ctx.fpsr = 0;
}
-static void ResetThreadContext64(Kernel::Svc::ThreadContext& ctx, u64 stack_top, u64 entry_point,
- u64 arg) {
+static void ResetThreadContext64(Kernel::Svc::ThreadContext& ctx, u64 stack_top, u64 entry_point, u64 arg) {
ctx = {};
ctx.r[0] = arg;
- ctx.r[18] = Kernel::KSystemControl::GenerateRandomU64() | 1;
+ ctx.r[18] = Common::Random::Random64(0) | 1;
ctx.pc = entry_point;
ctx.sp = stack_top;
ctx.fpcr = 0;
diff --git a/src/core/hle/service/bcat/news/builtin_news.cpp b/src/core/hle/service/bcat/news/builtin_news.cpp
index ed001b056b..d24431cdbc 100644
--- a/src/core/hle/service/bcat/news/builtin_news.cpp
+++ b/src/core/hle/service/bcat/news/builtin_news.cpp
@@ -15,9 +15,7 @@
#include
#include
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-#include
-#endif
+#include "common/httplib.h"
#include
#include
@@ -103,8 +101,6 @@ std::vector TryLoadFromDisk(const std::filesystem::path& path) {
std::vector DownloadImage(const std::string& url_path, const std::filesystem::path& cache_path) {
LOG_INFO(Service_BCAT, "Downloading image: https://eden-emu.dev{}", url_path);
-
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
try {
httplib::Client cli("https://eden-emu.dev");
cli.set_follow_location(true);
@@ -128,8 +124,6 @@ std::vector DownloadImage(const std::string& url_path, const std::filesystem
} catch (...) {
LOG_WARNING(Service_BCAT, "Failed to download: {}", url_path);
}
-#endif
-
return {};
}
@@ -232,8 +226,6 @@ void WriteCachedJson(std::string_view json) {
}
std::optional DownloadReleasesJson() {
-
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
try {
httplib::SSLClient cli{"api.github.com", 443};
cli.set_connection_timeout(10);
@@ -255,7 +247,6 @@ std::optional DownloadReleasesJson() {
} catch (...) {
LOG_WARNING(Service_BCAT, " failed to download releases");
}
-#endif
return std::nullopt;
}
diff --git a/src/core/hle/service/cmif_serialization.h b/src/core/hle/service/cmif_serialization.h
index 4d32c6cd6b..75461cc6be 100644
--- a/src/core/hle/service/cmif_serialization.h
+++ b/src/core/hle/service/cmif_serialization.h
@@ -438,20 +438,20 @@ void WriteOutArgument(bool is_domain, CallArguments& args, u8* raw_data, HLERequ
template
void CmifReplyWrapImpl(HLERequestContext& ctx, T& t, Result (T::*f)(A...)) {
+ const auto mgr = ctx.GetManager().get();
// Verify domain state.
if constexpr (!Domain) {
- const auto _mgr = ctx.GetManager();
- const bool _is_domain = _mgr ? _mgr->IsDomain() : false;
- ASSERT_MSG(!_is_domain,
- "Non-domain reply used on domain session\n"
- "Service={} (TIPC={} CmdType={} Cmd=0x{:08X}\n"
- "HasDomainHeader={} DomainHandlers={}\nDesc={}",
- t.GetServiceName(), ctx.IsTipc(),
- static_cast(ctx.GetCommandType()), static_cast(ctx.GetCommand()),
- ctx.HasDomainMessageHeader(), _mgr ? static_cast(_mgr->DomainHandlerCount()) : 0u,
- ctx.Description());
+ const bool is_domain = mgr ? mgr->IsDomain() : false;
+ ASSERT_MSG(!is_domain,
+ "Non-domain reply used on domain session\n"
+ "Service={} (TIPC={} CmdType={} Cmd=0x{:08X}\n"
+ "HasDomainHeader={} DomainHandlers={}\nDesc={}",
+ t.GetServiceName(), ctx.IsTipc(),
+ u32(ctx.GetCommandType()), u32(ctx.GetCommand()),
+ ctx.HasDomainMessageHeader(), mgr ? u32(mgr->DomainHandlerCount()) : 0u,
+ ctx.Description());
}
- const bool is_domain = Domain ? ctx.GetManager()->IsDomain() : false;
+ const bool is_domain = Domain ? mgr->IsDomain() : false;
static_assert(ConstIfReference(), "Arguments taken by reference must be const");
using MethodArguments = std::tuple...>;
diff --git a/src/core/hle/service/ipc_helpers.h b/src/core/hle/service/ipc_helpers.h
index 4b02872fba..8aee17db8d 100644
--- a/src/core/hle/service/ipc_helpers.h
+++ b/src/core/hle/service/ipc_helpers.h
@@ -1,3 +1,6 @@
+// SPDX-FileCopyrightText: Copyright 2026 Eden Emulator Project
+// SPDX-License-Identifier: GPL-3.0-or-later
+
// SPDX-FileCopyrightText: 2016 Citra Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
@@ -78,32 +81,29 @@ public:
memset(cmdbuf, 0, sizeof(u32) * IPC::COMMAND_BUFFER_LENGTH);
IPC::CommandHeader header{};
+ auto const mgr = ctx.GetManager().get();
// The entire size of the raw data section in u32 units, including the 16 bytes of mandatory
// padding.
- u32 raw_data_size = ctx.write_size =
- ctx.IsTipc() ? normal_params_size - 1 : normal_params_size;
+ u32 raw_data_size = ctx.write_size = ctx.IsTipc() ? normal_params_size - 1 : normal_params_size;
u32 num_handles_to_move{};
u32 num_domain_objects{};
- const bool always_move_handles{
- (static_cast(flags) & static_cast(Flags::AlwaysMoveHandles)) != 0};
- if (!ctx.GetManager()->IsDomain() || always_move_handles) {
+ const bool always_move_handles = (u32(flags) & u32(Flags::AlwaysMoveHandles)) != 0;
+ if (!mgr->IsDomain() || always_move_handles) {
num_handles_to_move = num_objects_to_move;
} else {
num_domain_objects = num_objects_to_move;
}
- if (ctx.GetManager()->IsDomain()) {
- raw_data_size +=
- static_cast(sizeof(DomainMessageHeader) / sizeof(u32) + num_domain_objects);
+ if (mgr->IsDomain()) {
+ raw_data_size += u32(sizeof(DomainMessageHeader) / sizeof(u32) + num_domain_objects);
ctx.write_size += num_domain_objects;
}
if (ctx.IsTipc()) {
header.type.Assign(ctx.GetCommandType());
} else {
- raw_data_size += static_cast(sizeof(IPC::DataPayloadHeader) / sizeof(u32) + 4 +
- normal_params_size);
+ raw_data_size += u32(sizeof(IPC::DataPayloadHeader) / sizeof(u32) + 4 + normal_params_size);
}
header.data_size.Assign(raw_data_size);
@@ -126,7 +126,7 @@ public:
if (!ctx.IsTipc()) {
AlignWithPadding();
- if (ctx.GetManager()->IsDomain() && ctx.HasDomainMessageHeader()) {
+ if (mgr->IsDomain() && ctx.HasDomainMessageHeader()) {
IPC::DomainMessageHeader domain_header{};
domain_header.num_objects = num_domain_objects;
PushRaw(domain_header);
diff --git a/src/core/hle/service/mii/mii_util.h b/src/core/hle/service/mii/mii_util.h
index 3534fa31d5..2ef006765c 100644
--- a/src/core/hle/service/mii/mii_util.h
+++ b/src/core/hle/service/mii/mii_util.h
@@ -1,3 +1,6 @@
+// SPDX-FileCopyrightText: Copyright 2026 Eden Emulator Project
+// SPDX-License-Identifier: GPL-3.0-or-later
+
// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
@@ -7,6 +10,7 @@
#include
#include "common/common_types.h"
+#include "common/random.h"
#include "common/swap.h"
#include "common/uuid.h"
#include "core/hle/service/mii/mii_types.h"
@@ -65,11 +69,9 @@ public:
template
static T GetRandomValue(T min, T max) {
- std::random_device device;
- std::mt19937 gen(device());
- std::uniform_int_distribution distribution(static_cast(min),
- static_cast(max));
- return static_cast(distribution(gen));
+ std::uniform_int_distribution distribution{u64(min), u64(max)};
+ auto gen = Common::Random::GetMT19937();
+ return T(distribution(gen));
}
template
diff --git a/src/core/loader/deconstructed_rom_directory.cpp b/src/core/loader/deconstructed_rom_directory.cpp
index 4e0b119f21..2ea63e137e 100644
--- a/src/core/loader/deconstructed_rom_directory.cpp
+++ b/src/core/loader/deconstructed_rom_directory.cpp
@@ -6,6 +6,7 @@
#include
#include "common/logging.h"
+#include "common/random.h"
#include "common/settings.h"
#include "core/core.h"
#include "core/file_sys/content_archive.h"
@@ -229,7 +230,7 @@ AppLoader_DeconstructedRomDirectory::LoadResult AppLoader_DeconstructedRomDirect
// TODO: this is bad form of ASLR, it sucks
size_t aslr_offset = ((::Settings::values.rng_seed_enabled.GetValue()
? ::Settings::values.rng_seed.GetValue()
- : std::rand()) * 0x734287f27) & 0xfff000;
+ : Common::Random::Random64(0)) * 0x734287f27) & 0xfff000;
// Setup the process code layout
if (process.LoadFromMetadata(metadata, code_size, fastmem_base, aslr_offset, is_hbl).IsError()) {
diff --git a/src/core/loader/kip.cpp b/src/core/loader/kip.cpp
index db6c98c5a3..81449ac8b8 100644
--- a/src/core/loader/kip.cpp
+++ b/src/core/loader/kip.cpp
@@ -6,6 +6,7 @@
#include
#include "common/settings.h"
+#include "common/random.h"
#include "core/file_sys/kernel_executable.h"
#include "core/file_sys/program_metadata.h"
#include "core/hle/kernel/code_set.h"
@@ -90,7 +91,7 @@ AppLoader::LoadResult AppLoader_KIP::Load(Kernel::KProcess& process,
// TODO: this is bad form of ASLR, it sucks
size_t aslr_offset = ((::Settings::values.rng_seed_enabled.GetValue()
? ::Settings::values.rng_seed.GetValue()
- : std::rand()) * 0x734287f27) & 0xfff000;
+ : Common::Random::Random64(0)) * 0x734287f27) & 0xfff000;
// Setup the process code layout
if (process.LoadFromMetadata(FileSys::ProgramMetadata::GetDefault(), codeset.memory.size(), 0, aslr_offset, false).IsError()) {
diff --git a/src/core/loader/nro.cpp b/src/core/loader/nro.cpp
index b429aa9e80..e7c5ac01b1 100644
--- a/src/core/loader/nro.cpp
+++ b/src/core/loader/nro.cpp
@@ -11,6 +11,7 @@
#include "common/common_types.h"
#include "common/logging.h"
#include "common/settings.h"
+#include "common/random.h"
#include "common/swap.h"
#include "core/core.h"
#include "core/file_sys/control_metadata.h"
@@ -243,7 +244,7 @@ static bool LoadNroImpl(Core::System& system, Kernel::KProcess& process,
// TODO: this is bad form of ASLR, it sucks
size_t aslr_offset = ((::Settings::values.rng_seed_enabled.GetValue()
? ::Settings::values.rng_seed.GetValue()
- : std::rand()) * 0x734287f27) & 0xfff000;
+ : Common::Random::Random64(0)) * 0x734287f27) & 0xfff000;
// Setup the process code layout
if (process
diff --git a/src/core/memory.cpp b/src/core/memory.cpp
index 5c57df424c..3a9ea308a8 100644
--- a/src/core/memory.cpp
+++ b/src/core/memory.cpp
@@ -45,11 +45,7 @@ static inline bool AddressSpaceContains(const Common::PageTable& table, const Co
// from outside classes. This also allows modification to the internals of the memory
// subsystem without needing to rebuild all files that make use of the memory interface.
struct Memory::Impl {
- explicit Impl(Core::System& system_) : system{system_} {
- // Initialize thread count based on available cores for parallel memory operations
- const unsigned int hw_concurrency = std::thread::hardware_concurrency();
- thread_count = (std::max)(2u, (std::min)(hw_concurrency, 8u)); // Limit to 8 threads max
- }
+ explicit Impl(Core::System& system_) : system{system_} {}
void SetCurrentPageTable(Kernel::KProcess& process) {
current_page_table = &process.GetPageTable().GetImpl();
@@ -856,13 +852,7 @@ struct Memory::Impl {
Tegra::MaxwellDeviceMemoryManager* gpu_device_memory{};
Common::PageTable* current_page_table = nullptr;
- // Number of threads to use for parallel memory operations
- unsigned int thread_count = 2;
-
- // Minimum size in bytes for which parallel processing is beneficial
- //size_t PARALLEL_THRESHOLD = (L3 CACHE * NUM PHYSICAL CORES); // 64 KB
- std::array
- rasterizer_read_areas{};
+ std::array rasterizer_read_areas{};
std::array rasterizer_write_areas{};
std::array, Core::Hardware::NUM_CPU_CORES> scratch_buffers{};
std::span gpu_dirty_managers;
diff --git a/src/dynarmic/src/dynarmic/backend/arm64/reg_alloc.cpp b/src/dynarmic/src/dynarmic/backend/arm64/reg_alloc.cpp
index 47d83f2362..a92648cd44 100644
--- a/src/dynarmic/src/dynarmic/backend/arm64/reg_alloc.cpp
+++ b/src/dynarmic/src/dynarmic/backend/arm64/reg_alloc.cpp
@@ -316,8 +316,8 @@ int RegAlloc::RealizeReadImpl(const IR::Value& value) {
return current_location->index;
}
- ASSERT(!ValueInfo(*current_location).realized);
- ASSERT(ValueInfo(*current_location).locked);
+ ASSERT(!bool(ValueInfo(*current_location).realized));
+ ASSERT(bool(ValueInfo(*current_location).locked));
if constexpr (required_kind == HostLoc::Kind::Gpr) {
const int new_location_index = AllocateRegister(gprs, gpr_order);
diff --git a/src/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.cpp b/src/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.cpp
index 80f0f9cc2f..dd9e9e4a66 100644
--- a/src/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.cpp
+++ b/src/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.cpp
@@ -59,8 +59,10 @@ static Xbyak::Address MJitStateExtReg(A32::ExtReg reg) {
UNREACHABLE();
}
-A32EmitContext::A32EmitContext(const A32::UserConfig& conf, RegAlloc& reg_alloc, IR::Block& block)
- : EmitContext(reg_alloc, block), conf(conf) {}
+A32EmitContext::A32EmitContext(const A32::UserConfig& conf, RegAlloc& reg_alloc, IR::Block& block, boost::container::stable_vector& shared_labels)
+ : EmitContext(reg_alloc, block, shared_labels)
+ , conf(conf)
+{}
A32::LocationDescriptor A32EmitContext::Location() const {
return A32::LocationDescriptor{block.Location()};
@@ -109,35 +111,59 @@ A32EmitX64::BlockDescriptor A32EmitX64::Emit(IR::Block& block) {
gprs.reset(size_t(HostLoc::R14));
return gprs;
}(), any_xmm);
- A32EmitContext ctx{conf, reg_alloc, block};
+
+ A32EmitContext ctx{conf, reg_alloc, block, shared_labels};
// Start emitting.
code.align();
const u8* const entrypoint = code.getCurr();
+ code.mov(code.qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, abi_base_pointer)], rbp);
+ code.lea(rbp, code.ptr[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, abi_base_pointer) - 8]);
EmitCondPrelude(ctx);
-
- for (auto iter = block.instructions.begin(); iter != block.instructions.end(); ++iter) [[likely]] {
- auto* inst = &*iter;
- // Call the relevant Emit* member function.
- switch (inst->GetOpcode()) {
-#define OPCODE(name, type, ...) \
- case IR::Opcode::name: \
- A32EmitX64::Emit##name(ctx, inst); \
- break;
-#define A32OPC(name, type, ...) \
- case IR::Opcode::A32##name: \
- A32EmitX64::EmitA32##name(ctx, inst);\
- break;
+ typedef void (EmitX64::*EmitHandlerFn)(EmitContext& context, IR::Inst* inst);
+ constexpr EmitHandlerFn opcode_handlers[] = {
+#define OPCODE(name, type, ...) &EmitX64::Emit##name,
+#define A32OPC(name, type, ...)
+#define A64OPC(name, type, ...)
+#include "dynarmic/ir/opcodes.inc"
+#undef OPCODE
+#undef A32OPC
+#undef A64OPC
+ };
+ typedef void (A32EmitX64::*A32EmitHandlerFn)(A32EmitContext& context, IR::Inst* inst);
+ constexpr A32EmitHandlerFn a32_handlers[] = {
+#define OPCODE(...)
+#define A32OPC(name, type, ...) &A32EmitX64::EmitA32##name,
#define A64OPC(...)
#include "dynarmic/ir/opcodes.inc"
#undef OPCODE
#undef A32OPC
+#undef A64OPC
+ };
+
+ for (auto& inst : block.instructions) {
+ auto const opcode = inst.GetOpcode();
+ // Call the relevant Emit* member function.
+ switch (opcode) {
+#define OPCODE(name, type, ...) case IR::Opcode::name: goto opcode_branch;
+#define A32OPC(name, type, ...) case IR::Opcode::A32##name: goto a32_branch;
+#define A64OPC(name, type, ...)
+#include "dynarmic/ir/opcodes.inc"
+#undef OPCODE
+#undef A32OPC
#undef A64OPC
default:
UNREACHABLE();
}
- reg_alloc.EndOfAllocScope();
+opcode_branch:
+ (this->*opcode_handlers[size_t(opcode)])(ctx, &inst);
+ goto finish_this_inst;
+a32_branch:
+ // Update with FIRST A32 instruction
+ (this->*a32_handlers[size_t(opcode) - size_t(IR::Opcode::A32SetCheckBit)])(ctx, &inst);
+finish_this_inst:
+ ctx.reg_alloc.EndOfAllocScope();
#ifndef NDEBUG
if (conf.very_verbose_debugging_output)
EmitVerboseDebuggingOutput(reg_alloc);
@@ -146,15 +172,14 @@ A32EmitX64::BlockDescriptor A32EmitX64::Emit(IR::Block& block) {
reg_alloc.AssertNoMoreUses();
- if (conf.enable_cycle_counting) {
+ if (conf.enable_cycle_counting)
EmitAddCycles(block.CycleCount());
- }
+ code.mov(rbp, code.qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, abi_base_pointer)]);
EmitTerminal(block.GetTerminal(), ctx.Location().SetSingleStepping(false), ctx.IsSingleStep());
code.int3();
- for (auto& deferred_emit : ctx.deferred_emits) {
+ for (auto& deferred_emit : ctx.deferred_emits)
deferred_emit();
- }
code.int3();
const size_t size = size_t(code.getCurr() - entrypoint);
@@ -167,6 +192,7 @@ A32EmitX64::BlockDescriptor A32EmitX64::Emit(IR::Block& block) {
auto const bdesc = RegisterBlock(descriptor, entrypoint, size);
code.DisableWriting();
+ shared_labels.clear();
return bdesc;
}
diff --git a/src/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.h b/src/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.h
index 5ec78ff50e..8e97dc7737 100644
--- a/src/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.h
+++ b/src/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.h
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
+// SPDX-FileCopyrightText: Copyright 2026 Eden Emulator Project
// SPDX-License-Identifier: GPL-3.0-or-later
/* This file is part of the dynarmic project.
@@ -29,7 +29,7 @@ namespace Dynarmic::Backend::X64 {
class RegAlloc;
struct A32EmitContext final : public EmitContext {
- A32EmitContext(const A32::UserConfig& conf, RegAlloc& reg_alloc, IR::Block& block);
+ A32EmitContext(const A32::UserConfig& conf, RegAlloc& reg_alloc, IR::Block& block, boost::container::stable_vector& shared_labels);
A32::LocationDescriptor Location() const;
A32::LocationDescriptor EndLocation() const;
@@ -130,6 +130,7 @@ public:
ankerl::unordered_dense::map, void (*)()> write_fallbacks;
ankerl::unordered_dense::map, void (*)()> exclusive_write_fallbacks;
ankerl::unordered_dense::set do_not_fastmem;
+ boost::container::stable_vector shared_labels;
void (*memory_read_128)() = nullptr; // Dummy
void (*memory_write_128)() = nullptr; // Dummy
const void* terminal_handler_pop_rsb_hint;
diff --git a/src/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.cpp b/src/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.cpp
index 832cfdcce2..8edeb29aed 100644
--- a/src/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.cpp
+++ b/src/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.cpp
@@ -37,8 +37,10 @@ namespace Dynarmic::Backend::X64 {
using namespace Xbyak::util;
-A64EmitContext::A64EmitContext(const A64::UserConfig& conf, RegAlloc& reg_alloc, IR::Block& block)
- : EmitContext(reg_alloc, block), conf(conf) {}
+A64EmitContext::A64EmitContext(const A64::UserConfig& conf, RegAlloc& reg_alloc, IR::Block& block, boost::container::stable_vector& shared_labels)
+ : EmitContext(reg_alloc, block, shared_labels)
+ , conf(conf)
+{}
A64::LocationDescriptor A64EmitContext::Location() const {
return A64::LocationDescriptor{block.Location()};
@@ -83,11 +85,14 @@ A64EmitX64::BlockDescriptor A64EmitX64::Emit(IR::Block& block) noexcept {
gprs.reset(size_t(HostLoc::R14));
return gprs;
}(), any_xmm};
- A64EmitContext ctx{conf, reg_alloc, block};
+
+ A64EmitContext ctx{conf, reg_alloc, block, shared_labels};
// Start emitting.
code.align();
const auto* const entrypoint = code.getCurr();
+ code.mov(code.qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, abi_base_pointer)], rbp);
+ code.lea(rbp, code.ptr[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, abi_base_pointer) - 8]);
DEBUG_ASSERT(block.GetCondition() == IR::Cond::AL);
typedef void (EmitX64::*EmitHandlerFn)(EmitContext& context, IR::Inst* inst);
@@ -139,16 +144,13 @@ finish_this_inst:
}
reg_alloc.AssertNoMoreUses();
-
- if (conf.enable_cycle_counting) {
+ if (conf.enable_cycle_counting)
EmitAddCycles(block.CycleCount());
- }
+ code.mov(rbp, code.qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, abi_base_pointer)]);
EmitTerminal(block.GetTerminal(), ctx.Location().SetSingleStepping(false), ctx.IsSingleStep());
code.int3();
-
- for (auto& deferred_emit : ctx.deferred_emits) {
+ for (auto& deferred_emit : ctx.deferred_emits)
deferred_emit();
- }
code.int3();
const size_t size = size_t(code.getCurr() - entrypoint);
@@ -161,6 +163,7 @@ finish_this_inst:
auto bdesc = RegisterBlock(descriptor, entrypoint, size);
code.DisableWriting();
+ shared_labels.clear();
return bdesc;
}
diff --git a/src/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.h b/src/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.h
index dd556e36ce..d57b1d81b9 100644
--- a/src/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.h
+++ b/src/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.h
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
+// SPDX-FileCopyrightText: Copyright 2026 Eden Emulator Project
// SPDX-License-Identifier: GPL-3.0-or-later
/* This file is part of the dynarmic project.
@@ -27,7 +27,7 @@
namespace Dynarmic::Backend::X64 {
struct A64EmitContext final : public EmitContext {
- A64EmitContext(const A64::UserConfig& conf, RegAlloc& reg_alloc, IR::Block& block);
+ A64EmitContext(const A64::UserConfig& conf, RegAlloc& reg_alloc, IR::Block& block, boost::container::stable_vector& shared_labels);
A64::LocationDescriptor Location() const;
bool IsSingleStep() const;
@@ -126,6 +126,7 @@ public:
ankerl::unordered_dense::map, void (*)()> write_fallbacks;
ankerl::unordered_dense::map, void (*)()> exclusive_write_fallbacks;
ankerl::unordered_dense::set do_not_fastmem;
+ boost::container::stable_vector shared_labels;
const void* terminal_handler_pop_rsb_hint = nullptr;
const void* terminal_handler_fast_dispatch_hint = nullptr;
FastDispatchEntry& (*fast_dispatch_table_lookup)(u64) = nullptr;
diff --git a/src/dynarmic/src/dynarmic/backend/x64/emit_x64.cpp b/src/dynarmic/src/dynarmic/backend/x64/emit_x64.cpp
index 4e515fef2f..4ed198e09f 100644
--- a/src/dynarmic/src/dynarmic/backend/x64/emit_x64.cpp
+++ b/src/dynarmic/src/dynarmic/backend/x64/emit_x64.cpp
@@ -32,8 +32,11 @@ namespace Dynarmic::Backend::X64 {
using namespace Xbyak::util;
-EmitContext::EmitContext(RegAlloc& reg_alloc, IR::Block& block)
- : reg_alloc(reg_alloc), block(block) {}
+EmitContext::EmitContext(RegAlloc& reg_alloc, IR::Block& block, boost::container::stable_vector& shared_labels)
+ : reg_alloc(reg_alloc)
+ , block(block)
+ , shared_labels(shared_labels)
+{}
EmitContext::~EmitContext() = default;
diff --git a/src/dynarmic/src/dynarmic/backend/x64/emit_x64.h b/src/dynarmic/src/dynarmic/backend/x64/emit_x64.h
index 301f4ffc89..619945e19a 100644
--- a/src/dynarmic/src/dynarmic/backend/x64/emit_x64.h
+++ b/src/dynarmic/src/dynarmic/backend/x64/emit_x64.h
@@ -16,11 +16,12 @@
#include
#include
-#include "dynarmic/mcl/bit.hpp"
#include
-#include "dynarmic/backend/x64/xbyak.h"
+#include
#include
+#include "dynarmic/backend/x64/xbyak.h"
+#include "dynarmic/mcl/bit.hpp"
#include "dynarmic/backend/exception_handler.h"
#include "dynarmic/backend/x64/reg_alloc.h"
#include "dynarmic/common/fp/fpcr.h"
@@ -52,24 +53,23 @@ using VectorArray = std::array>
template
using HalfVectorArray = std::array / 2>;
+using SharedLabel = Xbyak::Label*;
struct EmitContext {
- EmitContext(RegAlloc& reg_alloc, IR::Block& block);
+ EmitContext(RegAlloc& reg_alloc, IR::Block& block, boost::container::stable_vector& shared_labels);
virtual ~EmitContext();
virtual FP::FPCR FPCR(bool fpcr_controlled = true) const = 0;
virtual bool HasOptimization(OptimizationFlag flag) const = 0;
- RegAlloc& reg_alloc;
- IR::Block& block;
+ [[nodiscard]] inline Xbyak::Label* GenSharedLabel() noexcept {
+ return &shared_labels.emplace_back();
+ }
std::vector> deferred_emits;
+ RegAlloc& reg_alloc;
+ IR::Block& block;
+ boost::container::stable_vector& shared_labels;
};
-using SharedLabel = std::shared_ptr;
-
-inline SharedLabel GenSharedLabel() {
- return std::make_shared();
-}
-
class EmitX64 {
public:
struct BlockDescriptor {
diff --git a/src/dynarmic/src/dynarmic/backend/x64/emit_x64_floating_point.cpp b/src/dynarmic/src/dynarmic/backend/x64/emit_x64_floating_point.cpp
index d073991fbe..6a3ab005f3 100644
--- a/src/dynarmic/src/dynarmic/backend/x64/emit_x64_floating_point.cpp
+++ b/src/dynarmic/src/dynarmic/backend/x64/emit_x64_floating_point.cpp
@@ -136,7 +136,7 @@ void ForceToDefaultNaN(BlockOfCode& code, Xbyak::Xmm result) {
template
SharedLabel ProcessNaN(BlockOfCode& code, EmitContext& ctx, Xbyak::Xmm a) {
- SharedLabel nan = GenSharedLabel(), end = GenSharedLabel();
+ SharedLabel nan = ctx.GenSharedLabel(), end = ctx.GenSharedLabel();
FCODE(ucomis)(a, a);
code.jp(*nan, code.T_NEAR);
@@ -251,7 +251,7 @@ template
void FPTwoOp(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- SharedLabel end = GenSharedLabel();
+ SharedLabel end = ctx.GenSharedLabel();
Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
@@ -304,7 +304,7 @@ void FPThreeOp(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn)
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
const Xbyak::Reg64 tmp = ctx.reg_alloc.ScratchGpr(code);
- SharedLabel end = GenSharedLabel(), nan = GenSharedLabel();
+ SharedLabel end = ctx.GenSharedLabel(), nan = ctx.GenSharedLabel();
code.movaps(result, op1);
if constexpr (std::is_member_function_pointer_v) {
@@ -413,7 +413,7 @@ static void EmitFPMinMax(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, bo
DenormalsAreZero(code, ctx, {result, operand});
- SharedLabel equal = GenSharedLabel(), end = GenSharedLabel();
+ SharedLabel equal = ctx.GenSharedLabel(), end = ctx.GenSharedLabel();
FCODE(ucomis)(result, operand);
code.jz(*equal, code.T_NEAR);
@@ -484,7 +484,7 @@ static inline void EmitFPMinMaxNumeric(BlockOfCode& code, EmitContext& ctx, IR::
}
};
- SharedLabel end = GenSharedLabel(), z = GenSharedLabel();
+ SharedLabel end = ctx.GenSharedLabel(), z = ctx.GenSharedLabel();
FCODE(ucomis)(op1, op2);
code.jz(*z, code.T_NEAR);
@@ -632,7 +632,7 @@ static void EmitFPMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, bo
}
if (code.HasHostFeature(HostFeature::FMA | HostFeature::AVX)) {
- SharedLabel fallback = GenSharedLabel(), end = GenSharedLabel();
+ SharedLabel fallback = ctx.GenSharedLabel(), end = ctx.GenSharedLabel();
const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(code, args[0]);
const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(code, args[1]);
@@ -843,7 +843,7 @@ static void EmitFPMulX(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
const Xbyak::Reg64 tmp = do_default_nan ? INVALID_REG : ctx.reg_alloc.ScratchGpr(code);
- SharedLabel end = GenSharedLabel(), nan = GenSharedLabel();
+ SharedLabel end = ctx.GenSharedLabel(), nan = ctx.GenSharedLabel();
if (code.HasHostFeature(HostFeature::AVX)) {
FCODE(vmuls)(result, op1, op2);
@@ -981,7 +981,7 @@ static void EmitFPRecipStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst*
}
if (code.HasHostFeature(HostFeature::FMA)) {
- SharedLabel end = GenSharedLabel(), fallback = GenSharedLabel();
+ SharedLabel end = ctx.GenSharedLabel(), fallback = ctx.GenSharedLabel();
const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(code, args[0]);
const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(code, args[1]);
@@ -1129,7 +1129,7 @@ static void EmitFPRSqrtEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* i
const Xbyak::Xmm value = ctx.reg_alloc.ScratchXmm(code);
[[maybe_unused]] const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr(code).cvt32();
- SharedLabel bad_values = GenSharedLabel(), end = GenSharedLabel();
+ SharedLabel bad_values = ctx.GenSharedLabel(), end = ctx.GenSharedLabel();
code.movaps(value, operand);
@@ -1296,7 +1296,7 @@ static void EmitFPRSqrtStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst*
}
if (code.HasHostFeature(HostFeature::FMA | HostFeature::AVX)) {
- SharedLabel end = GenSharedLabel(), fallback = GenSharedLabel();
+ SharedLabel end = ctx.GenSharedLabel(), fallback = ctx.GenSharedLabel();
const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(code, args[0]);
const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(code, args[1]);
@@ -1641,7 +1641,7 @@ static void EmitFPToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
const Xbyak::Xmm scratch = ctx.reg_alloc.ScratchXmm(code);
if (!unsigned_) {
- SharedLabel saturate_max = GenSharedLabel(), end = GenSharedLabel();
+ SharedLabel saturate_max = ctx.GenSharedLabel(), end = ctx.GenSharedLabel();
ZeroIfNaN<64>(code, src, scratch);
diff --git a/src/dynarmic/src/dynarmic/backend/x64/emit_x64_memory.cpp.inc b/src/dynarmic/src/dynarmic/backend/x64/emit_x64_memory.cpp.inc
index 54fc595214..4fa14d504b 100644
--- a/src/dynarmic/src/dynarmic/backend/x64/emit_x64_memory.cpp.inc
+++ b/src/dynarmic/src/dynarmic/backend/x64/emit_x64_memory.cpp.inc
@@ -86,7 +86,7 @@ void AxxEmitX64::EmitMemoryRead(AxxEmitContext& ctx, IR::Inst* inst) {
const auto wrapped_fn = read_fallbacks[std::make_tuple(ordered, bitsize, vaddr.getIdx(), value_idx)];
- SharedLabel abort = GenSharedLabel(), end = GenSharedLabel();
+ SharedLabel abort = ctx.GenSharedLabel(), end = ctx.GenSharedLabel();
if (fastmem_marker) {
// Use fastmem
@@ -108,7 +108,7 @@ void AxxEmitX64::EmitMemoryRead(AxxEmitContext& ctx, IR::Inst* inst) {
conf.recompile_on_fastmem_failure,
});
- EmitCheckMemoryAbort(ctx, inst, end.get());
+ EmitCheckMemoryAbort(ctx, inst, end);
code.jmp(*end, code.T_NEAR);
});
} else {
@@ -120,7 +120,7 @@ void AxxEmitX64::EmitMemoryRead(AxxEmitContext& ctx, IR::Inst* inst) {
ctx.deferred_emits.emplace_back([=, this, &ctx] {
code.L(*abort);
code.call(wrapped_fn);
- EmitCheckMemoryAbort(ctx, inst, end.get());
+ EmitCheckMemoryAbort(ctx, inst, end);
code.jmp(*end, code.T_NEAR);
});
}
@@ -173,7 +173,7 @@ void AxxEmitX64::EmitMemoryWrite(AxxEmitContext& ctx, IR::Inst* inst) {
const auto wrapped_fn = write_fallbacks[std::make_tuple(ordered, bitsize, vaddr.getIdx(), value_idx)];
- SharedLabel abort = GenSharedLabel(), end = GenSharedLabel();
+ SharedLabel abort = ctx.GenSharedLabel(), end = ctx.GenSharedLabel();
if (fastmem_marker) {
// Use fastmem
@@ -195,7 +195,7 @@ void AxxEmitX64::EmitMemoryWrite(AxxEmitContext& ctx, IR::Inst* inst) {
conf.recompile_on_fastmem_failure,
});
- EmitCheckMemoryAbort(ctx, inst, end.get());
+ EmitCheckMemoryAbort(ctx, inst, end);
code.jmp(*end, code.T_NEAR);
});
} else {
@@ -207,7 +207,7 @@ void AxxEmitX64::EmitMemoryWrite(AxxEmitContext& ctx, IR::Inst* inst) {
ctx.deferred_emits.emplace_back([=, this, &ctx] {
code.L(*abort);
code.call(wrapped_fn);
- EmitCheckMemoryAbort(ctx, inst, end.get());
+ EmitCheckMemoryAbort(ctx, inst, end);
code.jmp(*end, code.T_NEAR);
});
}
@@ -352,7 +352,7 @@ void AxxEmitX64::EmitExclusiveReadMemoryInline(AxxEmitContext& ctx, IR::Inst* in
const auto fastmem_marker = ShouldFastmem(ctx, inst);
if (fastmem_marker) {
- SharedLabel abort = GenSharedLabel(), end = GenSharedLabel();
+ SharedLabel abort = ctx.GenSharedLabel(), end = ctx.GenSharedLabel();
bool require_abort_handling = false;
const auto src_ptr = EmitFastmemVAddr(code, ctx, *abort, vaddr, require_abort_handling);
@@ -427,7 +427,7 @@ void AxxEmitX64::EmitExclusiveWriteMemoryInline(AxxEmitContext& ctx, IR::Inst* i
EmitExclusiveLock(code, conf, tmp, tmp2.cvt32());
- SharedLabel end = GenSharedLabel();
+ SharedLabel end = ctx.GenSharedLabel();
code.mov(status, u32(1));
code.movzx(tmp.cvt32(), code.byte[code.ABI_JIT_PTR + offsetof(AxxJitState, exclusive_state)]);
@@ -460,7 +460,7 @@ void AxxEmitX64::EmitExclusiveWriteMemoryInline(AxxEmitContext& ctx, IR::Inst* i
const auto fastmem_marker = ShouldFastmem(ctx, inst);
if (fastmem_marker) {
- SharedLabel abort = GenSharedLabel();
+ SharedLabel abort = ctx.GenSharedLabel();
bool require_abort_handling = false;
const auto dest_ptr = EmitFastmemVAddr(code, ctx, *abort, vaddr, require_abort_handling, tmp);
diff --git a/src/dynarmic/src/dynarmic/backend/x64/emit_x64_memory.h b/src/dynarmic/src/dynarmic/backend/x64/emit_x64_memory.h
index b354efcb51..3ac078f1d7 100644
--- a/src/dynarmic/src/dynarmic/backend/x64/emit_x64_memory.h
+++ b/src/dynarmic/src/dynarmic/backend/x64/emit_x64_memory.h
@@ -54,7 +54,7 @@ void EmitDetectMisalignedVAddr(BlockOfCode& code, EmitContext& ctx, size_t bitsi
if (ctx.conf.only_detect_misalignment_via_page_table_on_page_boundary) {
const u32 page_align_mask = static_cast(page_table_const_size - 1) & ~align_mask;
- SharedLabel detect_boundary = GenSharedLabel(), resume = GenSharedLabel();
+ SharedLabel detect_boundary = ctx.GenSharedLabel(), resume = ctx.GenSharedLabel();
code.jnz(*detect_boundary, code.T_NEAR);
code.L(*resume);
diff --git a/src/dynarmic/src/dynarmic/backend/x64/emit_x64_vector.cpp b/src/dynarmic/src/dynarmic/backend/x64/emit_x64_vector.cpp
index a0fd944041..6f53580997 100644
--- a/src/dynarmic/src/dynarmic/backend/x64/emit_x64_vector.cpp
+++ b/src/dynarmic/src/dynarmic/backend/x64/emit_x64_vector.cpp
@@ -38,33 +38,21 @@ template
static void EmitVectorOperation(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
(code.*fn)(xmm_a, xmm_b);
ctx.reg_alloc.DefineValue(code, inst, xmm_a);
}
-template
-static void EmitAVXVectorOperation(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) {
- auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-
- const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
-
- (code.*fn)(xmm_a, xmm_a, xmm_b);
-
- ctx.reg_alloc.DefineValue(code, inst, xmm_a);
-}
-
template
static void EmitOneArgumentFallback(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Lambda lambda) {
const auto fn = static_cast*>(lambda);
constexpr u32 stack_space = 2 * 16;
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm arg1 = ctx.reg_alloc.UseXmm(code, args[0]);
- const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+ auto const arg1 = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const result = ctx.reg_alloc.ScratchXmm(code);
ctx.reg_alloc.EndOfAllocScope();
ctx.reg_alloc.HostCall(code, nullptr);
@@ -86,8 +74,8 @@ static void EmitOneArgumentFallbackWithSaturation(BlockOfCode& code, EmitContext
const auto fn = static_cast*>(lambda);
constexpr u32 stack_space = 2 * 16;
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm arg1 = ctx.reg_alloc.UseXmm(code, args[0]);
- const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+ auto const arg1 = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const result = ctx.reg_alloc.ScratchXmm(code);
ctx.reg_alloc.EndOfAllocScope();
ctx.reg_alloc.HostCall(code, nullptr);
@@ -111,9 +99,9 @@ static void EmitTwoArgumentFallbackWithSaturation(BlockOfCode& code, EmitContext
const auto fn = static_cast*>(lambda);
constexpr u32 stack_space = 3 * 16;
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm arg1 = ctx.reg_alloc.UseXmm(code, args[0]);
- const Xbyak::Xmm arg2 = ctx.reg_alloc.UseXmm(code, args[1]);
- const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+ auto const arg1 = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const arg2 = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const result = ctx.reg_alloc.ScratchXmm(code);
ctx.reg_alloc.EndOfAllocScope();
ctx.reg_alloc.HostCall(code, nullptr);
@@ -139,9 +127,9 @@ static void EmitTwoArgumentFallbackWithSaturationAndImmediate(BlockOfCode& code,
const auto fn = static_cast*>(lambda);
constexpr u32 stack_space = 2 * 16;
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm arg1 = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const arg1 = ctx.reg_alloc.UseXmm(code, args[0]);
const u8 arg2 = args[1].GetImmediateU8();
- const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+ auto const result = ctx.reg_alloc.ScratchXmm(code);
ctx.reg_alloc.EndOfAllocScope();
ctx.reg_alloc.HostCall(code, nullptr);
@@ -166,9 +154,9 @@ static void EmitTwoArgumentFallback(BlockOfCode& code, EmitContext& ctx, IR::Ins
const auto fn = static_cast*>(lambda);
constexpr u32 stack_space = 3 * 16;
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm arg1 = ctx.reg_alloc.UseXmm(code, args[0]);
- const Xbyak::Xmm arg2 = ctx.reg_alloc.UseXmm(code, args[1]);
- const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+ auto const arg1 = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const arg2 = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const result = ctx.reg_alloc.ScratchXmm(code);
ctx.reg_alloc.EndOfAllocScope();
ctx.reg_alloc.HostCall(code, nullptr);
@@ -194,7 +182,7 @@ void EmitX64::EmitVectorGetElement8(EmitContext& ctx, IR::Inst* inst) {
// TODO: DefineValue directly on Argument for index == 0
- const Xbyak::Xmm source = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const source = ctx.reg_alloc.UseXmm(code, args[0]);
const Xbyak::Reg32 dest = ctx.reg_alloc.ScratchGpr(code).cvt32();
if (code.HasHostFeature(HostFeature::SSE41)) {
@@ -218,7 +206,7 @@ void EmitX64::EmitVectorGetElement16(EmitContext& ctx, IR::Inst* inst) {
// TODO: DefineValue directly on Argument for index == 0
- const Xbyak::Xmm source = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const source = ctx.reg_alloc.UseXmm(code, args[0]);
const Xbyak::Reg32 dest = ctx.reg_alloc.ScratchGpr(code).cvt32();
code.pextrw(dest, source, index);
ctx.reg_alloc.DefineValue(code, inst, dest);
@@ -234,10 +222,10 @@ void EmitX64::EmitVectorGetElement32(EmitContext& ctx, IR::Inst* inst) {
const Xbyak::Reg32 dest = ctx.reg_alloc.ScratchGpr(code).cvt32();
if (code.HasHostFeature(HostFeature::SSE41)) {
- const Xbyak::Xmm source = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const source = ctx.reg_alloc.UseXmm(code, args[0]);
code.pextrd(dest, source, index);
} else {
- const Xbyak::Xmm source = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const source = ctx.reg_alloc.UseScratchXmm(code, args[0]);
code.pshufd(source, source, index);
code.movd(dest, source);
}
@@ -253,7 +241,7 @@ void EmitX64::EmitVectorGetElement64(EmitContext& ctx, IR::Inst* inst) {
if (index == 0) {
// TODO: DefineValue directly on Argument for index == 0
const Xbyak::Reg64 dest = ctx.reg_alloc.ScratchGpr(code).cvt64();
- const Xbyak::Xmm source = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const source = ctx.reg_alloc.UseXmm(code, args[0]);
code.movq(dest, source);
ctx.reg_alloc.DefineValue(code, inst, dest);
return;
@@ -262,10 +250,10 @@ void EmitX64::EmitVectorGetElement64(EmitContext& ctx, IR::Inst* inst) {
const Xbyak::Reg64 dest = ctx.reg_alloc.ScratchGpr(code).cvt64();
if (code.HasHostFeature(HostFeature::SSE41)) {
- const Xbyak::Xmm source = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const source = ctx.reg_alloc.UseXmm(code, args[0]);
code.pextrq(dest, source, 1);
} else {
- const Xbyak::Xmm source = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const source = ctx.reg_alloc.UseScratchXmm(code, args[0]);
code.punpckhqdq(source, source);
code.movq(dest, source);
}
@@ -277,7 +265,7 @@ void EmitX64::EmitVectorSetElement8(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
ASSERT(args[1].IsImmediate());
const u8 index = args[1].GetImmediateU8();
- const Xbyak::Xmm source_vector = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const source_vector = ctx.reg_alloc.UseScratchXmm(code, args[0]);
if (code.HasHostFeature(HostFeature::SSE41)) {
const Xbyak::Reg8 source_elem = ctx.reg_alloc.UseGpr(code, args[2]).cvt8();
@@ -310,7 +298,7 @@ void EmitX64::EmitVectorSetElement16(EmitContext& ctx, IR::Inst* inst) {
ASSERT(args[1].IsImmediate());
const u8 index = args[1].GetImmediateU8();
- const Xbyak::Xmm source_vector = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const source_vector = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const Xbyak::Reg16 source_elem = ctx.reg_alloc.UseGpr(code, args[2]).cvt16();
code.pinsrw(source_vector, source_elem.cvt32(), index);
@@ -322,7 +310,7 @@ void EmitX64::EmitVectorSetElement32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
ASSERT(args[1].IsImmediate());
const u8 index = args[1].GetImmediateU8();
- const Xbyak::Xmm source_vector = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const source_vector = ctx.reg_alloc.UseScratchXmm(code, args[0]);
if (code.HasHostFeature(HostFeature::SSE41)) {
const Xbyak::Reg32 source_elem = ctx.reg_alloc.UseGpr(code, args[2]).cvt32();
@@ -345,7 +333,7 @@ void EmitX64::EmitVectorSetElement64(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
ASSERT(args[1].IsImmediate());
const u8 index = args[1].GetImmediateU8();
- const Xbyak::Xmm source_vector = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const source_vector = ctx.reg_alloc.UseScratchXmm(code, args[0]);
if (code.HasHostFeature(HostFeature::SSE41)) {
const Xbyak::Reg64 source_elem = ctx.reg_alloc.UseGpr(code, args[2]);
@@ -355,7 +343,7 @@ void EmitX64::EmitVectorSetElement64(EmitContext& ctx, IR::Inst* inst) {
ctx.reg_alloc.DefineValue(code, inst, source_vector);
} else {
const Xbyak::Reg64 source_elem = ctx.reg_alloc.UseGpr(code, args[2]);
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
code.movq(tmp, source_elem);
@@ -369,72 +357,53 @@ void EmitX64::EmitVectorSetElement64(EmitContext& ctx, IR::Inst* inst) {
}
}
-static void VectorAbs8(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& data) {
- if (code.HasHostFeature(HostFeature::SSSE3)) {
- code.pabsb(data, data);
- } else {
- const Xbyak::Xmm temp = ctx.reg_alloc.ScratchXmm(code);
- code.pxor(temp, temp);
- code.psubb(temp, data);
- code.pminub(data, temp);
- }
-}
-
-static void VectorAbs16(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& data) {
- if (code.HasHostFeature(HostFeature::SSSE3)) {
- code.pabsw(data, data);
- } else {
- const Xbyak::Xmm temp = ctx.reg_alloc.ScratchXmm(code);
- code.pxor(temp, temp);
- code.psubw(temp, data);
- code.pmaxsw(data, temp);
- }
-}
-
-static void VectorAbs32(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& data) {
- if (code.HasHostFeature(HostFeature::SSSE3)) {
- code.pabsd(data, data);
- } else {
- const Xbyak::Xmm temp = ctx.reg_alloc.ScratchXmm(code);
- code.movdqa(temp, data);
- code.psrad(temp, 31);
- code.pxor(data, temp);
- code.psubd(data, temp);
- }
-}
-
-static void VectorAbs64(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& data) {
- if (code.HasHostFeature(HostFeature::AVX512_Ortho)) {
- code.vpabsq(data, data);
- } else {
- const Xbyak::Xmm temp = ctx.reg_alloc.ScratchXmm(code);
- code.pshufd(temp, data, 0b11110101);
- code.psrad(temp, 31);
- code.pxor(data, temp);
- code.psubq(data, temp);
- }
-}
-
static void EmitVectorAbs(size_t esize, EmitContext& ctx, IR::Inst* inst, BlockOfCode& code) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-
- const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-
+ auto const data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
switch (esize) {
case 8:
- VectorAbs8(code, ctx, data);
+ if (code.HasHostFeature(HostFeature::SSSE3)) {
+ code.pabsb(data, data);
+ } else {
+ auto const temp = ctx.reg_alloc.ScratchXmm(code);
+ code.pxor(temp, temp);
+ code.psubb(temp, data);
+ code.pminub(data, temp);
+ }
break;
case 16:
- VectorAbs16(code, ctx, data);
+ if (code.HasHostFeature(HostFeature::SSSE3)) {
+ code.pabsw(data, data);
+ } else {
+ auto const temp = ctx.reg_alloc.ScratchXmm(code);
+ code.pxor(temp, temp);
+ code.psubw(temp, data);
+ code.pmaxsw(data, temp);
+ }
break;
case 32:
- VectorAbs32(code, ctx, data);
+ if (code.HasHostFeature(HostFeature::SSSE3)) {
+ code.pabsd(data, data);
+ } else {
+ auto const temp = ctx.reg_alloc.ScratchXmm(code);
+ code.movdqa(temp, data);
+ code.psrad(temp, 31);
+ code.pxor(data, temp);
+ code.psubd(data, temp);
+ }
break;
case 64:
- VectorAbs64(code, ctx, data);
+ if (code.HasHostFeature(HostFeature::AVX512_Ortho)) {
+ code.vpabsq(data, data);
+ } else {
+ auto const temp = ctx.reg_alloc.ScratchXmm(code);
+ code.pshufd(temp, data, 0b11110101);
+ code.psrad(temp, 31);
+ code.pxor(data, temp);
+ code.psubq(data, temp);
+ }
break;
}
-
ctx.reg_alloc.DefineValue(code, inst, data);
}
@@ -477,15 +446,15 @@ void EmitX64::EmitVectorAnd(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorAndNot(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseXmm(code, args[0]);
- const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const xmm_a = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const xmm_b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
code.pandn(xmm_b, xmm_a);
ctx.reg_alloc.DefineValue(code, inst, xmm_b);
}
-static void ArithmeticShiftRightByte(EmitContext& ctx, BlockOfCode& code, const Xbyak::Xmm& result, u8 shift_amount) {
+static void ArithmeticShiftRightByte(EmitContext& ctx, BlockOfCode& code, auto const& result, u8 shift_amount) {
if (code.HasHostFeature(HostFeature::GFNI)) {
const u64 shift_matrix = shift_amount < 8
? (0x0102040810204080 << (shift_amount * 8)) | (0x8080808080808080 >> (64 - shift_amount * 8))
@@ -494,7 +463,7 @@ static void ArithmeticShiftRightByte(EmitContext& ctx, BlockOfCode& code, const
return;
}
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
code.punpckhbw(tmp, result);
code.punpcklbw(result, result);
@@ -506,7 +475,7 @@ static void ArithmeticShiftRightByte(EmitContext& ctx, BlockOfCode& code, const
void EmitX64::EmitVectorArithmeticShiftRight8(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const u8 shift_amount = args[1].GetImmediateU8();
ArithmeticShiftRightByte(ctx, code, result, shift_amount);
@@ -517,7 +486,7 @@ void EmitX64::EmitVectorArithmeticShiftRight8(EmitContext& ctx, IR::Inst* inst)
void EmitX64::EmitVectorArithmeticShiftRight16(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const u8 shift_amount = args[1].GetImmediateU8();
code.psraw(result, shift_amount);
@@ -528,7 +497,7 @@ void EmitX64::EmitVectorArithmeticShiftRight16(EmitContext& ctx, IR::Inst* inst)
void EmitX64::EmitVectorArithmeticShiftRight32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const u8 shift_amount = args[1].GetImmediateU8();
code.psrad(result, shift_amount);
@@ -538,14 +507,14 @@ void EmitX64::EmitVectorArithmeticShiftRight32(EmitContext& ctx, IR::Inst* inst)
void EmitX64::EmitVectorArithmeticShiftRight64(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const u8 shift_amount = (std::min)(args[1].GetImmediateU8(), u8(63));
if (code.HasHostFeature(HostFeature::AVX512_Ortho)) {
code.vpsraq(result, result, shift_amount);
} else {
- const Xbyak::Xmm tmp1 = ctx.reg_alloc.ScratchXmm(code);
- const Xbyak::Xmm tmp2 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp1 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
const u64 sign_bit = 0x80000000'00000000u >> shift_amount;
@@ -660,12 +629,12 @@ void EmitX64::EmitVectorArithmeticVShift64(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorBroadcastLower8(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
if (code.HasHostFeature(HostFeature::AVX2)) {
code.vpbroadcastb(a, a);
code.vmovq(a, a);
} else if (code.HasHostFeature(HostFeature::SSSE3)) {
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
code.pxor(tmp, tmp);
code.pshufb(a, tmp);
code.movq(a, a);
@@ -678,7 +647,7 @@ void EmitX64::EmitVectorBroadcastLower8(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorBroadcastLower16(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
code.pshuflw(a, a, 0);
@@ -687,7 +656,7 @@ void EmitX64::EmitVectorBroadcastLower16(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorBroadcastLower32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
code.pshuflw(a, a, 0b01000100);
@@ -696,11 +665,11 @@ void EmitX64::EmitVectorBroadcastLower32(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorBroadcast8(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
if (code.HasHostFeature(HostFeature::AVX2)) {
code.vpbroadcastb(a, a);
} else if (code.HasHostFeature(HostFeature::SSSE3)) {
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
code.pxor(tmp, tmp);
code.pshufb(a, tmp);
} else {
@@ -713,7 +682,7 @@ void EmitX64::EmitVectorBroadcast8(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorBroadcast16(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
if (code.HasHostFeature(HostFeature::AVX2)) {
code.vpbroadcastw(a, a);
} else {
@@ -725,7 +694,7 @@ void EmitX64::EmitVectorBroadcast16(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorBroadcast32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
if (code.HasHostFeature(HostFeature::AVX2)) {
code.vpbroadcastd(a, a);
} else {
@@ -736,7 +705,7 @@ void EmitX64::EmitVectorBroadcast32(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorBroadcast64(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
if (code.HasHostFeature(HostFeature::AVX2)) {
code.vpbroadcastq(a, a);
} else {
@@ -747,7 +716,7 @@ void EmitX64::EmitVectorBroadcast64(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorBroadcastElementLower8(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
ASSERT(args[1].IsImmediate());
const u8 index = args[1].GetImmediateU8();
ASSERT(index < 16);
@@ -758,7 +727,7 @@ void EmitX64::EmitVectorBroadcastElementLower8(EmitContext& ctx, IR::Inst* inst)
code.vpbroadcastb(a, a);
code.vmovq(a, a);
} else if (code.HasHostFeature(HostFeature::SSSE3)) {
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
code.pxor(tmp, tmp);
code.pshufb(a, tmp);
code.movq(a, a);
@@ -771,7 +740,7 @@ void EmitX64::EmitVectorBroadcastElementLower8(EmitContext& ctx, IR::Inst* inst)
void EmitX64::EmitVectorBroadcastElementLower16(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
ASSERT(args[1].IsImmediate());
const u8 index = args[1].GetImmediateU8();
ASSERT(index < 8);
@@ -784,7 +753,7 @@ void EmitX64::EmitVectorBroadcastElementLower16(EmitContext& ctx, IR::Inst* inst
void EmitX64::EmitVectorBroadcastElementLower32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
ASSERT(args[1].IsImmediate());
const u8 index = args[1].GetImmediateU8();
ASSERT(index < 4);
@@ -800,7 +769,7 @@ void EmitX64::EmitVectorBroadcastElementLower32(EmitContext& ctx, IR::Inst* inst
void EmitX64::EmitVectorBroadcastElement8(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
ASSERT(args[1].IsImmediate());
const u8 index = args[1].GetImmediateU8();
ASSERT(index < 16);
@@ -810,7 +779,7 @@ void EmitX64::EmitVectorBroadcastElement8(EmitContext& ctx, IR::Inst* inst) {
if (code.HasHostFeature(HostFeature::AVX2)) {
code.vpbroadcastb(a, a);
} else if (code.HasHostFeature(HostFeature::SSSE3)) {
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
code.pxor(tmp, tmp);
code.pshufb(a, tmp);
@@ -824,7 +793,7 @@ void EmitX64::EmitVectorBroadcastElement8(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorBroadcastElement16(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
ASSERT(args[1].IsImmediate());
const u8 index = args[1].GetImmediateU8();
ASSERT(index < 8);
@@ -844,7 +813,7 @@ void EmitX64::EmitVectorBroadcastElement16(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorBroadcastElement32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
ASSERT(args[1].IsImmediate());
const u8 index = args[1].GetImmediateU8();
ASSERT(index < 4);
@@ -856,7 +825,7 @@ void EmitX64::EmitVectorBroadcastElement32(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorBroadcastElement64(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
ASSERT(args[1].IsImmediate());
const u8 index = args[1].GetImmediateU8();
ASSERT(index < 2);
@@ -1043,9 +1012,9 @@ void EmitX64::EmitVectorCountLeadingZeros32(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorDeinterleaveEven8(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(code, args[1]);
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const rhs = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(tmp, code.Const(xword, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF));
code.pand(lhs, tmp);
@@ -1057,11 +1026,11 @@ void EmitX64::EmitVectorDeinterleaveEven8(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorDeinterleaveEven16(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const rhs = ctx.reg_alloc.UseScratchXmm(code, args[1]);
if (code.HasHostFeature(HostFeature::SSE41)) {
- const Xbyak::Xmm zero = ctx.reg_alloc.ScratchXmm(code);
+ auto const zero = ctx.reg_alloc.ScratchXmm(code);
code.pxor(zero, zero);
code.pblendw(lhs, zero, 0b10101010);
@@ -1082,8 +1051,8 @@ void EmitX64::EmitVectorDeinterleaveEven16(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorDeinterleaveEven32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const rhs = ctx.reg_alloc.UseXmm(code, args[1]);
code.shufps(lhs, rhs, 0b10001000);
@@ -1092,8 +1061,8 @@ void EmitX64::EmitVectorDeinterleaveEven32(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorDeinterleaveEven64(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const rhs = ctx.reg_alloc.UseXmm(code, args[1]);
code.shufpd(lhs, rhs, 0b00);
@@ -1102,16 +1071,16 @@ void EmitX64::EmitVectorDeinterleaveEven64(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorDeinterleaveEvenLower8(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
if (code.HasHostFeature(HostFeature::SSSE3)) {
- const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const rhs = ctx.reg_alloc.UseXmm(code, args[1]);
code.punpcklbw(lhs, rhs);
code.pshufb(lhs, code.Const(xword, 0x0D'09'05'01'0C'08'04'00, 0x8080808080808080));
} else {
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
- const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const rhs = ctx.reg_alloc.UseScratchXmm(code, args[1]);
code.movdqa(tmp, code.Const(xword, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF));
code.pand(lhs, tmp);
@@ -1126,15 +1095,15 @@ void EmitX64::EmitVectorDeinterleaveEvenLower8(EmitContext& ctx, IR::Inst* inst)
void EmitX64::EmitVectorDeinterleaveEvenLower16(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
if (code.HasHostFeature(HostFeature::SSSE3)) {
- const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const rhs = ctx.reg_alloc.UseXmm(code, args[1]);
code.punpcklwd(lhs, rhs);
code.pshufb(lhs, code.Const(xword, 0x0B0A'0302'0908'0100, 0x8080'8080'8080'8080));
} else {
- const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const rhs = ctx.reg_alloc.UseScratchXmm(code, args[1]);
code.pslld(lhs, 16);
code.psrad(lhs, 16);
@@ -1152,8 +1121,8 @@ void EmitX64::EmitVectorDeinterleaveEvenLower16(EmitContext& ctx, IR::Inst* inst
void EmitX64::EmitVectorDeinterleaveEvenLower32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const rhs = ctx.reg_alloc.UseXmm(code, args[1]);
if (code.HasHostFeature(HostFeature::SSE41)) {
// copy bytes 0:3 of rhs to lhs, zero out upper 8 bytes
@@ -1168,8 +1137,8 @@ void EmitX64::EmitVectorDeinterleaveEvenLower32(EmitContext& ctx, IR::Inst* inst
void EmitX64::EmitVectorDeinterleaveOdd8(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const rhs = ctx.reg_alloc.UseScratchXmm(code, args[1]);
code.psraw(lhs, 8);
code.psraw(rhs, 8);
@@ -1180,8 +1149,8 @@ void EmitX64::EmitVectorDeinterleaveOdd8(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorDeinterleaveOdd16(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const rhs = ctx.reg_alloc.UseScratchXmm(code, args[1]);
code.psrad(lhs, 16);
code.psrad(rhs, 16);
@@ -1192,8 +1161,8 @@ void EmitX64::EmitVectorDeinterleaveOdd16(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorDeinterleaveOdd32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const rhs = ctx.reg_alloc.UseXmm(code, args[1]);
code.shufps(lhs, rhs, 0b11011101);
@@ -1202,8 +1171,8 @@ void EmitX64::EmitVectorDeinterleaveOdd32(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorDeinterleaveOdd64(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const rhs = ctx.reg_alloc.UseXmm(code, args[1]);
code.shufpd(lhs, rhs, 0b11);
@@ -1212,15 +1181,15 @@ void EmitX64::EmitVectorDeinterleaveOdd64(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorDeinterleaveOddLower8(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
if (code.HasHostFeature(HostFeature::SSSE3)) {
- const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const rhs = ctx.reg_alloc.UseXmm(code, args[1]);
code.punpcklbw(lhs, rhs);
code.pshufb(lhs, code.Const(xword, 0x0F'0B'07'03'0E'0A'06'02, 0x8080808080808080));
} else {
- const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const rhs = ctx.reg_alloc.UseScratchXmm(code, args[1]);
code.psraw(lhs, 8);
code.psraw(rhs, 8);
@@ -1234,15 +1203,15 @@ void EmitX64::EmitVectorDeinterleaveOddLower8(EmitContext& ctx, IR::Inst* inst)
void EmitX64::EmitVectorDeinterleaveOddLower16(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
if (code.HasHostFeature(HostFeature::SSSE3)) {
- const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const rhs = ctx.reg_alloc.UseXmm(code, args[1]);
code.punpcklwd(lhs, rhs);
code.pshufb(lhs, code.Const(xword, 0x0F0E'0706'0D0C'0504, 0x8080'8080'8080'8080));
} else {
- const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const rhs = ctx.reg_alloc.UseScratchXmm(code, args[1]);
code.psrad(lhs, 16);
code.psrad(rhs, 16);
@@ -1258,17 +1227,17 @@ void EmitX64::EmitVectorDeinterleaveOddLower32(EmitContext& ctx, IR::Inst* inst)
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasHostFeature(HostFeature::SSE41)) {
- const Xbyak::Xmm lhs = ctx.reg_alloc.UseXmm(code, args[0]);
- const Xbyak::Xmm rhs = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const lhs = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const rhs = ctx.reg_alloc.UseScratchXmm(code, args[1]);
// copy bytes 4:7 of lhs to bytes 0:3 of rhs, zero out upper 8 bytes
code.insertps(rhs, lhs, 0b01001100);
ctx.reg_alloc.DefineValue(code, inst, rhs);
} else {
- const Xbyak::Xmm lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm rhs = ctx.reg_alloc.UseXmm(code, args[1]);
- const Xbyak::Xmm zero = ctx.reg_alloc.ScratchXmm(code);
+ auto const lhs = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const rhs = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const zero = ctx.reg_alloc.ScratchXmm(code);
code.xorps(zero, zero);
code.unpcklps(lhs, rhs);
@@ -1302,9 +1271,9 @@ void EmitX64::EmitVectorEqual64(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
code.pcmpeqd(xmm_a, xmm_b);
code.pshufd(tmp, xmm_a, 0b10110001);
@@ -1317,9 +1286,9 @@ void EmitX64::EmitVectorEqual128(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasHostFeature(HostFeature::SSE41)) {
- const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
code.pcmpeqq(xmm_a, xmm_b);
code.pshufd(tmp, xmm_a, 0b01001110);
@@ -1327,9 +1296,9 @@ void EmitX64::EmitVectorEqual128(EmitContext& ctx, IR::Inst* inst) {
ctx.reg_alloc.DefineValue(code, inst, xmm_a);
} else {
- const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
code.pcmpeqd(xmm_a, xmm_b);
code.pshufd(tmp, xmm_a, 0b10110001);
@@ -1353,16 +1322,16 @@ void EmitX64::EmitVectorExtract(EmitContext& ctx, IR::Inst* inst) {
}
if (code.HasHostFeature(HostFeature::SSSE3)) {
- const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseXmm(code, args[0]);
- const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const xmm_a = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const xmm_b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
code.palignr(xmm_b, xmm_a, position / 8);
ctx.reg_alloc.DefineValue(code, inst, xmm_b);
return;
}
- const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const xmm_b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
code.psrldq(xmm_a, position / 8);
code.pslldq(xmm_b, (128 - position) / 8);
@@ -1374,13 +1343,13 @@ void EmitX64::EmitVectorExtract(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorExtractLower(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const u8 position = args[2].GetImmediateU8();
ASSERT(position % 8 == 0);
if (position != 0) {
- const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
code.punpcklqdq(xmm_a, xmm_b);
code.psrldq(xmm_a, position / 8);
@@ -1405,22 +1374,33 @@ void EmitX64::EmitVectorGreaterS32(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorGreaterS64(EmitContext& ctx, IR::Inst* inst) {
if (code.HasHostFeature(HostFeature::SSE42)) {
EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pcmpgtq);
- return;
+ } else {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const tmp1 = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp3 = ctx.reg_alloc.ScratchXmm(code);
+ code.movdqa(tmp2, code.Const(xword, 0x80000000, 0x80000000));
+ code.pxor(tmp0, tmp2);
+ code.pxor(tmp1, tmp2);
+ code.movdqa(tmp2, tmp0);
+ code.pcmpeqd(tmp0, tmp1);
+ code.pcmpgtd(tmp2, tmp1);
+ code.pshufd(tmp1, tmp0, 245);
+ code.pshufd(tmp3, tmp2, 160);
+ code.pshufd(tmp0, tmp2, 245);
+ code.pand(tmp1, tmp3);
+ code.por(tmp0, tmp1);
+ ctx.reg_alloc.DefineValue(code, inst, tmp0);
}
-
- EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) {
- for (size_t i = 0; i < result.size(); ++i) {
- result[i] = (a[i] > b[i]) ? ~u64(0) : 0;
- }
- });
}
static void EmitVectorHalvingAddSigned(size_t esize, EmitContext& ctx, IR::Inst* inst, BlockOfCode& code) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(tmp, b);
code.pand(tmp, a);
@@ -1459,9 +1439,9 @@ void EmitX64::EmitVectorHalvingAddS32(EmitContext& ctx, IR::Inst* inst) {
static void EmitVectorHalvingAddUnsigned(size_t esize, EmitContext& ctx, IR::Inst* inst, BlockOfCode& code) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(tmp, b);
@@ -1504,12 +1484,12 @@ void EmitX64::EmitVectorHalvingAddU32(EmitContext& ctx, IR::Inst* inst) {
static void EmitVectorHalvingSubSigned(size_t esize, EmitContext& ctx, IR::Inst* inst, BlockOfCode& code) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
switch (esize) {
case 8: {
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(tmp, code.Const(xword, 0x8080808080808080, 0x8080808080808080));
code.pxor(a, tmp);
code.pxor(b, tmp);
@@ -1518,7 +1498,7 @@ static void EmitVectorHalvingSubSigned(size_t esize, EmitContext& ctx, IR::Inst*
break;
}
case 16: {
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(tmp, code.Const(xword, 0x8000800080008000, 0x8000800080008000));
code.pxor(a, tmp);
code.pxor(b, tmp);
@@ -1552,8 +1532,8 @@ void EmitX64::EmitVectorHalvingSubS32(EmitContext& ctx, IR::Inst* inst) {
static void EmitVectorHalvingSubUnsigned(size_t esize, EmitContext& ctx, IR::Inst* inst, BlockOfCode& code) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
switch (esize) {
case 8:
@@ -1590,8 +1570,8 @@ void EmitX64::EmitVectorHalvingSubU32(EmitContext& ctx, IR::Inst* inst) {
static void EmitVectorInterleaveLower(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, int size) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const b = ctx.reg_alloc.UseXmm(code, args[1]);
switch (size) {
case 8:
@@ -1630,8 +1610,8 @@ void EmitX64::EmitVectorInterleaveLower64(EmitContext& ctx, IR::Inst* inst) {
static void EmitVectorInterleaveUpper(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, int size) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const b = ctx.reg_alloc.UseXmm(code, args[1]);
switch (size) {
case 8:
@@ -1670,7 +1650,7 @@ void EmitX64::EmitVectorInterleaveUpper64(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorLogicalShiftLeft8(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const u8 shift_amount = args[1].GetImmediateU8();
if (shift_amount == 0) {
@@ -1696,7 +1676,7 @@ void EmitX64::EmitVectorLogicalShiftLeft8(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorLogicalShiftLeft16(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const u8 shift_amount = args[1].GetImmediateU8();
code.psllw(result, shift_amount);
@@ -1707,7 +1687,7 @@ void EmitX64::EmitVectorLogicalShiftLeft16(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorLogicalShiftLeft32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const u8 shift_amount = args[1].GetImmediateU8();
code.pslld(result, shift_amount);
@@ -1718,7 +1698,7 @@ void EmitX64::EmitVectorLogicalShiftLeft32(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorLogicalShiftLeft64(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const u8 shift_amount = args[1].GetImmediateU8();
code.psllq(result, shift_amount);
@@ -1729,7 +1709,7 @@ void EmitX64::EmitVectorLogicalShiftLeft64(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorLogicalShiftRight8(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const u8 shift_amount = args[1].GetImmediateU8();
if (shift_amount == 0) {
@@ -1753,7 +1733,7 @@ void EmitX64::EmitVectorLogicalShiftRight8(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorLogicalShiftRight16(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const u8 shift_amount = args[1].GetImmediateU8();
code.psrlw(result, shift_amount);
@@ -1764,7 +1744,7 @@ void EmitX64::EmitVectorLogicalShiftRight16(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorLogicalShiftRight32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const u8 shift_amount = args[1].GetImmediateU8();
code.psrld(result, shift_amount);
@@ -1775,7 +1755,7 @@ void EmitX64::EmitVectorLogicalShiftRight32(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorLogicalShiftRight64(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const u8 shift_amount = args[1].GetImmediateU8();
code.psrlq(result, shift_amount);
@@ -1783,41 +1763,12 @@ void EmitX64::EmitVectorLogicalShiftRight64(EmitContext& ctx, IR::Inst* inst) {
ctx.reg_alloc.DefineValue(code, inst, result);
}
-template
-static void EmitVectorLogicalVShiftAVX2(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
- static_assert(esize == 32 || esize == 64);
- auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
- const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
-
- // store sign bit of lowest byte of each element of b to select left/right shift later
- ICODE(vpsll)(xmm0, b, u8(esize - 8));
-
- // sse/avx shifts are only positive, with dedicated left/right forms - shift by lowest byte of abs(b)
- code.vpabsb(b, b);
- code.vpand(b, b, code.BConst(xword, 0xFF));
-
- // calculate shifts
- ICODE(vpsllv)(result, a, b);
- ICODE(vpsrlv)(a, a, b);
-
- // implicit argument: xmm0 (sign of lowest byte of b)
- if (esize == 32) {
- code.blendvps(result, a);
- } else {
- code.blendvpd(result, a);
- }
- ctx.reg_alloc.DefineValue(code, inst, result);
-}
-
void EmitX64::EmitVectorLogicalVShift8(EmitContext& ctx, IR::Inst* inst) {
if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW | HostFeature::GFNI)) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm left_shift = ctx.reg_alloc.UseScratchXmm(code, args[1]);
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const left_shift = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
const Xbyak::Opmask negative_mask = k1;
code.pxor(tmp, tmp);
@@ -1862,10 +1813,10 @@ void EmitX64::EmitVectorLogicalVShift16(EmitContext& ctx, IR::Inst* inst) {
if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW)) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm left_shift = ctx.reg_alloc.UseScratchXmm(code, args[1]);
- const Xbyak::Xmm right_shift = ctx.reg_alloc.ScratchXmm(code);
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const left_shift = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const right_shift = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
code.vmovdqa32(tmp, code.Const(xword, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF));
code.vpxord(right_shift, right_shift, right_shift);
@@ -1886,18 +1837,87 @@ void EmitX64::EmitVectorLogicalVShift16(EmitContext& ctx, IR::Inst* inst) {
}
void EmitX64::EmitVectorLogicalVShift32(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasHostFeature(HostFeature::AVX2)) {
- EmitVectorLogicalVShiftAVX2<32>(code, ctx, inst);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const result = ctx.reg_alloc.ScratchXmm(code);
+ auto const mask = ctx.reg_alloc.ScratchXmm(code);
+ // store sign bit of lowest byte of each element of b to select left/right shift later
+ code.vpslld(mask, b, u8(32 - 8));
+ // sse/avx shifts are only positive, with dedicated left/right forms - shift by lowest byte of abs(b)
+ code.vpabsb(b, b);
+ code.vpand(b, b, code.BConst<32>(xword, 0xFF));
+ // calculate shifts
+ code.vpsllvd(result, a, b);
+ code.vpsrlvd(a, a, b);
+ code.vblendvps(result, result, a, mask);
+ ctx.reg_alloc.DefineValue(code, inst, result);
} else {
- EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) {
- std::transform(a.begin(), a.end(), b.begin(), result.begin(), VShift);
- });
+ auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const tmp1 = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp3 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp4 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp5 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp6 = ctx.reg_alloc.ScratchXmm(code);
+ code.pxor(tmp3, tmp3);
+ code.movdqa(tmp2, tmp0);
+ code.psubb(tmp3, tmp1);
+ code.movdqa(tmp4, tmp2);
+ code.movdqa(tmp6, tmp2);
+ code.pminub(tmp3, tmp1);
+ code.pslld(tmp1, 24);
+ code.pand(tmp3, code.Const(xword, 0x000000ff'000000ff, 0x000000ff'000000ff));
+ code.psrad(tmp1, 31);
+ code.pshuflw(tmp0, tmp3, 254);
+ code.pshuflw(tmp5, tmp3, 84);
+ code.psrld(tmp4, tmp0);
+ code.movdqa(tmp0, tmp2);
+ code.psrld(tmp0, tmp5);
+ code.punpcklqdq(tmp0, tmp4);
+ code.pshufd(tmp4, tmp3, 238);
+ code.pslld(tmp3, 23);
+ code.paddd(tmp3, code.Const(xword, 0x3F80'00003F80'0000, 0x3F80'00003F80'0000));
+ code.pshuflw(tmp5, tmp4, 254);
+ code.pshuflw(tmp4, tmp4, 84);
+ code.psrld(tmp6, tmp5);
+ code.movdqa(tmp5, tmp2);
+ code.psrld(tmp5, tmp4);
+ code.pshufd(tmp4, tmp2, 245);
+ code.punpckhqdq(tmp5, tmp6);
+ code.cvttps2dq(tmp3, tmp3);
+ code.shufps(tmp0, tmp5, 204);
+ code.pmuludq(tmp2, tmp3);
+ code.pshufd(tmp3, tmp3, 245);
+ code.andps(tmp0, tmp1);
+ code.pmuludq(tmp3, tmp4);
+ code.pshufd(tmp2, tmp2, 232);
+ code.pshufd(tmp3, tmp3, 232);
+ code.punpckldq(tmp2, tmp3);
+ code.pandn(tmp1, tmp2);
+ code.orps(tmp0, tmp1);
+ ctx.reg_alloc.DefineValue(code, inst, tmp0);
}
}
void EmitX64::EmitVectorLogicalVShift64(EmitContext& ctx, IR::Inst* inst) {
if (code.HasHostFeature(HostFeature::AVX2)) {
- EmitVectorLogicalVShiftAVX2<64>(code, ctx, inst);
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const result = ctx.reg_alloc.ScratchXmm(code);
+ auto const mask = ctx.reg_alloc.ScratchXmm(code);
+ // store sign bit of lowest byte of each element of b to select left/right shift later
+ code.vpsllq(mask, b, u8(64 - 8));
+ // sse/avx shifts are only positive, with dedicated left/right forms - shift by lowest byte of abs(b)
+ code.vpabsb(b, b);
+ code.vpand(b, b, code.BConst<64>(xword, 0xFF));
+ // calculate shifts
+ code.vpsllvq(result, a, b);
+ code.vpsrlvq(a, a, b);
+ code.vblendvpd(result, result, a, mask);
+ ctx.reg_alloc.DefineValue(code, inst, result);
} else {
EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) {
std::transform(a.begin(), a.end(), b.begin(), result.begin(), VShift);
@@ -1912,28 +1932,11 @@ enum class MinMaxOperation {
Max,
};
-// Compute the minimum/maximum of two vectors of signed 8-bit integers, using only SSE2 instructons.
-// The result of the operation is placed in operand a, while b is unmodified.
-void FallbackMinMaxS8(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& a, const Xbyak::Xmm& b, MinMaxOperation op) {
- const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(code);
- if(op == MinMaxOperation::Min) {
- code.movdqa(c, b);
- code.pcmpgtb(c, a);
- } else {
- code.movdqa(c, a);
- code.pcmpgtb(c, b);
- }
-
- code.pand(a, c);
- code.pandn(c, b);
- code.por(a, c);
-}
-
// Compute the minimum/maximum of two vectors of unsigned 16-bit integers, using only SSE2 instructons.
// The result of the operation is placed in operand a, while b is unmodified.
-void FallbackMinMaxU16(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& a, const Xbyak::Xmm& b, MinMaxOperation op) {
+void FallbackMinMaxU16(BlockOfCode& code, EmitContext& ctx, auto const& a, auto const& b, MinMaxOperation op) {
if(op == MinMaxOperation::Min) {
- const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(code);
+ auto const c = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(c, a);
code.psubusw(c, b);
code.psubw(a, c);
@@ -1945,8 +1948,8 @@ void FallbackMinMaxU16(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& a,
// Compute the minimum/maximum of two vectors of signed 32-bit integers, using only SSE2 instructons.
// The result of the operation is placed in operand a, while b is unmodified.
-void FallbackMinMaxS32(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& a, const Xbyak::Xmm& b, MinMaxOperation op) {
- const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(code);
+void FallbackMinMaxS32(BlockOfCode& code, EmitContext& ctx, auto const& a, auto const& b, MinMaxOperation op) {
+ auto const c = ctx.reg_alloc.ScratchXmm(code);
if(op == MinMaxOperation::Min) {
code.movdqa(c, b);
code.pcmpgtd(c, a);
@@ -1962,12 +1965,12 @@ void FallbackMinMaxS32(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& a,
// Compute the minimum/maximum of two vectors of unsigned 32-bit integers, using only SSE2 instructons.
// The result of the operation is placed in operand a, while b is unmodified.
-void FallbackMinMaxU32(BlockOfCode& code, EmitContext& ctx, const Xbyak::Xmm& a, const Xbyak::Xmm& b, MinMaxOperation op) {
- const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(code);
+void FallbackMinMaxU32(BlockOfCode& code, EmitContext& ctx, auto const& a, auto const& b, MinMaxOperation op) {
+ auto const c = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(c, code.BConst<32>(xword, 0x80000000));
// bias a and b by XORing their sign bits, then use the signed comparison function
- const Xbyak::Xmm d = ctx.reg_alloc.ScratchXmm(code);
+ auto const d = ctx.reg_alloc.ScratchXmm(code);
if(op == MinMaxOperation::Min) {
code.movdqa(d, a);
code.pxor(d, c);
@@ -1989,11 +1992,16 @@ void EmitX64::EmitVectorMaxS8(EmitContext& ctx, IR::Inst* inst) {
if (code.HasHostFeature(HostFeature::SSE41)) {
EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pmaxsb);
} else {
- auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(code, args[1]);
- FallbackMinMaxS8(code, ctx, a, b, MinMaxOperation::Max);
- ctx.reg_alloc.DefineValue(code, inst, a);
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const b = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const c = ctx.reg_alloc.ScratchXmm(code);
+ code.movdqa(c, a);
+ code.pcmpgtb(c, b);
+ code.pand(a, c);
+ code.pandn(c, b);
+ code.por(a, c);
+ ctx.reg_alloc.DefineValue(code, inst, a);
}
}
@@ -2005,31 +2013,55 @@ void EmitX64::EmitVectorMaxS32(EmitContext& ctx, IR::Inst* inst) {
if (code.HasHostFeature(HostFeature::SSE41)) {
EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pmaxsd);
} else {
- auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(code, args[1]);
- FallbackMinMaxS32(code, ctx, a, b, MinMaxOperation::Max);
- ctx.reg_alloc.DefineValue(code, inst, a);
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const tmp1 = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+ code.movdqa(tmp2, tmp0);
+ code.pcmpgtd(tmp2, tmp1);
+ code.pand(tmp0, tmp2);
+ code.pandn(tmp2, tmp1);
+ code.por(tmp0, tmp2);
+ ctx.reg_alloc.DefineValue(code, inst, tmp0);
}
}
void EmitX64::EmitVectorMaxS64(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasHostFeature(HostFeature::AVX512_Ortho)) {
- EmitAVXVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::vpmaxsq);
+ auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
+ code.vpmaxsq(xmm_a, xmm_a, xmm_b);
+ ctx.reg_alloc.DefineValue(code, inst, xmm_a);
} else if (code.HasHostFeature(HostFeature::AVX)) {
- auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-
- const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(code, args[1]);
-
+ auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const y = ctx.reg_alloc.UseXmm(code, args[1]);
code.vpcmpgtq(xmm0, y, x);
code.pblendvb(x, y);
-
ctx.reg_alloc.DefineValue(code, inst, x);
} else {
- EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) {
- std::transform(a.begin(), a.end(), b.begin(), result.begin(), [](auto x, auto y) { return (std::max)(x, y); });
- });
+ auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const tmp1 = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp3 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp4 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp5 = ctx.reg_alloc.ScratchXmm(code);
+ code.movdqa(tmp2, code.Const(xword, 0x8000'0000, 0x8000'0000));
+ code.movdqa(tmp3, tmp1);
+ code.pxor(tmp3, tmp2);
+ code.pxor(tmp2, tmp0);
+ code.movdqa(tmp4, tmp2);
+ code.pcmpeqd(tmp2, tmp3);
+ code.pcmpgtd(tmp4, tmp3);
+ code.pshufd(tmp2, tmp2, 245);
+ code.pshufd(tmp5, tmp4, 160);
+ code.pshufd(tmp3, tmp4, 245);
+ code.pand(tmp2, tmp5);
+ code.por(tmp3, tmp2);
+ code.pand(tmp0, tmp3);
+ code.pandn(tmp3, tmp1);
+ code.por(tmp0, tmp3);
+ ctx.reg_alloc.DefineValue(code, inst, tmp0);
}
}
@@ -2041,11 +2073,11 @@ void EmitX64::EmitVectorMaxU16(EmitContext& ctx, IR::Inst* inst) {
if (code.HasHostFeature(HostFeature::SSE41)) {
EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pmaxuw);
} else {
- auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const b = ctx.reg_alloc.UseXmm(code, args[1]);
FallbackMinMaxU16(code, ctx, a, b, MinMaxOperation::Max);
- ctx.reg_alloc.DefineValue(code, inst, a);
+ ctx.reg_alloc.DefineValue(code, inst, a);
}
}
@@ -2053,35 +2085,54 @@ void EmitX64::EmitVectorMaxU32(EmitContext& ctx, IR::Inst* inst) {
if (code.HasHostFeature(HostFeature::SSE41)) {
EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pmaxud);
} else {
- auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const b = ctx.reg_alloc.UseXmm(code, args[1]);
FallbackMinMaxU32(code, ctx, a, b, MinMaxOperation::Max);
- ctx.reg_alloc.DefineValue(code, inst, a);
+ ctx.reg_alloc.DefineValue(code, inst, a);
}
}
void EmitX64::EmitVectorMaxU64(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasHostFeature(HostFeature::AVX512_Ortho)) {
- EmitAVXVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::vpmaxuq);
+ auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
+ code.vpmaxuq(xmm_a, xmm_a, xmm_b);
+ ctx.reg_alloc.DefineValue(code, inst, xmm_a);
} else if (code.HasHostFeature(HostFeature::AVX)) {
- auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-
- const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(code, args[1]);
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
-
+ auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const y = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
code.vmovdqa(xmm0, code.Const(xword, 0x8000000000000000, 0x8000000000000000));
code.vpsubq(tmp, y, xmm0);
code.vpsubq(xmm0, x, xmm0);
code.vpcmpgtq(xmm0, tmp, xmm0);
code.pblendvb(x, y);
-
ctx.reg_alloc.DefineValue(code, inst, x);
} else {
- EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) {
- std::transform(a.begin(), a.end(), b.begin(), result.begin(), [](auto x, auto y) { return (std::max)(x, y); });
- });
+ auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const tmp1 = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp3 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp4 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp5 = ctx.reg_alloc.ScratchXmm(code);
+ code.movdqa(tmp2, code.Const(xword, 0x8000000080000000, 0x8000000080000000));
+ code.movdqa(tmp3, tmp1);
+ code.pxor(tmp3, tmp2);
+ code.pxor(tmp2, tmp0);
+ code.movdqa(tmp4, tmp2);
+ code.pcmpeqd(tmp2, tmp3);
+ code.pcmpgtd(tmp4, tmp3);
+ code.pshufd(tmp2, tmp2, 245);
+ code.pshufd(tmp5, tmp4, 160);
+ code.pshufd(tmp3, tmp4, 245);
+ code.pand(tmp2, tmp5);
+ code.por(tmp3, tmp2);
+ code.pand(tmp0, tmp3);
+ code.pandn(tmp3, tmp1);
+ code.por(tmp0, tmp3);
+ ctx.reg_alloc.DefineValue(code, inst, tmp0);
}
}
@@ -2089,11 +2140,16 @@ void EmitX64::EmitVectorMinS8(EmitContext& ctx, IR::Inst* inst) {
if (code.HasHostFeature(HostFeature::SSE41)) {
EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pminsb);
} else {
- auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(code, args[1]);
- FallbackMinMaxS8(code, ctx, a, b, MinMaxOperation::Min);
- ctx.reg_alloc.DefineValue(code, inst, a);
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const b = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const c = ctx.reg_alloc.ScratchXmm(code);
+ code.movdqa(c, b);
+ code.pcmpgtb(c, a);
+ code.pand(a, c);
+ code.pandn(c, b);
+ code.por(a, c);
+ ctx.reg_alloc.DefineValue(code, inst, a);
}
}
@@ -2105,31 +2161,51 @@ void EmitX64::EmitVectorMinS32(EmitContext& ctx, IR::Inst* inst) {
if (code.HasHostFeature(HostFeature::SSE41)) {
EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pminsd);
} else {
- auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const b = ctx.reg_alloc.UseXmm(code, args[1]);
FallbackMinMaxS32(code, ctx, a, b, MinMaxOperation::Min);
- ctx.reg_alloc.DefineValue(code, inst, a);
+ ctx.reg_alloc.DefineValue(code, inst, a);
}
}
void EmitX64::EmitVectorMinS64(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasHostFeature(HostFeature::AVX512_Ortho)) {
- EmitAVXVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::vpminsq);
+ auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
+ code.vpminsq(xmm_a, xmm_a, xmm_b);
+ ctx.reg_alloc.DefineValue(code, inst, xmm_a);
} else if (code.HasHostFeature(HostFeature::AVX)) {
- auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-
- const Xbyak::Xmm x = ctx.reg_alloc.UseXmm(code, args[0]);
- const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
-
+ auto const x = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
code.vpcmpgtq(xmm0, y, x);
code.pblendvb(y, x);
-
ctx.reg_alloc.DefineValue(code, inst, y);
} else {
- EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) {
- std::transform(a.begin(), a.end(), b.begin(), result.begin(), [](auto x, auto y) { return (std::min)(x, y); });
- });
+ auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const tmp1 = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp3 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp4 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp5 = ctx.reg_alloc.ScratchXmm(code);
+ code.movdqa(tmp2, code.Const(xword, 0x8000'0000, 0x8000'0000));
+ code.movdqa(tmp3, tmp1);
+ code.pxor(tmp3, tmp2);
+ code.pxor(tmp2, tmp0);
+ code.movdqa(tmp4, tmp2);
+ code.pcmpeqd(tmp2, tmp3);
+ code.pcmpgtd(tmp4, tmp3);
+ code.pshufd(tmp3, tmp2, 245);
+ code.pshufd(tmp5, tmp4, 160);
+ code.pshufd(tmp2, tmp4, 245);
+ code.pand(tmp3, tmp5);
+ code.por(tmp2, tmp3);
+ code.pand(tmp1, tmp2);
+ code.pandn(tmp2, tmp0);
+ code.por(tmp2, tmp1);
+ //code.movdqa(tmp0, tmp2);
+ ctx.reg_alloc.DefineValue(code, inst, tmp2);
}
}
@@ -2141,11 +2217,11 @@ void EmitX64::EmitVectorMinU16(EmitContext& ctx, IR::Inst* inst) {
if (code.HasHostFeature(HostFeature::SSE41)) {
EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pminuw);
} else {
- auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const b = ctx.reg_alloc.UseXmm(code, args[1]);
FallbackMinMaxU16(code, ctx, a, b, MinMaxOperation::Min);
- ctx.reg_alloc.DefineValue(code, inst, a);
+ ctx.reg_alloc.DefineValue(code, inst, a);
}
}
@@ -2153,57 +2229,93 @@ void EmitX64::EmitVectorMinU32(EmitContext& ctx, IR::Inst* inst) {
if (code.HasHostFeature(HostFeature::SSE41)) {
EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pminud);
} else {
- auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const b = ctx.reg_alloc.UseXmm(code, args[1]);
FallbackMinMaxU32(code, ctx, a, b, MinMaxOperation::Min);
- ctx.reg_alloc.DefineValue(code, inst, a);
+ ctx.reg_alloc.DefineValue(code, inst, a);
}
}
void EmitX64::EmitVectorMinU64(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasHostFeature(HostFeature::AVX512_Ortho)) {
- EmitAVXVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::vpminuq);
+ auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
+ code.vpminuq(xmm_a, xmm_a, xmm_b);
+ ctx.reg_alloc.DefineValue(code, inst, xmm_a);
} else if (code.HasHostFeature(HostFeature::AVX)) {
- auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-
- const Xbyak::Xmm x = ctx.reg_alloc.UseXmm(code, args[0]);
- const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
-
+ auto const x = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
code.vmovdqa(xmm0, code.Const(xword, 0x8000000000000000, 0x8000000000000000));
code.vpsubq(tmp, y, xmm0);
code.vpsubq(xmm0, x, xmm0);
code.vpcmpgtq(xmm0, tmp, xmm0);
code.pblendvb(y, x);
-
ctx.reg_alloc.DefineValue(code, inst, y);
} else {
- EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) {
- std::transform(a.begin(), a.end(), b.begin(), result.begin(), [](auto x, auto y) { return (std::min)(x, y); });
- });
+ auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const tmp1 = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp3 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp4 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp5 = ctx.reg_alloc.ScratchXmm(code);
+ code.movdqa(tmp2, code.Const(xword, 0x8000000080000000, 0x8000000080000000));
+ code.movdqa(tmp3, tmp1);
+ code.pxor(tmp3, tmp2);
+ code.pxor(tmp2, tmp0);
+ code.movdqa(tmp4, tmp2);
+ code.pcmpeqd(tmp2, tmp3);
+ code.pcmpgtd(tmp4, tmp3);
+ code.pshufd(tmp3, tmp2, 245);
+ code.pshufd(tmp5, tmp4, 160);
+ code.pshufd(tmp2, tmp4, 245);
+ code.pand(tmp3, tmp5);
+ code.por(tmp2, tmp3);
+ code.pand(tmp1, tmp2);
+ code.pandn(tmp2, tmp0);
+ code.por(tmp2, tmp1);
+ //code.movdqa(tmp0, tmp2);
+ ctx.reg_alloc.DefineValue(code, inst, tmp2);
}
}
void EmitX64::EmitVectorMultiply8(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
- const Xbyak::Xmm tmp_a = ctx.reg_alloc.ScratchXmm(code);
- const Xbyak::Xmm tmp_b = ctx.reg_alloc.ScratchXmm(code);
-
- // TODO: Optimize
- code.movdqa(tmp_a, a);
- code.movdqa(tmp_b, b);
- code.pmullw(a, b);
- code.psrlw(tmp_a, 8);
- code.psrlw(tmp_b, 8);
- code.pmullw(tmp_a, tmp_b);
- code.pand(a, code.Const(xword, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF));
- code.psllw(tmp_a, 8);
- code.por(a, tmp_a);
-
- ctx.reg_alloc.DefineValue(code, inst, a);
+ if (code.HasHostFeature(HostFeature::AVX)) {
+ auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const tmp1 = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp3 = ctx.reg_alloc.ScratchXmm(code);
+ code.vbroadcastss(tmp3, code.Const(dword, 0x00ff'00ff));
+ code.vpmullw(tmp2, tmp1, tmp0);
+ code.vpandn(tmp0, tmp3, tmp0);
+ code.vpand(tmp2, tmp2, tmp3);
+ code.vpmaddubsw(tmp0, tmp1, tmp0);
+ code.vpsllw(tmp0, tmp0, 8);
+ code.vpor(tmp0, tmp2, tmp0);
+ ctx.reg_alloc.DefineValue(code, inst, tmp0);
+ } else {
+ auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const tmp1 = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp3 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp4 = ctx.reg_alloc.ScratchXmm(code);
+ code.movdqa(tmp2, tmp0);
+ code.movdqa(tmp3, tmp1);
+ code.movdqa(tmp4, code.Const(xword, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF));
+ code.punpckhbw(tmp2, tmp2);
+ code.punpckhbw(tmp3, tmp3);
+ code.punpcklbw(tmp0, tmp0);
+ code.punpcklbw(tmp1, tmp1);
+ code.pmullw(tmp3, tmp2);
+ code.pmullw(tmp0, tmp1);
+ code.pand(tmp3, tmp4);
+ code.pand(tmp0, tmp4);
+ code.packuswb(tmp0, tmp3);
+ ctx.reg_alloc.DefineValue(code, inst, tmp0);
+ }
}
void EmitX64::EmitVectorMultiply16(EmitContext& ctx, IR::Inst* inst) {
@@ -2214,31 +2326,32 @@ void EmitX64::EmitVectorMultiply32(EmitContext& ctx, IR::Inst* inst) {
if (code.HasHostFeature(HostFeature::SSE41)) {
EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pmulld);
} else {
- auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
-
- code.movdqa(tmp, a);
- code.psrlq(a, 32);
- code.pmuludq(tmp, b);
- code.psrlq(b, 32);
- code.pmuludq(a, b);
- code.pshufd(tmp, tmp, 0b00001000);
- code.pshufd(b, a, 0b00001000);
- code.punpckldq(tmp, b);
-
- ctx.reg_alloc.DefineValue(code, inst, tmp);
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
+ code.movdqa(tmp, a);
+ code.psrlq(a, 32);
+ code.pmuludq(tmp, b);
+ code.psrlq(b, 32);
+ code.pmuludq(a, b);
+ code.pshufd(tmp, tmp, 0b00001000);
+ code.pshufd(b, a, 0b00001000);
+ code.punpckldq(tmp, b);
+ ctx.reg_alloc.DefineValue(code, inst, tmp);
}
}
void EmitX64::EmitVectorMultiply64(EmitContext& ctx, IR::Inst* inst) {
- if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512DQ)) {
- EmitAVXVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::vpmullq);
- } else if (code.HasHostFeature(HostFeature::SSE41)) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(code, args[1]);
+ if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512DQ)) {
+ auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
+ code.vpmullq(xmm_a, xmm_a, xmm_b);
+ ctx.reg_alloc.DefineValue(code, inst, xmm_a);
+ } else if (code.HasHostFeature(HostFeature::SSE41)) {
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const b = ctx.reg_alloc.UseXmm(code, args[1]);
const Xbyak::Reg64 tmp1 = ctx.reg_alloc.ScratchGpr(code);
const Xbyak::Reg64 tmp2 = ctx.reg_alloc.ScratchGpr(code);
@@ -2253,29 +2366,28 @@ void EmitX64::EmitVectorMultiply64(EmitContext& ctx, IR::Inst* inst) {
ctx.reg_alloc.DefineValue(code, inst, a);
} else {
- auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseXmm(code, args[0]);
- const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
- const Xbyak::Xmm tmp1 = ctx.reg_alloc.ScratchXmm(code);
- const Xbyak::Xmm tmp2 = ctx.reg_alloc.ScratchXmm(code);
- const Xbyak::Xmm tmp3 = ctx.reg_alloc.ScratchXmm(code);
+ auto const a = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const tmp1 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp3 = ctx.reg_alloc.ScratchXmm(code);
- code.movdqa(tmp1, a);
- code.movdqa(tmp2, a);
- code.movdqa(tmp3, b);
+ code.movdqa(tmp1, a);
+ code.movdqa(tmp2, a);
+ code.movdqa(tmp3, b);
- code.psrlq(tmp1, 32);
- code.psrlq(tmp3, 32);
+ code.psrlq(tmp1, 32);
+ code.psrlq(tmp3, 32);
- code.pmuludq(tmp2, b);
- code.pmuludq(tmp3, a);
- code.pmuludq(b, tmp1);
+ code.pmuludq(tmp2, b);
+ code.pmuludq(tmp3, a);
+ code.pmuludq(b, tmp1);
- code.paddq(b, tmp3);
- code.psllq(b, 32);
- code.paddq(tmp2, b);
+ code.paddq(b, tmp3);
+ code.psllq(b, 32);
+ code.paddq(tmp2, b);
- ctx.reg_alloc.DefineValue(code, inst, tmp2);
+ ctx.reg_alloc.DefineValue(code, inst, tmp2);
}
}
@@ -2307,15 +2419,15 @@ void EmitX64::EmitVectorNarrow16(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW)) {
- const Xbyak::Xmm a = ctx.reg_alloc.UseXmm(code, args[0]);
- const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+ auto const a = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const result = ctx.reg_alloc.ScratchXmm(code);
code.vpmovwb(result, a);
ctx.reg_alloc.DefineValue(code, inst, result);
} else {
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm zeros = ctx.reg_alloc.ScratchXmm(code);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const zeros = ctx.reg_alloc.ScratchXmm(code);
code.pxor(zeros, zeros);
code.pand(a, code.Const(xword, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF));
@@ -2328,13 +2440,13 @@ void EmitX64::EmitVectorNarrow16(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorNarrow32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasHostFeature(HostFeature::AVX512_Ortho)) {
- const Xbyak::Xmm a = ctx.reg_alloc.UseXmm(code, args[0]);
- const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+ auto const a = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const result = ctx.reg_alloc.ScratchXmm(code);
code.vpmovdw(result, a);
ctx.reg_alloc.DefineValue(code, inst, result);
} else {
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm zeros = ctx.reg_alloc.ScratchXmm(code);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const zeros = ctx.reg_alloc.ScratchXmm(code);
code.pxor(zeros, zeros);
if (code.HasHostFeature(HostFeature::SSE41)) {
code.pblendw(a, zeros, 0b10101010);
@@ -2352,15 +2464,15 @@ void EmitX64::EmitVectorNarrow64(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasHostFeature(HostFeature::AVX512_Ortho)) {
- const Xbyak::Xmm a = ctx.reg_alloc.UseXmm(code, args[0]);
- const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+ auto const a = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const result = ctx.reg_alloc.ScratchXmm(code);
code.vpmovqd(result, a);
ctx.reg_alloc.DefineValue(code, inst, result);
} else {
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm zeros = ctx.reg_alloc.ScratchXmm(code);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const zeros = ctx.reg_alloc.ScratchXmm(code);
code.pxor(zeros, zeros);
code.shufps(a, zeros, 0b00001000);
@@ -2373,13 +2485,13 @@ void EmitX64::EmitVectorNot(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasHostFeature(HostFeature::AVX512_Ortho)) {
- const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
- const Xbyak::Xmm operand = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const result = ctx.reg_alloc.ScratchXmm(code);
+ auto const operand = ctx.reg_alloc.UseXmm(code, args[0]);
code.vpternlogq(result, operand, operand, u8(~Tern::c));
ctx.reg_alloc.DefineValue(code, inst, result);
} else {
- const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm xmm_b = ctx.reg_alloc.ScratchXmm(code);
+ auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const xmm_b = ctx.reg_alloc.ScratchXmm(code);
code.pcmpeqw(xmm_b, xmm_b);
code.pxor(xmm_a, xmm_b);
ctx.reg_alloc.DefineValue(code, inst, xmm_a);
@@ -2393,9 +2505,9 @@ void EmitX64::EmitVectorOr(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorPairedAddLower8(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
code.punpcklqdq(xmm_a, xmm_b);
code.movdqa(tmp, xmm_a);
@@ -2411,9 +2523,9 @@ void EmitX64::EmitVectorPairedAddLower8(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorPairedAddLower16(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
code.punpcklqdq(xmm_a, xmm_b);
if (code.HasHostFeature(HostFeature::SSSE3)) {
@@ -2434,9 +2546,9 @@ void EmitX64::EmitVectorPairedAddLower16(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorPairedAddLower32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
code.punpcklqdq(xmm_a, xmm_b);
if (code.HasHostFeature(HostFeature::SSSE3)) {
@@ -2456,10 +2568,10 @@ void EmitX64::EmitVectorPairedAddLower32(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorPairedAdd8(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
- const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(code);
- const Xbyak::Xmm d = ctx.reg_alloc.ScratchXmm(code);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const c = ctx.reg_alloc.ScratchXmm(code);
+ auto const d = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(c, a);
code.movdqa(d, b);
@@ -2478,17 +2590,17 @@ void EmitX64::EmitVectorPairedAdd16(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasHostFeature(HostFeature::SSSE3)) {
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const b = ctx.reg_alloc.UseXmm(code, args[1]);
code.phaddw(a, b);
ctx.reg_alloc.DefineValue(code, inst, a);
} else {
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
- const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(code);
- const Xbyak::Xmm d = ctx.reg_alloc.ScratchXmm(code);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const c = ctx.reg_alloc.ScratchXmm(code);
+ auto const d = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(c, a);
code.movdqa(d, b);
@@ -2508,17 +2620,17 @@ void EmitX64::EmitVectorPairedAdd32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasHostFeature(HostFeature::SSSE3)) {
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const b = ctx.reg_alloc.UseXmm(code, args[1]);
code.phaddd(a, b);
ctx.reg_alloc.DefineValue(code, inst, a);
} else {
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
- const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(code);
- const Xbyak::Xmm d = ctx.reg_alloc.ScratchXmm(code);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const c = ctx.reg_alloc.ScratchXmm(code);
+ auto const d = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(c, a);
code.movdqa(d, b);
@@ -2535,9 +2647,9 @@ void EmitX64::EmitVectorPairedAdd32(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorPairedAdd64(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(code, args[1]);
- const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(code);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const b = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const c = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(c, a);
code.punpcklqdq(a, b);
@@ -2550,8 +2662,8 @@ void EmitX64::EmitVectorPairedAdd64(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorPairedAddSignedWiden8(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(code);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const c = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(c, a);
code.psllw(a, 8);
@@ -2565,8 +2677,8 @@ void EmitX64::EmitVectorPairedAddSignedWiden8(EmitContext& ctx, IR::Inst* inst)
void EmitX64::EmitVectorPairedAddSignedWiden16(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(code);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const c = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(c, a);
code.pslld(a, 16);
@@ -2580,18 +2692,18 @@ void EmitX64::EmitVectorPairedAddSignedWiden16(EmitContext& ctx, IR::Inst* inst)
void EmitX64::EmitVectorPairedAddSignedWiden32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
if (code.HasHostFeature(HostFeature::AVX512_Ortho)) {
- const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(code);
+ auto const c = ctx.reg_alloc.ScratchXmm(code);
code.vpsraq(c, a, 32);
code.vpsllq(a, a, 32);
code.vpsraq(a, a, 32);
code.vpaddq(a, a, c);
} else {
- const Xbyak::Xmm tmp1 = ctx.reg_alloc.ScratchXmm(code);
- const Xbyak::Xmm tmp2 = ctx.reg_alloc.ScratchXmm(code);
- const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp1 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+ auto const c = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(c, a);
code.psllq(a, 32);
@@ -2613,8 +2725,8 @@ void EmitX64::EmitVectorPairedAddSignedWiden32(EmitContext& ctx, IR::Inst* inst)
void EmitX64::EmitVectorPairedAddUnsignedWiden8(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(code);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const c = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(c, a);
code.psllw(a, 8);
@@ -2628,8 +2740,8 @@ void EmitX64::EmitVectorPairedAddUnsignedWiden8(EmitContext& ctx, IR::Inst* inst
void EmitX64::EmitVectorPairedAddUnsignedWiden16(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(code);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const c = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(c, a);
code.pslld(a, 16);
@@ -2643,8 +2755,8 @@ void EmitX64::EmitVectorPairedAddUnsignedWiden16(EmitContext& ctx, IR::Inst* ins
void EmitX64::EmitVectorPairedAddUnsignedWiden32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm c = ctx.reg_alloc.ScratchXmm(code);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const c = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(c, a);
code.psllq(a, 32);
@@ -2658,14 +2770,10 @@ void EmitX64::EmitVectorPairedAddUnsignedWiden32(EmitContext& ctx, IR::Inst* ins
template
static void PairedOperation(VectorArray& result, const VectorArray& x, const VectorArray& y, Function fn) {
const size_t range = x.size() / 2;
-
- for (size_t i = 0; i < range; i++) {
+ for (size_t i = 0; i < range; i++)
result[i] = fn(x[2 * i], x[2 * i + 1]);
- }
-
- for (size_t i = 0; i < range; i++) {
+ for (size_t i = 0; i < range; i++)
result[range + i] = fn(y[2 * i], y[2 * i + 1]);
- }
}
template
@@ -2686,11 +2794,6 @@ static void PairedMax(VectorArray& result, const VectorArray& x, const Vec
PairedOperation(result, x, y, [](auto a, auto b) { return (std::max)(a, b); });
}
-template
-static void PairedMin(VectorArray& result, const VectorArray& x, const VectorArray& y) {
- PairedOperation(result, x, y, [](auto a, auto b) { return (std::min)(a, b); });
-}
-
template
static void LowerPairedMax(VectorArray& result, const VectorArray& x, const VectorArray& y) {
LowerPairedOperation(result, x, y, [](auto a, auto b) { return (std::max)(a, b); });
@@ -2705,19 +2808,16 @@ template
static void EmitVectorPairedMinMax8(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(tmp, code.Const(xword, 0x0E'0C'0A'08'06'04'02'00, 0x0F'0D'0B'09'07'05'03'01));
code.pshufb(x, tmp);
code.pshufb(y, tmp);
-
code.movaps(tmp, x);
code.shufps(tmp, y, 0b01'00'01'00);
-
code.shufps(x, y, 0b11'10'11'10);
-
if constexpr (std::is_member_function_pointer_v) {
(code.*fn)(x, tmp);
} else {
@@ -2730,21 +2830,17 @@ static void EmitVectorPairedMinMax8(BlockOfCode& code, EmitContext& ctx, IR::Ins
template
static void EmitVectorPairedMinMaxLower8(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-
- const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
-
+ auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
code.punpcklqdq(x, y);
code.pshufb(x, code.Const(xword, 0x0E'0C'0A'08'06'04'02'00, 0x0F'0D'0B'09'07'05'03'01));
code.movhlps(y, x);
code.movq(x, x);
-
if constexpr (std::is_member_function_pointer_v) {
(code.*fn)(x, y);
} else {
fn(x, y);
}
-
ctx.reg_alloc.DefineValue(code, inst, x);
}
@@ -2752,9 +2848,9 @@ template
static void EmitVectorPairedMinMax16(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
// swap idxs 1 and 2 within 64-bit lanes so that both registers contain [even, odd, even, odd]-indexed pairs of elements
code.pshuflw(x, x, 0b11'01'10'00);
@@ -2780,63 +2876,31 @@ static void EmitVectorPairedMinMax16(BlockOfCode& code, EmitContext& ctx, IR::In
ctx.reg_alloc.DefineValue(code, inst, x);
}
-template
-static void EmitVectorPairedMinMaxLower16(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) {
- auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-
- const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
-
- // swap idxs 1 and 2 so that both registers contain even then odd-indexed pairs of elements
- code.pshuflw(x, x, 0b11'01'10'00);
- code.pshuflw(y, y, 0b11'01'10'00);
-
- // move pairs of even/odd-indexed elements into one register each
-
- // tmp = x[0, 2], y[0, 2], 0s...
- code.movaps(tmp, y);
- code.insertps(tmp, x, 0b01001100);
- // x = x[1, 3], y[1, 3], 0s...
- code.insertps(x, y, 0b00011100);
-
- (code.*fn)(x, tmp);
-
- ctx.reg_alloc.DefineValue(code, inst, x);
-}
-
-static void EmitVectorPairedMinMaxLower32(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, void (Xbyak::CodeGenerator::*fn)(const Xbyak::Xmm&, const Xbyak::Operand&)) {
- auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-
- const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(code, args[1]);
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
-
- // tmp = x[1], y[1], 0, 0
- code.movaps(tmp, y);
- code.insertps(tmp, x, 0b01001100);
- // x = x[0], y[0], 0, 0
- code.insertps(x, y, 0b00011100);
-
- (code.*fn)(x, tmp);
-
- ctx.reg_alloc.DefineValue(code, inst, x);
-}
void EmitX64::EmitVectorPairedMaxS8(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
+ code.movdqa(tmp, code.Const(xword, 0x0E'0C'0A'08'06'04'02'00, 0x0F'0D'0B'09'07'05'03'01));
+ code.pshufb(x, tmp);
+ code.pshufb(y, tmp);
+ code.movaps(tmp, x);
+ code.shufps(tmp, y, 0b01'00'01'00);
+ code.shufps(x, y, 0b11'10'11'10);
if (code.HasHostFeature(HostFeature::SSE41)) {
- EmitVectorPairedMinMax8(code, ctx, inst, &Xbyak::CodeGenerator::pmaxsb);
- return;
- } else if (code.HasHostFeature(HostFeature::SSSE3)) {
- EmitVectorPairedMinMax8(code, ctx, inst, [&](const auto& lhs, const auto& rhs) {
- FallbackMinMaxS8(code, ctx, lhs, rhs, MinMaxOperation::Max);
- });
- return;
+ code.pmaxsb(x, tmp);
+ } else {
+ auto const a = x;
+ auto const b = tmp;
+ auto const c = ctx.reg_alloc.ScratchXmm(code);
+ code.movdqa(c, a);
+ code.pcmpgtb(c, b);
+ code.pand(a, c);
+ code.pandn(c, b);
+ code.por(a, c);
}
-
- EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) {
- PairedMax(result, a, b);
- });
+ ctx.reg_alloc.DefineValue(code, inst, x);
}
void EmitX64::EmitVectorPairedMaxS16(EmitContext& ctx, IR::Inst* inst) {
@@ -2846,9 +2910,9 @@ void EmitX64::EmitVectorPairedMaxS16(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorPairedMaxS32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(code, args[1]);
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const y = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(tmp, x);
code.shufps(tmp, y, 0b10001000);
@@ -2866,12 +2930,24 @@ void EmitX64::EmitVectorPairedMaxS32(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorPairedMaxU8(EmitContext& ctx, IR::Inst* inst) {
if (code.HasHostFeature(HostFeature::SSSE3)) {
EmitVectorPairedMinMax8(code, ctx, inst, &Xbyak::CodeGenerator::pmaxub);
- return;
+ } else {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const tmp1 = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp3 = ctx.reg_alloc.ScratchXmm(code);
+ auto const constant_00ff = code.Const(xword, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF);
+ code.movdqa(tmp2, constant_00ff);
+ code.movdqa(tmp3, tmp1);
+ code.pand(tmp3, tmp2);
+ code.pand(tmp2, tmp0);
+ code.packuswb(tmp2, tmp3);
+ code.psrlw(tmp1, 8);
+ code.psrlw(tmp0, 8);
+ code.packuswb(tmp0, tmp1);
+ code.pmaxub(tmp0, tmp2);
+ ctx.reg_alloc.DefineValue(code, inst, tmp0);
}
-
- EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) {
- PairedMax(result, a, b);
- });
}
void EmitX64::EmitVectorPairedMaxU16(EmitContext& ctx, IR::Inst* inst) {
@@ -2887,9 +2963,9 @@ void EmitX64::EmitVectorPairedMaxU16(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorPairedMaxU32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(code, args[1]);
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const y = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(tmp, x);
code.shufps(tmp, y, 0b10001000);
@@ -2907,14 +2983,15 @@ void EmitX64::EmitVectorPairedMaxU32(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorPairedMinS8(EmitContext& ctx, IR::Inst* inst) {
if (code.HasHostFeature(HostFeature::SSE41)) {
EmitVectorPairedMinMax8(code, ctx, inst, &Xbyak::CodeGenerator::pminsb);
- } else if (code.HasHostFeature(HostFeature::SSSE3)) {
- EmitVectorPairedMinMax8(code, ctx, inst, [&](const auto& lhs, const auto& rhs) {
- FallbackMinMaxS8(code, ctx, lhs, rhs, MinMaxOperation::Min);
- });
} else {
- EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) {
- PairedMin(result, a, b);
- });
+ EmitVectorPairedMinMax8(code, ctx, inst, [&](const auto& a, const auto& b) {
+ auto const c = ctx.reg_alloc.ScratchXmm(code);
+ code.movdqa(c, b);
+ code.pcmpgtb(c, a);
+ code.pand(a, c);
+ code.pandn(c, b);
+ code.por(a, c);
+ });
}
}
@@ -2925,9 +3002,9 @@ void EmitX64::EmitVectorPairedMinS16(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorPairedMinS32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(code, args[1]);
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const y = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(tmp, x);
code.shufps(tmp, y, 0b10001000);
@@ -2943,12 +3020,25 @@ void EmitX64::EmitVectorPairedMinS32(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorPairedMinU8(EmitContext& ctx, IR::Inst* inst) {
if (code.HasHostFeature(HostFeature::SSSE3)) {
EmitVectorPairedMinMax8(code, ctx, inst, &Xbyak::CodeGenerator::pminub);
- return;
+ } else {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const tmp1 = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp3 = ctx.reg_alloc.ScratchXmm(code);
+ auto const constant_00ff = code.Const(xword, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF);
+ code.movdqa(tmp2, tmp1);
+ code.psrlw(tmp2, 8);
+ code.movdqa(tmp3, tmp0);
+ code.psrlw(tmp3, 8);
+ code.packuswb(tmp3, tmp2);
+ code.movdqa(tmp2, constant_00ff);
+ code.pand(tmp1, tmp2);
+ code.pand(tmp0, tmp2);
+ code.packuswb(tmp0, tmp1);
+ code.pminub(tmp0, tmp3);
+ ctx.reg_alloc.DefineValue(code, inst, tmp0);
}
-
- EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) {
- PairedMin(result, a, b);
- });
}
void EmitX64::EmitVectorPairedMinU16(EmitContext& ctx, IR::Inst* inst) {
@@ -2964,9 +3054,9 @@ void EmitX64::EmitVectorPairedMinU16(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorPairedMinU32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(code, args[1]);
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const y = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(tmp, x);
code.shufps(tmp, y, 0b10001000);
@@ -2982,41 +3072,88 @@ void EmitX64::EmitVectorPairedMinU32(EmitContext& ctx, IR::Inst* inst) {
}
void EmitX64::EmitVectorPairedMaxLowerS8(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
if (code.HasHostFeature(HostFeature::SSE41)) {
- EmitVectorPairedMinMaxLower8(code, ctx, inst, &Xbyak::CodeGenerator::pmaxsb);
- return;
- } else if (code.HasHostFeature(HostFeature::SSSE3)) {
- EmitVectorPairedMinMaxLower8(code, ctx, inst, [&](const auto& lhs, const auto& rhs) {
- FallbackMinMaxS8(code, ctx, lhs, rhs, MinMaxOperation::Max);
- });
- return;
+ code.punpcklqdq(x, y);
+ code.pshufb(x, code.Const(xword, 0x0E'0C'0A'08'06'04'02'00, 0x0F'0D'0B'09'07'05'03'01));
+ code.movhlps(y, x);
+ code.movq(x, x);
+ code.pmaxsb(x, y);
+ } else {
+ auto const c = ctx.reg_alloc.ScratchXmm(code);
+ code.punpcklqdq(x, y);
+ code.pshufb(x, code.Const(xword, 0x0E'0C'0A'08'06'04'02'00, 0x0F'0D'0B'09'07'05'03'01));
+ code.movhlps(y, x);
+ code.movq(x, x);
+ code.movdqa(c, x);
+ code.pcmpgtb(c, y);
+ code.pand(x, c);
+ code.pandn(c, y);
+ code.por(x, c);
}
-
- EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) {
- LowerPairedMax(result, a, b);
- });
+ ctx.reg_alloc.DefineValue(code, inst, x);
}
void EmitX64::EmitVectorPairedMaxLowerS16(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasHostFeature(HostFeature::SSE41)) {
- EmitVectorPairedMinMaxLower16(code, ctx, inst, &Xbyak::CodeGenerator::pmaxsw);
- return;
+ auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
+ // swap idxs 1 and 2 so that both registers contain even then odd-indexed pairs of elements
+ code.pshuflw(x, x, 0b11'01'10'00);
+ code.pshuflw(y, y, 0b11'01'10'00);
+ // move pairs of even/odd-indexed elements into one register each
+ // tmp = x[0, 2], y[0, 2], 0s...
+ code.movaps(tmp, y);
+ code.insertps(tmp, x, 0b01001100);
+ // x = x[1, 3], y[1, 3], 0s...
+ code.insertps(x, y, 0b00011100);
+ code.pmaxsw(x, tmp);
+ ctx.reg_alloc.DefineValue(code, inst, x);
+ } else {
+ auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const tmp1 = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ code.punpcklwd(tmp0, tmp1);
+ code.pshufd(tmp1, tmp0, 232);
+ code.pshuflw(tmp1, tmp1, 216);
+ code.pshufd(tmp0, tmp0, 231);
+ code.pshuflw(tmp0, tmp0, 114);
+ code.pmaxsw(tmp0, tmp1);
+ code.movq(tmp0, tmp0);
+ ctx.reg_alloc.DefineValue(code, inst, tmp0);
}
-
- EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) {
- LowerPairedMax(result, a, b);
- });
}
void EmitX64::EmitVectorPairedMaxLowerS32(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasHostFeature(HostFeature::SSE41)) {
- EmitVectorPairedMinMaxLower32(code, ctx, inst, &Xbyak::CodeGenerator::pmaxsd);
- return;
+ auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const y = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
+ // tmp = x[1], y[1], 0, 0
+ code.movaps(tmp, y);
+ code.insertps(tmp, x, 0b01001100);
+ // x = x[0], y[0], 0, 0
+ code.insertps(x, y, 0b00011100);
+ code.pmaxsd(x, tmp);
+ ctx.reg_alloc.DefineValue(code, inst, x);
+ } else {
+ auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const tmp1 = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+ code.punpckldq(tmp0, tmp1);
+ code.pshufd(tmp1, tmp0, 238);
+ code.movdqa(tmp2, tmp0);
+ code.pcmpgtd(tmp2, tmp1);
+ code.pand(tmp0, tmp2);
+ code.pandn(tmp2, tmp1);
+ code.por(tmp2, tmp0);
+ code.movq(tmp0, tmp2);
+ ctx.reg_alloc.DefineValue(code, inst, tmp0);
}
-
- EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) {
- LowerPairedMax(result, a, b);
- });
}
void EmitX64::EmitVectorPairedMaxLowerU8(EmitContext& ctx, IR::Inst* inst) {
@@ -3031,63 +3168,143 @@ void EmitX64::EmitVectorPairedMaxLowerU8(EmitContext& ctx, IR::Inst* inst) {
}
void EmitX64::EmitVectorPairedMaxLowerU16(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasHostFeature(HostFeature::SSE41)) {
- EmitVectorPairedMinMaxLower16(code, ctx, inst, &Xbyak::CodeGenerator::pmaxuw);
- return;
+ auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
+ // swap idxs 1 and 2 so that both registers contain even then odd-indexed pairs of elements
+ code.pshuflw(x, x, 0b11'01'10'00);
+ code.pshuflw(y, y, 0b11'01'10'00);
+ // move pairs of even/odd-indexed elements into one register each
+ // tmp = x[0, 2], y[0, 2], 0s...
+ code.movaps(tmp, y);
+ code.insertps(tmp, x, 0b01001100);
+ // x = x[1, 3], y[1, 3], 0s...
+ code.insertps(x, y, 0b00011100);
+ code.pmaxuw(x, tmp);
+ ctx.reg_alloc.DefineValue(code, inst, x);
+ } else {
+ auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const tmp1 = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ code.punpcklwd(tmp0, tmp1);
+ code.pshufd(tmp1, tmp0, 232);
+ code.pshuflw(tmp1, tmp1, 216);
+ code.pshufd(tmp0, tmp0, 231);
+ code.pshuflw(tmp0, tmp0, 114);
+ code.psubusw(tmp0, tmp1);
+ code.paddw(tmp0, tmp1);
+ code.movq(tmp0, tmp0);
+ ctx.reg_alloc.DefineValue(code, inst, tmp0);
}
-
- EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) {
- LowerPairedMax(result, a, b);
- });
}
void EmitX64::EmitVectorPairedMaxLowerU32(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasHostFeature(HostFeature::SSE41)) {
- EmitVectorPairedMinMaxLower32(code, ctx, inst, &Xbyak::CodeGenerator::pmaxud);
- return;
+ auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const y = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
+ // tmp = x[1], y[1], 0, 0
+ code.movaps(tmp, y);
+ code.insertps(tmp, x, 0b01001100);
+ // x = x[0], y[0], 0, 0
+ code.insertps(x, y, 0b00011100);
+ code.pmaxud(x, tmp);
+ ctx.reg_alloc.DefineValue(code, inst, x);
+ } else {
+ auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const tmp1 = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp3 = ctx.reg_alloc.ScratchXmm(code);
+ code.punpckldq(tmp0, tmp1);
+ code.pshufd(tmp1, tmp0, 238);
+ code.movdqa(tmp2, code.Const(xword, 0x8000'00008000'0000, 0x8000'00008000'0000));
+ code.movdqa(tmp3, tmp0);
+ code.pxor(tmp3, tmp2);
+ code.pxor(tmp2, tmp1);
+ code.pcmpgtd(tmp3, tmp2);
+ code.pand(tmp0, tmp3);
+ code.pandn(tmp3, tmp1);
+ code.por(tmp3, tmp0);
+ code.movq(tmp0, tmp3);
+ ctx.reg_alloc.DefineValue(code, inst, tmp0);
}
-
- EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) {
- LowerPairedMax(result, a, b);
- });
}
void EmitX64::EmitVectorPairedMinLowerS8(EmitContext& ctx, IR::Inst* inst) {
if (code.HasHostFeature(HostFeature::SSE41)) {
EmitVectorPairedMinMaxLower8(code, ctx, inst, &Xbyak::CodeGenerator::pminsb);
- return;
- } else if (code.HasHostFeature(HostFeature::SSSE3)) {
- EmitVectorPairedMinMaxLower8(code, ctx, inst, [&](const auto& lhs, const auto& rhs) {
- FallbackMinMaxS8(code, ctx, lhs, rhs, MinMaxOperation::Min);
+ } else {
+ EmitVectorPairedMinMaxLower8(code, ctx, inst, [&](const auto& a, const auto& b) {
+ auto const c = ctx.reg_alloc.ScratchXmm(code);
+ code.movdqa(c, b);
+ code.pcmpgtb(c, a);
+ code.pand(a, c);
+ code.pandn(c, b);
+ code.por(a, c);
});
- return;
}
-
- EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) {
- LowerPairedMin(result, a, b);
- });
}
void EmitX64::EmitVectorPairedMinLowerS16(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasHostFeature(HostFeature::SSE41)) {
- EmitVectorPairedMinMaxLower16(code, ctx, inst, &Xbyak::CodeGenerator::pminsw);
- return;
+ auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
+ // swap idxs 1 and 2 so that both registers contain even then odd-indexed pairs of elements
+ code.pshuflw(x, x, 0b11'01'10'00);
+ code.pshuflw(y, y, 0b11'01'10'00);
+ // move pairs of even/odd-indexed elements into one register each
+ // tmp = x[0, 2], y[0, 2], 0s...
+ code.movaps(tmp, y);
+ code.insertps(tmp, x, 0b01001100);
+ // x = x[1, 3], y[1, 3], 0s...
+ code.insertps(x, y, 0b00011100);
+ code.pminsw(x, tmp);
+ ctx.reg_alloc.DefineValue(code, inst, x);
+ } else {
+ auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const tmp1 = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ code.punpcklwd(tmp0, tmp1);
+ code.pshufd(tmp1, tmp0, 231);
+ code.pshuflw(tmp1, tmp1, 114);
+ code.pshufd(tmp0, tmp0, 232);
+ code.pshuflw(tmp0, tmp0, 216);
+ code.pminsw(tmp0, tmp1);
+ code.movq(tmp0, tmp0);
+ ctx.reg_alloc.DefineValue(code, inst, tmp0);
}
-
- EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) {
- LowerPairedMin(result, a, b);
- });
}
void EmitX64::EmitVectorPairedMinLowerS32(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasHostFeature(HostFeature::SSE41)) {
- EmitVectorPairedMinMaxLower32(code, ctx, inst, &Xbyak::CodeGenerator::pminsd);
- return;
+ auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const y = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
+ // tmp = x[1], y[1], 0, 0
+ code.movaps(tmp, y);
+ code.insertps(tmp, x, 0b01001100);
+ // x = x[0], y[0], 0, 0
+ code.insertps(x, y, 0b00011100);
+ code.pminsd(x, tmp);
+ ctx.reg_alloc.DefineValue(code, inst, x);
+ } else {
+ auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const tmp1 = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+ code.punpckldq(tmp0, tmp1);
+ code.pshufd(tmp1, tmp0, 238);
+ code.movdqa(tmp2, tmp0);
+ code.pcmpgtd(tmp2, tmp1);
+ code.pand(tmp1, tmp2);
+ code.pandn(tmp2, tmp0);
+ code.por(tmp2, tmp1);
+ code.movq(tmp0, tmp2);
+ ctx.reg_alloc.DefineValue(code, inst, tmp0);
}
-
- EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) {
- LowerPairedMin(result, a, b);
- });
}
void EmitX64::EmitVectorPairedMinLowerU8(EmitContext& ctx, IR::Inst* inst) {
@@ -3102,50 +3319,91 @@ void EmitX64::EmitVectorPairedMinLowerU8(EmitContext& ctx, IR::Inst* inst) {
}
void EmitX64::EmitVectorPairedMinLowerU16(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasHostFeature(HostFeature::SSE41)) {
- EmitVectorPairedMinMaxLower16(code, ctx, inst, &Xbyak::CodeGenerator::pminuw);
- return;
+ auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
+ // swap idxs 1 and 2 so that both registers contain even then odd-indexed pairs of elements
+ code.pshuflw(x, x, 0b11'01'10'00);
+ code.pshuflw(y, y, 0b11'01'10'00);
+ // move pairs of even/odd-indexed elements into one register each
+ // tmp = x[0, 2], y[0, 2], 0s...
+ code.movaps(tmp, y);
+ code.insertps(tmp, x, 0b01001100);
+ // x = x[1, 3], y[1, 3], 0s...
+ code.insertps(x, y, 0b00011100);
+ code.pminuw(x, tmp);
+ ctx.reg_alloc.DefineValue(code, inst, x);
+ } else {
+ auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const tmp1 = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+ code.punpcklwd(tmp0, tmp1);
+ code.pshufd(tmp1, tmp0, 231);
+ code.pshuflw(tmp1, tmp1, 114);
+ code.pshufd(tmp0, tmp0, 232);
+ code.pshuflw(tmp0, tmp0, 216);
+ code.movdqa(tmp2, tmp1);
+ code.psubusw(tmp2, tmp0);
+ code.psubw(tmp1, tmp2);
+ code.movq(tmp0, tmp1);
+ ctx.reg_alloc.DefineValue(code, inst, tmp0);
}
-
- EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) {
- LowerPairedMin(result, a, b);
- });
}
void EmitX64::EmitVectorPairedMinLowerU32(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasHostFeature(HostFeature::SSE41)) {
- EmitVectorPairedMinMaxLower32(code, ctx, inst, &Xbyak::CodeGenerator::pminud);
- return;
+ auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const y = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
+ // tmp = x[1], y[1], 0, 0
+ code.movaps(tmp, y);
+ code.insertps(tmp, x, 0b01001100);
+ // x = x[0], y[0], 0, 0
+ code.insertps(x, y, 0b00011100);
+ code.pminud(x, tmp);
+ ctx.reg_alloc.DefineValue(code, inst, x);
+ } else {
+ auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const tmp1 = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp3 = ctx.reg_alloc.ScratchXmm(code);
+ code.punpckldq(tmp0, tmp1);
+ code.pshufd(tmp1, tmp0, 238);
+ code.movdqa(tmp2, code.Const(xword, 0x8000'00008000'0000, 0x8000'00008000'0000));
+ code.movdqa(tmp3, tmp0);
+ code.pxor(tmp3, tmp2);
+ code.pxor(tmp2, tmp1);
+ code.pcmpgtd(tmp3, tmp2);
+ code.pand(tmp1, tmp3);
+ code.pandn(tmp3, tmp0);
+ code.por(tmp3, tmp1);
+ code.movq(tmp0, tmp3);
+ ctx.reg_alloc.DefineValue(code, inst, tmp0);
}
-
- EmitTwoArgumentFallback(code, ctx, inst, [](VectorArray& result, const VectorArray& a, const VectorArray& b) {
- LowerPairedMin(result, a, b);
- });
}
template
static D PolynomialMultiply(T lhs, T rhs) {
constexpr size_t bit_size = mcl::bitsizeof;
const std::bitset operand(lhs);
-
D res = 0;
- for (size_t i = 0; i < bit_size; i++) {
- if (operand[i]) {
+ for (size_t i = 0; i < bit_size; i++)
+ if (operand[i])
res ^= rhs << i;
- }
- }
-
return res;
}
void EmitX64::EmitVectorPolynomialMultiply8(EmitContext& ctx, IR::Inst* inst) {
if (code.HasHostFeature(HostFeature::SSE41)) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
- const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
- const Xbyak::Xmm alternate = ctx.reg_alloc.ScratchXmm(code);
- const Xbyak::Xmm mask = ctx.reg_alloc.ScratchXmm(code);
+ auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const result = ctx.reg_alloc.ScratchXmm(code);
+ auto const alternate = ctx.reg_alloc.ScratchXmm(code);
+ auto const mask = ctx.reg_alloc.ScratchXmm(code);
const Xbyak::Reg32 counter = ctx.reg_alloc.ScratchGpr(code).cvt32();
Xbyak::Label loop;
@@ -3183,11 +3441,11 @@ void EmitX64::EmitVectorPolynomialMultiply8(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorPolynomialMultiplyLong8(EmitContext& ctx, IR::Inst* inst) {
if (code.HasHostFeature(HostFeature::SSE41)) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
- const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
- const Xbyak::Xmm alternate = ctx.reg_alloc.ScratchXmm(code);
- const Xbyak::Xmm mask = ctx.reg_alloc.ScratchXmm(code);
+ auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const xmm_b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const result = ctx.reg_alloc.ScratchXmm(code);
+ auto const alternate = ctx.reg_alloc.ScratchXmm(code);
+ auto const mask = ctx.reg_alloc.ScratchXmm(code);
const Xbyak::Reg32 counter = ctx.reg_alloc.ScratchGpr(code).cvt32();
Xbyak::Label loop;
@@ -3229,8 +3487,8 @@ void EmitX64::EmitVectorPolynomialMultiplyLong8(EmitContext& ctx, IR::Inst* inst
void EmitX64::EmitVectorPolynomialMultiplyLong64(EmitContext& ctx, IR::Inst* inst) {
if (code.HasHostFeature(HostFeature::PCLMULQDQ)) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const xmm_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const xmm_b = ctx.reg_alloc.UseXmm(code, args[1]);
code.pclmulqdq(xmm_a, xmm_b, 0x00);
@@ -3260,7 +3518,7 @@ void EmitX64::EmitVectorPolynomialMultiplyLong64(EmitContext& ctx, IR::Inst* ins
void EmitX64::EmitVectorPopulationCount(EmitContext& ctx, IR::Inst* inst) {
if (code.HasHostFeature(HostFeature::AVX512VL | HostFeature::AVX512BITALG)) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
code.vpopcntb(data, data);
@@ -3271,10 +3529,10 @@ void EmitX64::EmitVectorPopulationCount(EmitContext& ctx, IR::Inst* inst) {
if (code.HasHostFeature(HostFeature::SSSE3)) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm low_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm high_a = ctx.reg_alloc.ScratchXmm(code);
- const Xbyak::Xmm tmp1 = ctx.reg_alloc.ScratchXmm(code);
- const Xbyak::Xmm tmp2 = ctx.reg_alloc.ScratchXmm(code);
+ auto const low_a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const high_a = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp1 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(high_a, low_a);
code.psrlw(high_a, 4);
@@ -3303,12 +3561,12 @@ void EmitX64::EmitVectorPopulationCount(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorReverseBits(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
if (code.HasHostFeature(HostFeature::GFNI)) {
code.gf2p8affineqb(data, code.Const(xword, 0x8040201008040201, 0x8040201008040201), 0);
} else {
- const Xbyak::Xmm high_nibble_reg = ctx.reg_alloc.ScratchXmm(code);
+ auto const high_nibble_reg = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(high_nibble_reg, code.Const(xword, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0));
code.pand(high_nibble_reg, data);
code.pxor(data, high_nibble_reg);
@@ -3316,7 +3574,7 @@ void EmitX64::EmitVectorReverseBits(EmitContext& ctx, IR::Inst* inst) {
if (code.HasHostFeature(HostFeature::SSSE3)) {
// High lookup
- const Xbyak::Xmm high_reversed_reg = ctx.reg_alloc.ScratchXmm(code);
+ auto const high_reversed_reg = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(high_reversed_reg, code.Const(xword, 0xE060A020C0408000, 0xF070B030D0509010));
code.pshufb(high_reversed_reg, data);
@@ -3350,8 +3608,8 @@ void EmitX64::EmitVectorReverseBits(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorReverseElementsInHalfGroups8(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(tmp, data);
code.psllw(tmp, 8);
@@ -3363,13 +3621,13 @@ void EmitX64::EmitVectorReverseElementsInHalfGroups8(EmitContext& ctx, IR::Inst*
void EmitX64::EmitVectorReverseElementsInWordGroups8(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
if (code.HasHostFeature(HostFeature::AVX)) {
code.vpshufb(data, data, code.Const(xword, 0x0405060700010203, 0x0c0d0e0f08090a0b));
} else if (code.HasHostFeature(HostFeature::SSSE3)) {
code.pshufb(data, code.Const(xword, 0x0405060700010203, 0x0c0d0e0f08090a0b));
} else {
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(tmp, data);
code.psllw(tmp, 8);
code.psrlw(data, 8);
@@ -3382,7 +3640,7 @@ void EmitX64::EmitVectorReverseElementsInWordGroups8(EmitContext& ctx, IR::Inst*
void EmitX64::EmitVectorReverseElementsInWordGroups16(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
code.pshuflw(data, data, 0b10110001);
code.pshufhw(data, data, 0b10110001);
ctx.reg_alloc.DefineValue(code, inst, data);
@@ -3390,13 +3648,13 @@ void EmitX64::EmitVectorReverseElementsInWordGroups16(EmitContext& ctx, IR::Inst
void EmitX64::EmitVectorReverseElementsInLongGroups8(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
if (code.HasHostFeature(HostFeature::AVX)) {
code.vpshufb(data, data, code.Const(xword, 0x0001020304050607, 0x08090a0b0c0d0e0f));
} else if (code.HasHostFeature(HostFeature::SSSE3)) {
code.pshufb(data, code.Const(xword, 0x0001020304050607, 0x08090a0b0c0d0e0f));
} else {
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(tmp, data);
code.psllw(tmp, 8);
code.psrlw(data, 8);
@@ -3410,7 +3668,7 @@ void EmitX64::EmitVectorReverseElementsInLongGroups8(EmitContext& ctx, IR::Inst*
void EmitX64::EmitVectorReverseElementsInLongGroups16(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
code.pshuflw(data, data, 0b00011011);
code.pshufhw(data, data, 0b00011011);
@@ -3421,7 +3679,7 @@ void EmitX64::EmitVectorReverseElementsInLongGroups16(EmitContext& ctx, IR::Inst
void EmitX64::EmitVectorReverseElementsInLongGroups32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
code.pshuflw(data, data, 0b01001110);
code.pshufhw(data, data, 0b01001110);
@@ -3432,8 +3690,8 @@ void EmitX64::EmitVectorReverseElementsInLongGroups32(EmitContext& ctx, IR::Inst
void EmitX64::EmitVectorReduceAdd8(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm temp = xmm0;
+ auto const data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const temp = xmm0;
// Add upper elements to lower elements
code.pshufd(temp, data, 0b01'00'11'10);
@@ -3453,8 +3711,8 @@ void EmitX64::EmitVectorReduceAdd8(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorReduceAdd16(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm temp = xmm0;
+ auto const data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const temp = xmm0;
if (code.HasHostFeature(HostFeature::SSSE3)) {
code.pxor(temp, temp);
@@ -3484,8 +3742,8 @@ void EmitX64::EmitVectorReduceAdd16(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorReduceAdd32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm temp = xmm0;
+ auto const data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const temp = xmm0;
// Add upper elements to lower elements(reversed)
code.pshufd(temp, data, 0b00'01'10'11);
@@ -3508,8 +3766,8 @@ void EmitX64::EmitVectorReduceAdd32(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorReduceAdd64(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm temp = xmm0;
+ auto const data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const temp = xmm0;
// Add upper elements to lower elements
code.pshufd(temp, data, 0b01'00'11'10);
@@ -3524,8 +3782,8 @@ void EmitX64::EmitVectorReduceAdd64(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorRotateWholeVectorRight(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm operand = ctx.reg_alloc.UseXmm(code, args[0]);
- const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+ auto const operand = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const result = ctx.reg_alloc.ScratchXmm(code);
const u8 shift_amount = args[1].GetImmediateU8();
ASSERT(shift_amount % 32 == 0);
const u8 shuffle_imm = std::rotr(0b11100100, shift_amount / 32 * 2);
@@ -3538,12 +3796,12 @@ void EmitX64::EmitVectorRotateWholeVectorRight(EmitContext& ctx, IR::Inst* inst)
static void EmitVectorRoundingHalvingAddSigned(size_t esize, EmitContext& ctx, IR::Inst* inst, BlockOfCode& code) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
switch (esize) {
case 8: {
- const Xbyak::Xmm vec_128 = ctx.reg_alloc.ScratchXmm(code);
+ auto const vec_128 = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(vec_128, code.Const(xword, 0x8080808080808080, 0x8080808080808080));
code.paddb(a, vec_128);
@@ -3553,7 +3811,7 @@ static void EmitVectorRoundingHalvingAddSigned(size_t esize, EmitContext& ctx, I
break;
}
case 16: {
- const Xbyak::Xmm vec_32768 = ctx.reg_alloc.ScratchXmm(code);
+ auto const vec_32768 = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(vec_32768, code.Const(xword, 0x8000800080008000, 0x8000800080008000));
code.paddw(a, vec_32768);
@@ -3563,7 +3821,7 @@ static void EmitVectorRoundingHalvingAddSigned(size_t esize, EmitContext& ctx, I
break;
}
case 32: {
- const Xbyak::Xmm tmp1 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp1 = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(tmp1, a);
code.por(a, b);
@@ -3603,9 +3861,9 @@ static void EmitVectorRoundingHalvingAddUnsigned(size_t esize, EmitContext& ctx,
case 32: {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
- const Xbyak::Xmm tmp1 = ctx.reg_alloc.ScratchXmm(code);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const b = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const tmp1 = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(tmp1, a);
@@ -3669,18 +3927,18 @@ static void EmitUnsignedRoundingShiftLeft(BlockOfCode& code, EmitContext& ctx, I
static_assert(esize == 32 || esize == 64);
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseXmm(code, args[0]);
- const Xbyak::Xmm b = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const a = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const b = ctx.reg_alloc.UseXmm(code, args[1]);
// positive values of b are left shifts, while negative values are (positive) rounding right shifts
// only the lowest byte of each element is read as the shift amount
// conveniently, the behavior of bit shifts greater than element width is the same in NEON and SSE/AVX - filled with zeros
- const Xbyak::Xmm shift_amount = ctx.reg_alloc.ScratchXmm(code);
+ auto const shift_amount = ctx.reg_alloc.ScratchXmm(code);
code.vpabsb(shift_amount, b);
code.vpand(shift_amount, shift_amount, code.BConst(xword, 0xFF));
// if b is positive, do a normal left shift
- const Xbyak::Xmm left_shift = ctx.reg_alloc.ScratchXmm(code);
+ auto const left_shift = ctx.reg_alloc.ScratchXmm(code);
ICODE(vpsllv)(left_shift, a, shift_amount);
// if b is negative, compute the rounding right shift
@@ -3691,7 +3949,7 @@ static void EmitUnsignedRoundingShiftLeft(BlockOfCode& code, EmitContext& ctx, I
// tmp = (a >> (b - 1)) & 1
// res = (a >> b) + tmp
// to add the value of the last bit to be shifted off to the result of the right shift
- const Xbyak::Xmm right_shift = ctx.reg_alloc.ScratchXmm(code);
+ auto const right_shift = ctx.reg_alloc.ScratchXmm(code);
code.vmovdqa(xmm0, code.BConst(xword, 1));
// find value of last bit to be shifted off
@@ -3775,12 +4033,12 @@ void EmitX64::EmitVectorRoundingShiftLeftU64(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorSignExtend8(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasHostFeature(HostFeature::SSE41)) {
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
code.pmovsxbw(a, a);
ctx.reg_alloc.DefineValue(code, inst, a);
} else {
- const Xbyak::Xmm a = ctx.reg_alloc.UseXmm(code, args[0]);
- const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+ auto const a = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const result = ctx.reg_alloc.ScratchXmm(code);
code.pxor(result, result);
code.punpcklbw(result, a);
code.psraw(result, 8);
@@ -3791,12 +4049,12 @@ void EmitX64::EmitVectorSignExtend8(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorSignExtend16(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasHostFeature(HostFeature::SSE41)) {
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
code.pmovsxwd(a, a);
ctx.reg_alloc.DefineValue(code, inst, a);
} else {
- const Xbyak::Xmm a = ctx.reg_alloc.UseXmm(code, args[0]);
- const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+ auto const a = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const result = ctx.reg_alloc.ScratchXmm(code);
code.pxor(result, result);
code.punpcklwd(result, a);
code.psrad(result, 16);
@@ -3806,12 +4064,12 @@ void EmitX64::EmitVectorSignExtend16(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorSignExtend32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
if (code.HasHostFeature(HostFeature::SSE41)) {
code.pmovsxdq(a, a);
} else {
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
code.movaps(tmp, a);
code.psrad(tmp, 31);
@@ -3824,7 +4082,7 @@ void EmitX64::EmitVectorSignExtend32(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorSignExtend64(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const Xbyak::Reg64 gpr_tmp = ctx.reg_alloc.ScratchGpr(code);
code.movq(gpr_tmp, data);
@@ -3833,7 +4091,7 @@ void EmitX64::EmitVectorSignExtend64(EmitContext& ctx, IR::Inst* inst) {
if (code.HasHostFeature(HostFeature::SSE41)) {
code.pinsrq(data, gpr_tmp, 1);
} else {
- const Xbyak::Xmm xmm_tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const xmm_tmp = ctx.reg_alloc.ScratchXmm(code);
code.movq(xmm_tmp, gpr_tmp);
code.punpcklqdq(data, xmm_tmp);
@@ -3844,9 +4102,9 @@ void EmitX64::EmitVectorSignExtend64(EmitContext& ctx, IR::Inst* inst) {
static void EmitVectorSignedAbsoluteDifference(size_t esize, EmitContext& ctx, IR::Inst* inst, BlockOfCode& code) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(code, args[1]);
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const y = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
// only signed 16-bit min/max are available below SSE4.1
if (code.HasHostFeature(HostFeature::SSE41) || esize == 16) {
@@ -3912,11 +4170,11 @@ void EmitX64::EmitVectorSignedMultiply16(EmitContext& ctx, IR::Inst* inst) {
const auto lower_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetLowerFromOp);
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm x = ctx.reg_alloc.UseXmm(code, args[0]);
- const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const x = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const y = ctx.reg_alloc.UseXmm(code, args[1]);
if (upper_inst) {
- const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+ auto const result = ctx.reg_alloc.ScratchXmm(code);
if (code.HasHostFeature(HostFeature::AVX)) {
code.vpmulhw(result, x, y);
} else {
@@ -3928,7 +4186,7 @@ void EmitX64::EmitVectorSignedMultiply16(EmitContext& ctx, IR::Inst* inst) {
}
if (lower_inst) {
- const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+ auto const result = ctx.reg_alloc.ScratchXmm(code);
if (code.HasHostFeature(HostFeature::AVX)) {
code.vpmullw(result, x, y);
} else {
@@ -3946,9 +4204,9 @@ void EmitX64::EmitVectorSignedMultiply32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (lower_inst && !upper_inst && code.HasHostFeature(HostFeature::AVX)) {
- const Xbyak::Xmm x = ctx.reg_alloc.UseXmm(code, args[0]);
- const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(code, args[1]);
- const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+ auto const x = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const y = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const result = ctx.reg_alloc.ScratchXmm(code);
code.vpmulld(result, x, y);
@@ -3957,16 +4215,16 @@ void EmitX64::EmitVectorSignedMultiply32(EmitContext& ctx, IR::Inst* inst) {
}
if (code.HasHostFeature(HostFeature::AVX)) {
- const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
if (lower_inst) {
- const Xbyak::Xmm lower_result = ctx.reg_alloc.ScratchXmm(code);
+ auto const lower_result = ctx.reg_alloc.ScratchXmm(code);
code.vpmulld(lower_result, x, y);
ctx.reg_alloc.DefineValue(code, lower_inst, lower_result);
}
- const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+ auto const result = ctx.reg_alloc.ScratchXmm(code);
code.vpmuldq(result, x, y);
code.vpsrlq(x, x, 32);
@@ -3978,12 +4236,12 @@ void EmitX64::EmitVectorSignedMultiply32(EmitContext& ctx, IR::Inst* inst) {
return;
}
- const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
- const Xbyak::Xmm sign_correction = ctx.reg_alloc.ScratchXmm(code);
- const Xbyak::Xmm upper_result = ctx.reg_alloc.ScratchXmm(code);
- const Xbyak::Xmm lower_result = ctx.reg_alloc.ScratchXmm(code);
+ auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const sign_correction = ctx.reg_alloc.ScratchXmm(code);
+ auto const upper_result = ctx.reg_alloc.ScratchXmm(code);
+ auto const lower_result = ctx.reg_alloc.ScratchXmm(code);
// calculate sign correction
code.movdqa(tmp, x);
@@ -4026,7 +4284,7 @@ void EmitX64::EmitVectorSignedMultiply32(EmitContext& ctx, IR::Inst* inst) {
static void EmitVectorSignedSaturatedAbs(size_t esize, BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const data = ctx.reg_alloc.UseScratchXmm(code, args[0]);
const Xbyak::Reg32 bit = ctx.reg_alloc.ScratchGpr(code).cvt32();
// SSE absolute value functions return an unsigned result
@@ -4038,21 +4296,34 @@ static void EmitVectorSignedSaturatedAbs(size_t esize, BlockOfCode& code, EmitCo
// or shift in sign bits to create a mask of (msb == 1 ? -1 : 0), then add to the result vector
switch (esize) {
case 8: {
- VectorAbs8(code, ctx, data);
+ if (code.HasHostFeature(HostFeature::SSSE3)) {
+ code.pabsb(data, data);
+ } else {
+ auto const temp = ctx.reg_alloc.ScratchXmm(code);
+ code.pxor(temp, temp);
+ code.psubb(temp, data);
+ code.pminub(data, temp);
+ }
code.pmovmskb(bit, data);
-
code.pminub(data, code.BConst<8>(xword, 0x7F));
break;
}
case 16: {
- VectorAbs16(code, ctx, data);
+ if (code.HasHostFeature(HostFeature::SSSE3)) {
+ code.pabsw(data, data);
+ } else {
+ auto const temp = ctx.reg_alloc.ScratchXmm(code);
+ code.pxor(temp, temp);
+ code.psubw(temp, data);
+ code.pmaxsw(data, temp);
+ }
code.pmovmskb(bit, data);
code.and_(bit, 0xAAAA); // toggle mask bits that aren't the msb of an int16 to 0
if (code.HasHostFeature(HostFeature::SSE41)) {
code.pminuw(data, code.BConst<16>(xword, 0x7FFF));
} else {
- const Xbyak::Xmm tmp = xmm0;
+ auto const tmp = xmm0;
code.movdqa(tmp, data);
code.psraw(data, 15);
code.paddw(data, tmp);
@@ -4060,13 +4331,21 @@ static void EmitVectorSignedSaturatedAbs(size_t esize, BlockOfCode& code, EmitCo
break;
}
case 32: {
- VectorAbs32(code, ctx, data);
+ if (code.HasHostFeature(HostFeature::SSSE3)) {
+ code.pabsd(data, data);
+ } else {
+ auto const temp = ctx.reg_alloc.ScratchXmm(code);
+ code.movdqa(temp, data);
+ code.psrad(temp, 31);
+ code.pxor(data, temp);
+ code.psubd(data, temp);
+ }
code.movmskps(bit, data);
if (code.HasHostFeature(HostFeature::SSE41)) {
code.pminud(data, code.BConst<32>(xword, 0x7FFFFFFF));
} else {
- const Xbyak::Xmm tmp = xmm0;
+ auto const tmp = xmm0;
code.movdqa(tmp, data);
code.psrad(data, 31);
code.paddd(data, tmp);
@@ -4074,10 +4353,18 @@ static void EmitVectorSignedSaturatedAbs(size_t esize, BlockOfCode& code, EmitCo
break;
}
case 64: {
- VectorAbs64(code, ctx, data);
+ if (code.HasHostFeature(HostFeature::AVX512_Ortho)) {
+ code.vpabsq(data, data);
+ } else {
+ auto const temp = ctx.reg_alloc.ScratchXmm(code);
+ code.pshufd(temp, data, 0b11110101);
+ code.psrad(temp, 31);
+ code.pxor(data, temp);
+ code.psubq(data, temp);
+ }
code.movmskpd(bit, data);
- const Xbyak::Xmm tmp = xmm0;
+ auto const tmp = xmm0;
if (code.HasHostFeature(HostFeature::SSE42)) {
// create a -1 mask if msb is set
code.pxor(tmp, tmp);
@@ -4119,13 +4406,13 @@ template
static void EmitVectorSignedSaturatedAccumulateUnsigned(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const y = ctx.reg_alloc.UseXmm(code, args[1]);
code.movdqa(xmm0, y);
ctx.reg_alloc.Release(y);
- const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const result = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
switch (bit_width) {
case 8:
@@ -4182,7 +4469,7 @@ static void EmitVectorSignedSaturatedAccumulateUnsigned(BlockOfCode& code, EmitC
switch (bit_width) {
case 8:
if (code.HasHostFeature(HostFeature::AVX)) {
- const Xbyak::Xmm tmp2 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
code.pcmpeqb(tmp2, tmp2);
code.pxor(tmp, tmp);
code.vpblendvb(xmm0, tmp, tmp2, xmm0);
@@ -4262,10 +4549,10 @@ void EmitX64::EmitVectorSignedSaturatedAccumulateUnsigned64(EmitContext& ctx, IR
template
static void EmitVectorSignedSaturatedDoublingMultiply16(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm x = ctx.reg_alloc.UseXmm(code, args[0]);
- const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(code, args[1]);
- const Xbyak::Xmm upper_tmp = ctx.reg_alloc.ScratchXmm(code);
- const Xbyak::Xmm lower_tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const x = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const y = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const upper_tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const lower_tmp = ctx.reg_alloc.ScratchXmm(code);
if (code.HasHostFeature(HostFeature::AVX)) {
code.vpmulhw(upper_tmp, x, y);
@@ -4284,7 +4571,7 @@ static void EmitVectorSignedSaturatedDoublingMultiply16(BlockOfCode& code, EmitC
ctx.reg_alloc.Release(x);
ctx.reg_alloc.Release(y);
- const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+ auto const result = ctx.reg_alloc.ScratchXmm(code);
if (code.HasHostFeature(HostFeature::AVX)) {
if constexpr (is_rounding) {
@@ -4334,10 +4621,10 @@ void EmitVectorSignedSaturatedDoublingMultiply32(BlockOfCode& code, EmitContext&
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (code.HasHostFeature(HostFeature::AVX)) {
- const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
- const Xbyak::Xmm odds = ctx.reg_alloc.ScratchXmm(code);
- const Xbyak::Xmm even = ctx.reg_alloc.ScratchXmm(code);
+ auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const odds = ctx.reg_alloc.ScratchXmm(code);
+ auto const even = ctx.reg_alloc.ScratchXmm(code);
code.vpmuldq(odds, x, y);
code.vpsrlq(x, x, 32);
@@ -4350,7 +4637,7 @@ void EmitVectorSignedSaturatedDoublingMultiply32(BlockOfCode& code, EmitContext&
code.vpaddq(odds, odds, odds);
code.vpaddq(even, even, even);
- const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+ auto const result = ctx.reg_alloc.ScratchXmm(code);
if constexpr (is_rounding) {
code.vmovdqa(result, code.Const(xword, 0x0000000080000000, 0x0000000080000000));
@@ -4361,7 +4648,7 @@ void EmitVectorSignedSaturatedDoublingMultiply32(BlockOfCode& code, EmitContext&
code.vpsrlq(result, odds, 32);
code.vblendps(result, result, even, 0b1010);
- const Xbyak::Xmm mask = ctx.reg_alloc.ScratchXmm(code);
+ auto const mask = ctx.reg_alloc.ScratchXmm(code);
const Xbyak::Reg32 bit = ctx.reg_alloc.ScratchGpr(code).cvt32();
code.vpcmpeqd(mask, result, code.Const(xword, 0x8000000080000000, 0x8000000080000000));
@@ -4376,11 +4663,11 @@ void EmitVectorSignedSaturatedDoublingMultiply32(BlockOfCode& code, EmitContext&
return;
}
- const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
- const Xbyak::Xmm sign_correction = ctx.reg_alloc.ScratchXmm(code);
- const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+ auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const sign_correction = ctx.reg_alloc.ScratchXmm(code);
+ auto const result = ctx.reg_alloc.ScratchXmm(code);
// calculate sign correction
code.movdqa(tmp, x);
@@ -4439,8 +4726,8 @@ void EmitX64::EmitVectorSignedSaturatedDoublingMultiplyHighRounding32(EmitContex
void EmitX64::EmitVectorSignedSaturatedDoublingMultiplyLong16(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
code.punpcklwd(x, x);
code.punpcklwd(y, y);
@@ -4465,8 +4752,8 @@ void EmitX64::EmitVectorSignedSaturatedDoublingMultiplyLong16(EmitContext& ctx,
void EmitX64::EmitVectorSignedSaturatedDoublingMultiplyLong32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
if (code.HasHostFeature(HostFeature::AVX)) {
code.vpmovsxdq(x, x);
@@ -4517,10 +4804,10 @@ void EmitX64::EmitVectorSignedSaturatedDoublingMultiplyLong32(EmitContext& ctx,
static void EmitVectorSignedSaturatedNarrowToSigned(size_t original_esize, BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm src = ctx.reg_alloc.UseXmm(code, args[0]);
- const Xbyak::Xmm dest = ctx.reg_alloc.ScratchXmm(code);
- const Xbyak::Xmm reconstructed = ctx.reg_alloc.ScratchXmm(code);
- const Xbyak::Xmm sign = ctx.reg_alloc.ScratchXmm(code);
+ auto const src = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const dest = ctx.reg_alloc.ScratchXmm(code);
+ auto const reconstructed = ctx.reg_alloc.ScratchXmm(code);
+ auto const sign = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(dest, src);
code.pxor(xmm0, xmm0);
@@ -4577,9 +4864,9 @@ void EmitX64::EmitVectorSignedSaturatedNarrowToSigned64(EmitContext& ctx, IR::In
static void EmitVectorSignedSaturatedNarrowToUnsigned(size_t original_esize, BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm src = ctx.reg_alloc.UseXmm(code, args[0]);
- const Xbyak::Xmm dest = ctx.reg_alloc.ScratchXmm(code);
- const Xbyak::Xmm reconstructed = ctx.reg_alloc.ScratchXmm(code);
+ auto const src = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const dest = ctx.reg_alloc.ScratchXmm(code);
+ auto const reconstructed = ctx.reg_alloc.ScratchXmm(code);
code.movdqa(dest, src);
code.pxor(xmm0, xmm0);
@@ -4647,9 +4934,9 @@ void EmitX64::EmitVectorSignedSaturatedNarrowToUnsigned64(EmitContext& ctx, IR::
static void EmitVectorSignedSaturatedNeg(size_t esize, BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm data = ctx.reg_alloc.UseXmm(code, args[0]);
- const Xbyak::Xmm zero = ctx.reg_alloc.ScratchXmm(code);
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const data = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const zero = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
const Xbyak::Address mask = [esize, &code] {
switch (esize) {
case 8:
@@ -4665,7 +4952,7 @@ static void EmitVectorSignedSaturatedNeg(size_t esize, BlockOfCode& code, EmitCo
}
}();
- const auto vector_equality = [esize, &code](const Xbyak::Xmm& x, const auto& y) {
+ const auto vector_equality = [esize, &code](auto const& x, const auto& y) {
switch (esize) {
case 8:
code.pcmpeqb(x, y);
@@ -4810,33 +5097,23 @@ void EmitX64::EmitVectorSignedSaturatedShiftLeft64(EmitContext& ctx, IR::Inst* i
EmitTwoArgumentFallbackWithSaturation(code, ctx, inst, VectorSignedSaturatedShiftLeft);
}
-template>
+template
static bool VectorSignedSaturatedShiftLeftUnsigned(VectorArray& dst, const VectorArray& data, u8 shift_amount) {
+ using U = std::make_unsigned_t;
static_assert(std::is_signed_v, "T must be signed.");
-
bool qc_flag = false;
for (size_t i = 0; i < dst.size(); i++) {
- const T element = data[i];
- const T shift = static_cast(shift_amount);
-
- if (element == 0) {
- dst[i] = 0;
- } else if (element < 0) {
- dst[i] = 0;
- qc_flag = true;
- } else {
- const U shifted = static_cast(element) << static_cast(shift);
- const U shifted_test = shifted >> static_cast(shift);
-
- if (shifted_test != static_cast(element)) {
- dst[i] = static_cast((std::numeric_limits::max)());
- qc_flag = true;
- } else {
- dst[i] = shifted;
- }
- }
+ auto const element = data[i];
+ auto const shifted = U(element) << U(T(shift_amount));
+ auto const shifted_test = shifted >> U(T(shift_amount));
+ auto result = 0;
+ if (element > 0 && shifted_test != U(element))
+ result = T((std::numeric_limits::max)());
+ if (element > 0 && shifted_test == U(element))
+ result = shifted;
+ qc_flag |= element < 0 || (element > 0 && shifted_test != U(element));
+ dst[i] = result;
}
-
return qc_flag;
}
@@ -4849,7 +5126,97 @@ void EmitX64::EmitVectorSignedSaturatedShiftLeftUnsigned16(EmitContext& ctx, IR:
}
void EmitX64::EmitVectorSignedSaturatedShiftLeftUnsigned32(EmitContext& ctx, IR::Inst* inst) {
- EmitTwoArgumentFallbackWithSaturationAndImmediate(code, ctx, inst, VectorSignedSaturatedShiftLeftUnsigned);
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ auto const imm8 = args[1].GetImmediateU8();
+ if (code.HasHostFeature(HostFeature::AVX2)) {
+ auto const tmp_flag = ctx.reg_alloc.ScratchGpr(code);
+ auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ if (imm8 == 0) {
+ auto const tmp1 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+ code.vpshufd(tmp1, tmp0, 85);
+ code.vpshufd(tmp2, tmp0, 238);
+ code.vpor(tmp1, tmp1, tmp2);
+ code.vpshufd(tmp2, tmp0, 255);
+ code.vpor(tmp2, tmp2, tmp0);
+ code.vpor(tmp1, tmp1, tmp2);
+ code.vmovd(tmp_flag.cvt32(), tmp1);
+ code.shr(tmp_flag.cvt32(), 31);
+ code.vpxor(tmp1, tmp1, tmp1);
+ code.vpmaxsd(tmp0, tmp0, tmp1);
+ } else {
+ auto const tmp1 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp3 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp4 = ctx.reg_alloc.ScratchXmm(code);
+ auto const cmp_value = u32(1ULL << 31) >> (imm8 - 1);
+ code.vpshufd(tmp1, tmp0, 238);
+ code.vpor(tmp1, tmp1, tmp0);
+ code.vpshufd(tmp2, tmp1, 85);
+ code.vpor(tmp1, tmp1, tmp2);
+ code.vmovd(tmp_flag.cvt32(), tmp1);
+ code.cmp(tmp_flag.cvt32(), cmp_value);
+ code.vpslld(tmp1, tmp0, imm8);
+ code.vpbroadcastd(tmp2, code.Const(dword, cmp_value - 2));
+ code.vpbroadcastd(tmp3, code.Const(dword, cmp_value - 1));
+ code.vpcmpgtd(tmp3, tmp0, tmp3);
+ code.vpcmpeqd(tmp4, tmp4, tmp4);
+ code.vpaddd(tmp0, tmp0, tmp4);
+ code.vpminud(tmp2, tmp0, tmp2);
+ code.vpcmpeqd(tmp0, tmp0, tmp2);
+ code.vblendvps(tmp0, tmp3, tmp1, tmp0);
+ code.setae(tmp_flag.cvt8());
+ }
+ code.or_(code.byte[code.ABI_JIT_PTR + code.GetJitStateInfo().offsetof_fpsr_qc], tmp_flag.cvt8());
+ ctx.reg_alloc.DefineValue(code, inst, tmp0);
+ } else {
+ auto const tmp_flag = ctx.reg_alloc.ScratchGpr(code);
+ auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ if (imm8 == 0) {
+ auto const tmp1 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+ code.pshufd(tmp1, tmp0, 85);
+ code.pshufd(tmp2, tmp0, 238);
+ code.por(tmp2, tmp1);
+ code.pshufd(tmp1, tmp0, 255);
+ code.por(tmp1, tmp0);
+ code.por(tmp1, tmp2);
+ code.movd(tmp_flag.cvt32(), tmp1);
+ code.shr(tmp_flag.cvt32(), 31);
+ code.pxor(tmp1, tmp1);
+ code.movdqa(tmp2, tmp0);
+ code.pcmpgtd(tmp2, tmp1);
+ code.pand(tmp0, tmp2);
+ } else {
+ auto const tmp1 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp3 = ctx.reg_alloc.ScratchXmm(code);
+ u64 const cmp_value = u64(1ULL << 31) >> (imm8 - 1);
+ u64 const cmp_one = cmp_value - 1;
+ u64 const cmp_add = (cmp_value - 2) + 0x80000000;
+ code.pshufd(tmp1, tmp0, 238);
+ code.por(tmp1, tmp0);
+ code.pshufd(tmp2, tmp1, 85);
+ code.por(tmp2, tmp1);
+ code.movd(tmp_flag.cvt32(), tmp2);
+ code.cmp(tmp_flag.cvt32(), cmp_value);
+ code.movdqa(tmp1, tmp0);
+ code.pslld(tmp1, imm8);
+ code.movdqa(tmp2, tmp0);
+ code.pcmpgtd(tmp2, code.Const(xword, cmp_one | (cmp_one << 32), cmp_one | (cmp_one << 32)));
+ code.pcmpeqd(tmp3, tmp3);
+ code.paddd(tmp0, tmp3);
+ code.pxor(tmp0, code.Const(xword, 0x80000000'80000000, 0x80000000'80000000));
+ code.pcmpgtd(tmp0, code.Const(xword, cmp_add | (cmp_add << 32), cmp_add | (cmp_add << 32)));
+ code.pand(tmp2, tmp0);
+ code.pandn(tmp0, tmp1);
+ code.por(tmp0, tmp2);
+ code.setae(tmp_flag.cvt8());
+ }
+ code.or_(code.byte[code.ABI_JIT_PTR + code.GetJitStateInfo().offsetof_fpsr_qc], tmp_flag.cvt8());
+ ctx.reg_alloc.DefineValue(code, inst, tmp0);
+// EmitTwoArgumentFallbackWithSaturationAndImmediate(code, ctx, inst, VectorSignedSaturatedShiftLeftUnsigned);
+ }
}
void EmitX64::EmitVectorSignedSaturatedShiftLeftUnsigned64(EmitContext& ctx, IR::Inst* inst) {
@@ -4887,7 +5254,7 @@ void EmitX64::EmitVectorTableLookup64(EmitContext& ctx, IR::Inst* inst) {
const bool is_defaults_zero = inst->GetArg(0).IsZero();
if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW | HostFeature::AVX512VBMI)) {
- const Xbyak::Xmm indicies = table_size <= 2 ? ctx.reg_alloc.UseXmm(code, args[2]) : ctx.reg_alloc.UseScratchXmm(code, args[2]);
+ auto const indicies = table_size <= 2 ? ctx.reg_alloc.UseXmm(code, args[2]) : ctx.reg_alloc.UseScratchXmm(code, args[2]);
const u64 index_count = mcl::bit::replicate_element(static_cast(table_size * 8));
@@ -4895,43 +5262,43 @@ void EmitX64::EmitVectorTableLookup64(EmitContext& ctx, IR::Inst* inst) {
switch (table_size) {
case 1: {
- const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseXmm(code, table[0]);
+ auto const xmm_table0 = ctx.reg_alloc.UseXmm(code, table[0]);
if (is_defaults_zero) {
- const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+ auto const result = ctx.reg_alloc.ScratchXmm(code);
code.vpermb(result | k1 | T_z, indicies, xmm_table0);
ctx.reg_alloc.DefineValue(code, inst, result);
} else {
- const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
code.vpermb(result | k1, indicies, xmm_table0);
ctx.reg_alloc.DefineValue(code, inst, result);
}
break;
}
case 2: {
- const Xbyak::Xmm xmm_table0_lower = ctx.reg_alloc.UseXmm(code, table[0]);
- const Xbyak::Xmm xmm_table0_upper = ctx.reg_alloc.UseXmm(code, table[1]);
+ auto const xmm_table0_lower = ctx.reg_alloc.UseXmm(code, table[0]);
+ auto const xmm_table0_upper = ctx.reg_alloc.UseXmm(code, table[1]);
code.vpunpcklqdq(xmm0, xmm_table0_lower, xmm_table0_upper);
if (is_defaults_zero) {
- const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+ auto const result = ctx.reg_alloc.ScratchXmm(code);
code.vpermb(result | k1 | T_z, indicies, xmm0);
ctx.reg_alloc.DefineValue(code, inst, result);
} else {
- const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
code.vpermb(result | k1, indicies, xmm0);
ctx.reg_alloc.DefineValue(code, inst, result);
}
break;
}
case 3: {
- const Xbyak::Xmm xmm_table0_lower = ctx.reg_alloc.UseXmm(code, table[0]);
- const Xbyak::Xmm xmm_table0_upper = ctx.reg_alloc.UseXmm(code, table[1]);
- const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseXmm(code, table[2]);
+ auto const xmm_table0_lower = ctx.reg_alloc.UseXmm(code, table[0]);
+ auto const xmm_table0_upper = ctx.reg_alloc.UseXmm(code, table[1]);
+ auto const xmm_table1 = ctx.reg_alloc.UseXmm(code, table[2]);
code.vpunpcklqdq(xmm0, xmm_table0_lower, xmm_table0_upper);
if (is_defaults_zero) {
code.vpermi2b(indicies | k1 | T_z, xmm0, xmm_table1);
ctx.reg_alloc.DefineValue(code, inst, indicies);
} else {
- const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
code.vpermi2b(indicies, xmm0, xmm_table1);
code.vmovdqu8(result | k1, indicies);
ctx.reg_alloc.DefineValue(code, inst, result);
@@ -4939,17 +5306,17 @@ void EmitX64::EmitVectorTableLookup64(EmitContext& ctx, IR::Inst* inst) {
break;
}
case 4: {
- const Xbyak::Xmm xmm_table0_lower = ctx.reg_alloc.UseXmm(code, table[0]);
- const Xbyak::Xmm xmm_table0_upper = ctx.reg_alloc.UseXmm(code, table[1]);
- const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseScratchXmm(code, table[2]);
- const Xbyak::Xmm xmm_table1_upper = ctx.reg_alloc.UseXmm(code, table[3]);
+ auto const xmm_table0_lower = ctx.reg_alloc.UseXmm(code, table[0]);
+ auto const xmm_table0_upper = ctx.reg_alloc.UseXmm(code, table[1]);
+ auto const xmm_table1 = ctx.reg_alloc.UseScratchXmm(code, table[2]);
+ auto const xmm_table1_upper = ctx.reg_alloc.UseXmm(code, table[3]);
code.vpunpcklqdq(xmm0, xmm_table0_lower, xmm_table0_upper);
code.vpunpcklqdq(xmm_table1, xmm_table1, xmm_table1_upper);
if (is_defaults_zero) {
code.vpermi2b(indicies | k1 | T_z, xmm0, xmm_table1);
ctx.reg_alloc.DefineValue(code, inst, indicies);
} else {
- const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
code.vpermi2b(indicies, xmm0, xmm_table1);
code.vmovdqu8(result | k1, indicies);
ctx.reg_alloc.DefineValue(code, inst, result);
@@ -4972,9 +5339,9 @@ void EmitX64::EmitVectorTableLookup64(EmitContext& ctx, IR::Inst* inst) {
};
if (code.HasHostFeature(HostFeature::SSSE3) && is_defaults_zero && table_size == 1) {
- const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]);
- const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseXmm(code, table[0]);
- const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+ auto const indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]);
+ auto const xmm_table0 = ctx.reg_alloc.UseXmm(code, table[0]);
+ auto const result = ctx.reg_alloc.ScratchXmm(code);
code.xorps(result, result);
code.movsd(result, xmm_table0);
@@ -4986,9 +5353,9 @@ void EmitX64::EmitVectorTableLookup64(EmitContext& ctx, IR::Inst* inst) {
}
if (code.HasHostFeature(HostFeature::SSSE3) && is_defaults_zero && table_size == 2) {
- const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]);
- const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(code, table[0]);
- const Xbyak::Xmm xmm_table0_upper = ctx.reg_alloc.UseXmm(code, table[1]);
+ auto const indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]);
+ auto const xmm_table0 = ctx.reg_alloc.UseScratchXmm(code, table[0]);
+ auto const xmm_table0_upper = ctx.reg_alloc.UseXmm(code, table[1]);
code.punpcklqdq(xmm_table0, xmm_table0_upper);
code.paddusb(indicies, code.Const(xword, 0x7070707070707070, 0xFFFFFFFFFFFFFFFF));
@@ -4999,12 +5366,12 @@ void EmitX64::EmitVectorTableLookup64(EmitContext& ctx, IR::Inst* inst) {
}
if (code.HasHostFeature(HostFeature::SSE41) && table_size <= 2) {
- const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(code, args[2]);
- const Xbyak::Xmm defaults = ctx.reg_alloc.UseXmm(code, args[0]);
- const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(code, table[0]);
+ auto const indicies = ctx.reg_alloc.UseXmm(code, args[2]);
+ auto const defaults = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const xmm_table0 = ctx.reg_alloc.UseScratchXmm(code, table[0]);
if (table_size == 2) {
- const Xbyak::Xmm xmm_table0_upper = ctx.reg_alloc.UseXmm(code, table[1]);
+ auto const xmm_table0_upper = ctx.reg_alloc.UseXmm(code, table[1]);
code.punpcklqdq(xmm_table0, xmm_table0_upper);
ctx.reg_alloc.Release(xmm_table0_upper);
}
@@ -5023,12 +5390,12 @@ void EmitX64::EmitVectorTableLookup64(EmitContext& ctx, IR::Inst* inst) {
}
if (code.HasHostFeature(HostFeature::SSE41) && is_defaults_zero) {
- const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]);
- const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(code, table[0]);
- const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseScratchXmm(code, table[2]);
+ auto const indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]);
+ auto const xmm_table0 = ctx.reg_alloc.UseScratchXmm(code, table[0]);
+ auto const xmm_table1 = ctx.reg_alloc.UseScratchXmm(code, table[2]);
{
- const Xbyak::Xmm xmm_table0_upper = ctx.reg_alloc.UseXmm(code, table[1]);
+ auto const xmm_table0_upper = ctx.reg_alloc.UseXmm(code, table[1]);
code.punpcklqdq(xmm_table0, xmm_table0_upper);
ctx.reg_alloc.Release(xmm_table0_upper);
}
@@ -5037,7 +5404,7 @@ void EmitX64::EmitVectorTableLookup64(EmitContext& ctx, IR::Inst* inst) {
code.punpcklqdq(xmm_table1, xmm0);
} else {
ASSERT(table_size == 4);
- const Xbyak::Xmm xmm_table1_upper = ctx.reg_alloc.UseXmm(code, table[3]);
+ auto const xmm_table1_upper = ctx.reg_alloc.UseXmm(code, table[3]);
code.punpcklqdq(xmm_table1, xmm_table1_upper);
ctx.reg_alloc.Release(xmm_table1_upper);
}
@@ -5058,18 +5425,18 @@ void EmitX64::EmitVectorTableLookup64(EmitContext& ctx, IR::Inst* inst) {
}
if (code.HasHostFeature(HostFeature::SSE41)) {
- const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]);
- const Xbyak::Xmm defaults = ctx.reg_alloc.UseXmm(code, args[0]);
- const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(code, table[0]);
- const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseScratchXmm(code, table[2]);
+ auto const indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]);
+ auto const defaults = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const xmm_table0 = ctx.reg_alloc.UseScratchXmm(code, table[0]);
+ auto const xmm_table1 = ctx.reg_alloc.UseScratchXmm(code, table[2]);
{
- const Xbyak::Xmm xmm_table0_upper = ctx.reg_alloc.UseXmm(code, table[1]);
+ auto const xmm_table0_upper = ctx.reg_alloc.UseXmm(code, table[1]);
code.punpcklqdq(xmm_table0, xmm_table0_upper);
ctx.reg_alloc.Release(xmm_table0_upper);
}
if (table_size == 4) {
- const Xbyak::Xmm xmm_table1_upper = ctx.reg_alloc.UseXmm(code, table[3]);
+ auto const xmm_table1_upper = ctx.reg_alloc.UseXmm(code, table[3]);
code.punpcklqdq(xmm_table1, xmm_table1_upper);
ctx.reg_alloc.Release(xmm_table1_upper);
}
@@ -5098,37 +5465,31 @@ void EmitX64::EmitVectorTableLookup64(EmitContext& ctx, IR::Inst* inst) {
const u32 stack_space = static_cast(6 * 8);
ctx.reg_alloc.AllocStackSpace(code, stack_space + ABI_SHADOW_SPACE);
for (size_t i = 0; i < table_size; ++i) {
- const Xbyak::Xmm table_value = ctx.reg_alloc.UseXmm(code, table[i]);
+ auto const table_value = ctx.reg_alloc.UseXmm(code, table[i]);
code.movq(qword[rsp + ABI_SHADOW_SPACE + i * 8], table_value);
ctx.reg_alloc.Release(table_value);
}
- const Xbyak::Xmm defaults = ctx.reg_alloc.UseXmm(code, args[0]);
- const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(code, args[2]);
- const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+ auto const defaults = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const indicies = ctx.reg_alloc.UseXmm(code, args[2]);
+ auto const result = ctx.reg_alloc.ScratchXmm(code);
ctx.reg_alloc.EndOfAllocScope();
ctx.reg_alloc.HostCall(code, nullptr);
-
code.lea(code.ABI_PARAM1, ptr[rsp + ABI_SHADOW_SPACE]);
code.lea(code.ABI_PARAM2, ptr[rsp + ABI_SHADOW_SPACE + 4 * 8]);
code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE + 5 * 8]);
code.mov(code.ABI_PARAM4.cvt32(), table_size);
code.movq(qword[code.ABI_PARAM2], defaults);
code.movq(qword[code.ABI_PARAM3], indicies);
-
- code.CallLambda(
- [](const HalfVectorArray* table, HalfVectorArray& result, const HalfVectorArray& indicies, size_t table_size) {
- for (size_t i = 0; i < result.size(); ++i) {
- const size_t index = indicies[i] / table[0].size();
- const size_t elem = indicies[i] % table[0].size();
- if (index < table_size) {
- result[i] = table[index][elem];
- }
- }
- });
-
+ code.CallLambda([](const HalfVectorArray* table, HalfVectorArray& result, const HalfVectorArray& indicies, size_t table_size) {
+ for (size_t i = 0; i < result.size(); ++i) {
+ const size_t index = indicies[i] / table[0].size();
+ const size_t elem = indicies[i] % table[0].size();
+ if (index < table_size)
+ result[i] = table[index][elem];
+ }
+ });
code.movq(result, qword[rsp + ABI_SHADOW_SPACE + 4 * 8]);
ctx.reg_alloc.ReleaseStackSpace(code, stack_space + ABI_SHADOW_SPACE);
-
ctx.reg_alloc.DefineValue(code, inst, result);
}
@@ -5142,14 +5503,14 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) {
const bool is_defaults_zero = !inst->GetArg(0).IsImmediate() && inst->GetArg(0).GetInst()->GetOpcode() == IR::Opcode::ZeroVector;
if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW | HostFeature::AVX512VBMI) && table_size == 4) {
- const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]);
+ auto const indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]);
code.vpcmpub(k1, indicies, code.BConst<8>(xword, 2 * 16), CmpInt::LessThan);
code.vpcmpub(k2, indicies, code.BConst<8>(xword, 4 * 16), CmpInt::LessThan);
// Handle vector-table 0,1
- const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseXmm(code, table[0]);
- const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseXmm(code, table[1]);
+ auto const xmm_table0 = ctx.reg_alloc.UseXmm(code, table[0]);
+ auto const xmm_table1 = ctx.reg_alloc.UseXmm(code, table[1]);
code.vpermi2b(indicies | k1, xmm_table0, xmm_table1);
@@ -5157,8 +5518,8 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) {
ctx.reg_alloc.Release(xmm_table1);
// Handle vector-table 2,3
- const Xbyak::Xmm xmm_table2 = ctx.reg_alloc.UseXmm(code, table[2]);
- const Xbyak::Xmm xmm_table3 = ctx.reg_alloc.UseXmm(code, table[3]);
+ auto const xmm_table2 = ctx.reg_alloc.UseXmm(code, table[2]);
+ auto const xmm_table3 = ctx.reg_alloc.UseXmm(code, table[3]);
code.kandnw(k1, k1, k2);
code.vpermi2b(indicies | k1, xmm_table2, xmm_table3);
@@ -5167,19 +5528,19 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) {
code.vmovdqu8(indicies | k2 | T_z, indicies);
ctx.reg_alloc.DefineValue(code, inst, indicies);
} else {
- const Xbyak::Xmm defaults = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const defaults = ctx.reg_alloc.UseScratchXmm(code, args[0]);
code.vmovdqu8(defaults | k2, indicies);
ctx.reg_alloc.DefineValue(code, inst, defaults);
}
} else if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW | HostFeature::AVX512VBMI) && table_size == 3) {
- const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]);
+ auto const indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]);
code.vpcmpub(k1, indicies, code.BConst<8>(xword, 2 * 16), CmpInt::LessThan);
code.vpcmpub(k2, indicies, code.BConst<8>(xword, 3 * 16), CmpInt::LessThan);
// Handle vector-table 0,1
- const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseXmm(code, table[0]);
- const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseXmm(code, table[1]);
+ auto const xmm_table0 = ctx.reg_alloc.UseXmm(code, table[0]);
+ auto const xmm_table1 = ctx.reg_alloc.UseXmm(code, table[1]);
code.vpermi2b(indicies | k1, xmm_table0, xmm_table1);
@@ -5187,7 +5548,7 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) {
ctx.reg_alloc.Release(xmm_table1);
// Handle vector-table 2
- const Xbyak::Xmm xmm_table2 = ctx.reg_alloc.UseXmm(code, table[2]);
+ auto const xmm_table2 = ctx.reg_alloc.UseXmm(code, table[2]);
code.kandnw(k1, k1, k2);
code.vpermb(indicies | k1, indicies, xmm_table2);
@@ -5196,14 +5557,14 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) {
code.vmovdqu8(indicies | k2 | T_z, indicies);
ctx.reg_alloc.DefineValue(code, inst, indicies);
} else {
- const Xbyak::Xmm defaults = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const defaults = ctx.reg_alloc.UseScratchXmm(code, args[0]);
code.vmovdqu8(defaults | k2, indicies);
ctx.reg_alloc.DefineValue(code, inst, defaults);
}
} else if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW | HostFeature::AVX512VBMI) && table_size == 2) {
- const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]);
- const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseXmm(code, table[0]);
- const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseXmm(code, table[1]);
+ auto const indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]);
+ auto const xmm_table0 = ctx.reg_alloc.UseXmm(code, table[0]);
+ auto const xmm_table1 = ctx.reg_alloc.UseXmm(code, table[1]);
code.vpcmpub(k1, indicies, code.BConst<8>(xword, 2 * 16), CmpInt::LessThan);
@@ -5211,36 +5572,36 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) {
code.vpermi2b(indicies | k1 | T_z, xmm_table0, xmm_table1);
ctx.reg_alloc.DefineValue(code, inst, indicies);
} else {
- const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
code.vpermi2b(indicies, xmm_table0, xmm_table1);
code.vmovdqu8(result | k1, indicies);
ctx.reg_alloc.DefineValue(code, inst, result);
}
} else if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW | HostFeature::AVX512VBMI) && table_size == 1) {
- const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(code, args[2]);
- const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseXmm(code, table[0]);
+ auto const indicies = ctx.reg_alloc.UseXmm(code, args[2]);
+ auto const xmm_table0 = ctx.reg_alloc.UseXmm(code, table[0]);
code.vpcmpub(k1, indicies, code.BConst<8>(xword, 1 * 16), CmpInt::LessThan);
if (is_defaults_zero) {
- const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+ auto const result = ctx.reg_alloc.ScratchXmm(code);
code.vpermb(result | k1 | T_z, indicies, xmm_table0);
ctx.reg_alloc.DefineValue(code, inst, result);
} else {
- const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
code.vpermb(result | k1, indicies, xmm_table0);
ctx.reg_alloc.DefineValue(code, inst, result);
}
} else if (code.HasHostFeature(HostFeature::SSSE3) && is_defaults_zero && table_size == 1) {
- const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]);
- const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(code, table[0]);
+ auto const indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]);
+ auto const xmm_table0 = ctx.reg_alloc.UseScratchXmm(code, table[0]);
code.paddusb(indicies, code.Const(xword, 0x7070707070707070, 0x7070707070707070));
code.pshufb(xmm_table0, indicies);
ctx.reg_alloc.DefineValue(code, inst, xmm_table0);
} else if (code.HasHostFeature(HostFeature::SSE41) && table_size == 1) {
- const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(code, args[2]);
- const Xbyak::Xmm defaults = ctx.reg_alloc.UseXmm(code, args[0]);
- const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(code, table[0]);
+ auto const indicies = ctx.reg_alloc.UseXmm(code, args[2]);
+ auto const defaults = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const xmm_table0 = ctx.reg_alloc.UseScratchXmm(code, table[0]);
if (code.HasHostFeature(HostFeature::AVX)) {
code.vpaddusb(xmm0, indicies, code.Const(xword, 0x7070707070707070, 0x7070707070707070));
@@ -5253,9 +5614,9 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) {
ctx.reg_alloc.DefineValue(code, inst, xmm_table0);
} else if (code.HasHostFeature(HostFeature::SSE41) && is_defaults_zero && table_size == 2) {
- const Xbyak::Xmm indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]);
- const Xbyak::Xmm xmm_table0 = ctx.reg_alloc.UseScratchXmm(code, table[0]);
- const Xbyak::Xmm xmm_table1 = ctx.reg_alloc.UseScratchXmm(code, table[1]);
+ auto const indicies = ctx.reg_alloc.UseScratchXmm(code, args[2]);
+ auto const xmm_table0 = ctx.reg_alloc.UseScratchXmm(code, table[0]);
+ auto const xmm_table1 = ctx.reg_alloc.UseScratchXmm(code, table[1]);
if (code.HasHostFeature(HostFeature::AVX)) {
code.vpaddusb(xmm0, indicies, code.Const(xword, 0x7070707070707070, 0x7070707070707070));
@@ -5271,14 +5632,14 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) {
ctx.reg_alloc.DefineValue(code, inst, xmm_table0);
return;
} else if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512BW)) {
- const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(code, args[2]);
- const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm masked = ctx.reg_alloc.ScratchXmm(code);
+ auto const indicies = ctx.reg_alloc.UseXmm(code, args[2]);
+ auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const masked = ctx.reg_alloc.ScratchXmm(code);
code.vpandd(masked, indicies, code.Const(xword_b, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0));
for (size_t i = 0; i < table_size; ++i) {
- const Xbyak::Xmm xmm_table = ctx.reg_alloc.UseScratchXmm(code, table[i]);
+ auto const xmm_table = ctx.reg_alloc.UseScratchXmm(code, table[i]);
const Xbyak::Opmask table_mask = k1;
const u64 table_index = mcl::bit::replicate_element(i * 16);
@@ -5295,15 +5656,15 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) {
ctx.reg_alloc.DefineValue(code, inst, result);
} else if (code.HasHostFeature(HostFeature::SSE41)) {
- const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(code, args[2]);
- const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm masked = ctx.reg_alloc.ScratchXmm(code);
+ auto const indicies = ctx.reg_alloc.UseXmm(code, args[2]);
+ auto const result = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const masked = ctx.reg_alloc.ScratchXmm(code);
code.movaps(masked, code.Const(xword, 0xF0F0F0F0F0F0F0F0, 0xF0F0F0F0F0F0F0F0));
code.pand(masked, indicies);
for (size_t i = 0; i < table_size; ++i) {
- const Xbyak::Xmm xmm_table = ctx.reg_alloc.UseScratchXmm(code, table[i]);
+ auto const xmm_table = ctx.reg_alloc.UseScratchXmm(code, table[i]);
const u64 table_index = mcl::bit::replicate_element(i * 16);
@@ -5327,13 +5688,13 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) {
const u32 stack_space = static_cast((table_size + 2) * 16);
ctx.reg_alloc.AllocStackSpace(code, stack_space + ABI_SHADOW_SPACE);
for (size_t i = 0; i < table_size; ++i) {
- const Xbyak::Xmm table_value = ctx.reg_alloc.UseXmm(code, table[i]);
+ auto const table_value = ctx.reg_alloc.UseXmm(code, table[i]);
code.movaps(xword[rsp + ABI_SHADOW_SPACE + i * 16], table_value);
ctx.reg_alloc.Release(table_value);
}
- const Xbyak::Xmm defaults = ctx.reg_alloc.UseXmm(code, args[0]);
- const Xbyak::Xmm indicies = ctx.reg_alloc.UseXmm(code, args[2]);
- const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+ auto const defaults = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const indicies = ctx.reg_alloc.UseXmm(code, args[2]);
+ auto const result = ctx.reg_alloc.ScratchXmm(code);
ctx.reg_alloc.EndOfAllocScope();
ctx.reg_alloc.HostCall(code, nullptr);
code.lea(code.ABI_PARAM1, ptr[rsp + ABI_SHADOW_SPACE]);
@@ -5360,8 +5721,8 @@ void EmitX64::EmitVectorTableLookup128(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorTranspose8(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm lower = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm upper = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const lower = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const upper = ctx.reg_alloc.UseScratchXmm(code, args[1]);
const bool part = args[2].GetImmediateU1();
if (!part) {
@@ -5379,8 +5740,8 @@ void EmitX64::EmitVectorTranspose8(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorTranspose16(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm lower = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm upper = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const lower = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const upper = ctx.reg_alloc.UseScratchXmm(code, args[1]);
const bool part = args[2].GetImmediateU1();
if (!part) {
@@ -5398,8 +5759,8 @@ void EmitX64::EmitVectorTranspose16(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorTranspose32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm lower = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm upper = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const lower = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const upper = ctx.reg_alloc.UseXmm(code, args[1]);
const bool part = args[2].GetImmediateU1();
code.shufps(lower, upper, !part ? 0b10001000 : 0b11011101);
@@ -5411,8 +5772,8 @@ void EmitX64::EmitVectorTranspose32(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorTranspose64(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm lower = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm upper = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const lower = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const upper = ctx.reg_alloc.UseXmm(code, args[1]);
const bool part = args[2].GetImmediateU1();
code.shufpd(lower, upper, !part ? 0b00 : 0b11);
@@ -5420,89 +5781,87 @@ void EmitX64::EmitVectorTranspose64(EmitContext& ctx, IR::Inst* inst) {
ctx.reg_alloc.DefineValue(code, inst, lower);
}
-static void EmitVectorUnsignedAbsoluteDifference(size_t esize, EmitContext& ctx, IR::Inst* inst, BlockOfCode& code) {
+
+void EmitX64::EmitVectorUnsignedAbsoluteDifference8(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-
- const Xbyak::Xmm temp = ctx.reg_alloc.ScratchXmm(code);
-
- switch (esize) {
- case 8: {
- const Xbyak::Xmm x = ctx.reg_alloc.UseXmm(code, args[0]);
- const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
-
+ if (code.HasHostFeature(HostFeature::AVX)) {
+ auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const tmp1 = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+ code.vpminub(tmp2, tmp0, tmp1);
+ code.vpmaxub(tmp0, tmp0, tmp1);
+ code.vpsubb(tmp0, tmp0, tmp2);
+ ctx.reg_alloc.DefineValue(code, inst, tmp0);
+ } else {
+ auto const temp = ctx.reg_alloc.ScratchXmm(code);
+ auto const x = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
code.movdqa(temp, x);
code.psubusb(temp, y);
code.psubusb(y, x);
code.por(temp, y);
- break;
+ ctx.reg_alloc.DefineValue(code, inst, temp);
}
- case 16: {
- const Xbyak::Xmm x = ctx.reg_alloc.UseXmm(code, args[0]);
- const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+}
+void EmitX64::EmitVectorUnsignedAbsoluteDifference16(EmitContext& ctx, IR::Inst* inst) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ if (code.HasHostFeature(HostFeature::AVX)) {
+ auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const tmp1 = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+ code.vpminuw(tmp2, tmp0, tmp1);
+ code.vpmaxuw(tmp0, tmp0, tmp1);
+ code.vpsubw(tmp0, tmp0, tmp2);
+ ctx.reg_alloc.DefineValue(code, inst, tmp0);
+ } else {
+ auto const temp = ctx.reg_alloc.ScratchXmm(code);
+ auto const x = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
code.movdqa(temp, x);
code.psubusw(temp, y);
code.psubusw(y, x);
code.por(temp, y);
- break;
+ ctx.reg_alloc.DefineValue(code, inst, temp);
}
- case 32:
- // See https://stackoverflow.com/questions/3380785/compute-the-absolute-difference-between-unsigned-integers-using-sse/3527267#3527267
- if (code.HasHostFeature(HostFeature::SSE41)) {
- const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(code, args[1]);
-
- code.movdqa(temp, x);
- code.pminud(x, y);
- code.pmaxud(temp, y);
- code.psubd(temp, x);
- } else {
- const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
- if (ctx.HasOptimization(OptimizationFlag::CodeSpeed)) {
- // About 45 bytes
- const Xbyak::Xmm temp_x = ctx.reg_alloc.ScratchXmm(code);
- const Xbyak::Xmm temp_y = ctx.reg_alloc.ScratchXmm(code);
- code.pcmpeqd(temp, temp);
- code.pslld(temp, 31);
- code.movdqa(temp_x, x);
- code.movdqa(temp_y, y);
- code.paddd(temp_x, x);
- code.paddd(temp_y, y);
- code.pcmpgtd(temp_y, temp_x);
- code.psubd(x, y);
- code.pandn(temp, temp_y);
- code.pxor(x, y);
- code.psubd(x, y);
- } else {
- // Smaller code size - about 36 bytes
- code.movdqa(temp, code.Const(xword, 0x8000000080000000, 0x8000000080000000));
- code.pxor(x, temp);
- code.pxor(y, temp);
- code.movdqa(temp, x);
- code.psubd(temp, y);
- code.pcmpgtd(y, x);
- code.psrld(y, 1);
- code.pxor(temp, y);
- code.psubd(temp, y);
- }
- }
- break;
- }
-
- ctx.reg_alloc.DefineValue(code, inst, temp);
-}
-
-void EmitX64::EmitVectorUnsignedAbsoluteDifference8(EmitContext& ctx, IR::Inst* inst) {
- EmitVectorUnsignedAbsoluteDifference(8, ctx, inst, code);
-}
-
-void EmitX64::EmitVectorUnsignedAbsoluteDifference16(EmitContext& ctx, IR::Inst* inst) {
- EmitVectorUnsignedAbsoluteDifference(16, ctx, inst, code);
}
void EmitX64::EmitVectorUnsignedAbsoluteDifference32(EmitContext& ctx, IR::Inst* inst) {
- EmitVectorUnsignedAbsoluteDifference(32, ctx, inst, code);
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ if (code.HasHostFeature(HostFeature::AVX)) {
+ auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const tmp1 = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+ code.vpminud(tmp2, tmp0, tmp1);
+ code.vpmaxud(tmp0, tmp0, tmp1);
+ code.vpsubd(tmp0, tmp0, tmp2);
+ ctx.reg_alloc.DefineValue(code, inst, tmp0);
+ } else if (code.HasHostFeature(HostFeature::SSE41)) {
+ // See https://stackoverflow.com/questions/3380785/compute-the-absolute-difference-between-unsigned-integers-using-sse/3527267#3527267
+ auto const temp = ctx.reg_alloc.ScratchXmm(code);
+ auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const y = ctx.reg_alloc.UseXmm(code, args[1]);
+ code.movdqa(temp, x);
+ code.pminud(x, y);
+ code.pmaxud(temp, y);
+ code.psubd(temp, x);
+ ctx.reg_alloc.DefineValue(code, inst, temp);
+ } else {
+ auto const tmp0 = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const tmp1 = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const tmp2 = ctx.reg_alloc.ScratchXmm(code);
+ auto const tmp3 = ctx.reg_alloc.ScratchXmm(code);
+ code.movdqa(tmp2, code.Const(xword, 0x8000'00008000'0000, 0x8000'00008000'0000));
+ code.movdqa(tmp3, tmp1);
+ code.pxor(tmp3, tmp2);
+ code.pxor(tmp2, tmp0);
+ code.pcmpgtd(tmp2, tmp3);
+ code.psubd(tmp0, tmp1);
+ code.pxor(tmp0, tmp2);
+ code.psubd(tmp2, tmp0);
+ //code.movdqa(tmp0, tmp2);
+ ctx.reg_alloc.DefineValue(code, inst, tmp2);
+ }
}
void EmitX64::EmitVectorUnsignedMultiply16(EmitContext& ctx, IR::Inst* inst) {
@@ -5510,11 +5869,11 @@ void EmitX64::EmitVectorUnsignedMultiply16(EmitContext& ctx, IR::Inst* inst) {
const auto lower_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetLowerFromOp);
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm x = ctx.reg_alloc.UseXmm(code, args[0]);
- const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const x = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const y = ctx.reg_alloc.UseXmm(code, args[1]);
if (upper_inst) {
- const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+ auto const result = ctx.reg_alloc.ScratchXmm(code);
if (code.HasHostFeature(HostFeature::AVX)) {
code.vpmulhuw(result, x, y);
} else {
@@ -5526,7 +5885,7 @@ void EmitX64::EmitVectorUnsignedMultiply16(EmitContext& ctx, IR::Inst* inst) {
}
if (lower_inst) {
- const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+ auto const result = ctx.reg_alloc.ScratchXmm(code);
if (code.HasHostFeature(HostFeature::AVX)) {
code.vpmullw(result, x, y);
} else {
@@ -5544,24 +5903,24 @@ void EmitX64::EmitVectorUnsignedMultiply32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
if (lower_inst && !upper_inst && code.HasHostFeature(HostFeature::AVX)) {
- const Xbyak::Xmm x = ctx.reg_alloc.UseXmm(code, args[0]);
- const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(code, args[1]);
- const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+ auto const x = ctx.reg_alloc.UseXmm(code, args[0]);
+ auto const y = ctx.reg_alloc.UseXmm(code, args[1]);
+ auto const result = ctx.reg_alloc.ScratchXmm(code);
code.vpmulld(result, x, y);
ctx.reg_alloc.DefineValue(code, lower_inst, result);
} else if (code.HasHostFeature(HostFeature::AVX)) {
- const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
if (lower_inst) {
- const Xbyak::Xmm lower_result = ctx.reg_alloc.ScratchXmm(code);
+ auto const lower_result = ctx.reg_alloc.ScratchXmm(code);
code.vpmulld(lower_result, x, y);
ctx.reg_alloc.DefineValue(code, lower_inst, lower_result);
}
- const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
+ auto const result = ctx.reg_alloc.ScratchXmm(code);
code.vpmuludq(result, x, y);
code.vpsrlq(x, x, 32);
@@ -5571,11 +5930,11 @@ void EmitX64::EmitVectorUnsignedMultiply32(EmitContext& ctx, IR::Inst* inst) {
ctx.reg_alloc.DefineValue(code, upper_inst, result);
} else {
- const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
- const Xbyak::Xmm upper_result = upper_inst ? ctx.reg_alloc.ScratchXmm(code) : Xbyak::Xmm{-1};
- const Xbyak::Xmm lower_result = lower_inst ? ctx.reg_alloc.ScratchXmm(code) : Xbyak::Xmm{-1};
+ auto const x = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const y = ctx.reg_alloc.UseScratchXmm(code, args[1]);
+ auto const tmp = ctx.reg_alloc.ScratchXmm(code);
+ auto const upper_result = upper_inst ? ctx.reg_alloc.ScratchXmm(code) : Xbyak::Xmm{-1};
+ auto const lower_result = lower_inst ? ctx.reg_alloc.ScratchXmm(code) : Xbyak::Xmm{-1};
// calculate unsigned multiply
code.movdqa(tmp, x);
@@ -5792,11 +6151,11 @@ void EmitX64::EmitVectorUnsignedSaturatedShiftLeft64(EmitContext& ctx, IR::Inst*
void EmitX64::EmitVectorZeroExtend8(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
if (code.HasHostFeature(HostFeature::SSE41)) {
code.pmovzxbw(a, a);
} else {
- const Xbyak::Xmm zeros = ctx.reg_alloc.ScratchXmm(code);
+ auto const zeros = ctx.reg_alloc.ScratchXmm(code);
code.pxor(zeros, zeros);
code.punpcklbw(a, zeros);
}
@@ -5805,11 +6164,11 @@ void EmitX64::EmitVectorZeroExtend8(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorZeroExtend16(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
if (code.HasHostFeature(HostFeature::SSE41)) {
code.pmovzxwd(a, a);
} else {
- const Xbyak::Xmm zeros = ctx.reg_alloc.ScratchXmm(code);
+ auto const zeros = ctx.reg_alloc.ScratchXmm(code);
code.pxor(zeros, zeros);
code.punpcklwd(a, zeros);
}
@@ -5818,11 +6177,11 @@ void EmitX64::EmitVectorZeroExtend16(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorZeroExtend32(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
if (code.HasHostFeature(HostFeature::SSE41)) {
code.pmovzxdq(a, a);
} else {
- const Xbyak::Xmm zeros = ctx.reg_alloc.ScratchXmm(code);
+ auto const zeros = ctx.reg_alloc.ScratchXmm(code);
code.pxor(zeros, zeros);
code.punpckldq(a, zeros);
}
@@ -5831,8 +6190,8 @@ void EmitX64::EmitVectorZeroExtend32(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorZeroExtend64(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
- const Xbyak::Xmm zeros = ctx.reg_alloc.ScratchXmm(code);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const zeros = ctx.reg_alloc.ScratchXmm(code);
code.pxor(zeros, zeros);
code.punpcklqdq(a, zeros);
ctx.reg_alloc.DefineValue(code, inst, a);
@@ -5840,7 +6199,7 @@ void EmitX64::EmitVectorZeroExtend64(EmitContext& ctx, IR::Inst* inst) {
void EmitX64::EmitVectorZeroUpper(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
- const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ auto const a = ctx.reg_alloc.UseScratchXmm(code, args[0]);
code.movq(a, a); // TODO: !IsLastUse
@@ -5848,7 +6207,7 @@ void EmitX64::EmitVectorZeroUpper(EmitContext& ctx, IR::Inst* inst) {
}
void EmitX64::EmitZeroVector(EmitContext& ctx, IR::Inst* inst) {
- const Xbyak::Xmm a = ctx.reg_alloc.ScratchXmm(code);
+ auto const a = ctx.reg_alloc.ScratchXmm(code);
code.pxor(a, a);
ctx.reg_alloc.DefineValue(code, inst, a);
}
diff --git a/src/dynarmic/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp b/src/dynarmic/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp
index 70edfbd0bc..046ecc78d6 100644
--- a/src/dynarmic/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp
+++ b/src/dynarmic/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp
@@ -24,6 +24,7 @@
#include "dynarmic/common/fp/fpcr.h"
#include "dynarmic/common/fp/info.h"
#include "dynarmic/common/fp/op.h"
+#include "dynarmic/common/fp/rounding_mode.h"
#include "dynarmic/common/fp/util.h"
#include "dynarmic/interface/optimization_flags.h"
#include "dynarmic/ir/basic_block.h"
@@ -93,7 +94,7 @@ void HandleNaNs(BlockOfCode& code, EmitContext& ctx, bool fpcr_controlled, std::
code.cmp(bitmask, 0);
}
- SharedLabel end = GenSharedLabel(), nan = GenSharedLabel();
+ SharedLabel end = ctx.GenSharedLabel(), nan = ctx.GenSharedLabel();
code.jnz(*nan, code.T_NEAR);
code.L(*end);
@@ -188,23 +189,6 @@ void ForceToDefaultNaN(BlockOfCode& code, FP::FPCR fpcr, Xbyak::Xmm result) {
}
}
-template
-void ZeroIfNaN(BlockOfCode& code, Xbyak::Xmm result) {
- const Xbyak::Xmm nan_mask = xmm0;
- if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) {
- constexpr u32 nan_to_zero = FixupLUT(FpFixup::PosZero,
- FpFixup::PosZero);
- FCODE(vfixupimmp)(result, result, code.BConst<32>(ptr_b, nan_to_zero), u8(0));
- } else if (code.HasHostFeature(HostFeature::AVX)) {
- FCODE(vcmpordp)(nan_mask, result, result);
- FCODE(vandp)(result, result, nan_mask);
- } else {
- code.movaps(nan_mask, result);
- FCODE(cmpordp)(nan_mask, nan_mask);
- code.andps(result, nan_mask);
- }
-}
-
template
void DenormalsAreZero(BlockOfCode& code, FP::FPCR fpcr, std::initializer_list to_daz, Xbyak::Xmm tmp) {
if (fpcr.FZ()) {
@@ -1330,7 +1314,7 @@ void EmitFPVectorMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
- SharedLabel end = GenSharedLabel(), fallback = GenSharedLabel();
+ SharedLabel end = ctx.GenSharedLabel(), fallback = ctx.GenSharedLabel();
MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] {
code.movaps(result, xmm_a);
@@ -1603,7 +1587,7 @@ static void EmitRecipStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst* in
const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(code, args[1]);
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
- SharedLabel end = GenSharedLabel(), fallback = GenSharedLabel();
+ SharedLabel end = ctx.GenSharedLabel(), fallback = ctx.GenSharedLabel();
MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] {
code.movaps(result, GetVectorOf(code));
@@ -1776,7 +1760,7 @@ static void EmitRSqrtEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* ins
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(code);
const Xbyak::Xmm value = ctx.reg_alloc.ScratchXmm(code);
- SharedLabel bad_values = GenSharedLabel(), end = GenSharedLabel();
+ SharedLabel bad_values = ctx.GenSharedLabel(), end = ctx.GenSharedLabel();
code.movaps(value, operand);
@@ -1867,7 +1851,7 @@ static void EmitRSqrtStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst* in
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
const Xbyak::Xmm mask = ctx.reg_alloc.ScratchXmm(code);
- SharedLabel end = GenSharedLabel(), fallback = GenSharedLabel();
+ SharedLabel end = ctx.GenSharedLabel(), fallback = ctx.GenSharedLabel();
MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] {
code.vmovaps(result, GetVectorOf(code));
@@ -2004,120 +1988,123 @@ void EmitX64::EmitFPVectorToHalf32(EmitContext& ctx, IR::Inst* inst) {
template
void EmitFPVectorToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
const size_t fbits = inst->GetArg(1).GetU8();
- const auto rounding = static_cast(inst->GetArg(2).GetU8());
+ const auto rounding = FP::RoundingMode(inst->GetArg(2).GetU8());
[[maybe_unused]] const bool fpcr_controlled = inst->GetArg(3).GetU1();
- if constexpr (fsize != 16) {
- if (code.HasHostFeature(HostFeature::SSE41) && rounding != FP::RoundingMode::ToNearest_TieAwayFromZero) {
- auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-
- const Xbyak::Xmm src = ctx.reg_alloc.UseScratchXmm(code, args[0]);
-
- MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] {
- const int round_imm = [&] {
- switch (rounding) {
- case FP::RoundingMode::ToNearest_TieEven:
- default:
- return 0b00;
- case FP::RoundingMode::TowardsPlusInfinity:
- return 0b10;
- case FP::RoundingMode::TowardsMinusInfinity:
- return 0b01;
- case FP::RoundingMode::TowardsZero:
- return 0b11;
- }
- }();
-
- const auto perform_conversion = [&code, &ctx](const Xbyak::Xmm& src) {
- // MSVC doesn't allow us to use a [&] capture, so we have to do this instead.
- (void)ctx;
-
- if constexpr (fsize == 32) {
- code.cvttps2dq(src, src);
- } else {
- if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) {
- code.vcvttpd2qq(src, src);
- } else {
- const Xbyak::Reg64 hi = ctx.reg_alloc.ScratchGpr(code);
- const Xbyak::Reg64 lo = ctx.reg_alloc.ScratchGpr(code);
-
- code.cvttsd2si(lo, src);
- code.punpckhqdq(src, src);
- code.cvttsd2si(hi, src);
- code.movq(src, lo);
- code.pinsrq(src, hi, 1);
-
- ctx.reg_alloc.Release(hi);
- ctx.reg_alloc.Release(lo);
- }
- }
- };
-
- if (fbits != 0) {
- const u64 scale_factor = fsize == 32
- ? static_cast(fbits + 127) << 23
- : static_cast(fbits + 1023) << 52;
- FCODE(mulp)(src, GetVectorOf(code, scale_factor));
+ if (code.HasHostFeature(HostFeature::SSE41) && fsize != 16 && rounding != FP::RoundingMode::ToNearest_TieAwayFromZero) {
+ auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+ const Xbyak::Xmm src = ctx.reg_alloc.UseScratchXmm(code, args[0]);
+ MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] {
+ const int round_imm = [&] {
+ switch (rounding) {
+ case FP::RoundingMode::ToNearest_TieEven:
+ default:
+ return 0b00;
+ case FP::RoundingMode::TowardsPlusInfinity:
+ return 0b10;
+ case FP::RoundingMode::TowardsMinusInfinity:
+ return 0b01;
+ case FP::RoundingMode::TowardsZero:
+ return 0b11;
}
+ }();
+ const auto perform_conversion = [&code, &ctx](const Xbyak::Xmm& src) {
+ // MSVC doesn't allow us to use a [&] capture, so we have to do this instead.
+ (void)ctx;
- FCODE(roundp)(src, src, static_cast(round_imm));
- ZeroIfNaN(code, src);
-
- constexpr u64 float_upper_limit_signed = fsize == 32 ? 0x4f000000 : 0x43e0000000000000;
- [[maybe_unused]] constexpr u64 float_upper_limit_unsigned = fsize == 32 ? 0x4f800000 : 0x43f0000000000000;
-
- if constexpr (unsigned_) {
+ if constexpr (fsize == 32) {
+ code.cvttps2dq(src, src);
+ } else {
if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) {
- // Mask positive values
- code.xorps(xmm0, xmm0);
- FCODE(vcmpp)(k1, src, xmm0, Cmp::GreaterEqual_OQ);
-
- // Convert positive values to unsigned integers, write 0 anywhere else
- // vcvttp*2u*q already saturates out-of-range values to (0xFFFF...)
- if constexpr (fsize == 32) {
- code.vcvttps2udq(src | k1 | T_z, src);
- } else {
- code.vcvttpd2uqq(src | k1 | T_z, src);
- }
+ code.vcvttpd2qq(src, src);
} else {
- // Zero is minimum
- code.xorps(xmm0, xmm0);
- FCODE(cmplep)(xmm0, src);
- FCODE(andp)(src, xmm0);
+ const Xbyak::Reg64 hi = ctx.reg_alloc.ScratchGpr(code);
+ const Xbyak::Reg64 lo = ctx.reg_alloc.ScratchGpr(code);
- // Will we exceed unsigned range?
- const Xbyak::Xmm exceed_unsigned = ctx.reg_alloc.ScratchXmm(code);
- code.movaps(exceed_unsigned, GetVectorOf(code));
- FCODE(cmplep)(exceed_unsigned, src);
+ code.cvttsd2si(lo, src);
+ code.punpckhqdq(src, src);
+ code.cvttsd2si(hi, src);
+ code.movq(src, lo);
+ code.pinsrq(src, hi, 1);
- // Will be exceed signed range?
- const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
- code.movaps(tmp, GetVectorOf(code));
- code.movaps(xmm0, tmp);
- FCODE(cmplep)(xmm0, src);
- FCODE(andp)(tmp, xmm0);
- FCODE(subp)(src, tmp);
- perform_conversion(src);
- ICODE(psll)(xmm0, u8(fsize - 1));
- FCODE(orp)(src, xmm0);
+ ctx.reg_alloc.Release(hi);
+ ctx.reg_alloc.Release(lo);
+ }
+ }
+ };
+ if (fbits != 0) {
+ const u64 scale_factor = fsize == 32
+ ? u64(fbits + 127) << 23
+ : u64(fbits + 1023) << 52;
+ FCODE(mulp)(src, GetVectorOf(code, scale_factor));
+ }
- // Saturate to max
- FCODE(orp)(src, exceed_unsigned);
+ FCODE(roundp)(src, src, u8(round_imm));
+ const Xbyak::Xmm nan_mask = xmm0;
+ if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) {
+ static constexpr u32 nan_to_zero = FixupLUT(FpFixup::PosZero, FpFixup::PosZero);
+ FCODE(vfixupimmp)(src, src, code.BConst<32>(ptr_b, nan_to_zero), u8(0));
+ } else if (code.HasHostFeature(HostFeature::AVX)) {
+ FCODE(vcmpordp)(nan_mask, src, src);
+ FCODE(vandp)(src, src, nan_mask);
+ } else {
+ code.movaps(nan_mask, src);
+ FCODE(cmpordp)(nan_mask, nan_mask);
+ code.andps(src, nan_mask);
+ }
+
+ constexpr u64 float_upper_limit_signed = fsize == 32 ? 0x4f000000 : 0x43e0000000000000;
+ [[maybe_unused]] constexpr u64 float_upper_limit_unsigned = fsize == 32 ? 0x4f800000 : 0x43f0000000000000;
+
+ if constexpr (unsigned_) {
+ if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) {
+ // Mask positive values
+ code.xorps(xmm0, xmm0);
+ FCODE(vcmpp)(k1, src, xmm0, Cmp::GreaterEqual_OQ);
+
+ // Convert positive values to unsigned integers, write 0 anywhere else
+ // vcvttp*2u*q already saturates out-of-range values to (0xFFFF...)
+ if (fsize == 32) {
+ code.vcvttps2udq(src | k1 | T_z, src);
+ } else {
+ code.vcvttpd2uqq(src | k1 | T_z, src);
}
} else {
- using FPT = mcl::unsigned_integer_of_size; // WORKAROUND: For issue 678 on MSVC
- constexpr u64 integer_max = FPT((std::numeric_limits>>::max)());
-
- code.movaps(xmm0, GetVectorOf(code));
+ // Zero is minimum
+ code.xorps(xmm0, xmm0);
FCODE(cmplep)(xmm0, src);
- perform_conversion(src);
- FCODE(blendvp)(src, GetVectorOf(code));
- }
- });
+ FCODE(andp)(src, xmm0);
- ctx.reg_alloc.DefineValue(code, inst, src);
- return;
- }
+ // Will we exceed unsigned range?
+ const Xbyak::Xmm exceed_unsigned = ctx.reg_alloc.ScratchXmm(code);
+ code.movaps(exceed_unsigned, GetVectorOf(code));
+ FCODE(cmplep)(exceed_unsigned, src);
+
+ // Will be exceed signed range?
+ const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(code);
+ code.movaps(tmp, GetVectorOf(code));
+ code.movaps(xmm0, tmp);
+ FCODE(cmplep)(xmm0, src);
+ FCODE(andp)(tmp, xmm0);
+ FCODE(subp)(src, tmp);
+ perform_conversion(src);
+ ICODE(psll)(xmm0, u8(fsize - 1));
+ FCODE(orp)(src, xmm0);
+
+ // Saturate to max
+ FCODE(orp)(src, exceed_unsigned);
+ }
+ } else {
+ using FPT = mcl::unsigned_integer_of_size; // WORKAROUND: For issue 678 on MSVC
+ constexpr u64 integer_max = FPT((std::numeric_limits>>::max)());
+ code.movaps(xmm0, GetVectorOf(code));
+ FCODE(cmplep)(xmm0, src);
+ perform_conversion(src);
+ FCODE(blendvp)(src, GetVectorOf(code));
+ }
+ });
+ ctx.reg_alloc.DefineValue(code, inst, src);
+ return;
}
using FPT = mcl::unsigned_integer_of_size; // WORKAROUND: For issue 678 on MSVC
diff --git a/src/dynarmic/src/dynarmic/backend/x64/exception_handler_windows.cpp b/src/dynarmic/src/dynarmic/backend/x64/exception_handler_windows.cpp
index 3ae553bccd..bae397ff2b 100644
--- a/src/dynarmic/src/dynarmic/backend/x64/exception_handler_windows.cpp
+++ b/src/dynarmic/src/dynarmic/backend/x64/exception_handler_windows.cpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
+// SPDX-FileCopyrightText: Copyright 2026 Eden Emulator Project
// SPDX-License-Identifier: GPL-3.0-or-later
/* This file is part of the dynarmic project.
@@ -176,7 +176,7 @@ struct ExceptionHandler::Impl final {
code.align(16);
const u8* exception_handler_without_cb = code.getCurr();
- code.mov(code.eax, static_cast(ExceptionContinueSearch));
+ code.mov(code.eax, u32(ExceptionContinueSearch));
code.ret();
code.align(16);
@@ -192,20 +192,18 @@ struct ExceptionHandler::Impl final {
code.lea(code.rsp, code.ptr[code.rsp - 8]);
code.mov(code.ABI_PARAM1, std::bit_cast(&cb));
code.mov(code.ABI_PARAM2, code.ABI_PARAM3);
- code.CallLambda(
- [](const std::function& cb_, PCONTEXT ctx) {
- FakeCall fc = cb_(ctx->Rip);
-
- ctx->Rsp -= sizeof(u64);
- *std::bit_cast(ctx->Rsp) = fc.ret_rip;
- ctx->Rip = fc.call_rip;
- });
+ code.CallLambda([](const std::function& cb_, PCONTEXT ctx) {
+ FakeCall fc = cb_(ctx->Rip);
+ ctx->Rsp -= sizeof(u64);
+ *std::bit_cast