eden-miror/src/dynarmic/tests/rsqrt_test_fn.s
crueter 51b170b470
[cmake] refactor: Use CPM over submodules (#143)
Transfers the majority of submodules and large externals to CPM, using source archives rather than full Git clones. Not only does this save massive amounts of clone and configure time, but dependencies are grabbed on-demand rather than being required by default. Additionally, CPM will (generally) automatically search for system dependencies, though certain dependencies have options to control this.

Testing shows gains ranging from 5x to 10x in terms of overall clone/configure time.

Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/143
Reviewed-by: CamilleLaVey <camillelavey99@gmail.com>
2025-08-04 04:50:14 +02:00

303 lines
5.2 KiB
ArmAsm

.global _rsqrt_inaccurate
.global rsqrt_inaccurate
.global _rsqrt_full
.global rsqrt_full
.global _rsqrt_full_gpr
.global rsqrt_full_gpr
.global _rsqrt_full_nb
.global rsqrt_full_nb
.global _rsqrt_full_nb2
.global rsqrt_full_nb2
.global _rsqrt_full_nb_gpr
.global rsqrt_full_nb_gpr
.global _rsqrt_newton
.global rsqrt_newton
.global _rsqrt_hack
.global rsqrt_hack
.global _rsqrt_fallback
.text
.intel_syntax noprefix
.align 16
min_pos_denorm:
.long 0x00800000,0,0,0
penultimate_bit:
.long 0x00008000,0,0,0
ultimate_bit:
.long 0x00004000,0,0,0
top_mask:
.long 0xFFFF8000,0,0,0
one:
.long 0x3f800000,0,0,0
half:
.long 0x3f000000,0,0,0
one_point_five:
.long 0x3fc00000,0,0,0
magic1:
.long 0x60000000,0,0,0
magic2:
.long 0x3c000000,0,0,0
magic3:
.long 0x000047ff,0,0,0
_rsqrt_inaccurate:
rsqrt_inaccurate:
movd xmm0, edi
rsqrtss xmm0, xmm0
movd eax, xmm0
ret
_rsqrt_full:
rsqrt_full:
movd xmm0, edi
pand xmm0, [rip + top_mask]
por xmm0, [rip + penultimate_bit]
vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm]
ptest xmm1, xmm1
jnz rsqrt_full_bad
sqrtss xmm0, xmm0
movd xmm1, [rip + one]
divss xmm1, xmm0
paddd xmm1, [rip + ultimate_bit]
pand xmm1, [rip + top_mask]
movd eax, xmm1
ret
_rsqrt_full_gpr:
rsqrt_full_gpr:
movd eax, xmm0 # Emulate regalloc mov
mov eax, edi
and eax, 0xFFFF8000
or eax, 0x00008000
movd xmm0, eax
vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm]
ptest xmm1, xmm1
jnz rsqrt_full_bad
sqrtss xmm0, xmm0
movd xmm1, [rip + one]
divss xmm1, xmm0
movd eax, xmm1
add eax, 0x00004000
and eax, 0xffff8000
movd xmm0, eax # Emulate regalloc mov
ret
_rsqrt_full_nb2:
rsqrt_full_nb2:
movd xmm0, edi
pand xmm0, [rip + top_mask]
por xmm0, [rip + penultimate_bit]
ucomiss xmm0, [rip + min_pos_denorm]
jna rsqrt_full_bad_new1
sqrtss xmm0, xmm0
movd xmm1, [rip + one]
divss xmm1, xmm0
paddd xmm1, [rip + ultimate_bit]
pand xmm1, [rip + top_mask]
movd eax, xmm1
ret
_rsqrt_full_nb:
rsqrt_full_nb:
movd xmm0, edi
pand xmm0, [rip + top_mask]
por xmm0, [rip + penultimate_bit]
vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm]
ptest xmm1, xmm1
jnz rsqrt_full_bad_new1
sqrtss xmm0, xmm0
movd xmm1, [rip + one]
divss xmm1, xmm0
paddd xmm1, [rip + ultimate_bit]
pand xmm1, [rip + top_mask]
movd eax, xmm1
ret
rsqrt_full_bad_new1:
cmp edi, 0x00800000
jb rsqrt_full_bad_new_fallback1
movd xmm0, edi
rsqrtss xmm1, xmm0
ucomiss xmm1, xmm1
jp rsqrt_full_bad_new1_nan
movd eax, xmm1
ret
rsqrt_full_bad_new_fallback1:
call _rsqrt_fallback
ret
rsqrt_full_bad_new1_nan:
ucomiss xmm0, xmm0
jp rsqrt_full_bad_new1_nan_ret
mov eax, 0x7FC00000
ret
rsqrt_full_bad_new1_nan_ret:
ret
_rsqrt_full_nb_gpr:
rsqrt_full_nb_gpr:
movd eax, xmm0 # Emulate regalloc mov
mov eax, edi
and eax, 0xFFFF8000
or eax, 0x00008000
movd xmm0, eax
vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm]
ptest xmm1, xmm1
jnz rsqrt_full_bad_new2
sqrtss xmm0, xmm0
movd xmm1, [rip + one]
divss xmm1, xmm0
movd eax, xmm1
add eax, 0x00004000
and eax, 0xffff8000
movd xmm0, eax # Emulate regalloc mov
ret
rsqrt_full_bad_new2:
cmp edi, 0x00800000
jb rsqrt_full_bad_new_fallback2
movd xmm0, edi
rsqrtss xmm1, xmm0
test edi, edi
js rsqrt_full_bad_new2_nan
movd eax, xmm1
ret
rsqrt_full_bad_new_fallback2:
call _rsqrt_fallback
ret
rsqrt_full_bad_new2_nan:
mov eax, 0x7FC00000
ret
rsqrt_full_bad:
xorps xmm1, xmm1
movd xmm0, edi
ucomiss xmm0, xmm1
jp rsqrt_full_nan
je rsqrt_full_zero
jc rsqrt_full_neg
cmp edi, 0x7F800000
je rsqrt_full_inf
# TODO: Full Denormal Implementation
call _rsqrt_fallback
ret
rsqrt_full_neg:
mov eax, 0x7FC00000
ret
rsqrt_full_inf:
xor eax, eax
ret
rsqrt_full_nan:
mov eax, edi
or eax, 0x00400000
ret
rsqrt_full_zero:
mov eax, edi
or eax, 0x7F800000
ret
_rsqrt_newton:
rsqrt_newton:
movd xmm0, edi
pand xmm0, [rip + top_mask]
por xmm0, [rip + penultimate_bit]
vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm]
ptest xmm1, xmm1
jnz rsqrt_full_bad
rsqrtps xmm1, xmm0
mulss xmm0, [rip + half]
vmulss xmm2, xmm1, xmm1
mulss xmm2, xmm0
movaps xmm0, [rip + one_point_five]
subss xmm0, xmm2
mulss xmm0, xmm1
paddd xmm0, [rip + ultimate_bit]
pand xmm0, [rip + top_mask]
movd eax, xmm0
ret
_rsqrt_hack:
rsqrt_hack:
movd xmm9, edi
vpand xmm0, xmm9, [rip + top_mask]
por xmm0, [rip + penultimate_bit]
# detect NaNs, negatives, zeros, denormals and infinities
vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm]
ptest xmm1, xmm1
jnz rsqrt_full_bad
# calculate x64 estimate
rsqrtps xmm0, xmm0
# calculate correction factor
vpslld xmm1, xmm9, 8
vpsrad xmm2, xmm1, 31
paddd xmm1, [rip + magic1]
pcmpgtd xmm1, [rip + magic2]
pxor xmm1, xmm2
movaps xmm2, [rip + magic3]
psubd xmm2, xmm1
# correct x64 estimate
paddd xmm0, xmm2
pand xmm0, [rip + top_mask]
movd eax, xmm0
ret