mirror of
https://git.eden-emu.dev/eden-emu/eden
synced 2026-04-19 21:28:56 +02:00
Transfers the majority of submodules and large externals to CPM, using source archives rather than full Git clones. Not only does this save massive amounts of clone and configure time, but dependencies are grabbed on-demand rather than being required by default. Additionally, CPM will (generally) automatically search for system dependencies, though certain dependencies have options to control this. Testing shows gains ranging from 5x to 10x in terms of overall clone/configure time. Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/143 Reviewed-by: CamilleLaVey <camillelavey99@gmail.com>
303 lines
5.2 KiB
ArmAsm
303 lines
5.2 KiB
ArmAsm
.global _rsqrt_inaccurate
|
|
.global rsqrt_inaccurate
|
|
.global _rsqrt_full
|
|
.global rsqrt_full
|
|
.global _rsqrt_full_gpr
|
|
.global rsqrt_full_gpr
|
|
.global _rsqrt_full_nb
|
|
.global rsqrt_full_nb
|
|
.global _rsqrt_full_nb2
|
|
.global rsqrt_full_nb2
|
|
.global _rsqrt_full_nb_gpr
|
|
.global rsqrt_full_nb_gpr
|
|
.global _rsqrt_newton
|
|
.global rsqrt_newton
|
|
.global _rsqrt_hack
|
|
.global rsqrt_hack
|
|
.global _rsqrt_fallback
|
|
|
|
.text
|
|
.intel_syntax noprefix
|
|
|
|
.align 16
|
|
min_pos_denorm:
|
|
.long 0x00800000,0,0,0
|
|
penultimate_bit:
|
|
.long 0x00008000,0,0,0
|
|
ultimate_bit:
|
|
.long 0x00004000,0,0,0
|
|
top_mask:
|
|
.long 0xFFFF8000,0,0,0
|
|
one:
|
|
.long 0x3f800000,0,0,0
|
|
half:
|
|
.long 0x3f000000,0,0,0
|
|
one_point_five:
|
|
.long 0x3fc00000,0,0,0
|
|
magic1:
|
|
.long 0x60000000,0,0,0
|
|
magic2:
|
|
.long 0x3c000000,0,0,0
|
|
magic3:
|
|
.long 0x000047ff,0,0,0
|
|
|
|
_rsqrt_inaccurate:
|
|
rsqrt_inaccurate:
|
|
movd xmm0, edi
|
|
|
|
rsqrtss xmm0, xmm0
|
|
|
|
movd eax, xmm0
|
|
ret
|
|
|
|
_rsqrt_full:
|
|
rsqrt_full:
|
|
movd xmm0, edi
|
|
|
|
pand xmm0, [rip + top_mask]
|
|
por xmm0, [rip + penultimate_bit]
|
|
|
|
vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm]
|
|
ptest xmm1, xmm1
|
|
jnz rsqrt_full_bad
|
|
|
|
sqrtss xmm0, xmm0
|
|
|
|
movd xmm1, [rip + one]
|
|
divss xmm1, xmm0
|
|
|
|
paddd xmm1, [rip + ultimate_bit]
|
|
pand xmm1, [rip + top_mask]
|
|
|
|
movd eax, xmm1
|
|
ret
|
|
|
|
_rsqrt_full_gpr:
|
|
rsqrt_full_gpr:
|
|
movd eax, xmm0 # Emulate regalloc mov
|
|
|
|
mov eax, edi
|
|
and eax, 0xFFFF8000
|
|
or eax, 0x00008000
|
|
|
|
movd xmm0, eax
|
|
vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm]
|
|
ptest xmm1, xmm1
|
|
jnz rsqrt_full_bad
|
|
|
|
sqrtss xmm0, xmm0
|
|
|
|
movd xmm1, [rip + one]
|
|
divss xmm1, xmm0
|
|
movd eax, xmm1
|
|
|
|
add eax, 0x00004000
|
|
and eax, 0xffff8000
|
|
|
|
movd xmm0, eax # Emulate regalloc mov
|
|
ret
|
|
|
|
_rsqrt_full_nb2:
|
|
rsqrt_full_nb2:
|
|
movd xmm0, edi
|
|
|
|
pand xmm0, [rip + top_mask]
|
|
por xmm0, [rip + penultimate_bit]
|
|
|
|
ucomiss xmm0, [rip + min_pos_denorm]
|
|
jna rsqrt_full_bad_new1
|
|
|
|
sqrtss xmm0, xmm0
|
|
|
|
movd xmm1, [rip + one]
|
|
divss xmm1, xmm0
|
|
|
|
paddd xmm1, [rip + ultimate_bit]
|
|
pand xmm1, [rip + top_mask]
|
|
|
|
movd eax, xmm1
|
|
ret
|
|
|
|
_rsqrt_full_nb:
|
|
rsqrt_full_nb:
|
|
movd xmm0, edi
|
|
|
|
pand xmm0, [rip + top_mask]
|
|
por xmm0, [rip + penultimate_bit]
|
|
|
|
vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm]
|
|
ptest xmm1, xmm1
|
|
jnz rsqrt_full_bad_new1
|
|
|
|
sqrtss xmm0, xmm0
|
|
|
|
movd xmm1, [rip + one]
|
|
divss xmm1, xmm0
|
|
|
|
paddd xmm1, [rip + ultimate_bit]
|
|
pand xmm1, [rip + top_mask]
|
|
|
|
movd eax, xmm1
|
|
ret
|
|
|
|
rsqrt_full_bad_new1:
|
|
cmp edi, 0x00800000
|
|
jb rsqrt_full_bad_new_fallback1
|
|
|
|
movd xmm0, edi
|
|
rsqrtss xmm1, xmm0
|
|
|
|
ucomiss xmm1, xmm1
|
|
jp rsqrt_full_bad_new1_nan
|
|
|
|
movd eax, xmm1
|
|
ret
|
|
|
|
rsqrt_full_bad_new_fallback1:
|
|
call _rsqrt_fallback
|
|
ret
|
|
|
|
rsqrt_full_bad_new1_nan:
|
|
ucomiss xmm0, xmm0
|
|
jp rsqrt_full_bad_new1_nan_ret
|
|
|
|
mov eax, 0x7FC00000
|
|
ret
|
|
|
|
rsqrt_full_bad_new1_nan_ret:
|
|
ret
|
|
|
|
_rsqrt_full_nb_gpr:
|
|
rsqrt_full_nb_gpr:
|
|
movd eax, xmm0 # Emulate regalloc mov
|
|
|
|
mov eax, edi
|
|
and eax, 0xFFFF8000
|
|
or eax, 0x00008000
|
|
|
|
movd xmm0, eax
|
|
vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm]
|
|
ptest xmm1, xmm1
|
|
jnz rsqrt_full_bad_new2
|
|
|
|
sqrtss xmm0, xmm0
|
|
|
|
movd xmm1, [rip + one]
|
|
divss xmm1, xmm0
|
|
movd eax, xmm1
|
|
|
|
add eax, 0x00004000
|
|
and eax, 0xffff8000
|
|
|
|
movd xmm0, eax # Emulate regalloc mov
|
|
ret
|
|
|
|
rsqrt_full_bad_new2:
|
|
cmp edi, 0x00800000
|
|
jb rsqrt_full_bad_new_fallback2
|
|
|
|
movd xmm0, edi
|
|
rsqrtss xmm1, xmm0
|
|
|
|
test edi, edi
|
|
js rsqrt_full_bad_new2_nan
|
|
|
|
movd eax, xmm1
|
|
ret
|
|
|
|
rsqrt_full_bad_new_fallback2:
|
|
call _rsqrt_fallback
|
|
ret
|
|
|
|
rsqrt_full_bad_new2_nan:
|
|
mov eax, 0x7FC00000
|
|
ret
|
|
|
|
rsqrt_full_bad:
|
|
xorps xmm1, xmm1
|
|
movd xmm0, edi
|
|
ucomiss xmm0, xmm1
|
|
jp rsqrt_full_nan
|
|
je rsqrt_full_zero
|
|
jc rsqrt_full_neg
|
|
|
|
cmp edi, 0x7F800000
|
|
je rsqrt_full_inf
|
|
|
|
# TODO: Full Denormal Implementation
|
|
call _rsqrt_fallback
|
|
ret
|
|
|
|
rsqrt_full_neg:
|
|
mov eax, 0x7FC00000
|
|
ret
|
|
|
|
rsqrt_full_inf:
|
|
xor eax, eax
|
|
ret
|
|
|
|
rsqrt_full_nan:
|
|
mov eax, edi
|
|
or eax, 0x00400000
|
|
ret
|
|
|
|
rsqrt_full_zero:
|
|
mov eax, edi
|
|
or eax, 0x7F800000
|
|
ret
|
|
|
|
_rsqrt_newton:
|
|
rsqrt_newton:
|
|
movd xmm0, edi
|
|
|
|
pand xmm0, [rip + top_mask]
|
|
por xmm0, [rip + penultimate_bit]
|
|
|
|
vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm]
|
|
ptest xmm1, xmm1
|
|
jnz rsqrt_full_bad
|
|
|
|
rsqrtps xmm1, xmm0
|
|
mulss xmm0, [rip + half]
|
|
vmulss xmm2, xmm1, xmm1
|
|
mulss xmm2, xmm0
|
|
movaps xmm0, [rip + one_point_five]
|
|
subss xmm0, xmm2
|
|
mulss xmm0, xmm1
|
|
|
|
paddd xmm0, [rip + ultimate_bit]
|
|
pand xmm0, [rip + top_mask]
|
|
|
|
movd eax, xmm0
|
|
ret
|
|
|
|
_rsqrt_hack:
|
|
rsqrt_hack:
|
|
movd xmm9, edi
|
|
|
|
vpand xmm0, xmm9, [rip + top_mask]
|
|
por xmm0, [rip + penultimate_bit]
|
|
|
|
# detect NaNs, negatives, zeros, denormals and infinities
|
|
vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm]
|
|
ptest xmm1, xmm1
|
|
jnz rsqrt_full_bad
|
|
|
|
# calculate x64 estimate
|
|
rsqrtps xmm0, xmm0
|
|
|
|
# calculate correction factor
|
|
vpslld xmm1, xmm9, 8
|
|
vpsrad xmm2, xmm1, 31
|
|
paddd xmm1, [rip + magic1]
|
|
pcmpgtd xmm1, [rip + magic2]
|
|
pxor xmm1, xmm2
|
|
movaps xmm2, [rip + magic3]
|
|
psubd xmm2, xmm1
|
|
|
|
# correct x64 estimate
|
|
paddd xmm0, xmm2
|
|
pand xmm0, [rip + top_mask]
|
|
|
|
movd eax, xmm0
|
|
ret
|