reactos/sdk/lib/crt/math/libm_sse2/log10.asm
2022-12-01 15:21:59 +02:00

566 lines
16 KiB
NASM

;
; MIT License
; -----------
;
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
;
; Permission is hereby granted, free of charge, to any person obtaining a copy
; of this Software and associated documentaon files (the "Software"), to deal
; in the Software without restriction, including without limitation the rights
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
; copies of the Software, and to permit persons to whom the Software is
; furnished to do so, subject to the following conditions:
;
; The above copyright notice and this permission notice shall be included in
; all copies or substantial portions of the Software.
;
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
; THE SOFTWARE.
;
; log10.asm
;
; An implementation of the log10 libm function.
;
; Prototype:
;
; double log10(double x);
;
;
; Algorithm:
; Similar to one presnted in log.asm
;
.const
ALIGN 16
__real_ninf DQ 0fff0000000000000h ; -inf
DQ 0000000000000000h
__real_inf DQ 7ff0000000000000h ; +inf
DQ 0000000000000000h
__real_neg_qnan DQ 0fff8000000000000h ; neg qNaN
DQ 0000000000000000h
__real_qnanbit DQ 0008000000000000h
DQ 0000000000000000h
__int_1023 DQ 00000000000003ffh
DQ 0000000000000000h
__mask_001 DQ 0000000000000001h
DQ 0000000000000000h
__mask_mant DQ 000FFFFFFFFFFFFFh ; mask for mantissa bits
DQ 0000000000000000h
__mask_mant_top8 DQ 000ff00000000000h ; mask for top 8 mantissa bits
DQ 0000000000000000h
__mask_mant9 DQ 0000080000000000h ; mask for 9th mantissa bit
DQ 0000000000000000h
__real_log10_e DQ 3fdbcb7b1526e50eh
DQ 0000000000000000h
__real_log10_e_lead DQ 3fdbcb7800000000h ; log10e_lead 4.34293746948242187500e-01
DQ 0000000000000000h
__real_log10_e_tail DQ 3ea8a93728719535h ; log10e_tail 7.3495500964015109100644e-7
DQ 0000000000000000h
__real_log10_2_lead DQ 3fd3441350000000h
DQ 0000000000000000h
__real_log10_2_tail DQ 3e03ef3fde623e25h
DQ 0000000000000000h
__real_two DQ 4000000000000000h ; 2
DQ 0000000000000000h
__real_one DQ 3ff0000000000000h ; 1
DQ 0000000000000000h
__real_half DQ 3fe0000000000000h ; 1/2
DQ 0000000000000000h
__mask_100 DQ 0000000000000100h
DQ 0000000000000000h
__real_1_over_512 DQ 3f60000000000000h
DQ 0000000000000000h
__real_1_over_2 DQ 3fe0000000000000h
DQ 0000000000000000h
__real_1_over_3 DQ 3fd5555555555555h
DQ 0000000000000000h
__real_1_over_4 DQ 3fd0000000000000h
DQ 0000000000000000h
__real_1_over_5 DQ 3fc999999999999ah
DQ 0000000000000000h
__real_1_over_6 DQ 3fc5555555555555h
DQ 0000000000000000h
__real_neg_1023 DQ 0c08ff80000000000h
DQ 0000000000000000h
__mask_2045 DQ 00000000000007fdh
DQ 0000000000000000h
__real_threshold DQ 3fb0000000000000h ; .0625
DQ 0000000000000000h
__real_near_one_lt DQ 3fee000000000000h ; .9375
DQ 0000000000000000h
__real_near_one_gt DQ 3ff1000000000000h ; 1.0625
DQ 0000000000000000h
__real_min_norm DQ 0010000000000000h
DQ 0000000000000000h
__real_notsign DQ 7ffFFFFFFFFFFFFFh ; ^sign bit
DQ 0000000000000000h
__real_ca1 DQ 3fb55555555554e6h ; 8.33333333333317923934e-02
DQ 0000000000000000h
__real_ca2 DQ 3f89999999bac6d4h ; 1.25000000037717509602e-02
DQ 0000000000000000h
__real_ca3 DQ 3f62492307f1519fh ; 2.23213998791944806202e-03
DQ 0000000000000000h
__real_ca4 DQ 3f3c8034c85dfff0h ; 4.34887777707614552256e-04
DQ 0000000000000000h
__mask_lower DQ 0ffffffff00000000h
DQ 0000000000000000h
; these codes and the ones in the corresponding .c file have to match
__flag_x_zero DD 00000001
__flag_x_neg DD 00000002
__flag_x_nan DD 00000003
EXTRN __log10_256_lead:QWORD
EXTRN __log10_256_tail:QWORD
EXTRN __log_F_inv_qword:QWORD
EXTRN __use_fma3_lib:DWORD
; local variable storage offsets
save_xmm6 EQU 20h
dummy_space EQU 30h
stack_size EQU 058h
include fm.inc
fname TEXTEQU <log10>
fname_special TEXTEQU <_log10_special>
EXTERN fname_special:PROC
.code
ALIGN 16
PUBLIC fname
fname PROC FRAME
StackAllocate stack_size
SaveXmm xmm6, save_xmm6
.ENDPROLOG
cmp DWORD PTR __use_fma3_lib, 0
jne Llog10_fma3
Llog10_sse2:
; compute exponent part
movapd xmm3, xmm0
movapd xmm4, xmm0
psrlq xmm3, 52
movd rax, xmm0
psubq xmm3, XMMWORD PTR __int_1023 ; xmm3 <-- unbiased exponent
; NaN or inf
movapd xmm5, xmm0
andpd xmm5, XMMWORD PTR __real_inf
comisd xmm5, QWORD PTR __real_inf
je Llog10_sse2_x_is_inf_or_nan
movapd xmm2, xmm0
cvtdq2pd xmm6, xmm3 ; xmm6 <-- unbiased exp as double
pand xmm2, XMMWORD PTR __mask_mant
subsd xmm4, QWORD PTR __real_one
comisd xmm6, QWORD PTR __real_neg_1023
je Llog10_sse2_denormal_adjust
Llog10_sse2_continue_common:
andpd xmm4, XMMWORD PTR __real_notsign
; compute index into the log tables
mov r9, rax
and rax, QWORD PTR __mask_mant_top8
and r9, QWORD PTR __mask_mant9
shl r9, 1
add rax, r9
movd xmm1, rax
; near one codepath
comisd xmm4, QWORD PTR __real_threshold
jb Llog10_sse2_near_one
; F, Y
shr rax, 44
por xmm2, XMMWORD PTR __real_half
por xmm1, XMMWORD PTR __real_half
lea r9, QWORD PTR __log_F_inv_qword
; check for negative numbers or zero
xorpd xmm5, xmm5
comisd xmm0, xmm5
jbe Llog10_sse2_x_is_zero_or_neg
; f = F - Y, r = f * inv
subsd xmm1, xmm2
mulsd xmm1, QWORD PTR [r9+rax*8]
movapd xmm2, xmm1
movapd xmm0, xmm1
lea r9, QWORD PTR __log10_256_lead
; poly
movsd xmm3, QWORD PTR __real_1_over_6
movsd xmm1, QWORD PTR __real_1_over_3
mulsd xmm3, xmm2
mulsd xmm1, xmm2
mulsd xmm0, xmm2
movapd xmm4, xmm0
addsd xmm3, QWORD PTR __real_1_over_5
addsd xmm1, QWORD PTR __real_1_over_2
mulsd xmm4, xmm0
mulsd xmm3, xmm2
mulsd xmm1, xmm0
addsd xmm3, QWORD PTR __real_1_over_4
addsd xmm1, xmm2
mulsd xmm3, xmm4
addsd xmm1, xmm3
movsd xmm5, QWORD PTR __real_log10_2_tail
mulsd xmm1, QWORD PTR __real_log10_e
; m*log(10) + log10(G) - poly
mulsd xmm5, xmm6
subsd xmm5, xmm1
movsd xmm0, QWORD PTR [r9+rax*8]
lea rdx, QWORD PTR __log10_256_tail
movsd xmm2, QWORD PTR [rdx+rax*8]
movsd xmm4, QWORD PTR __real_log10_2_lead
mulsd xmm4, xmm6
addsd xmm0, xmm4
addsd xmm2, xmm5
addsd xmm0, xmm2
RestoreXmm xmm6, save_xmm6
StackDeallocate stack_size
ret
ALIGN 16
Llog10_sse2_near_one:
; r = x - 1.0
movsd xmm2, QWORD PTR __real_two
subsd xmm0, QWORD PTR __real_one ; r
addsd xmm2, xmm0
movapd xmm1, xmm0
divsd xmm1, xmm2 ; r/(2+r) = u/2
movsd xmm4, QWORD PTR __real_ca2
movsd xmm5, QWORD PTR __real_ca4
movapd xmm6, xmm0
mulsd xmm6, xmm1 ; correction
addsd xmm1, xmm1 ; u
movapd xmm2, xmm1
mulsd xmm2, xmm1 ; u^2
mulsd xmm4, xmm2
mulsd xmm5, xmm2
addsd xmm4, QWORD PTR __real_ca1
addsd xmm5, QWORD PTR __real_ca3
mulsd xmm2, xmm1 ; u^3
mulsd xmm4, xmm2
mulsd xmm2, xmm2
mulsd xmm2, xmm1 ; u^7
mulsd xmm5, xmm2
movsd xmm2, QWORD PTR __real_log10_e_tail
addsd xmm4, xmm5
subsd xmm4, xmm6
movsd xmm6, QWORD PTR __real_log10_e_lead
movapd xmm3, xmm0
pand xmm3, XMMWORD PTR __mask_lower
subsd xmm0, xmm3
addsd xmm4, xmm0
movapd xmm0, xmm3
movapd xmm1, xmm4
mulsd xmm4, xmm2
mulsd xmm0, xmm2
mulsd xmm1, xmm6
mulsd xmm3, xmm6
addsd xmm0, xmm4
addsd xmm0, xmm1
addsd xmm0, xmm3
RestoreXmm xmm6, save_xmm6
StackDeallocate stack_size
ret
Llog10_sse2_denormal_adjust:
por xmm2, XMMWORD PTR __real_one
subsd xmm2, QWORD PTR __real_one
movsd xmm5, xmm2
pand xmm2, XMMWORD PTR __mask_mant
movd rax, xmm2
psrlq xmm5, 52
psubd xmm5, XMMWORD PTR __mask_2045
cvtdq2pd xmm6, xmm5
jmp Llog10_sse2_continue_common
ALIGN 16
Llog10_sse2_x_is_zero_or_neg:
jne Llog10_sse2_x_is_neg
movsd xmm1, QWORD PTR __real_ninf
mov r8d, DWORD PTR __flag_x_zero
call fname_special
jmp Llog10_sse2_finish
ALIGN 16
Llog10_sse2_x_is_neg:
movsd xmm1, QWORD PTR __real_neg_qnan
mov r8d, DWORD PTR __flag_x_neg
call fname_special
jmp Llog10_sse2_finish
ALIGN 16
Llog10_sse2_x_is_inf_or_nan:
cmp rax, QWORD PTR __real_inf
je Llog10_sse2_finish
cmp rax, QWORD PTR __real_ninf
je Llog10_sse2_x_is_neg
or rax, QWORD PTR __real_qnanbit
movd xmm1, rax
mov r8d, DWORD PTR __flag_x_nan
call fname_special
jmp Llog10_sse2_finish
ALIGN 16
Llog10_sse2_finish:
RestoreXmm xmm6, save_xmm6
StackDeallocate stack_size
ret
ALIGN 16
Llog10_fma3:
; compute exponent part
xor rax,rax
vpsrlq xmm3,xmm0,52
vmovq rax,xmm0
vpsubq xmm3,xmm3,QWORD PTR __int_1023
vcvtdq2pd xmm6,xmm3 ; xmm6 <-- (double)xexp
; NaN or Inf?
vpand xmm5,xmm0,__real_inf
vcomisd xmm5,QWORD PTR __real_inf
je Llog10_fma3_x_is_inf_or_nan
; negative number or zero?
vpxor xmm5,xmm5,xmm5
vcomisd xmm0,xmm5
jbe Llog10_fma3_x_is_zero_or_neg
vpand xmm2,xmm0,__mask_mant
vsubsd xmm4,xmm0,QWORD PTR __real_one
; Subnormal?
vcomisd xmm6,QWORD PTR __real_neg_1023
je Llog10_fma3_denormal_adjust
Llog10_fma3_continue_common:
; compute index into the log tables
vpand xmm1,xmm0,DWORD PTR __mask_mant_top8
vpand xmm3,xmm0,DWORD PTR __mask_mant9
vpsllq xmm3,xmm3,1
vpaddq xmm1,xmm3,xmm1
vmovq rax,xmm1
; near one codepath
vpand xmm4,xmm4,DWORD PTR __real_notsign
vcomisd xmm4,QWORD PTR __real_threshold
jb Llog10_fma3_near_one
; F,Y
shr rax,44
vpor xmm2,xmm2,DWORD PTR __real_half
vpor xmm1,xmm1,DWORD PTR __real_half
lea r9,DWORD PTR __log_F_inv_qword
; f = F - Y,r = f * inv
vsubsd xmm1,xmm1,xmm2
vmulsd xmm1,xmm1,QWORD PTR [r9 + rax * 8]
lea r9,DWORD PTR __log10_256_lead
; poly
vmulsd xmm0,xmm1,xmm1 ; r*r
vmovsd xmm3,QWORD PTR __real_1_over_6
vmovsd xmm5,QWORD PTR __real_1_over_3
vfmadd213sd xmm3,xmm1,QWORD PTR __real_1_over_5 ; r*1/6 + 1/5
vfmadd213sd xmm5,xmm1,QWORD PTR __real_half ; 1/2+r*1/3
movsd xmm4,xmm0 ; r*r
vfmadd213sd xmm3 ,xmm1,QWORD PTR __real_1_over_4 ; 1/4+(1/5*r+r*r*1/6)
vmulsd xmm4,xmm0,xmm0 ; r*r*r*r
vfmadd231sd xmm1,xmm5,xmm0 ; r*r*(1/2+r*1/3) + r
vfmadd231sd xmm1,xmm3,xmm4
vmulsd xmm1,xmm1,QWORD PTR __real_log10_e
; m*log(2) + log(G) - poly*log10_e
vmovsd xmm5,QWORD PTR __real_log10_2_tail
vfmsub213sd xmm5,xmm6,xmm1
movsd xmm0,QWORD PTR [r9 + rax * 8]
lea rdx,DWORD PTR __log10_256_tail
movsd xmm2,QWORD PTR [rdx + rax * 8]
vaddsd xmm2,xmm2,xmm5
vfmadd231sd xmm0,xmm6,QWORD PTR __real_log10_2_lead
vaddsd xmm0,xmm0,xmm2
AVXRestoreXmm xmm6, save_xmm6
StackDeallocate stack_size
ret
ALIGN 16
Llog10_fma3_near_one:
; r = x - 1.0
vmovsd xmm2,QWORD PTR __real_two
vsubsd xmm0,xmm0,QWORD PTR __real_one ; r
vaddsd xmm2,xmm2,xmm0
vdivsd xmm1,xmm0,xmm2 ; r/(2+r) = u/2
vmovsd xmm4,QWORD PTR __real_ca2
vmovsd xmm5,QWORD PTR __real_ca4
vmulsd xmm6,xmm0,xmm1 ; correction
vaddsd xmm1,xmm1,xmm1 ; u
vmulsd xmm2,xmm1,xmm1 ; u^2
vfmadd213sd xmm4,xmm2,QWORD PTR __real_ca1
vfmadd213sd xmm5,xmm2,QWORD PTR __real_ca3
vmulsd xmm2,xmm2,xmm1 ; u^3
vmulsd xmm4,xmm4,xmm2
vmulsd xmm2,xmm2,xmm2
vmulsd xmm2,xmm2,xmm1 ; u^7
vmulsd xmm5,xmm5,xmm2
vaddsd xmm4,xmm4,xmm5
vsubsd xmm4,xmm4,xmm6
vpand xmm3,xmm0,QWORD PTR __mask_lower
vsubsd xmm0,xmm0,xmm3
vaddsd xmm4,xmm4,xmm0
vmulsd xmm1,xmm4,QWORD PTR __real_log10_e_lead
vmulsd xmm4,xmm4,QWORD PTR __real_log10_e_tail
vmulsd xmm0,xmm3,QWORD PTR __real_log10_e_tail
vmulsd xmm3,xmm3,QWORD PTR __real_log10_e_lead
vaddsd xmm0,xmm0,xmm4
vaddsd xmm0,xmm0,xmm1
vaddsd xmm0,xmm0,xmm3
AVXRestoreXmm xmm6, save_xmm6
StackDeallocate stack_size
ret
Llog10_fma3_denormal_adjust:
vpor xmm2,xmm2,QWORD PTR __real_one
vsubsd xmm2,xmm2,QWORD PTR __real_one
vpsrlq xmm5,xmm2,52
vpand xmm2,xmm2,QWORD PTR __mask_mant
vmovapd xmm0,xmm2
vpsubd xmm5,xmm5,DWORD PTR __mask_2045
vcvtdq2pd xmm6,xmm5
jmp Llog10_fma3_continue_common
ALIGN 16
Llog10_fma3_x_is_zero_or_neg:
jne Llog10_fma3_x_is_neg
vmovsd xmm1,QWORD PTR __real_ninf
mov r8d,DWORD PTR __flag_x_zero
call fname_special
AVXRestoreXmm xmm6, save_xmm6
StackDeallocate stack_size
ret
ALIGN 16
Llog10_fma3_x_is_neg:
vmovsd xmm1,QWORD PTR __real_neg_qnan
mov r8d,DWORD PTR __flag_x_neg
call fname_special
AVXRestoreXmm xmm6, save_xmm6
StackDeallocate stack_size
ret
ALIGN 16
Llog10_fma3_x_is_inf_or_nan:
cmp rax,QWORD PTR __real_inf
je Llog10_fma3_finish
cmp rax,QWORD PTR __real_ninf
je Llog10_fma3_x_is_neg
or rax,QWORD PTR __real_qnanbit
movd xmm1,rax
mov r8d,DWORD PTR __flag_x_nan
call fname_special
jmp Llog10_fma3_finish
ALIGN 16
Llog10_fma3_finish:
AVXRestoreXmm xmm6, save_xmm6
StackDeallocate stack_size
ret
fname endp
END