reactos/sdk/lib/crt/math/libm_sse2/log.asm
Timo Kreuzer 105426b81a [LIBM] Fix up some asm files
This allows to compile them with GAS after translation
2022-12-01 15:21:59 +02:00

558 lines
16 KiB
NASM

;
; MIT License
; -----------
;
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
;
; Permission is hereby granted, free of charge, to any person obtaining a copy
; of this Software and associated documentaon files (the "Software"), to deal
; in the Software without restriction, including without limitation the rights
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
; copies of the Software, and to permit persons to whom the Software is
; furnished to do so, subject to the following conditions:
;
; The above copyright notice and this permission notice shall be included in
; all copies or substantial portions of the Software.
;
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
; THE SOFTWARE.
;
; log.asm
;
; An implementation of the log libm function.
;
; Prototype:
;
; double log(double x);
;
;
; Algorithm:
;
; Based on:
; Ping-Tak Peter Tang
; "Table-driven implementation of the logarithm function in IEEE
; floating-point arithmetic"
; ACM Transactions on Mathematical Software (TOMS)
; Volume 16, Issue 4 (December 1990)
;
;
; x very close to 1.0 is handled differently, for x everywhere else
; a brief explanation is given below
;
; x = (2^m)*A
; x = (2^m)*(G+g) with (1 <= G < 2) and (g <= 2^(-9))
; x = (2^m)*2*(G/2+g/2)
; x = (2^m)*2*(F+f) with (0.5 <= F < 1) and (f <= 2^(-10))
;
; Y = (2^(-1))*(2^(-m))*(2^m)*A
; Now, range of Y is: 0.5 <= Y < 1
;
; F = 0x100 + (first 8 mantissa bits) + (9th mantissa bit)
; Now, range of F is: 256 <= F <= 512
; F = F / 512
; Now, range of F is: 0.5 <= F <= 1
;
; f = -(Y-F), with (f <= 2^(-10))
;
; log(x) = m*log(2) + log(2) + log(F-f)
; log(x) = m*log(2) + log(2) + log(F) + log(1-(f/F))
; log(x) = m*log(2) + log(2*F) + log(1-r)
;
; r = (f/F), with (r <= 2^(-9))
; r = f*(1/F) with (1/F) precomputed to avoid division
;
; log(x) = m*log(2) + log(G) - poly
;
; log(G) is precomputed
; poly = (r + (r^2)/2 + (r^3)/3 + (r^4)/4) + (r^5)/5) + (r^6)/6))
;
; log(2) and log(G) need to be maintained in extra precision
; to avoid losing precision in the calculations
;
.const
ALIGN 16
__real_ninf DQ 0fff0000000000000h ; -inf
DQ 0000000000000000h
__real_inf DQ 7ff0000000000000h ; +inf
DQ 0000000000000000h
__real_neg_qnan DQ 0fff8000000000000h ; neg qNaN
DQ 0000000000000000h
__real_qnanbit DQ 0008000000000000h
DQ 0000000000000000h
__real_min_norm DQ 0010000000000000h
DQ 0000000000000000h
__real_mant DQ 000FFFFFFFFFFFFFh ; mantissa bits
DQ 0000000000000000h
__mask_1023 DQ 00000000000003ffh
DQ 0000000000000000h
__mask_001 DQ 0000000000000001h
DQ 0000000000000000h
__mask_mant_all8 DQ 000ff00000000000h
DQ 0000000000000000h
__mask_mant9 DQ 0000080000000000h
DQ 0000000000000000h
__real_two DQ 4000000000000000h ; 2
DQ 0000000000000000h
__real_one DQ 3ff0000000000000h ; 1
DQ 0000000000000000h
__real_near_one_lt DQ 3fee000000000000h ; .9375
DQ 0000000000000000h
__real_near_one_gt DQ 3ff1000000000000h ; 1.0625
DQ 0000000000000000h
__real_half DQ 3fe0000000000000h ; 1/2
DQ 0000000000000000h
__mask_100 DQ 0000000000000100h
DQ 0000000000000000h
__real_1_over_512 DQ 3f60000000000000h
DQ 0000000000000000h
__real_1_over_2 DQ 3fe0000000000000h
DQ 0000000000000000h
__real_1_over_3 DQ 3fd5555555555555h
DQ 0000000000000000h
__real_1_over_4 DQ 3fd0000000000000h
DQ 0000000000000000h
__real_1_over_5 DQ 3fc999999999999ah
DQ 0000000000000000h
__real_1_over_6 DQ 3fc5555555555555h
DQ 0000000000000000h
__mask_1023_f DQ 0c08ff80000000000h
DQ 0000000000000000h
__mask_2045 DQ 00000000000007fdh
DQ 0000000000000000h
__real_threshold DQ 3fb0000000000000h ; .0625
DQ 0000000000000000h
__real_notsign DQ 7ffFFFFFFFFFFFFFh ; ^sign bit
DQ 0000000000000000h
__real_ca1 DQ 3fb55555555554e6h ; 8.33333333333317923934e-02
DQ 0000000000000000h
__real_ca2 DQ 3f89999999bac6d4h ; 1.25000000037717509602e-02
DQ 0000000000000000h
__real_ca3 DQ 3f62492307f1519fh ; 2.23213998791944806202e-03
DQ 0000000000000000h
__real_ca4 DQ 3f3c8034c85dfff0h ; 4.34887777707614552256e-04
DQ 0000000000000000h
__real_log2_lead DQ 03fe62e42e0000000h ; 6.93147122859954833984e-01
DQ 00000000000000000h
__real_log2_tail DQ 03e6efa39ef35793ch ; 5.76999904754328540596e-08
DQ 00000000000000000h
; these codes and the ones in the corresponding .c file have to match
__flag_x_zero DD 00000001
__flag_x_neg DD 00000002
__flag_x_nan DD 00000003
EXTRN __log_256_lead:QWORD
EXTRN __log_256_tail:QWORD
EXTRN __log_F_inv_qword:QWORD
EXTRN __use_fma3_lib:DWORD
fname TEXTEQU <log>
fname_special TEXTEQU <_log_special>
; define local variable storage offsets
save_xmm6 EQU 20h
dummy_space EQU 40h
stack_size EQU 58h
include fm.inc
; external function
EXTERN fname_special:PROC
.code
ALIGN 16
PUBLIC fname
fname PROC FRAME
StackAllocate stack_size
SaveXmm xmm6, save_xmm6
.ENDPROLOG
cmp DWORD PTR __use_fma3_lib, 0
jne Llog_fma3
Llog_sse2:
; compute exponent part
movdqa xmm3, xmm0
movapd xmm4, xmm0
psrlq xmm3, 52
movd rax, xmm0
psubq xmm3, XMMWORD PTR __mask_1023
; NaN or inf
mov rcx, rax
btr rcx, 63
cmp rcx, QWORD PTR __real_inf
jae __x_is_inf_or_nan
movdqa xmm2, xmm0
cvtdq2pd xmm6, xmm3 ; xexp
pand xmm2, XMMWORD PTR __real_mant
subsd xmm4, QWORD PTR __real_one
comisd xmm6, QWORD PTR __mask_1023_f
je __denormal_adjust
__continue_common:
andpd xmm4, XMMWORD PTR __real_notsign
; compute index into the log tables
mov r9, rax
and rax, QWORD PTR __mask_mant_all8
and r9, QWORD PTR __mask_mant9
shl r9, 1
add rax, r9
movd xmm1, rax
; near one codepath
comisd xmm4, QWORD PTR __real_threshold
jb __near_one
; F, Y
shr rax, 44
por xmm2, XMMWORD PTR __real_half
por xmm1, XMMWORD PTR __real_half
lea r9, __log_F_inv_qword
; check for negative numbers or zero
xorpd xmm5, xmm5
comisd xmm0, xmm5
jbe __x_is_zero_or_neg
; f = F - Y, r = f * inv
subsd xmm1, xmm2 ; xmm1 <-- f = F - Y
mulsd xmm1, QWORD PTR [r9+rax*8] ; xmm1 <-- r = f * inv
movapd xmm2, xmm1 ; xmm2 <-- copy of r
movapd xmm0, xmm1 ; xmm0 <-- copy of r
lea r9, QWORD PTR __log_256_lead
; poly
movsd xmm3, QWORD PTR __real_1_over_6
movsd xmm1, QWORD PTR __real_1_over_3
mulsd xmm3, xmm2 ; xmm3 <-- r/6
mulsd xmm1, xmm2 ; xmm1 <-- r/3
mulsd xmm0, xmm2 ; xmm0 <-- r*r
movapd xmm4, xmm0 ; xmm4 <-- copy of r*r
addsd xmm3, QWORD PTR __real_1_over_5 ; xmm3 <-- r/6 + 1/5
addsd xmm1, QWORD PTR __real_1_over_2 ; xmm1 <-- r/3 + 1/2
mulsd xmm4, xmm0 ; xmm4 <-- r^4
mulsd xmm3, xmm2 ; xmm3 <-- (r/6 + 1/5)*r
mulsd xmm1, xmm0 ; xmm1 <-- (r/3 + 1/2)*r^2
addsd xmm3, QWORD PTR __real_1_over_4 ; xmm3 <-- (r/6 + 1/5)*r + 1/4
addsd xmm1, xmm2 ; xmm1 <-- (r/3 + 1/2)*r^2 + r
mulsd xmm3, xmm4 ; xmm3 <-- ((r/6+1/5)*r+1/4)*r^4
addsd xmm1, xmm3 ; xmm1 <-- poly
; m*log(2)_tail + log(G)_tail - poly
movsd xmm5, QWORD PTR __real_log2_tail
mulsd xmm5, xmm6 ; xmm5 <-- m*log2_tail
subsd xmm5, xmm1 ; xmm5 <-- m*log2_tail - poly
movsd xmm0, QWORD PTR [r9+rax*8] ; xmm0 <-- log(G)_lead
lea rdx, QWORD PTR __log_256_tail
movsd xmm2, QWORD PTR [rdx+rax*8] ; xmm2 <-- log(G)_tail
addsd xmm2, xmm5 ; xmm2 <-- (m*log2_tail - poly) + log(G)_tail
movsd xmm4, QWORD PTR __real_log2_lead
mulsd xmm4, xmm6 ; xmm4 <-- m*log2_lead
addsd xmm0, xmm4 ; xmm0 <-- m*log2_lead + log(G)_lead
addsd xmm0, xmm2 ; xmm0 <-- m*log(2)_tail + log(G)_tail - poly
RestoreXmm xmm6, save_xmm6
StackDeallocate stack_size
ret
ALIGN 16
__near_one:
; r = x - 1.0
movsd xmm2, QWORD PTR __real_two
subsd xmm0, QWORD PTR __real_one ; r
addsd xmm2, xmm0
movsd xmm1, xmm0
divsd xmm1, xmm2 ; r/(2+r) = u/2
movsd xmm4, QWORD PTR __real_ca2
movsd xmm5, QWORD PTR __real_ca4
movsd xmm6, xmm0
mulsd xmm6, xmm1 ; correction
addsd xmm1, xmm1 ; u
movsd xmm2, xmm1
mulsd xmm2, xmm1 ; u^2
mulsd xmm4, xmm2
mulsd xmm5, xmm2
addsd xmm4, __real_ca1
addsd xmm5, __real_ca3
mulsd xmm2, xmm1 ; u^3
mulsd xmm4, xmm2
mulsd xmm2, xmm2
mulsd xmm2, xmm1 ; u^7
mulsd xmm5, xmm2
addsd xmm4, xmm5
subsd xmm4, xmm6
addsd xmm0, xmm4
RestoreXmm xmm6, save_xmm6
StackDeallocate stack_size
ret
ALIGN 16
__denormal_adjust:
por xmm2, XMMWORD PTR __real_one
subsd xmm2, QWORD PTR __real_one
movsd xmm5, xmm2
pand xmm2, XMMWORD PTR __real_mant
movd rax, xmm2
psrlq xmm5, 52
psubd xmm5, XMMWORD PTR __mask_2045
cvtdq2pd xmm6, xmm5
jmp __continue_common
ALIGN 16
__x_is_zero_or_neg:
jne __x_is_neg
movsd xmm1, QWORD PTR __real_ninf
mov r8d, DWORD PTR __flag_x_zero
call fname_special
jmp __finish
ALIGN 16
__x_is_neg:
movsd xmm1, QWORD PTR __real_neg_qnan
mov r8d, DWORD PTR __flag_x_neg
call fname_special
jmp __finish
ALIGN 16
__x_is_inf_or_nan:
cmp rax, QWORD PTR __real_inf
je __finish
cmp rax, QWORD PTR __real_ninf
je __x_is_neg
or rax, QWORD PTR __real_qnanbit
movd xmm1, rax
mov r8d, DWORD PTR __flag_x_nan
call fname_special
jmp __finish
ALIGN 16
__finish:
RestoreXmm xmm6, save_xmm6
StackDeallocate stack_size
ret
ALIGN 16
Llog_fma3:
; compute exponent part
xor rax,rax
vpsrlq xmm3,xmm0,52
vmovq rax,xmm0
vpsubq xmm3,xmm3,XMMWORD PTR __mask_1023
vcvtdq2pd xmm6,xmm3 ; xexp
; NaN or inf
vpand xmm5,xmm0,XMMWORD PTR __real_inf
vcomisd xmm5,QWORD PTR __real_inf
je Llog_fma3_x_is_inf_or_nan
; check for negative numbers or zero
vpxor xmm5,xmm5,xmm5
vcomisd xmm0,xmm5
jbe Llog_fma3_x_is_zero_or_neg
vpand xmm2,xmm0,XMMWORD PTR __real_mant
vsubsd xmm4,xmm0,QWORD PTR __real_one
vcomisd xmm6,QWORD PTR __mask_1023_f
je Llog_fma3_denormal_adjust
Llog_fma3_continue_common:
; compute index into the log tables
vpand xmm1,xmm0,XMMWORD PTR __mask_mant_all8
vpand xmm3,xmm0,XMMWORD PTR __mask_mant9
vpsllq xmm3,xmm3,1
vpaddq xmm1,xmm3,xmm1
vmovq rax,xmm1
; near one codepath
vpand xmm4,xmm4,XMMWORD PTR __real_notsign
vcomisd xmm4,QWORD PTR __real_threshold
jb Llog_fma3_near_one
; F,Y
shr rax,44
vpor xmm2,xmm2,XMMWORD PTR __real_half
vpor xmm1,xmm1,XMMWORD PTR __real_half
lea r9,QWORD PTR __log_F_inv_qword
; f = F - Y,r = f * inv
vsubsd xmm1,xmm1,xmm2
vmulsd xmm1,xmm1,QWORD PTR[r9 + rax * 8]
lea r9,QWORD PTR __log_256_lead
; poly
vmulsd xmm0,xmm1,xmm1 ; r*r
vmovsd xmm3,QWORD PTR __real_1_over_6
vmovsd xmm5,QWORD PTR __real_1_over_3
vfmadd213sd xmm3,xmm1,QWORD PTR __real_1_over_5 ; r*1/6 + 1/5
vfmadd213sd xmm5,xmm1,QWORD PTR __real_1_over_2 ; 1/2+r*1/3
vmovsd xmm4,xmm0,xmm0
vfmadd213sd xmm3,xmm1,QWORD PTR __real_1_over_4 ; 1/4+(1/5*r+r*r*1/6)
vmulsd xmm4,xmm0,xmm0 ; r*r*r*r
vfmadd231sd xmm1,xmm5,xmm0 ; r*r*(1/2+r*1/3) + r
vfmadd231sd xmm1,xmm3,xmm4
; m*log(2) + log(G) - poly
vmovsd xmm5,QWORD PTR __real_log2_tail
vfmsub213sd xmm5,xmm6,xmm1
vmovsd xmm0,QWORD PTR[r9 + rax * 8]
lea rdx,QWORD PTR __log_256_tail
vmovsd xmm1,QWORD PTR[rdx + rax * 8]
vaddsd xmm1,xmm1,xmm5
vfmadd231sd xmm0,xmm6,QWORD PTR __real_log2_lead
vaddsd xmm0,xmm0,xmm1
AVXRestoreXmm xmm6, save_xmm6
StackDeallocate stack_size
ret
ALIGN 16
Llog_fma3_near_one:
; r = x - 1.0
vmovsd xmm3,QWORD PTR __real_two
vsubsd xmm0,xmm0,QWORD PTR __real_one ; r
vaddsd xmm3,xmm3,xmm0
vdivsd xmm1,xmm0,xmm3 ; r/(2+r) = u/2
vmovsd xmm4,QWORD PTR __real_ca2
vmovsd xmm5,QWORD PTR __real_ca4
vmulsd xmm3,xmm0,xmm1 ; correction
vaddsd xmm1,xmm1,xmm1 ; u
vmulsd xmm2,xmm1,xmm1 ; u^2
vfmadd213sd xmm4,xmm2,QWORD PTR __real_ca1
vfmadd213sd xmm5,xmm2,QWORD PTR __real_ca3
vmulsd xmm2,xmm2,xmm1 ; u^3
vmulsd xmm4,xmm4,xmm2
vmulsd xmm2,xmm2,xmm2
vmulsd xmm2,xmm2,xmm1 ; u^7
vfmadd231sd xmm4,xmm5,xmm2
vsubsd xmm4,xmm4,xmm3
vaddsd xmm0,xmm0,xmm4
AVXRestoreXmm xmm6, save_xmm6
StackDeallocate stack_size
ret
Llog_fma3_denormal_adjust:
vpor xmm2,xmm2,XMMWORD PTR __real_one
vsubsd xmm2,xmm2,QWORD PTR __real_one
vpsrlq xmm5,xmm2,52
vpand xmm2,xmm2,XMMWORD PTR __real_mant
vmovapd xmm0,xmm2
vpsubd xmm5,xmm5,XMMWORD PTR __mask_2045
vcvtdq2pd xmm6,xmm5
jmp Llog_fma3_continue_common
ALIGN 16
Llog_fma3_x_is_zero_or_neg:
jne Llog_fma3_x_is_neg
vmovsd xmm1,QWORD PTR __real_ninf
mov r8d,DWORD PTR __flag_x_zero
call fname_special
AVXRestoreXmm xmm6, save_xmm6
StackDeallocate stack_size
ret
ALIGN 16
Llog_fma3_x_is_neg:
vmovsd xmm1,QWORD PTR __real_neg_qnan
mov r8d,DWORD PTR __flag_x_neg
call fname_special
AVXRestoreXmm xmm6, save_xmm6
StackDeallocate stack_size
ret
ALIGN 16
Llog_fma3_x_is_inf_or_nan:
cmp rax,QWORD PTR __real_inf
je Llog_fma3_finish
cmp rax,QWORD PTR __real_ninf
je Llog_fma3_x_is_neg
or rax,QWORD PTR __real_qnanbit
vmovq xmm1,rax
mov r8d,DWORD PTR __flag_x_nan
call fname_special
ALIGN 16
Llog_fma3_finish:
AVXRestoreXmm xmm6, save_xmm6
StackDeallocate stack_size
ret
fname endp
END