reactos/sdk/lib/crt/math/libm_sse2/exp.asm
2022-12-01 15:21:59 +02:00

440 lines
14 KiB
NASM

;
; MIT License
; -----------
;
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
;
; Permission is hereby granted, free of charge, to any person obtaining a copy
; of this Software and associated documentaon files (the "Software"), to deal
; in the Software without restriction, including without limitation the rights
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
; copies of the Software, and to permit persons to whom the Software is
; furnished to do so, subject to the following conditions:
;
; The above copyright notice and this permission notice shall be included in
; all copies or substantial portions of the Software.
;
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
; THE SOFTWARE.
;
; exp.asm
;
; An implementation of the exp libm function.
;
; Prototype:
;
; double exp(double x);
;
;
; Algorithm:
;
; e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
;
; x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
; n = 64*m + j, 0 <= j < 64
;
; e^x = 2^((64*m + j + f)/64)
; = (2^m) * (2^(j/64)) * 2^(f/64)
; = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
;
; f = x*(64/ln(2)) - n
; r = f*(ln(2)/64) = x - n*(ln(2)/64)
;
; e^x = (2^m) * (2^(j/64)) * e^r
;
; (2^(j/64)) is precomputed
;
; e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5! + (r^5)/5!
; e^r = 1 + q
;
; q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5! + (r^5)/5!
;
.const
ALIGN 16
; these codes and the ones in the corresponding .c file have to match
__flag_x_nan DD 00000001
__flag_y_zero DD 00000002
__flag_y_inf DD 00000003
ALIGN 16
L__real_1_by_720 DQ 03f56c16c16c16c17h
DQ 03f56c16c16c16c17h ; 1/720
L__real_1_by_120 DQ 03f81111111111111h
DQ 03f81111111111111h ; 1/120
L__real_1_by_6 DQ 03fc5555555555555h
DQ 03fc5555555555555h ; 1/6
L__real_1_by_2 DQ 03fe0000000000000h
DQ 03fe0000000000000h ; 1/2
L__real_1_by_24 DQ 03fa5555555555555h
DQ 03fa5555555555555h ; 1/24
ALIGN 16
L__log2_by_64_mtail_mhead DQ 0bf862e42fefa0000h, 0bd1cf79abc9e3b39h
L__ln_of_smallest_normal DQ 0C086232BDD7ABCD2h
L__zero DQ 00000000000000000h
L__max_exp_arg DQ 040862e42fefa39efh ; 709.78271289338397
L__denormal_tiny_threshold DQ 0c0874046dfefd9d0h ; -744.03460681327306
L__min_exp_arg DQ 0c0874910d52d3051h ; -745.13321910194111
L__real_64_by_log2 DQ 040571547652b82feh ; 64/ln(2)
L__positive_infinity DQ 07ff0000000000000h
L__negative_infinity DQ 0fff0000000000000h
L__real_qnanbit DQ 0008000000000000h ; qnan set bit
L__real_x_near0_threshold DQ 3c00000000000000h
L__log2_by_64_mhead DQ 0bf862e42fefa0000h
L__log2_by_64_mtail DQ 0bd1cf79abc9e3b39h
L__real_smallest_denormal DQ 00000000000000001h
L__real_one DQ 03ff0000000000000h
L__2_to_neg_26 DQ 03E50000000000000h ; 2^-26
L__min_normal DQ 00010000000000000h ; smallest normal
EXTRN __two_to_jby64_table:QWORD
EXTRN __two_to_jby64_head_table:QWORD
EXTRN __two_to_jby64_tail_table:QWORD
EXTRN __use_fma3_lib:DWORD
; make room for fname_special to save things
dummy_space EQU 020h
stack_size EQU 038h
include fm.inc
fname TEXTEQU <exp>
fname_special TEXTEQU <_exp_special>
;Define name and any external functions being called
EXTERN fname_special : PROC
.code
PUBLIC fname
fname PROC FRAME
StackAllocate stack_size
.ENDPROLOG
; We need to avoid unwanted exceptions from a NaN argument.
; It could be argued that a signaling NaN should raise an exception,
; but the existing library doesn't. At any rate, the comparison operations
; don't seem to like quiet NaN either, so...
movd rdx, xmm0
btr rdx, 63
cmp rdx, L__positive_infinity
jge Lexp_x_is_nan_or_inf
cmp DWORD PTR __use_fma3_lib, 0
jne Lexp_fma3
movapd xmm2, xmm0
movapd xmm3, xmm0
; Some hardware has problems with too many branches in a single
; 16- or 32-byte window, so let's peel off the common case into
; a single branch.
cmplesd xmm2, L__max_exp_arg ; xmm2 <-- 0xFFFFFFFF is x is not too big positive
cmpnltsd xmm3, L__denormal_tiny_threshold ; xmm3 <-- 0xFFFFFFFF if x is not too big negative
andps xmm2, xmm3 ; xmm2 <-- 0xFFFFFFFF if x is in range, 0 otherwise
ucomisd xmm2, xmm2 ; note that FFF... is NaN, so this comparison should set PF for in-range x
jp Lexp_y_is_finite
ucomisd xmm0, L__max_exp_arg
ja Lexp_y_is_inf
; Since we peeled off the cases with normal result,
; there is only one possibility remaining:
jmp Lexp_y_is_denormal_or_zero
ALIGN 16
Lexp_y_is_finite:
; x * (64/ln(2))
movapd xmm1, xmm0
btr rdx, 63 ; rdx <-- |x|
cmp rdx, L__2_to_neg_26
jbe Lexp_return_1_plus_x
mulsd xmm1, L__real_64_by_log2
; n = int( x * (64/ln(2)) )
cvttpd2dq xmm2, xmm1 ; xmm2 = (int)n
cvtdq2pd xmm1, xmm2 ; xmm1 = (double)n
movd ecx, xmm2
movapd xmm2, xmm1
; r1 = x - n * ln(2)/64 head
mulsd xmm1, L__log2_by_64_mhead
; j = n & 0x3f
mov rax, 03fh
and eax, ecx ; eax = j
; m = (n - j) / 64
sar ecx, 6 ; ecx = m
; r2 = - n * ln(2)/64 tail
mulsd xmm2, L__log2_by_64_mtail
addsd xmm0, xmm1 ; xmm0 = r1
; r1+r2
addsd xmm2, xmm0 ; xmm2 = r
; q = r + r^2*1/2 + r^3*1/6 + r^4 *1/24 + r^5*1/120 + r^6*1/720
; q = r + r*r*(1/2 + r*(1/6+ r*(1/24 + r*(1/120 + r*(1/720)))))
movapd xmm3, L__real_1_by_720 ; xmm3 = 1/720
mulsd xmm3, xmm2 ; xmm3 = r*1/720
movapd xmm0, L__real_1_by_6 ; xmm0 = 1/6
movapd xmm1, xmm2 ; xmm1 = r
mulsd xmm0, xmm2 ; xmm0 = r*1/6
addsd xmm3, L__real_1_by_120 ; xmm3 = 1/120 + (r*1/720)
mulsd xmm1, xmm2 ; xmm1 = r*r
addsd xmm0, L__real_1_by_2 ; xmm0 = 1/2 + (r*1/6)
movapd xmm4, xmm1 ; xmm4 = r*r
mulsd xmm4, xmm1 ; xmm4 = (r*r) * (r*r)
mulsd xmm3, xmm2 ; xmm3 = r * (1/120 + (r*1/720))
mulsd xmm0, xmm1 ; xmm0 = (r*r)*(1/2 + (r*1/6))
addsd xmm3, L__real_1_by_24 ; xmm3 = 1/24 + (r * (1/120 + (r*1/720)))
addsd xmm0, xmm2 ; xmm0 = r + ((r*r)*(1/2 + (r*1/6)))
mulsd xmm3, xmm4 ; xmm3 = ((r*r) * (r*r)) * (1/24 + (r * (1/120 + (r*1/720))))
addsd xmm0, xmm3 ; xmm0 = r + ((r*r)*(1/2 + (r*1/6))) + ((r*r) * (r*r)) * (1/24 + (r * (1/120 + (r*1/720))))
;(f)*(q) + f2 + f1
cmp ecx, 0fffffc02h ; -1022
lea rdx, __two_to_jby64_table
lea r11, __two_to_jby64_tail_table
lea r10, __two_to_jby64_head_table
mulsd xmm0, QWORD PTR [rdx+rax * 8 ]
addsd xmm0, QWORD PTR [r11+rax * 8 ]
addsd xmm0, QWORD PTR [r10+rax * 8 ]
jle Lexp_process_denormal
Lexp_process_normal:
shl rcx, 52
movd xmm2, rcx
paddq xmm0, xmm2
StackDeallocate stack_size
ret
ALIGN 16
Lexp_process_denormal:
jl Lexp_process_true_denormal
ucomisd xmm0, L__real_one
jae Lexp_process_normal
Lexp_process_true_denormal:
; here ( e^r < 1 and m = -1022 ) or m <= -1023
add ecx, 1074
mov rax, 1
shl rax, cl
movd xmm2, rax
mulsd xmm0, xmm2
jmp Lexp_finish
Lexp_y_is_one:
movsd xmm0, L__real_one
jmp Lexp_finish
ALIGN 16
Lexp_x_is_nan_or_inf:
movd rax, xmm0
cmp rax, L__positive_infinity
je Lexp_finish
cmp rax, L__negative_infinity
je Lexp_return_zero_without_exception
or rax, L__real_qnanbit
movd xmm1, rax
mov r8d, __flag_x_nan
call fname_special
jmp Lexp_finish
ALIGN 16
Lexp_y_is_inf:
mov rax, 07ff0000000000000h
movd xmm1, rax
mov r8d, __flag_y_inf
call fname_special
jmp Lexp_finish
ALIGN 16
Lexp_y_is_denormal_or_zero:
ucomisd xmm0, L__min_exp_arg
jbe Lexp_y_is_zero
movapd xmm0, L__real_smallest_denormal
jmp Lexp_finish
ALIGN 16
Lexp_y_is_zero:
pxor xmm1, xmm1
mov r8d, __flag_y_zero
call fname_special
jmp Lexp_finish
ALIGN 16
Lexp_return_1_plus_x:
cmp rdx, L__min_normal
jbe Lexp_return_1_plus_eps
addsd xmm0, L__real_one
StackDeallocate stack_size
ret 0
; Some hardware really does not like subnormals. Try to avoid them.
ALIGN 16
Lexp_return_1_plus_eps:
movsd xmm0, L__real_one
addsd xmm0, L__min_normal ; make sure inexact is set
StackDeallocate stack_size
ret 0
ALIGN 16
Lexp_return_zero_without_exception:
pxor xmm0,xmm0
StackDeallocate stack_size
ret 0
ALIGN 16
Lexp_finish:
StackDeallocate stack_size
ret 0
ALIGN 16
Lexp_fma3:
; Some hardware has problems with too many branches in a single
; 16- or 32-byte window, so let's peel off the common case into
; a single branch.
vcmplesd xmm2, xmm0, L__max_exp_arg ; xmm2 <-- 0xFFFFFFFF is x is not too big positive
vcmpnltsd xmm3, xmm0, L__denormal_tiny_threshold ; xmm3 <-- 0xFFFFFFFF if x is not too big negative
vandps xmm2, xmm3, xmm2 ; xmm2 <-- 0xFFFFFFFF if x is in range, 0 otherwise
vucomisd xmm2, xmm2 ; note that FFF... is NaN, so this comparison should set PF for in-range x
jp Lexp_fma3_y_is_finite
vucomisd xmm0,L__max_exp_arg
ja Lexp_fma3_y_is_inf
; Since we peeled off the cases with normal result,
; there is only one possibility remaining:
jmp Lexp_fma3_y_is_zero
; vpsllq xmm1, xmm0, 1
; vpsrlq xmm1, xmm1, 1
; vucomisd xmm1, L__real_x_near0_threshold ; 2^-63
; jb Lexp_fma3_y_is_one
ALIGN 16
Lexp_fma3_y_is_finite:
vmovq rdx, xmm0
btr rdx, 63 ; rdx <-- |x|
cmp rdx, L__2_to_neg_26
jbe Lexp_fma3_return_1_plus_x
; x * (64/ln(2))
vmulsd xmm1,xmm0,L__real_64_by_log2
; n = int( x * (64/ln(2)) )
vcvttpd2dq xmm2,xmm1 ;xmm2 = (int)n
vcvtdq2pd xmm1,xmm2 ;xmm1 = (double)n ;can use round
vmovd ecx,xmm2
; r1 = x - n * ln(2)/64 head
; r2 = - n * ln(2)/64 tail
; r = r1+r2
vmovlhps xmm1,xmm1,xmm1 ;xmm1 = (double (double)n,)n
vmovq xmm0,xmm0 ;xmm0 = 0,x ;zero out the upper part
vfmadd132pd xmm1,xmm0,L__log2_by_64_mtail_mhead
vhaddpd xmm2,xmm1,xmm1 ;xmm2 = r,r
;j = n & 03fh
mov rax,03fh
and eax,ecx ;eax = j
; m = (n - j) / 64
sar ecx,6 ;ecx = m
; q = r + r^2*1/2 + r^3*1/6 + r^4 *1/24 + r^5*1/120 + r^6*1/720
; q = r + r*r*(1/2 + r*(1/6+ r*(1/24 + r*(1/120 + r*(1/720)))))
vmovapd xmm3,L__real_1_by_720
vfmadd213sd xmm3,xmm2,L__real_1_by_120
vfmadd213sd xmm3,xmm2,L__real_1_by_24
vfmadd213sd xmm3,xmm2,L__real_1_by_6
vfmadd213sd xmm3,xmm2,L__real_1_by_2
vmulsd xmm0,xmm2,xmm2
vfmadd213sd xmm0,xmm3,xmm2
; (f)*(q) + f2 + f1
cmp ecx,0fffffc02h ; -1022
lea rdx,__two_to_jby64_table
lea r11,__two_to_jby64_tail_table
lea r10,__two_to_jby64_head_table
vmulsd xmm2,xmm0,QWORD PTR[rdx + rax * 8]
vaddsd xmm1,xmm2,QWORD PTR[r11 + rax * 8]
vaddsd xmm0,xmm1,QWORD PTR[r10 + rax * 8]
jle Lexp_fma3_process_denormal
Lexp_fma3_process_normal:
shl rcx,52
vmovq xmm2,rcx
vpaddq xmm0,xmm0,xmm2
StackDeallocate stack_size
ret
ALIGN 16
Lexp_fma3_process_denormal:
jl Lexp_fma3_process_true_denormal
vucomisd xmm0,L__real_one
jae Lexp_fma3_process_normal
Lexp_fma3_process_true_denormal:
; here ( e^r < 1 and m = -1022 ) or m <= -1023
add ecx,1074
mov rax,1
shl rax,cl
vmovq xmm2,rax
vmulsd xmm0,xmm0,xmm2
jmp Lexp_fma3_finish
Lexp_fma3_y_is_one:
vmovsd xmm0, L__real_one
jmp Lexp_fma3_finish
ALIGN 16
Lexp_fma3_y_is_inf:
mov rax,07ff0000000000000h
vmovq xmm1,rax
mov r8d,__flag_y_inf
call fname_special
jmp Lexp_fma3_finish
ALIGN 16
Lexp_fma3_return_1_plus_x:
cmp rdx, L__min_normal
jbe Lexp_fma3_return_1_plus_eps
vaddsd xmm0, xmm0, L__real_one
StackDeallocate stack_size
ret 0
; Some hardware really does not like subnormals. Try to avoid them.
ALIGN 16
Lexp_fma3_return_1_plus_eps:
vmovsd xmm0, L__real_one
vaddsd xmm0, xmm0, L__min_normal ; make sure inexact is set
StackDeallocate stack_size
ret 0
ALIGN 16
Lexp_fma3_y_is_zero:
vpxor xmm1,xmm1,xmm1
mov r8d,__flag_y_zero
call fname_special
jmp Lexp_fma3_finish
ALIGN 16
Lexp_fma3_return_zero_without_exception:
vpxor xmm0,xmm0,xmm0
ALIGN 16
Lexp_fma3_finish:
StackDeallocate stack_size
ret
fname endp
END