reactos/sdk/lib/crt/math/libm_sse2/expf.asm
2022-12-01 15:21:59 +02:00

304 lines
7.5 KiB
NASM

;
; MIT License
; -----------
;
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
;
; Permission is hereby granted, free of charge, to any person obtaining a copy
; of this Software and associated documentaon files (the "Software"), to deal
; in the Software without restriction, including without limitation the rights
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
; copies of the Software, and to permit persons to whom the Software is
; furnished to do so, subject to the following conditions:
;
; The above copyright notice and this permission notice shall be included in
; all copies or substantial portions of the Software.
;
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
; THE SOFTWARE.
;
; expf.asm
;
; An implementation of the expf libm function.
;
; Prototype:
;
; float expf(float x);
;
;
; Algorithm:
; Similar to one presnted in exp.asm
;
; If FMA3 hardware is available, an FMA3 implementation of expf will be used.
.const
ALIGN 16
__real_inf DD 7f800000h
DD 0
DQ 0
__real_ninf DD 0ff800000h
DD 0
DQ 0
__real_qnanbit DD 00400000h
DD 0
DQ 0
__real_zero DD 00000000h
DD 0
DQ 0
__real_p8192 DQ 40c0000000000000h
DQ 0
__real_m9600 DQ 0c0c2c00000000000h
DQ 0
__real_64_by_log2 DQ 40571547652b82feh ; 64/ln(2)
DQ 0
__real_log2_by_64 DQ 3f862e42fefa39efh ; log2_by_64
DQ 0
__real_1_by_6 DQ 3fc5555555555555h ; 1/6
DQ 0
__real_1_by_2 DQ 3fe0000000000000h ; 1/2
DQ 0
; these codes and the ones in the corresponding .c file have to match
__flag_x_nan DD 00000001
__flag_y_zero DD 00000002
__flag_y_inf DD 00000003
EXTRN __two_to_jby64_table:QWORD
EXTRN __use_fma3_lib:DWORD
fname TEXTEQU <expf>
fname_special TEXTEQU <_expf_special>
; define local variable storage offsets
; make room for fname_special to save things
dummy_space EQU 020h
stack_size EQU 038h
include fm.inc
; external function
EXTERN fname_special:PROC
.code
ALIGN 16
PUBLIC fname
fname PROC FRAME
StackAllocate stack_size
.ENDPROLOG
; Do this to avoid possible exceptions from a NaN argument.
movd edx, xmm0
btr edx,31
cmp edx, DWORD PTR __real_inf
jge Lexpf_x_is_inf_or_nan
cmp DWORD PTR __use_fma3_lib, 0
jne Lexpf_fma3
Lexpf_sse2:
cvtss2sd xmm0, xmm0
; x * (64/ln(2))
movsd xmm3, QWORD PTR __real_64_by_log2
mulsd xmm3, xmm0
; x <= 128*ln(2), ( x * (64/ln(2)) ) <= 64*128
; x > -150*ln(2), ( x * (64/ln(2)) ) > 64*(-150)
comisd xmm3, QWORD PTR __real_p8192
jae Lexpf_y_is_inf
comisd xmm3, QWORD PTR __real_m9600
jb Lexpf_y_is_zero
; n = int( x * (64/ln(2)) )
cvtpd2dq xmm4, xmm3
lea r10, __two_to_jby64_table
cvtdq2pd xmm1, xmm4
; r = x - n * ln(2)/64
movsd xmm2, QWORD PTR __real_log2_by_64
mulsd xmm2, xmm1
movd ecx, xmm4
mov rax, 3fh
and eax, ecx
subsd xmm0, xmm2
movsd xmm1, xmm0
; m = (n - j) / 64
sub ecx, eax
sar ecx, 6
; q
movsd xmm3, QWORD PTR __real_1_by_6
mulsd xmm3, xmm0
mulsd xmm0, xmm0
addsd xmm3, QWORD PTR __real_1_by_2
mulsd xmm0, xmm3
addsd xmm0, xmm1
add rcx, 1023
shl rcx, 52
; (f)*(1+q)
movsd xmm2, QWORD PTR [r10+rax*8]
mulsd xmm0, xmm2
addsd xmm0, xmm2
movd xmm1, rcx
mulsd xmm0, xmm1
cvtsd2ss xmm0, xmm0
Lexpf_final_check:
StackDeallocate stack_size
ret
ALIGN 16
Lexpf_y_is_zero:
movss xmm1, DWORD PTR __real_zero
movd xmm0, edx
mov r8d, DWORD PTR __flag_y_zero
call fname_special
jmp Lexpf_finish
ALIGN 16
Lexpf_y_is_inf:
movss xmm1, DWORD PTR __real_inf
movd xmm0, edx
mov r8d, DWORD PTR __flag_y_inf
call fname_special
jmp Lexpf_finish
ALIGN 16
Lexpf_x_is_inf_or_nan:
cmp edx, DWORD PTR __real_inf
je Lexpf_finish
cmp edx, DWORD PTR __real_ninf
je Lexpf_process_zero
or edx, DWORD PTR __real_qnanbit
movd xmm1, edx
mov r8d, DWORD PTR __flag_x_nan
call fname_special
jmp Lexpf_finish
ALIGN 16
Lexpf_process_zero:
movss xmm0, DWORD PTR __real_zero
jmp Lexpf_final_check
ALIGN 16
Lexpf_finish:
StackDeallocate stack_size
ret
ALIGN 16
Lexpf_fma3:
vcvtss2sd xmm0, xmm0, xmm0
; x * (64/ln(2))
vmulsd xmm3, xmm0, QWORD PTR __real_64_by_log2
; x <= 128*ln(2), ( x * (64/ln(2)) ) <= 64*128
; x > -150*ln(2), ( x * (64/ln(2)) ) > 64*(-150)
vcomisd xmm3, QWORD PTR __real_p8192
jae Lexpf_fma3_y_is_inf
vucomisd xmm3, QWORD PTR __real_m9600
jb Lexpf_fma3_y_is_zero
; n = int( x * (64/ln(2)) )
vcvtpd2dq xmm4, xmm3
lea r10, __two_to_jby64_table
vcvtdq2pd xmm1, xmm4
; r = x - n * ln(2)/64
vfnmadd231sd xmm0, xmm1, QWORD PTR __real_log2_by_64
vmovd ecx, xmm4
mov rax, 3fh
and eax, ecx
vmovapd xmm1, xmm0 ; xmm1 <-- copy of r
; m = (n - j) / 64
sub ecx, eax
sar ecx, 6
; q
vmovsd xmm3, QWORD PTR __real_1_by_6
vmulsd xmm0, xmm0, xmm0 ; xmm0 <-- r^2
vfmadd213sd xmm3, xmm1, QWORD PTR __real_1_by_2 ; xmm3 <-- r/6 + 1/2
vfmadd213sd xmm0, xmm3, xmm1 ; xmm0 <-- q = r^2*(r/6 + 1/2) + r
add rcx, 1023
shl rcx, 52
; (f)*(1+q)
vmovsd xmm2, QWORD PTR [r10+rax*8]
vfmadd213sd xmm0, xmm2, xmm2
vmovq xmm2,rcx
vmulsd xmm0, xmm0, xmm2
vcvtsd2ss xmm0, xmm0, xmm0
Lexpf_fma3_final_check:
StackDeallocate stack_size
ret
ALIGN 16
Lexpf_fma3_y_is_zero:
vmovss xmm1, DWORD PTR __real_zero
vmovd xmm0, edx
mov r8d, DWORD PTR __flag_y_zero
call fname_special
jmp Lexpf_fma3_finish
ALIGN 16
Lexpf_fma3_y_is_inf:
vmovss xmm1, DWORD PTR __real_inf
vmovd xmm0, edx
mov r8d, DWORD PTR __flag_y_inf
call fname_special
jmp Lexpf_fma3_finish
ALIGN 16
Lexpf_fma3_process_zero:
vmovss xmm0, DWORD PTR __real_zero
jmp Lexpf_fma3_final_check
ALIGN 16
Lexpf_fma3_finish:
StackDeallocate stack_size
ret
fname endp
END