mirror of
https://github.com/reactos/reactos.git
synced 2024-10-31 20:02:55 +00:00
304 lines
7.5 KiB
NASM
304 lines
7.5 KiB
NASM
;
|
|
; MIT License
|
|
; -----------
|
|
;
|
|
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
|
;
|
|
; Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
; of this Software and associated documentaon files (the "Software"), to deal
|
|
; in the Software without restriction, including without limitation the rights
|
|
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
; copies of the Software, and to permit persons to whom the Software is
|
|
; furnished to do so, subject to the following conditions:
|
|
;
|
|
; The above copyright notice and this permission notice shall be included in
|
|
; all copies or substantial portions of the Software.
|
|
;
|
|
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
; THE SOFTWARE.
|
|
;
|
|
; expf.asm
|
|
;
|
|
; An implementation of the expf libm function.
|
|
;
|
|
; Prototype:
|
|
;
|
|
; float expf(float x);
|
|
;
|
|
|
|
;
|
|
; Algorithm:
|
|
; Similar to one presnted in exp.asm
|
|
;
|
|
; If FMA3 hardware is available, an FMA3 implementation of expf will be used.
|
|
|
|
|
|
.const
|
|
ALIGN 16
|
|
|
|
__real_inf DD 7f800000h
|
|
DD 0
|
|
DQ 0
|
|
|
|
__real_ninf DD 0ff800000h
|
|
DD 0
|
|
DQ 0
|
|
|
|
__real_qnanbit DD 00400000h
|
|
DD 0
|
|
DQ 0
|
|
|
|
__real_zero DD 00000000h
|
|
DD 0
|
|
DQ 0
|
|
|
|
__real_p8192 DQ 40c0000000000000h
|
|
DQ 0
|
|
__real_m9600 DQ 0c0c2c00000000000h
|
|
DQ 0
|
|
|
|
__real_64_by_log2 DQ 40571547652b82feh ; 64/ln(2)
|
|
DQ 0
|
|
__real_log2_by_64 DQ 3f862e42fefa39efh ; log2_by_64
|
|
DQ 0
|
|
|
|
__real_1_by_6 DQ 3fc5555555555555h ; 1/6
|
|
DQ 0
|
|
__real_1_by_2 DQ 3fe0000000000000h ; 1/2
|
|
DQ 0
|
|
|
|
; these codes and the ones in the corresponding .c file have to match
|
|
__flag_x_nan DD 00000001
|
|
__flag_y_zero DD 00000002
|
|
__flag_y_inf DD 00000003
|
|
|
|
EXTRN __two_to_jby64_table:QWORD
|
|
EXTRN __use_fma3_lib:DWORD
|
|
|
|
fname TEXTEQU <expf>
|
|
fname_special TEXTEQU <_expf_special>
|
|
|
|
; define local variable storage offsets
|
|
|
|
; make room for fname_special to save things
|
|
dummy_space EQU 020h
|
|
stack_size EQU 038h
|
|
|
|
include fm.inc
|
|
|
|
; external function
|
|
EXTERN fname_special:PROC
|
|
|
|
.code
|
|
|
|
ALIGN 16
|
|
PUBLIC fname
|
|
fname PROC FRAME
|
|
StackAllocate stack_size
|
|
.ENDPROLOG
|
|
|
|
; Do this to avoid possible exceptions from a NaN argument.
|
|
movd edx, xmm0
|
|
btr edx,31
|
|
cmp edx, DWORD PTR __real_inf
|
|
jge Lexpf_x_is_inf_or_nan
|
|
|
|
cmp DWORD PTR __use_fma3_lib, 0
|
|
jne Lexpf_fma3
|
|
|
|
Lexpf_sse2:
|
|
|
|
cvtss2sd xmm0, xmm0
|
|
|
|
; x * (64/ln(2))
|
|
movsd xmm3, QWORD PTR __real_64_by_log2
|
|
mulsd xmm3, xmm0
|
|
|
|
; x <= 128*ln(2), ( x * (64/ln(2)) ) <= 64*128
|
|
; x > -150*ln(2), ( x * (64/ln(2)) ) > 64*(-150)
|
|
comisd xmm3, QWORD PTR __real_p8192
|
|
jae Lexpf_y_is_inf
|
|
|
|
comisd xmm3, QWORD PTR __real_m9600
|
|
jb Lexpf_y_is_zero
|
|
|
|
; n = int( x * (64/ln(2)) )
|
|
cvtpd2dq xmm4, xmm3
|
|
lea r10, __two_to_jby64_table
|
|
cvtdq2pd xmm1, xmm4
|
|
|
|
; r = x - n * ln(2)/64
|
|
movsd xmm2, QWORD PTR __real_log2_by_64
|
|
mulsd xmm2, xmm1
|
|
movd ecx, xmm4
|
|
mov rax, 3fh
|
|
and eax, ecx
|
|
subsd xmm0, xmm2
|
|
movsd xmm1, xmm0
|
|
|
|
; m = (n - j) / 64
|
|
sub ecx, eax
|
|
sar ecx, 6
|
|
|
|
; q
|
|
movsd xmm3, QWORD PTR __real_1_by_6
|
|
mulsd xmm3, xmm0
|
|
mulsd xmm0, xmm0
|
|
addsd xmm3, QWORD PTR __real_1_by_2
|
|
mulsd xmm0, xmm3
|
|
addsd xmm0, xmm1
|
|
|
|
add rcx, 1023
|
|
shl rcx, 52
|
|
|
|
; (f)*(1+q)
|
|
movsd xmm2, QWORD PTR [r10+rax*8]
|
|
mulsd xmm0, xmm2
|
|
addsd xmm0, xmm2
|
|
|
|
movd xmm1, rcx
|
|
mulsd xmm0, xmm1
|
|
cvtsd2ss xmm0, xmm0
|
|
|
|
Lexpf_final_check:
|
|
StackDeallocate stack_size
|
|
ret
|
|
|
|
ALIGN 16
|
|
Lexpf_y_is_zero:
|
|
|
|
movss xmm1, DWORD PTR __real_zero
|
|
movd xmm0, edx
|
|
mov r8d, DWORD PTR __flag_y_zero
|
|
|
|
call fname_special
|
|
jmp Lexpf_finish
|
|
|
|
ALIGN 16
|
|
Lexpf_y_is_inf:
|
|
|
|
movss xmm1, DWORD PTR __real_inf
|
|
movd xmm0, edx
|
|
mov r8d, DWORD PTR __flag_y_inf
|
|
|
|
call fname_special
|
|
jmp Lexpf_finish
|
|
|
|
ALIGN 16
|
|
Lexpf_x_is_inf_or_nan:
|
|
|
|
cmp edx, DWORD PTR __real_inf
|
|
je Lexpf_finish
|
|
|
|
cmp edx, DWORD PTR __real_ninf
|
|
je Lexpf_process_zero
|
|
|
|
or edx, DWORD PTR __real_qnanbit
|
|
movd xmm1, edx
|
|
mov r8d, DWORD PTR __flag_x_nan
|
|
call fname_special
|
|
jmp Lexpf_finish
|
|
|
|
ALIGN 16
|
|
Lexpf_process_zero:
|
|
movss xmm0, DWORD PTR __real_zero
|
|
jmp Lexpf_final_check
|
|
|
|
ALIGN 16
|
|
Lexpf_finish:
|
|
StackDeallocate stack_size
|
|
ret
|
|
|
|
|
|
ALIGN 16
|
|
Lexpf_fma3:
|
|
|
|
vcvtss2sd xmm0, xmm0, xmm0
|
|
|
|
; x * (64/ln(2))
|
|
vmulsd xmm3, xmm0, QWORD PTR __real_64_by_log2
|
|
|
|
; x <= 128*ln(2), ( x * (64/ln(2)) ) <= 64*128
|
|
; x > -150*ln(2), ( x * (64/ln(2)) ) > 64*(-150)
|
|
vcomisd xmm3, QWORD PTR __real_p8192
|
|
jae Lexpf_fma3_y_is_inf
|
|
|
|
vucomisd xmm3, QWORD PTR __real_m9600
|
|
jb Lexpf_fma3_y_is_zero
|
|
|
|
; n = int( x * (64/ln(2)) )
|
|
vcvtpd2dq xmm4, xmm3
|
|
lea r10, __two_to_jby64_table
|
|
vcvtdq2pd xmm1, xmm4
|
|
|
|
; r = x - n * ln(2)/64
|
|
vfnmadd231sd xmm0, xmm1, QWORD PTR __real_log2_by_64
|
|
vmovd ecx, xmm4
|
|
mov rax, 3fh
|
|
and eax, ecx
|
|
vmovapd xmm1, xmm0 ; xmm1 <-- copy of r
|
|
|
|
; m = (n - j) / 64
|
|
sub ecx, eax
|
|
sar ecx, 6
|
|
|
|
; q
|
|
vmovsd xmm3, QWORD PTR __real_1_by_6
|
|
vmulsd xmm0, xmm0, xmm0 ; xmm0 <-- r^2
|
|
vfmadd213sd xmm3, xmm1, QWORD PTR __real_1_by_2 ; xmm3 <-- r/6 + 1/2
|
|
vfmadd213sd xmm0, xmm3, xmm1 ; xmm0 <-- q = r^2*(r/6 + 1/2) + r
|
|
|
|
add rcx, 1023
|
|
shl rcx, 52
|
|
|
|
; (f)*(1+q)
|
|
vmovsd xmm2, QWORD PTR [r10+rax*8]
|
|
vfmadd213sd xmm0, xmm2, xmm2
|
|
|
|
vmovq xmm2,rcx
|
|
vmulsd xmm0, xmm0, xmm2
|
|
vcvtsd2ss xmm0, xmm0, xmm0
|
|
|
|
Lexpf_fma3_final_check:
|
|
StackDeallocate stack_size
|
|
ret
|
|
|
|
ALIGN 16
|
|
Lexpf_fma3_y_is_zero:
|
|
|
|
vmovss xmm1, DWORD PTR __real_zero
|
|
vmovd xmm0, edx
|
|
mov r8d, DWORD PTR __flag_y_zero
|
|
|
|
call fname_special
|
|
jmp Lexpf_fma3_finish
|
|
|
|
ALIGN 16
|
|
Lexpf_fma3_y_is_inf:
|
|
|
|
vmovss xmm1, DWORD PTR __real_inf
|
|
vmovd xmm0, edx
|
|
mov r8d, DWORD PTR __flag_y_inf
|
|
|
|
call fname_special
|
|
jmp Lexpf_fma3_finish
|
|
|
|
ALIGN 16
|
|
Lexpf_fma3_process_zero:
|
|
vmovss xmm0, DWORD PTR __real_zero
|
|
jmp Lexpf_fma3_final_check
|
|
|
|
ALIGN 16
|
|
Lexpf_fma3_finish:
|
|
StackDeallocate stack_size
|
|
ret
|
|
|
|
fname endp
|
|
|
|
END
|