; ; MIT License ; ----------- ; ; Copyright (c) 2002-2019 Advanced Micro Devices, Inc. ; ; Permission is hereby granted, free of charge, to any person obtaining a copy ; of this Software and associated documentaon files (the "Software"), to deal ; in the Software without restriction, including without limitation the rights ; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell ; copies of the Software, and to permit persons to whom the Software is ; furnished to do so, subject to the following conditions: ; ; The above copyright notice and this permission notice shall be included in ; all copies or substantial portions of the Software. ; ; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN ; THE SOFTWARE. ; ; expf.asm ; ; An implementation of the expf libm function. ; ; Prototype: ; ; float expf(float x); ; ; ; Algorithm: ; Similar to one presnted in exp.asm ; ; If FMA3 hardware is available, an FMA3 implementation of expf will be used. .const ALIGN 16 __real_inf DD 7f800000h DD 0 DQ 0 __real_ninf DD 0ff800000h DD 0 DQ 0 __real_qnanbit DD 00400000h DD 0 DQ 0 __real_zero DD 00000000h DD 0 DQ 0 __real_p8192 DQ 40c0000000000000h DQ 0 __real_m9600 DQ 0c0c2c00000000000h DQ 0 __real_64_by_log2 DQ 40571547652b82feh ; 64/ln(2) DQ 0 __real_log2_by_64 DQ 3f862e42fefa39efh ; log2_by_64 DQ 0 __real_1_by_6 DQ 3fc5555555555555h ; 1/6 DQ 0 __real_1_by_2 DQ 3fe0000000000000h ; 1/2 DQ 0 ; these codes and the ones in the corresponding .c file have to match __flag_x_nan DD 00000001 __flag_y_zero DD 00000002 __flag_y_inf DD 00000003 EXTRN __two_to_jby64_table:QWORD EXTRN __use_fma3_lib:DWORD fname TEXTEQU fname_special TEXTEQU <_expf_special> ; define local variable storage offsets ; make room for fname_special to save things dummy_space EQU 020h stack_size EQU 038h include fm.inc ; external function EXTERN fname_special:PROC .code ALIGN 16 PUBLIC fname fname PROC FRAME StackAllocate stack_size .ENDPROLOG ; Do this to avoid possible exceptions from a NaN argument. movd edx, xmm0 btr edx,31 cmp edx, DWORD PTR __real_inf jge Lexpf_x_is_inf_or_nan cmp DWORD PTR __use_fma3_lib, 0 jne Lexpf_fma3 Lexpf_sse2: cvtss2sd xmm0, xmm0 ; x * (64/ln(2)) movsd xmm3, QWORD PTR __real_64_by_log2 mulsd xmm3, xmm0 ; x <= 128*ln(2), ( x * (64/ln(2)) ) <= 64*128 ; x > -150*ln(2), ( x * (64/ln(2)) ) > 64*(-150) comisd xmm3, QWORD PTR __real_p8192 jae Lexpf_y_is_inf comisd xmm3, QWORD PTR __real_m9600 jb Lexpf_y_is_zero ; n = int( x * (64/ln(2)) ) cvtpd2dq xmm4, xmm3 lea r10, __two_to_jby64_table cvtdq2pd xmm1, xmm4 ; r = x - n * ln(2)/64 movsd xmm2, QWORD PTR __real_log2_by_64 mulsd xmm2, xmm1 movd ecx, xmm4 mov rax, 3fh and eax, ecx subsd xmm0, xmm2 movsd xmm1, xmm0 ; m = (n - j) / 64 sub ecx, eax sar ecx, 6 ; q movsd xmm3, QWORD PTR __real_1_by_6 mulsd xmm3, xmm0 mulsd xmm0, xmm0 addsd xmm3, QWORD PTR __real_1_by_2 mulsd xmm0, xmm3 addsd xmm0, xmm1 add rcx, 1023 shl rcx, 52 ; (f)*(1+q) movsd xmm2, QWORD PTR [r10+rax*8] mulsd xmm0, xmm2 addsd xmm0, xmm2 movd xmm1, rcx mulsd xmm0, xmm1 cvtsd2ss xmm0, xmm0 Lexpf_final_check: StackDeallocate stack_size ret ALIGN 16 Lexpf_y_is_zero: movss xmm1, DWORD PTR __real_zero movd xmm0, edx mov r8d, DWORD PTR __flag_y_zero call fname_special jmp Lexpf_finish ALIGN 16 Lexpf_y_is_inf: movss xmm1, DWORD PTR __real_inf movd xmm0, edx mov r8d, DWORD PTR __flag_y_inf call fname_special jmp Lexpf_finish ALIGN 16 Lexpf_x_is_inf_or_nan: cmp edx, DWORD PTR __real_inf je Lexpf_finish cmp edx, DWORD PTR __real_ninf je Lexpf_process_zero or edx, DWORD PTR __real_qnanbit movd xmm1, edx mov r8d, DWORD PTR __flag_x_nan call fname_special jmp Lexpf_finish ALIGN 16 Lexpf_process_zero: movss xmm0, DWORD PTR __real_zero jmp Lexpf_final_check ALIGN 16 Lexpf_finish: StackDeallocate stack_size ret ALIGN 16 Lexpf_fma3: vcvtss2sd xmm0, xmm0, xmm0 ; x * (64/ln(2)) vmulsd xmm3, xmm0, QWORD PTR __real_64_by_log2 ; x <= 128*ln(2), ( x * (64/ln(2)) ) <= 64*128 ; x > -150*ln(2), ( x * (64/ln(2)) ) > 64*(-150) vcomisd xmm3, QWORD PTR __real_p8192 jae Lexpf_fma3_y_is_inf vucomisd xmm3, QWORD PTR __real_m9600 jb Lexpf_fma3_y_is_zero ; n = int( x * (64/ln(2)) ) vcvtpd2dq xmm4, xmm3 lea r10, __two_to_jby64_table vcvtdq2pd xmm1, xmm4 ; r = x - n * ln(2)/64 vfnmadd231sd xmm0, xmm1, QWORD PTR __real_log2_by_64 vmovd ecx, xmm4 mov rax, 3fh and eax, ecx vmovapd xmm1, xmm0 ; xmm1 <-- copy of r ; m = (n - j) / 64 sub ecx, eax sar ecx, 6 ; q vmovsd xmm3, QWORD PTR __real_1_by_6 vmulsd xmm0, xmm0, xmm0 ; xmm0 <-- r^2 vfmadd213sd xmm3, xmm1, QWORD PTR __real_1_by_2 ; xmm3 <-- r/6 + 1/2 vfmadd213sd xmm0, xmm3, xmm1 ; xmm0 <-- q = r^2*(r/6 + 1/2) + r add rcx, 1023 shl rcx, 52 ; (f)*(1+q) vmovsd xmm2, QWORD PTR [r10+rax*8] vfmadd213sd xmm0, xmm2, xmm2 vmovq xmm2,rcx vmulsd xmm0, xmm0, xmm2 vcvtsd2ss xmm0, xmm0, xmm0 Lexpf_fma3_final_check: StackDeallocate stack_size ret ALIGN 16 Lexpf_fma3_y_is_zero: vmovss xmm1, DWORD PTR __real_zero vmovd xmm0, edx mov r8d, DWORD PTR __flag_y_zero call fname_special jmp Lexpf_fma3_finish ALIGN 16 Lexpf_fma3_y_is_inf: vmovss xmm1, DWORD PTR __real_inf vmovd xmm0, edx mov r8d, DWORD PTR __flag_y_inf call fname_special jmp Lexpf_fma3_finish ALIGN 16 Lexpf_fma3_process_zero: vmovss xmm0, DWORD PTR __real_zero jmp Lexpf_fma3_final_check ALIGN 16 Lexpf_fma3_finish: StackDeallocate stack_size ret fname endp END