; ; MIT License ; ----------- ; ; Copyright (c) 2002-2019 Advanced Micro Devices, Inc. ; ; Permission is hereby granted, free of charge, to any person obtaining a copy ; of this Software and associated documentaon files (the "Software"), to deal ; in the Software without restriction, including without limitation the rights ; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell ; copies of the Software, and to permit persons to whom the Software is ; furnished to do so, subject to the following conditions: ; ; The above copyright notice and this permission notice shall be included in ; all copies or substantial portions of the Software. ; ; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN ; THE SOFTWARE. ; ; log.asm ; ; An implementation of the log libm function. ; ; Prototype: ; ; double log(double x); ; ; ; Algorithm: ; ; Based on: ; Ping-Tak Peter Tang ; "Table-driven implementation of the logarithm function in IEEE ; floating-point arithmetic" ; ACM Transactions on Mathematical Software (TOMS) ; Volume 16, Issue 4 (December 1990) ; ; ; x very close to 1.0 is handled differently, for x everywhere else ; a brief explanation is given below ; ; x = (2^m)*A ; x = (2^m)*(G+g) with (1 <= G < 2) and (g <= 2^(-9)) ; x = (2^m)*2*(G/2+g/2) ; x = (2^m)*2*(F+f) with (0.5 <= F < 1) and (f <= 2^(-10)) ; ; Y = (2^(-1))*(2^(-m))*(2^m)*A ; Now, range of Y is: 0.5 <= Y < 1 ; ; F = 0x100 + (first 8 mantissa bits) + (9th mantissa bit) ; Now, range of F is: 256 <= F <= 512 ; F = F / 512 ; Now, range of F is: 0.5 <= F <= 1 ; ; f = -(Y-F), with (f <= 2^(-10)) ; ; log(x) = m*log(2) + log(2) + log(F-f) ; log(x) = m*log(2) + log(2) + log(F) + log(1-(f/F)) ; log(x) = m*log(2) + log(2*F) + log(1-r) ; ; r = (f/F), with (r <= 2^(-9)) ; r = f*(1/F) with (1/F) precomputed to avoid division ; ; log(x) = m*log(2) + log(G) - poly ; ; log(G) is precomputed ; poly = (r + (r^2)/2 + (r^3)/3 + (r^4)/4) + (r^5)/5) + (r^6)/6)) ; ; log(2) and log(G) need to be maintained in extra precision ; to avoid losing precision in the calculations ; .const ALIGN 16 __real_ninf DQ 0fff0000000000000h ; -inf DQ 0000000000000000h __real_inf DQ 7ff0000000000000h ; +inf DQ 0000000000000000h __real_neg_qnan DQ 0fff8000000000000h ; neg qNaN DQ 0000000000000000h __real_qnanbit DQ 0008000000000000h DQ 0000000000000000h __real_min_norm DQ 0010000000000000h DQ 0000000000000000h __real_mant DQ 000FFFFFFFFFFFFFh ; mantissa bits DQ 0000000000000000h __mask_1023 DQ 00000000000003ffh DQ 0000000000000000h __mask_001 DQ 0000000000000001h DQ 0000000000000000h __mask_mant_all8 DQ 000ff00000000000h DQ 0000000000000000h __mask_mant9 DQ 0000080000000000h DQ 0000000000000000h __real_two DQ 4000000000000000h ; 2 DQ 0000000000000000h __real_one DQ 3ff0000000000000h ; 1 DQ 0000000000000000h __real_near_one_lt DQ 3fee000000000000h ; .9375 DQ 0000000000000000h __real_near_one_gt DQ 3ff1000000000000h ; 1.0625 DQ 0000000000000000h __real_half DQ 3fe0000000000000h ; 1/2 DQ 0000000000000000h __mask_100 DQ 0000000000000100h DQ 0000000000000000h __real_1_over_512 DQ 3f60000000000000h DQ 0000000000000000h __real_1_over_2 DQ 3fe0000000000000h DQ 0000000000000000h __real_1_over_3 DQ 3fd5555555555555h DQ 0000000000000000h __real_1_over_4 DQ 3fd0000000000000h DQ 0000000000000000h __real_1_over_5 DQ 3fc999999999999ah DQ 0000000000000000h __real_1_over_6 DQ 3fc5555555555555h DQ 0000000000000000h __mask_1023_f DQ 0c08ff80000000000h DQ 0000000000000000h __mask_2045 DQ 00000000000007fdh DQ 0000000000000000h __real_threshold DQ 3fb0000000000000h ; .0625 DQ 0000000000000000h __real_notsign DQ 7ffFFFFFFFFFFFFFh ; ^sign bit DQ 0000000000000000h __real_ca1 DQ 3fb55555555554e6h ; 8.33333333333317923934e-02 DQ 0000000000000000h __real_ca2 DQ 3f89999999bac6d4h ; 1.25000000037717509602e-02 DQ 0000000000000000h __real_ca3 DQ 3f62492307f1519fh ; 2.23213998791944806202e-03 DQ 0000000000000000h __real_ca4 DQ 3f3c8034c85dfff0h ; 4.34887777707614552256e-04 DQ 0000000000000000h __real_log2_lead DQ 03fe62e42e0000000h ; 6.93147122859954833984e-01 DQ 00000000000000000h __real_log2_tail DQ 03e6efa39ef35793ch ; 5.76999904754328540596e-08 DQ 00000000000000000h ; these codes and the ones in the corresponding .c file have to match __flag_x_zero DD 00000001 __flag_x_neg DD 00000002 __flag_x_nan DD 00000003 EXTRN __log_256_lead:QWORD EXTRN __log_256_tail:QWORD EXTRN __log_F_inv_qword:QWORD EXTRN __use_fma3_lib:DWORD fname TEXTEQU fname_special TEXTEQU <_log_special> ; define local variable storage offsets save_xmm6 EQU 20h dummy_space EQU 40h stack_size EQU 58h include fm.inc ; external function EXTERN fname_special:PROC .code ALIGN 16 PUBLIC fname fname PROC FRAME StackAllocate stack_size SaveXmm xmm6, save_xmm6 .ENDPROLOG cmp DWORD PTR __use_fma3_lib, 0 jne Llog_fma3 Llog_sse2: ; compute exponent part movdqa xmm3, xmm0 movapd xmm4, xmm0 psrlq xmm3, 52 movd rax, xmm0 psubq xmm3, XMMWORD PTR __mask_1023 ; NaN or inf mov rcx, rax btr rcx, 63 cmp rcx, QWORD PTR __real_inf jae __x_is_inf_or_nan movdqa xmm2, xmm0 cvtdq2pd xmm6, xmm3 ; xexp pand xmm2, XMMWORD PTR __real_mant subsd xmm4, QWORD PTR __real_one comisd xmm6, QWORD PTR __mask_1023_f je __denormal_adjust __continue_common: andpd xmm4, XMMWORD PTR __real_notsign ; compute index into the log tables mov r9, rax and rax, QWORD PTR __mask_mant_all8 and r9, QWORD PTR __mask_mant9 shl r9, 1 add rax, r9 movd xmm1, rax ; near one codepath comisd xmm4, QWORD PTR __real_threshold jb __near_one ; F, Y shr rax, 44 por xmm2, XMMWORD PTR __real_half por xmm1, XMMWORD PTR __real_half lea r9, __log_F_inv_qword ; check for negative numbers or zero xorpd xmm5, xmm5 comisd xmm0, xmm5 jbe __x_is_zero_or_neg ; f = F - Y, r = f * inv subsd xmm1, xmm2 ; xmm1 <-- f = F - Y mulsd xmm1, QWORD PTR [r9+rax*8] ; xmm1 <-- r = f * inv movapd xmm2, xmm1 ; xmm2 <-- copy of r movapd xmm0, xmm1 ; xmm0 <-- copy of r lea r9, QWORD PTR __log_256_lead ; poly movsd xmm3, QWORD PTR __real_1_over_6 movsd xmm1, QWORD PTR __real_1_over_3 mulsd xmm3, xmm2 ; xmm3 <-- r/6 mulsd xmm1, xmm2 ; xmm1 <-- r/3 mulsd xmm0, xmm2 ; xmm0 <-- r*r movapd xmm4, xmm0 ; xmm4 <-- copy of r*r addsd xmm3, QWORD PTR __real_1_over_5 ; xmm3 <-- r/6 + 1/5 addsd xmm1, QWORD PTR __real_1_over_2 ; xmm1 <-- r/3 + 1/2 mulsd xmm4, xmm0 ; xmm4 <-- r^4 mulsd xmm3, xmm2 ; xmm3 <-- (r/6 + 1/5)*r mulsd xmm1, xmm0 ; xmm1 <-- (r/3 + 1/2)*r^2 addsd xmm3, QWORD PTR __real_1_over_4 ; xmm3 <-- (r/6 + 1/5)*r + 1/4 addsd xmm1, xmm2 ; xmm1 <-- (r/3 + 1/2)*r^2 + r mulsd xmm3, xmm4 ; xmm3 <-- ((r/6+1/5)*r+1/4)*r^4 addsd xmm1, xmm3 ; xmm1 <-- poly ; m*log(2)_tail + log(G)_tail - poly movsd xmm5, QWORD PTR __real_log2_tail mulsd xmm5, xmm6 ; xmm5 <-- m*log2_tail subsd xmm5, xmm1 ; xmm5 <-- m*log2_tail - poly movsd xmm0, QWORD PTR [r9+rax*8] ; xmm0 <-- log(G)_lead lea rdx, QWORD PTR __log_256_tail movsd xmm2, QWORD PTR [rdx+rax*8] ; xmm2 <-- log(G)_tail addsd xmm2, xmm5 ; xmm2 <-- (m*log2_tail - poly) + log(G)_tail movsd xmm4, QWORD PTR __real_log2_lead mulsd xmm4, xmm6 ; xmm4 <-- m*log2_lead addsd xmm0, xmm4 ; xmm0 <-- m*log2_lead + log(G)_lead addsd xmm0, xmm2 ; xmm0 <-- m*log(2)_tail + log(G)_tail - poly RestoreXmm xmm6, save_xmm6 StackDeallocate stack_size ret ALIGN 16 __near_one: ; r = x - 1.0 movsd xmm2, QWORD PTR __real_two subsd xmm0, QWORD PTR __real_one ; r addsd xmm2, xmm0 movsd xmm1, xmm0 divsd xmm1, xmm2 ; r/(2+r) = u/2 movsd xmm4, QWORD PTR __real_ca2 movsd xmm5, QWORD PTR __real_ca4 movsd xmm6, xmm0 mulsd xmm6, xmm1 ; correction addsd xmm1, xmm1 ; u movsd xmm2, xmm1 mulsd xmm2, xmm1 ; u^2 mulsd xmm4, xmm2 mulsd xmm5, xmm2 addsd xmm4, __real_ca1 addsd xmm5, __real_ca3 mulsd xmm2, xmm1 ; u^3 mulsd xmm4, xmm2 mulsd xmm2, xmm2 mulsd xmm2, xmm1 ; u^7 mulsd xmm5, xmm2 addsd xmm4, xmm5 subsd xmm4, xmm6 addsd xmm0, xmm4 RestoreXmm xmm6, save_xmm6 StackDeallocate stack_size ret ALIGN 16 __denormal_adjust: por xmm2, XMMWORD PTR __real_one subsd xmm2, QWORD PTR __real_one movsd xmm5, xmm2 pand xmm2, XMMWORD PTR __real_mant movd rax, xmm2 psrlq xmm5, 52 psubd xmm5, XMMWORD PTR __mask_2045 cvtdq2pd xmm6, xmm5 jmp __continue_common ALIGN 16 __x_is_zero_or_neg: jne __x_is_neg movsd xmm1, QWORD PTR __real_ninf mov r8d, DWORD PTR __flag_x_zero call fname_special jmp __finish ALIGN 16 __x_is_neg: movsd xmm1, QWORD PTR __real_neg_qnan mov r8d, DWORD PTR __flag_x_neg call fname_special jmp __finish ALIGN 16 __x_is_inf_or_nan: cmp rax, QWORD PTR __real_inf je __finish cmp rax, QWORD PTR __real_ninf je __x_is_neg or rax, QWORD PTR __real_qnanbit movd xmm1, rax mov r8d, DWORD PTR __flag_x_nan call fname_special jmp __finish ALIGN 16 __finish: RestoreXmm xmm6, save_xmm6 StackDeallocate stack_size ret ALIGN 16 Llog_fma3: ; compute exponent part xor rax,rax vpsrlq xmm3,xmm0,52 vmovq rax,xmm0 vpsubq xmm3,xmm3,XMMWORD PTR __mask_1023 vcvtdq2pd xmm6,xmm3 ; xexp ; NaN or inf vpand xmm5,xmm0,XMMWORD PTR __real_inf vcomisd xmm5,QWORD PTR __real_inf je Llog_fma3_x_is_inf_or_nan ; check for negative numbers or zero vpxor xmm5,xmm5,xmm5 vcomisd xmm0,xmm5 jbe Llog_fma3_x_is_zero_or_neg vpand xmm2,xmm0,XMMWORD PTR __real_mant vsubsd xmm4,xmm0,QWORD PTR __real_one vcomisd xmm6,QWORD PTR __mask_1023_f je Llog_fma3_denormal_adjust Llog_fma3_continue_common: ; compute index into the log tables vpand xmm1,xmm0,XMMWORD PTR __mask_mant_all8 vpand xmm3,xmm0,XMMWORD PTR __mask_mant9 vpsllq xmm3,xmm3,1 vpaddq xmm1,xmm3,xmm1 vmovq rax,xmm1 ; near one codepath vpand xmm4,xmm4,XMMWORD PTR __real_notsign vcomisd xmm4,QWORD PTR __real_threshold jb Llog_fma3_near_one ; F,Y shr rax,44 vpor xmm2,xmm2,XMMWORD PTR __real_half vpor xmm1,xmm1,XMMWORD PTR __real_half lea r9,QWORD PTR __log_F_inv_qword ; f = F - Y,r = f * inv vsubsd xmm1,xmm1,xmm2 vmulsd xmm1,xmm1,QWORD PTR[r9 + rax * 8] lea r9,QWORD PTR __log_256_lead ; poly vmulsd xmm0,xmm1,xmm1 ; r*r vmovsd xmm3,QWORD PTR __real_1_over_6 vmovsd xmm5,QWORD PTR __real_1_over_3 vfmadd213sd xmm3,xmm1,QWORD PTR __real_1_over_5 ; r*1/6 + 1/5 vfmadd213sd xmm5,xmm1,QWORD PTR __real_1_over_2 ; 1/2+r*1/3 vmovsd xmm4,xmm0,xmm0 vfmadd213sd xmm3,xmm1,QWORD PTR __real_1_over_4 ; 1/4+(1/5*r+r*r*1/6) vmulsd xmm4,xmm0,xmm0 ; r*r*r*r vfmadd231sd xmm1,xmm5,xmm0 ; r*r*(1/2+r*1/3) + r vfmadd231sd xmm1,xmm3,xmm4 ; m*log(2) + log(G) - poly vmovsd xmm5,QWORD PTR __real_log2_tail vfmsub213sd xmm5,xmm6,xmm1 vmovsd xmm0,QWORD PTR[r9 + rax * 8] lea rdx,QWORD PTR __log_256_tail vmovsd xmm1,QWORD PTR[rdx + rax * 8] vaddsd xmm1,xmm1,xmm5 vfmadd231sd xmm0,xmm6,QWORD PTR __real_log2_lead vaddsd xmm0,xmm0,xmm1 AVXRestoreXmm xmm6, save_xmm6 StackDeallocate stack_size ret ALIGN 16 Llog_fma3_near_one: ; r = x - 1.0 vmovsd xmm3,QWORD PTR __real_two vsubsd xmm0,xmm0,QWORD PTR __real_one ; r vaddsd xmm3,xmm3,xmm0 vdivsd xmm1,xmm0,xmm3 ; r/(2+r) = u/2 vmovsd xmm4,QWORD PTR __real_ca2 vmovsd xmm5,QWORD PTR __real_ca4 vmulsd xmm3,xmm0,xmm1 ; correction vaddsd xmm1,xmm1,xmm1 ; u vmulsd xmm2,xmm1,xmm1 ; u^2 vfmadd213sd xmm4,xmm2,QWORD PTR __real_ca1 vfmadd213sd xmm5,xmm2,QWORD PTR __real_ca3 vmulsd xmm2,xmm2,xmm1 ; u^3 vmulsd xmm4,xmm4,xmm2 vmulsd xmm2,xmm2,xmm2 vmulsd xmm2,xmm2,xmm1 ; u^7 vfmadd231sd xmm4,xmm5,xmm2 vsubsd xmm4,xmm4,xmm3 vaddsd xmm0,xmm0,xmm4 AVXRestoreXmm xmm6, save_xmm6 StackDeallocate stack_size ret Llog_fma3_denormal_adjust: vpor xmm2,xmm2,XMMWORD PTR __real_one vsubsd xmm2,xmm2,QWORD PTR __real_one vpsrlq xmm5,xmm2,52 vpand xmm2,xmm2,XMMWORD PTR __real_mant vmovapd xmm0,xmm2 vpsubd xmm5,xmm5,XMMWORD PTR __mask_2045 vcvtdq2pd xmm6,xmm5 jmp Llog_fma3_continue_common ALIGN 16 Llog_fma3_x_is_zero_or_neg: jne Llog_fma3_x_is_neg vmovsd xmm1,QWORD PTR __real_ninf mov r8d,DWORD PTR __flag_x_zero call fname_special AVXRestoreXmm xmm6, save_xmm6 StackDeallocate stack_size ret ALIGN 16 Llog_fma3_x_is_neg: vmovsd xmm1,QWORD PTR __real_neg_qnan mov r8d,DWORD PTR __flag_x_neg call fname_special AVXRestoreXmm xmm6, save_xmm6 StackDeallocate stack_size ret ALIGN 16 Llog_fma3_x_is_inf_or_nan: cmp rax,QWORD PTR __real_inf je Llog_fma3_finish cmp rax,QWORD PTR __real_ninf je Llog_fma3_x_is_neg or rax,QWORD PTR __real_qnanbit vmovq xmm1,rax mov r8d,DWORD PTR __flag_x_nan call fname_special ALIGN 16 Llog_fma3_finish: AVXRestoreXmm xmm6, save_xmm6 StackDeallocate stack_size ret fname endp END