; ; ; MIT License ; ----------- ; ; Copyright (c) 2002-2019 Advanced Micro Devices, Inc. ; ; Permission is hereby granted, free of charge, to any person obtaining a copy ; of this Software and associated documentaon files (the "Software"), to deal ; in the Software without restriction, including without limitation the rights ; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell ; copies of the Software, and to permit persons to whom the Software is ; furnished to do so, subject to the following conditions: ; ; The above copyright notice and this permission notice shall be included in ; all copies or substantial portions of the Software. ; ; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN ; THE SOFTWARE. ; ; An implementation of the tan function. ; ; Prototype: ; ; double tan(double x); ; ; Computes tan(x). ; It will provide proper C99 return values, ; but may not raise floating point status bits properly. ; Based on the NAG C implementation. ; ; If FMA3 hardware is present, it will be used for the calculation. ; .const ALIGN 16 L_signbit DQ 08000000000000000h DQ 08000000000000000h ; duplicate for pd L_sign_mask DQ 07FFFFFFFFFFFFFFFh DQ 07FFFFFFFFFFFFFFFh ; duplicate for pd L_int_one DQ 00000000000000001h DQ 00000000000000001h ; duplicate for pd L_twobypi DQ 03FE45F306DC9C883h DQ 03FE45F306DC9C883h ; duplicate for pd L_point_333 DQ 03FD5555555555555h; 1/3 DQ 03FD5555555555555h ; duplicate for pd L_tan_p0 DQ 03FD7D50F6638564Ah ; 0.372379159759792203640806338901e0 DQ 03FD7D50F6638564Ah ; duplicate for pd L_tan_p2 DQ 0BF977C24C7569ABBh ; -0.229345080057565662883358588111e-1 DQ 0BF977C24C7569ABBh ; duplicate for pd L_tan_p4 DQ 03F2D5DAF289C385Ah ; 0.224044448537022097264602535574e-3 DQ 03F2D5DAF289C385Ah ; duplicate for pd L_tan_q0 DQ 03FF1DFCB8CAA40B8h ; 0.111713747927937668539901657944e1 DQ 03FF1DFCB8CAA40B8h ; duplicate for pd L_tan_q2 DQ 0BFE08046499EB90Fh ; -0.515658515729031149329237816945e0 DQ 0BFE08046499EB90Fh ; duplicate for pd L_tan_q4 DQ 03F9AB0F4F80A0ACFh ; 0.260656620398645407524064091208e-1 DQ 03F9AB0F4F80A0ACFh ; duplicate for pd L_tan_q6 DQ 0BF2E7517EF6D98F8h ; -0.232371494088563558304549252913e-3 DQ 0BF2E7517EF6D98F8h ; duplicate for pd L_half_mask DQ 0ffffffff00000000h DQ 0ffffffff00000000h ; duplicate for pd L_piby4_lead DQ 03FE921FB54442D18h ; pi/4, high part DQ 03FE921FB54442D18h ; duplicate for pd L_piby4_tail DQ 03C81A62633145C06h ; pi/4, low parft DQ 03C81A62633145C06h ; duplicate for pd ; Different parts of argument reduction need different versions of pi/2 L_piby2_1 DQ 03FF921FB54400000h ; pi/2, high 33 bits L_piby2_1tail DQ 03DD0B4611A626331h ; pi/2, second 53 bits, overlaps... L_piby2_2 DQ 03DD0B4611A600000h ; pi/2, second 33 bits L_piby2_2tail DQ 03BA3198A2E037073h ; pi/2, third 53 bits, overlaps... L_piby2_3 DQ 03BA3198A2E000000h ; pi/2, third 33 bits L_piby2_3tail DQ 0397B839A252049C1h ; pi/2, fourth 53 bits ; end of pi/2 versions L_two_to_neg_27 DQ 03e40000000000000h ; 2^-27 L_two_to_neg_13 DQ 03f20000000000000h ; 2^-13 L_inf_mask_64 DQ 07FF0000000000000h L_point_five DQ 03FE0000000000000h L_point_68 DQ 03FE5C28F5C28F5C3h ; .68 L_n_point_68 DQ 0BFE5C28F5C28F5C3h ; -.68 L_zero DQ -0000000000000000h ; 0.0 L_one DQ 03FF0000000000000h ; 1.0 L_n_one DQ 0BFF0000000000000h ; -1.0 L_two DQ 04000000000000000h ; 2.0 L_moderate_arg_cw DQ 0411E848000000000h ; 5.e5 L_moderate_arg_bdl DQ 0417312D000000000h ; 2e7, works for BDL fname TEXTEQU fname_special TEXTEQU <_tan_special> ; local storage offsets save_xmm6 EQU 020h save_xmm7 EQU 030h store_input EQU 040h save_r10 EQU 050h dummy_space EQU 060h stack_size EQU 088h include fm.inc EXTERN __use_fma3_lib:DWORD EXTERN fname_special : PROC EXTERN __remainder_piby2_fma3 : PROC EXTERN __remainder_piby2_fma3_bdl : PROC EXTERN __remainder_piby2_forAsm : PROC EXTERN _set_statfp : PROC .code ALIGN 16 PUBLIC fname fname PROC FRAME StackAllocate stack_size SaveXmm xmm6, save_xmm6 SaveXmm xmm7, save_xmm7 .ENDPROLOG cmp DWORD PTR __use_fma3_lib, 0 jne Ltan_fma3 Ltan_sse2: movd rdx, xmm0 ; really movq movaps xmm6, xmm0 mov rcx, rdx btr rcx, 63 ; rcx <-- |x| cmp rcx, L_piby4_lead ja Ltan_abs_x_nle_pio4 ; branch if > pi/4 or NaN cmp rcx, L_two_to_neg_13 jae Ltan_abs_x_ge_two_to_neg_13 cmp rcx, L_two_to_neg_27 jae Labs_x_ge_two_to_neg_27 ; At this point tan(x) ~= x; if it's not exact, set the inexact flag test rcx, rcx je Ltan_return mov ecx, 20h ; ecx <-- AMD_F_INEXACT call _set_statfp movaps xmm0, xmm6 ; may be redundant, but xmm0 <-- x RestoreXmm xmm7, save_xmm7 RestoreXmm xmm6, save_xmm6 StackDeallocate stack_size ret 0 Labs_x_ge_two_to_neg_27: mulsd xmm0, xmm0 mulsd xmm0, xmm6 mulsd xmm0, QWORD PTR L_point_333 addsd xmm0, xmm6 RestoreXmm xmm7, save_xmm7 RestoreXmm xmm6, save_xmm6 StackDeallocate stack_size ret 0 Ltan_abs_x_ge_two_to_neg_13: xorps xmm1, xmm1 ; xmm1 <-- xx = 0 xor r8d, r8d ; r8 <-- recip flag = 0 call _tan_piby4 Ltan_return: RestoreXmm xmm7, save_xmm7 RestoreXmm xmm6, save_xmm6 StackDeallocate stack_size ret 0 Ltan_abs_x_nle_pio4: cmp rcx, L_inf_mask_64 ; |x| uint >= +inf as uint ? jnae Ltan_x_is_finite call fname_special RestoreXmm xmm7, save_xmm7 RestoreXmm xmm6, save_xmm6 StackDeallocate stack_size ret ALIGN 16 Ltan_x_is_finite: xor r8d, r8d xor r10, r10 cmp rcx, rdx setne r10b ; r10 <-- x was negative flag andpd xmm6, L_sign_mask movsd xmm0, QWORD PTR L_moderate_arg_cw ; currently 5e5 comisd xmm0, xmm6 jbe Ltan_x_is_very_large Ltan_x_is_moderate: ; unused label ; For these arguments we do a Cody-Waite reduction, subtracting the ; appropriate multiple of pi/2, using extra precision where x is close ; to an exact multiple of pi/2 ; We special-case region setting for |x| <= 9pi/4 ; It seems strange that this speeds things up, but it does mov rdx, rcx mov rax, 4616025215990052958 ; 400f6a7a2955385eH (5pi/4) shr rdx, 52 ; rdx <-- xexp cmp rcx, rax ja Labs_x_gt_5pio4 mov rax, 4612488097114038738 ; 4002d97c7f3321d2H (3pi/4) cmp rcx, rax seta r8b inc r8d ; r8d <-- region (1 or 2) jmp Lhave_region Labs_x_gt_5pio4: mov rax, 4619644535898419899 ; 401c463abeccb2bbH (9pi/4) cmp rcx, rax ja Lneed_region_computation mov rax, 4617875976460412789 ; 4015fdbbe9bba775H (7pi/4) cmp rcx, rax seta r8b add r8d, 3 ; r8d <-- region (3 or 4) jmp Lhave_region ALIGN 16 Lneed_region_computation: movaps xmm0, xmm6 mulsd xmm0, QWORD PTR L_twobypi addsd xmm0, QWORD PTR L_point_five cvttsd2si r8d, xmm0 ; r8d <-- region Lhave_region: movd xmm3, r8d cvtdq2pd xmm3, xmm3 movaps xmm2, xmm3 movaps xmm0, xmm3 mulsd xmm0, QWORD PTR L_piby2_1 mulsd xmm2, QWORD PTR L_piby2_1tail ; xmm2 < rtail = npi2 * piby2_1tail subsd xmm6, xmm0 ; xmm6 <-- rhead = x - npi2*piby2_1 ; If x is not too close to multiple of pi/2, ; we're essentially done with reduction ; If the exponent of rhead is not close to that of x, ; then most of x has been subtracted away in computing rhead; ; i.e., x is close to a multiple of pi/2. movd rax, xmm6 shr rax, 52 and eax, 2047 sub rdx, rax ; rdx <-- exp diff of x vs rhead cmp rdx, 15 jbe Ltan_have_rhead_rtail ; Oops, x is almost a multiple of pi/2. Compute more bits of reduced x ; t = rhead; ; rtail = npi2 * piby2_2; ; rhead = t - rtail; ; rtail = npi2 * piby2_2tail - ((t - rhead) - rtail); movaps xmm1, xmm6 movaps xmm0, xmm3 movaps xmm2, xmm3 mulsd xmm0, QWORD PTR L_piby2_2 mulsd xmm2, QWORD PTR L_piby2_2tail subsd xmm6, xmm0 subsd xmm1, xmm6 subsd xmm1, xmm0 subsd xmm2, xmm1 cmp rdx, 48 jbe Ltan_have_rhead_rtail ; We've done enough ; Wow, x is REALLY close to a multiple of pi/2. Compute more bits. ; t = rhead; ; rtail = npi2 * piby2_3; ; rhead = t - rtail; ; rtail = npi2 * piby2_3tail - ((t - rhead) - rtail); movaps xmm1, xmm6 movaps xmm0, xmm3 movaps xmm2, xmm3 mulsd xmm0, QWORD PTR L_piby2_3 mulsd xmm2, QWORD PTR L_piby2_3tail subsd xmm6, xmm0 ; xmm6 <-- rhead = t - rtail subsd xmm1, xmm6 ; xmm1 <-- t - rhead subsd xmm1, xmm0 ; xmm1 <-- ((t - rhead) - rtail) subsd xmm2, xmm1 ; xmm2 <-- final rtail Ltan_have_rhead_rtail: ; At this point xmm6 has a suitable rhead, xmm2 a suitable rtail movaps xmm0, xmm6 ; xmm0 <-- copy of rhead ; r = rhead - rtail ; rr = (rhead - r) - rtail; ; region = npi2 & 3; and r8d, 3 ; r8d <-- region subsd xmm0, xmm2 ; xmm0 <-- r = rhead - rtail subsd xmm6, xmm0 ; xmm6 <-- rhead - r subsd xmm6, xmm2 ; xmm6 <-- rr = (rhead - r) - rtail Ltan_do_tan_computation: and r8d, 1 ; r8d <-- region & 1 movaps xmm1, xmm6 call _tan_piby4 test r10d, r10d je Ltan_pos_return xorpd xmm0, QWORD PTR L_signbit Ltan_pos_return: RestoreXmm xmm7, save_xmm7 RestoreXmm xmm6, save_xmm6 StackDeallocate stack_size ret 0 ALIGN 16 Ltan_x_is_very_large: ; Reduce x into range [-pi/4,pi/4] (general case) movaps xmm0, xmm6 mov QWORD PTR [rsp+save_r10], r10 call __remainder_piby2_forAsm ; this call clobbers r10 mov r10, QWORD PTR [rsp+save_r10] movapd xmm6,xmm1 ; xmm6 <-- rr mov r8d,eax ; r8d <-- region jmp Ltan_do_tan_computation ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; ; From here on, it is assumed that the hardware supports FMA3 (and AVX). ALIGN 16 Ltan_fma3: vmovq r9,xmm0 mov rdx,r9 ; rdx <-- x btr r9,63 ; r9 <-- |x| cmp r9,L_piby4_lead jae Ltan_fma3_absx_gt_pio4 ; Note that NaN will branch Ltan_fma3_absx_le_pio4: ; no argument reduction is needed, so recip is 0, xx is 0. ; Note that this routine is not special-casing very small |x| vmovsd xmm5,L_piby4_lead vmovsd xmm6,L_piby4_tail vxorpd xmm1,xmm1,xmm1 ; xx <-- 0. vxorpd xmm7,xmm7,xmm7 ; transform <-- 0 comisd xmm0,L_point_68 jbe Ltan_fma3_small_x_le_point_68 Ltan_fma3_x_small_gt_point_68: vmovsd xmm7,L_one ; xmm7 <-- transform = 1.0 vsubsd xmm0,xmm5,xmm0 ; x = piby4_lead - x vaddsd xmm0,xmm0,xmm6 ; xmm0 <-- x = x + xl = x + piby4_tail jmp Ltan_fma3_compute_Remez_for_small_x ALIGN 16 Ltan_fma3_small_x_le_point_68: comisd xmm0,L_n_point_68 jae Ltan_fma3_compute_Remez_for_small_x Ltan_fma3_small_x_lt_neg_point_68: vmovsd xmm7,L_n_one ; xmm7 <-- transform = -1.0 vaddsd xmm0,xmm5,xmm0 ; x = piby4_lead + x vaddsd xmm0,xmm0,xmm6 ; xmm0 <-- x = x + xl = x + piby4_tail Ltan_fma3_compute_Remez_for_small_x: ; At this point xmm0 holds x, possibly transformed ; now do core Remez rational approximation for x in [0,0.68] vmovsd xmm4,L_tan_q6 vmovsd xmm3,L_tan_p4 vmulsd xmm2,xmm0,xmm0 ; xx is 0, so xmm2 <-- r = x*x vfmadd213sd xmm4,xmm2,L_tan_q4 vfmadd213sd xmm3,xmm2,L_tan_p2 vfmadd213sd xmm4,xmm2,L_tan_q2 vfmadd213sd xmm3,xmm2,L_tan_p0 ; xmm3 <-- p2 (polynomial) vfmadd213sd xmm4,xmm2,L_tan_q0 ; xmm4 <-- q3 (polynomial) vdivsd xmm3,xmm3,xmm4 ; xmm3 <-- r3 = p2/q3 vmulsd xmm3,xmm3,xmm2 ; xmm3 <-- r * r3 vfmadd132sd xmm0,xmm0,xmm3 ; xx = 0, so xmm0 <-- t = x + x*(r*r3) comisd xmm7,L_zero ; did we transform x? ; if x was transformed, we need to transform t to get answer; ; if not, the answer is just t. je Ltan_fma3_ext_piby4_zero ; x was transformed, so answer is +- (1. - 2.*t/(1.+t)) ; (remember recip is 0 here) vmovsd xmm3,L_one vaddsd xmm4,xmm0,L_one ; xmm4 <-- 1. + t vdivsd xmm6,xmm0,xmm4 ; xmm6 <-- t / (1.+t) vfnmadd231sd xmm3,xmm6,L_two ; xmm3 <-- 1. - 2.*t/(1.+t) vmulsd xmm0,xmm3,xmm7 ; multiply by +- 1. Ltan_fma3_ext_piby4_zero: ; restore volatile registers AVXRestoreXmm xmm7, save_xmm7 AVXRestoreXmm xmm6, save_xmm6 StackDeallocate stack_size ret 0 ALIGN 16 Ltan_fma3_absx_gt_pio4: ;;; come here if |x| > pi/4 cmp r9, L_inf_mask_64 jae Ltan_fma3_naninf ;Ltan_fma3_range_reduce: vmovapd [store_input + rsp],xmm0 ; save copy of x vmovq xmm0,r9 ; xmm0l <-- |x| cmp r9,L_moderate_arg_bdl jge Ltan_fma3_remainder_piby2 ; go elsewhere if |x| > 500000. ; Note that __remainder_piby2_fma3 and __remainder_piby2_fma3_bdl ; have calling conventions that differ from the C routine ; on input ; |x| is in xmm0 ; on output ; z is in xmm0 ; zz is in xmm1 ; where z + zz = arg reduced |x| and zz is small compared to z ; region of |x| is in rax Ltan_fma3_remainder_piby2_small: ; Boldo-Daumas-Li reduction for reasonably small |x| call __remainder_piby2_fma3_bdl Ltan_fma3_full_computation: ; we have done argument reduction; recip and xx may be nonzero ; x is in xmm0, xx is in xmm1 ; recip is region & 1, and region is in rax. vmovsd xmm5,L_piby4_lead vmovsd xmm6,L_piby4_tail vxorpd xmm7,xmm7,xmm7 ; transform <-- 0 vcomisd xmm0,L_point_68 jbe Ltan_fma3_full_x_le_point_68 Ltan_fma3_full_x_gt_point_68: vmovsd xmm7,L_one ; xmm7 <-- transform = 1.0 vsubsd xmm0,xmm5,xmm0 ; xmm0 <-- x = piby4_lead - x vsubsd xmm2,xmm6,xmm1 ; xmm2 <-- xl = pibi4_tail - xx vaddsd xmm0,xmm0,xmm2 ; xmm0 <-- x = x + xl vxorps xmm1,xmm1,xmm1 ; xmm1 <-- xx = 0 jmp Ltan_fma3_compute_Remez ALIGN 16 Ltan_fma3_full_x_le_point_68: vcomisd xmm0,L_n_point_68 jae Ltan_fma3_compute_Remez Ltan_fma3_full_x_lt_neg_point_68: vmovsd xmm7,L_n_one ; xmm7 <-- transform = -1.0 vaddsd xmm0,xmm5,xmm0 ; x = piby4_lead + x vaddsd xmm2,xmm6,xmm1 ; xmm2 <-- xl = piby4_tail + xx vaddsd xmm0,xmm0,xmm2 ; xmm0 <-- x = x + xl vxorps xmm1,xmm1,xmm1 ; xmm1 <-- xx = 0 Ltan_fma3_compute_Remez: vmulsd xmm2,xmm0,xmm0 ; xmm2 <-- x*x vmulsd xmm5,xmm1,xmm0 ; xmm5 <-- x*xx vfmadd132sd xmm5,xmm2,L_two ; xmm5 <-- r = x*x + 2.*x*xx vmovsd xmm2,L_tan_p4 vfmadd213sd xmm2,xmm5,L_tan_p2 ; xmm2 <-- p4*r+p2 vfmadd213sd xmm2,xmm5,L_tan_p0 ; xmm2 <-- p = (p4*r+p2)*r+p0 vmovsd xmm4,L_tan_q6 vfmadd213sd xmm4,xmm5,L_tan_q4 ; xmm4 <-- q6*r+q4 vfmadd213sd xmm4,xmm5,L_tan_q2 ; xmm4 <-- (q6*r+q4)*r+q2 vfmadd213sd xmm4,xmm5,L_tan_q0 ; xmm4 <-- q = ((q6*r+q4)*r+q2)*r+q0 vdivsd xmm2,xmm2,xmm4 ; xmm2 <-- p/q vmulsd xmm2,xmm2,xmm5 ; xmm2 <-- r*p/q vfmadd213sd xmm2,xmm0,xmm1 ; xmm2 <-- t2 = xx + x*r*(p/q) vaddsd xmm1,xmm0,xmm2 ; xmm1 <-- t = (t1=x) + t2 ; If |x| > .68 we transformed, and t is an approximation of ; tan(pi/4 +- (x+xx)) ; otherwise, t is just tan(x+xx) vxorpd xmm6,xmm6,xmm6 vcomisd xmm7,xmm6 ; did we transform? (|x| > .68) ? jz Ltan_fma3_if_recip_set ; if not, go check recip Ltan_fma3_if_transfor_set: ; Because we transformed x+xx, we have to transform t before returning ; let transform be 1 for x > .68, -1 for x < -.68, then we return ; transform * (recip ? (2.*t/(t-1.) - 1.) : (1. - 2.*t/(1.+t))) vaddsd xmm6,xmm1,xmm1 ; xmm6 <-- 2.*t vmovsd xmm4,L_one vaddsd xmm2,xmm1,xmm4 ; xmm2 <-- t+1 vsubsd xmm5,xmm1,xmm4 ; xmm5 <-- t-1 bt rax,0 jc Ltan_fma3_transform_and_recip_set ; here recip is not set vaddsd xmm2,xmm1,xmm4 ; xmm2 <-- t+1 vdivsd xmm2,xmm1,xmm2 ; xmm2 <-- t/(t+1) vfnmadd132sd xmm2,xmm4,L_two ; xmm2 <-- 1 - 2*t/(t+1) vmulsd xmm1,xmm2,xmm7 ; xmm1 <-- transform*(1 - 2*t/(t+1)) jmp Ltan_fma3_exit_piby4 ALIGN 16 Ltan_fma3_transform_and_recip_set: ; here recip is set vsubsd xmm2,xmm1,xmm4 ; xmm2 <-- t-1 vdivsd xmm2,xmm1,xmm2 ; xmm2 <-- t/(t-1) vfmsub132sd xmm2,xmm4,L_two ; xmm2 <-- 2*t/(t-1) - 1 vmulsd xmm1,xmm2,xmm7 ; xmm1 <-- transform*(2*t/(t-1) - 1) jmp Ltan_fma3_exit_piby4 ALIGN 16 Ltan_fma3_if_recip_set: ; Here we did not transform x and xx, but if we are in an odd quadrant ; we will need to return -1./(t1+t2), computed accurately ; (t=t1 is in xmm1, t2 is in xmm2) bt rax,0 jnc Ltan_fma3_exit_piby4 vandpd xmm7,xmm1,L_half_mask ; xmm7 <-- z1 = high bits of t vsubsd xmm4,xmm7,xmm0 ; xmm4 <-- z1 - t1 vsubsd xmm4,xmm2,xmm4 ; xmm4 <-- z2 = t2 - (z1-t1) vmovsd xmm2,L_n_one vdivsd xmm2,xmm2,xmm1 ; xmm2 <-- trec = -1./t vandpd xmm5,xmm2,L_half_mask ; xmm5 <-- trec_top=high bits of trec vfmadd213sd xmm7,xmm5,L_one ; xmm7 <-- trec_top*z1 + 1. vfmadd231sd xmm7 ,xmm4,xmm5 ; xmm7 <-- z2*trec_top + (trec_top*z1 + 1.) vfmadd213sd xmm7,xmm2,xmm5 ; xmm7 <-- u = trec_top + trec*(z2*trec_top + (trec_top*z1+1.)) vmovapd xmm1,xmm7 ; xmm1 <-- u Ltan_fma3_exit_piby4: vmovapd xmm0,xmm1 ; xmm0 <-- t, u, or v, as needed vmovapd xmm1,[store_input + rsp] vandpd xmm1,xmm1,L_signbit vxorpd xmm0,xmm0,xmm1 ; tan(-x) = -tan(x) ; restore volatile registers AVXRestoreXmm xmm7, save_xmm7 AVXRestoreXmm xmm6, save_xmm6 StackDeallocate stack_size ret ALIGN 16 Ltan_fma3_remainder_piby2: ; argument reduction for general x call __remainder_piby2_fma3 jmp Ltan_fma3_full_computation Ltan_fma3_naninf: ; here argument is +-Inf or NaN. Special case. call fname_special AVXRestoreXmm xmm7, save_xmm7 AVXRestoreXmm xmm6, save_xmm6 StackDeallocate stack_size ret fname endp ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; .const tan_piby4_save_xmm6 EQU 030h tan_piby4_stack_size EQU 048h .code ALIGN 16 _tan_piby4 PROC PRIVATE FRAME StackAllocate tan_piby4_stack_size SaveXmm xmm6, tan_piby4_save_xmm6 .ENDPROLOG ; Compute tangent for x+xx in [-pi/4,pi/4]. ; xmm0 has x ; xmm1 has xx ; r8d has recip. If recip is true, return -1/tan(x+xx) else tan(x+xx) xor eax, eax comisd xmm0, QWORD PTR L_point_68 movaps xmm3, xmm1 movaps xmm6, xmm0 jbe Ltan_piby4_x_le_point_68 ; Here x > .68, so we transform x using the identity ; tan(pi/4-x) = (1-tan(x))/(1+tan(x)) movsd xmm2, QWORD PTR L_piby4_lead mov eax, 1 ; eax <-- transform = 1 subsd xmm2, xmm0 ; xmm2 <-- x = piby4_lead - x movsd xmm0, QWORD PTR L_piby4_tail subsd xmm0, xmm1 ; xmm0 <-- xl = piby4_tail - xx movaps xmm6, xmm2 addsd xmm6, xmm0 ; xmm6 <-- x = x + xl xorps xmm3,xmm3 ; xmm3 <-- xx = 0. jmp Ltan_piby4_do_remez Ltan_piby4_x_le_point_68: ; 43 : else if (x < -0.68) movsd xmm0, QWORD PTR L_n_point_68 comisd xmm0, xmm6 jbe Ltan_piby4_do_remez ; jump if x >= -.68 ; Here x < -.68, so we transform x using the identity ; tan(x-pi/4) = (tan(x)-1)/(tan(x)+1) addsd xmm6, QWORD PTR L_piby4_lead ; xmm6 <-- x = piby4_lead + x addsd xmm3, QWORD PTR L_piby4_tail ; xmm3 <-- xl = piby4_tail + xx or eax, -1 ; eax <-- transform = -1 addsd xmm6, xmm3 ; xmm6 <-- x = x + xl xorps xmm3, xmm3 ; xmm3 <-- xx = 0 Ltan_piby4_do_remez: ; Core Remez [2,3] approximation to tan(x+xx) on the interval [0,0.68]. movaps xmm0, xmm6 movaps xmm2, xmm6; ; An implementation of the tan function. ; ; Prototype: ; ; double tan(double x); ; ; Computes tan(x). ; It will provide proper C99 return values, ; but may not raise floating point status bits properly. ; Based on the NAG C implementation. ; ; mulsd xmm0, xmm6 ; xmm0 <-- x*x addsd xmm2, xmm2 ; xmm2 <-- 2*x mulsd xmm2, xmm3 ; xmm2 <-- 2*x*xx addsd xmm2, xmm0 ; xmm2 <-- r = x*x + 2*x*xx ; Magic Remez approximation movaps xmm0, xmm2 movaps xmm5, xmm2 movaps xmm1, xmm2 mulsd xmm5, QWORD PTR L_tan_p4 mulsd xmm1, QWORD PTR L_tan_q6 mulsd xmm0, xmm6 addsd xmm5, QWORD PTR L_tan_p2 mulsd xmm5, xmm2 addsd xmm5, QWORD PTR L_tan_p0 mulsd xmm5, xmm0 movsd xmm0, QWORD PTR L_tan_q4 addsd xmm0, xmm1 mulsd xmm0, xmm2 addsd xmm0, QWORD PTR L_tan_q2 mulsd xmm0, xmm2 addsd xmm0, QWORD PTR L_tan_q0 divsd xmm5, xmm0 addsd xmm5, xmm3 ; xmm5 <-- t2 test eax, eax je Ltan_piby4_transform_false addsd xmm5, xmm6 ; xmm5 <-- t = t1 + t2 = x + t2 test r8d, r8d je Ltan_piby4_transform_true_recip_false ; Here transform and recip are both true. ; return transform*(2*t/(t-1) - 1.0); movaps xmm0, xmm5 subsd xmm5, QWORD PTR L_one movd xmm1, eax addsd xmm0, xmm0 divsd xmm0, xmm5 cvtdq2pd xmm1, xmm1 subsd xmm0, QWORD PTR L_one mulsd xmm0, xmm1 RestoreXmm xmm6, tan_piby4_save_xmm6 StackDeallocate tan_piby4_stack_size ret 0 Ltan_piby4_transform_true_recip_false: ; Here return transform*(1.0 - 2*t/(1+t)); movsd xmm0, QWORD PTR L_one movaps xmm1, xmm5 addsd xmm5, xmm0 addsd xmm1, xmm1 divsd xmm1, xmm5 subsd xmm0, xmm1 movd xmm1, eax cvtdq2pd xmm1, xmm1 mulsd xmm0, xmm1 RestoreXmm xmm6, tan_piby4_save_xmm6 StackDeallocate tan_piby4_stack_size ret 0 Ltan_piby4_transform_false: test r8d, r8d je Ltan_piby4_atransform_false_recip_false ; Here transform is false but recip is true ; We return an accurate computation of -1.0/(t1 + t2). movsd xmm4, QWORD PTR L_n_one movaps xmm0, xmm5 mov rcx, -4294967296 ; ffffffff00000000H addsd xmm0, xmm6 movd rax, xmm0 ; really movq divsd xmm4, xmm0 and rax, rcx movd xmm3, rax ; really movq movaps xmm1, xmm3 subsd xmm1, xmm6 movd rax, xmm4 ; really movq subsd xmm5, xmm1 and rax, rcx movd xmm2, rax ; really movq ; return trec_top + trec * ((1.0 + trec_top * z1) + trec_top * z2); movaps xmm0, xmm2 mulsd xmm5, xmm2 mulsd xmm0, xmm3 addsd xmm0, QWORD PTR L_one addsd xmm0, xmm5 mulsd xmm0, xmm4 addsd xmm0, xmm2 RestoreXmm xmm6, tan_piby4_save_xmm6 StackDeallocate tan_piby4_stack_size ret 0 Ltan_piby4_atransform_false_recip_false: ; Here both transform and recip are false; we just return t1 + t2 addsd xmm5, xmm6 movaps xmm0, xmm5 RestoreXmm xmm6, tan_piby4_save_xmm6 StackDeallocate tan_piby4_stack_size ret 0 _tan_piby4 endp END