reactos/sdk/lib/crt/math/libm_sse2/tan.asm
2022-12-01 15:21:59 +02:00

763 lines
26 KiB
NASM

;
;
; MIT License
; -----------
;
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
;
; Permission is hereby granted, free of charge, to any person obtaining a copy
; of this Software and associated documentaon files (the "Software"), to deal
; in the Software without restriction, including without limitation the rights
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
; copies of the Software, and to permit persons to whom the Software is
; furnished to do so, subject to the following conditions:
;
; The above copyright notice and this permission notice shall be included in
; all copies or substantial portions of the Software.
;
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
; THE SOFTWARE.
;
; An implementation of the tan function.
;
; Prototype:
;
; double tan(double x);
;
; Computes tan(x).
; It will provide proper C99 return values,
; but may not raise floating point status bits properly.
; Based on the NAG C implementation.
;
; If FMA3 hardware is present, it will be used for the calculation.
;
.const
ALIGN 16
L_signbit DQ 08000000000000000h
DQ 08000000000000000h ; duplicate for pd
L_sign_mask DQ 07FFFFFFFFFFFFFFFh
DQ 07FFFFFFFFFFFFFFFh ; duplicate for pd
L_int_one DQ 00000000000000001h
DQ 00000000000000001h ; duplicate for pd
L_twobypi DQ 03FE45F306DC9C883h
DQ 03FE45F306DC9C883h ; duplicate for pd
L_point_333 DQ 03FD5555555555555h; 1/3
DQ 03FD5555555555555h ; duplicate for pd
L_tan_p0 DQ 03FD7D50F6638564Ah ; 0.372379159759792203640806338901e0
DQ 03FD7D50F6638564Ah ; duplicate for pd
L_tan_p2 DQ 0BF977C24C7569ABBh ; -0.229345080057565662883358588111e-1
DQ 0BF977C24C7569ABBh ; duplicate for pd
L_tan_p4 DQ 03F2D5DAF289C385Ah ; 0.224044448537022097264602535574e-3
DQ 03F2D5DAF289C385Ah ; duplicate for pd
L_tan_q0 DQ 03FF1DFCB8CAA40B8h ; 0.111713747927937668539901657944e1
DQ 03FF1DFCB8CAA40B8h ; duplicate for pd
L_tan_q2 DQ 0BFE08046499EB90Fh ; -0.515658515729031149329237816945e0
DQ 0BFE08046499EB90Fh ; duplicate for pd
L_tan_q4 DQ 03F9AB0F4F80A0ACFh ; 0.260656620398645407524064091208e-1
DQ 03F9AB0F4F80A0ACFh ; duplicate for pd
L_tan_q6 DQ 0BF2E7517EF6D98F8h ; -0.232371494088563558304549252913e-3
DQ 0BF2E7517EF6D98F8h ; duplicate for pd
L_half_mask DQ 0ffffffff00000000h
DQ 0ffffffff00000000h ; duplicate for pd
L_piby4_lead DQ 03FE921FB54442D18h ; pi/4, high part
DQ 03FE921FB54442D18h ; duplicate for pd
L_piby4_tail DQ 03C81A62633145C06h ; pi/4, low parft
DQ 03C81A62633145C06h ; duplicate for pd
; Different parts of argument reduction need different versions of pi/2
L_piby2_1 DQ 03FF921FB54400000h ; pi/2, high 33 bits
L_piby2_1tail DQ 03DD0B4611A626331h ; pi/2, second 53 bits, overlaps...
L_piby2_2 DQ 03DD0B4611A600000h ; pi/2, second 33 bits
L_piby2_2tail DQ 03BA3198A2E037073h ; pi/2, third 53 bits, overlaps...
L_piby2_3 DQ 03BA3198A2E000000h ; pi/2, third 33 bits
L_piby2_3tail DQ 0397B839A252049C1h ; pi/2, fourth 53 bits
; end of pi/2 versions
L_two_to_neg_27 DQ 03e40000000000000h ; 2^-27
L_two_to_neg_13 DQ 03f20000000000000h ; 2^-13
L_inf_mask_64 DQ 07FF0000000000000h
L_point_five DQ 03FE0000000000000h
L_point_68 DQ 03FE5C28F5C28F5C3h ; .68
L_n_point_68 DQ 0BFE5C28F5C28F5C3h ; -.68
L_zero DQ -0000000000000000h ; 0.0
L_one DQ 03FF0000000000000h ; 1.0
L_n_one DQ 0BFF0000000000000h ; -1.0
L_two DQ 04000000000000000h ; 2.0
L_moderate_arg_cw DQ 0411E848000000000h ; 5.e5
L_moderate_arg_bdl DQ 0417312D000000000h ; 2e7, works for BDL
fname TEXTEQU <tan>
fname_special TEXTEQU <_tan_special>
; local storage offsets
save_xmm6 EQU 020h
save_xmm7 EQU 030h
store_input EQU 040h
save_r10 EQU 050h
dummy_space EQU 060h
stack_size EQU 088h
include fm.inc
EXTERN __use_fma3_lib:DWORD
EXTERN fname_special : PROC
EXTERN __remainder_piby2_fma3 : PROC
EXTERN __remainder_piby2_fma3_bdl : PROC
EXTERN __remainder_piby2_forAsm : PROC
EXTERN _set_statfp : PROC
.code
ALIGN 16
PUBLIC fname
fname PROC FRAME
StackAllocate stack_size
SaveXmm xmm6, save_xmm6
SaveXmm xmm7, save_xmm7
.ENDPROLOG
cmp DWORD PTR __use_fma3_lib, 0
jne Ltan_fma3
Ltan_sse2:
movd rdx, xmm0 ; really movq
movaps xmm6, xmm0
mov rcx, rdx
btr rcx, 63 ; rcx <-- |x|
cmp rcx, L_piby4_lead
ja Ltan_abs_x_nle_pio4 ; branch if > pi/4 or NaN
cmp rcx, L_two_to_neg_13
jae Ltan_abs_x_ge_two_to_neg_13
cmp rcx, L_two_to_neg_27
jae Labs_x_ge_two_to_neg_27
; At this point tan(x) ~= x; if it's not exact, set the inexact flag
test rcx, rcx
je Ltan_return
mov ecx, 20h ; ecx <-- AMD_F_INEXACT
call _set_statfp
movaps xmm0, xmm6 ; may be redundant, but xmm0 <-- x
RestoreXmm xmm7, save_xmm7
RestoreXmm xmm6, save_xmm6
StackDeallocate stack_size
ret 0
Labs_x_ge_two_to_neg_27:
mulsd xmm0, xmm0
mulsd xmm0, xmm6
mulsd xmm0, QWORD PTR L_point_333
addsd xmm0, xmm6
RestoreXmm xmm7, save_xmm7
RestoreXmm xmm6, save_xmm6
StackDeallocate stack_size
ret 0
Ltan_abs_x_ge_two_to_neg_13:
xorps xmm1, xmm1 ; xmm1 <-- xx = 0
xor r8d, r8d ; r8 <-- recip flag = 0
call _tan_piby4
Ltan_return:
RestoreXmm xmm7, save_xmm7
RestoreXmm xmm6, save_xmm6
StackDeallocate stack_size
ret 0
Ltan_abs_x_nle_pio4:
cmp rcx, L_inf_mask_64 ; |x| uint >= +inf as uint ?
jnae Ltan_x_is_finite
call fname_special
RestoreXmm xmm7, save_xmm7
RestoreXmm xmm6, save_xmm6
StackDeallocate stack_size
ret
ALIGN 16
Ltan_x_is_finite:
xor r8d, r8d
xor r10, r10
cmp rcx, rdx
setne r10b ; r10 <-- x was negative flag
andpd xmm6, L_sign_mask
movsd xmm0, QWORD PTR L_moderate_arg_cw ; currently 5e5
comisd xmm0, xmm6
jbe Ltan_x_is_very_large
Ltan_x_is_moderate: ; unused label
; For these arguments we do a Cody-Waite reduction, subtracting the
; appropriate multiple of pi/2, using extra precision where x is close
; to an exact multiple of pi/2
; We special-case region setting for |x| <= 9pi/4
; It seems strange that this speeds things up, but it does
mov rdx, rcx
mov rax, 4616025215990052958 ; 400f6a7a2955385eH (5pi/4)
shr rdx, 52 ; rdx <-- xexp
cmp rcx, rax
ja Labs_x_gt_5pio4
mov rax, 4612488097114038738 ; 4002d97c7f3321d2H (3pi/4)
cmp rcx, rax
seta r8b
inc r8d ; r8d <-- region (1 or 2)
jmp Lhave_region
Labs_x_gt_5pio4:
mov rax, 4619644535898419899 ; 401c463abeccb2bbH (9pi/4)
cmp rcx, rax
ja Lneed_region_computation
mov rax, 4617875976460412789 ; 4015fdbbe9bba775H (7pi/4)
cmp rcx, rax
seta r8b
add r8d, 3 ; r8d <-- region (3 or 4)
jmp Lhave_region
ALIGN 16
Lneed_region_computation:
movaps xmm0, xmm6
mulsd xmm0, QWORD PTR L_twobypi
addsd xmm0, QWORD PTR L_point_five
cvttsd2si r8d, xmm0 ; r8d <-- region
Lhave_region:
movd xmm3, r8d
cvtdq2pd xmm3, xmm3
movaps xmm2, xmm3
movaps xmm0, xmm3
mulsd xmm0, QWORD PTR L_piby2_1
mulsd xmm2, QWORD PTR L_piby2_1tail ; xmm2 < rtail = npi2 * piby2_1tail
subsd xmm6, xmm0 ; xmm6 <-- rhead = x - npi2*piby2_1
; If x is not too close to multiple of pi/2,
; we're essentially done with reduction
; If the exponent of rhead is not close to that of x,
; then most of x has been subtracted away in computing rhead;
; i.e., x is close to a multiple of pi/2.
movd rax, xmm6
shr rax, 52
and eax, 2047
sub rdx, rax ; rdx <-- exp diff of x vs rhead
cmp rdx, 15
jbe Ltan_have_rhead_rtail
; Oops, x is almost a multiple of pi/2. Compute more bits of reduced x
; t = rhead;
; rtail = npi2 * piby2_2;
; rhead = t - rtail;
; rtail = npi2 * piby2_2tail - ((t - rhead) - rtail);
movaps xmm1, xmm6
movaps xmm0, xmm3
movaps xmm2, xmm3
mulsd xmm0, QWORD PTR L_piby2_2
mulsd xmm2, QWORD PTR L_piby2_2tail
subsd xmm6, xmm0
subsd xmm1, xmm6
subsd xmm1, xmm0
subsd xmm2, xmm1
cmp rdx, 48
jbe Ltan_have_rhead_rtail ; We've done enough
; Wow, x is REALLY close to a multiple of pi/2. Compute more bits.
; t = rhead;
; rtail = npi2 * piby2_3;
; rhead = t - rtail;
; rtail = npi2 * piby2_3tail - ((t - rhead) - rtail);
movaps xmm1, xmm6
movaps xmm0, xmm3
movaps xmm2, xmm3
mulsd xmm0, QWORD PTR L_piby2_3
mulsd xmm2, QWORD PTR L_piby2_3tail
subsd xmm6, xmm0 ; xmm6 <-- rhead = t - rtail
subsd xmm1, xmm6 ; xmm1 <-- t - rhead
subsd xmm1, xmm0 ; xmm1 <-- ((t - rhead) - rtail)
subsd xmm2, xmm1 ; xmm2 <-- final rtail
Ltan_have_rhead_rtail:
; At this point xmm6 has a suitable rhead, xmm2 a suitable rtail
movaps xmm0, xmm6 ; xmm0 <-- copy of rhead
; r = rhead - rtail
; rr = (rhead - r) - rtail;
; region = npi2 & 3;
and r8d, 3 ; r8d <-- region
subsd xmm0, xmm2 ; xmm0 <-- r = rhead - rtail
subsd xmm6, xmm0 ; xmm6 <-- rhead - r
subsd xmm6, xmm2 ; xmm6 <-- rr = (rhead - r) - rtail
Ltan_do_tan_computation:
and r8d, 1 ; r8d <-- region & 1
movaps xmm1, xmm6
call _tan_piby4
test r10d, r10d
je Ltan_pos_return
xorpd xmm0, QWORD PTR L_signbit
Ltan_pos_return:
RestoreXmm xmm7, save_xmm7
RestoreXmm xmm6, save_xmm6
StackDeallocate stack_size
ret 0
ALIGN 16
Ltan_x_is_very_large:
; Reduce x into range [-pi/4,pi/4] (general case)
movaps xmm0, xmm6
mov QWORD PTR [rsp+save_r10], r10
call __remainder_piby2_forAsm ; this call clobbers r10
mov r10, QWORD PTR [rsp+save_r10]
movapd xmm6,xmm1 ; xmm6 <-- rr
mov r8d,eax ; r8d <-- region
jmp Ltan_do_tan_computation
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;
; From here on, it is assumed that the hardware supports FMA3 (and AVX).
ALIGN 16
Ltan_fma3:
vmovq r9,xmm0
mov rdx,r9 ; rdx <-- x
btr r9,63 ; r9 <-- |x|
cmp r9,L_piby4_lead
jae Ltan_fma3_absx_gt_pio4 ; Note that NaN will branch
Ltan_fma3_absx_le_pio4:
; no argument reduction is needed, so recip is 0, xx is 0.
; Note that this routine is not special-casing very small |x|
vmovsd xmm5,L_piby4_lead
vmovsd xmm6,L_piby4_tail
vxorpd xmm1,xmm1,xmm1 ; xx <-- 0.
vxorpd xmm7,xmm7,xmm7 ; transform <-- 0
comisd xmm0,L_point_68
jbe Ltan_fma3_small_x_le_point_68
Ltan_fma3_x_small_gt_point_68:
vmovsd xmm7,L_one ; xmm7 <-- transform = 1.0
vsubsd xmm0,xmm5,xmm0 ; x = piby4_lead - x
vaddsd xmm0,xmm0,xmm6 ; xmm0 <-- x = x + xl = x + piby4_tail
jmp Ltan_fma3_compute_Remez_for_small_x
ALIGN 16
Ltan_fma3_small_x_le_point_68:
comisd xmm0,L_n_point_68
jae Ltan_fma3_compute_Remez_for_small_x
Ltan_fma3_small_x_lt_neg_point_68:
vmovsd xmm7,L_n_one ; xmm7 <-- transform = -1.0
vaddsd xmm0,xmm5,xmm0 ; x = piby4_lead + x
vaddsd xmm0,xmm0,xmm6 ; xmm0 <-- x = x + xl = x + piby4_tail
Ltan_fma3_compute_Remez_for_small_x:
; At this point xmm0 holds x, possibly transformed
; now do core Remez rational approximation for x in [0,0.68]
vmovsd xmm4,L_tan_q6
vmovsd xmm3,L_tan_p4
vmulsd xmm2,xmm0,xmm0 ; xx is 0, so xmm2 <-- r = x*x
vfmadd213sd xmm4,xmm2,L_tan_q4
vfmadd213sd xmm3,xmm2,L_tan_p2
vfmadd213sd xmm4,xmm2,L_tan_q2
vfmadd213sd xmm3,xmm2,L_tan_p0 ; xmm3 <-- p2 (polynomial)
vfmadd213sd xmm4,xmm2,L_tan_q0 ; xmm4 <-- q3 (polynomial)
vdivsd xmm3,xmm3,xmm4 ; xmm3 <-- r3 = p2/q3
vmulsd xmm3,xmm3,xmm2 ; xmm3 <-- r * r3
vfmadd132sd xmm0,xmm0,xmm3 ; xx = 0, so xmm0 <-- t = x + x*(r*r3)
comisd xmm7,L_zero ; did we transform x?
; if x was transformed, we need to transform t to get answer;
; if not, the answer is just t.
je Ltan_fma3_ext_piby4_zero
; x was transformed, so answer is +- (1. - 2.*t/(1.+t))
; (remember recip is 0 here)
vmovsd xmm3,L_one
vaddsd xmm4,xmm0,L_one ; xmm4 <-- 1. + t
vdivsd xmm6,xmm0,xmm4 ; xmm6 <-- t / (1.+t)
vfnmadd231sd xmm3,xmm6,L_two ; xmm3 <-- 1. - 2.*t/(1.+t)
vmulsd xmm0,xmm3,xmm7 ; multiply by +- 1.
Ltan_fma3_ext_piby4_zero:
; restore volatile registers
AVXRestoreXmm xmm7, save_xmm7
AVXRestoreXmm xmm6, save_xmm6
StackDeallocate stack_size
ret 0
ALIGN 16
Ltan_fma3_absx_gt_pio4: ;;; come here if |x| > pi/4
cmp r9, L_inf_mask_64
jae Ltan_fma3_naninf
;Ltan_fma3_range_reduce:
vmovapd [store_input + rsp],xmm0 ; save copy of x
vmovq xmm0,r9 ; xmm0l <-- |x|
cmp r9,L_moderate_arg_bdl
jge Ltan_fma3_remainder_piby2 ; go elsewhere if |x| > 500000.
; Note that __remainder_piby2_fma3 and __remainder_piby2_fma3_bdl
; have calling conventions that differ from the C routine
; on input
; |x| is in xmm0
; on output
; z is in xmm0
; zz is in xmm1
; where z + zz = arg reduced |x| and zz is small compared to z
; region of |x| is in rax
Ltan_fma3_remainder_piby2_small:
; Boldo-Daumas-Li reduction for reasonably small |x|
call __remainder_piby2_fma3_bdl
Ltan_fma3_full_computation:
; we have done argument reduction; recip and xx may be nonzero
; x is in xmm0, xx is in xmm1
; recip is region & 1, and region is in rax.
vmovsd xmm5,L_piby4_lead
vmovsd xmm6,L_piby4_tail
vxorpd xmm7,xmm7,xmm7 ; transform <-- 0
vcomisd xmm0,L_point_68
jbe Ltan_fma3_full_x_le_point_68
Ltan_fma3_full_x_gt_point_68:
vmovsd xmm7,L_one ; xmm7 <-- transform = 1.0
vsubsd xmm0,xmm5,xmm0 ; xmm0 <-- x = piby4_lead - x
vsubsd xmm2,xmm6,xmm1 ; xmm2 <-- xl = pibi4_tail - xx
vaddsd xmm0,xmm0,xmm2 ; xmm0 <-- x = x + xl
vxorps xmm1,xmm1,xmm1 ; xmm1 <-- xx = 0
jmp Ltan_fma3_compute_Remez
ALIGN 16
Ltan_fma3_full_x_le_point_68:
vcomisd xmm0,L_n_point_68
jae Ltan_fma3_compute_Remez
Ltan_fma3_full_x_lt_neg_point_68:
vmovsd xmm7,L_n_one ; xmm7 <-- transform = -1.0
vaddsd xmm0,xmm5,xmm0 ; x = piby4_lead + x
vaddsd xmm2,xmm6,xmm1 ; xmm2 <-- xl = piby4_tail + xx
vaddsd xmm0,xmm0,xmm2 ; xmm0 <-- x = x + xl
vxorps xmm1,xmm1,xmm1 ; xmm1 <-- xx = 0
Ltan_fma3_compute_Remez:
vmulsd xmm2,xmm0,xmm0 ; xmm2 <-- x*x
vmulsd xmm5,xmm1,xmm0 ; xmm5 <-- x*xx
vfmadd132sd xmm5,xmm2,L_two ; xmm5 <-- r = x*x + 2.*x*xx
vmovsd xmm2,L_tan_p4
vfmadd213sd xmm2,xmm5,L_tan_p2 ; xmm2 <-- p4*r+p2
vfmadd213sd xmm2,xmm5,L_tan_p0 ; xmm2 <-- p = (p4*r+p2)*r+p0
vmovsd xmm4,L_tan_q6
vfmadd213sd xmm4,xmm5,L_tan_q4 ; xmm4 <-- q6*r+q4
vfmadd213sd xmm4,xmm5,L_tan_q2 ; xmm4 <-- (q6*r+q4)*r+q2
vfmadd213sd xmm4,xmm5,L_tan_q0 ; xmm4 <-- q = ((q6*r+q4)*r+q2)*r+q0
vdivsd xmm2,xmm2,xmm4 ; xmm2 <-- p/q
vmulsd xmm2,xmm2,xmm5 ; xmm2 <-- r*p/q
vfmadd213sd xmm2,xmm0,xmm1 ; xmm2 <-- t2 = xx + x*r*(p/q)
vaddsd xmm1,xmm0,xmm2 ; xmm1 <-- t = (t1=x) + t2
; If |x| > .68 we transformed, and t is an approximation of
; tan(pi/4 +- (x+xx))
; otherwise, t is just tan(x+xx)
vxorpd xmm6,xmm6,xmm6
vcomisd xmm7,xmm6 ; did we transform? (|x| > .68) ?
jz Ltan_fma3_if_recip_set ; if not, go check recip
Ltan_fma3_if_transfor_set:
; Because we transformed x+xx, we have to transform t before returning
; let transform be 1 for x > .68, -1 for x < -.68, then we return
; transform * (recip ? (2.*t/(t-1.) - 1.) : (1. - 2.*t/(1.+t)))
vaddsd xmm6,xmm1,xmm1 ; xmm6 <-- 2.*t
vmovsd xmm4,L_one
vaddsd xmm2,xmm1,xmm4 ; xmm2 <-- t+1
vsubsd xmm5,xmm1,xmm4 ; xmm5 <-- t-1
bt rax,0
jc Ltan_fma3_transform_and_recip_set
; here recip is not set
vaddsd xmm2,xmm1,xmm4 ; xmm2 <-- t+1
vdivsd xmm2,xmm1,xmm2 ; xmm2 <-- t/(t+1)
vfnmadd132sd xmm2,xmm4,L_two ; xmm2 <-- 1 - 2*t/(t+1)
vmulsd xmm1,xmm2,xmm7 ; xmm1 <-- transform*(1 - 2*t/(t+1))
jmp Ltan_fma3_exit_piby4
ALIGN 16
Ltan_fma3_transform_and_recip_set:
; here recip is set
vsubsd xmm2,xmm1,xmm4 ; xmm2 <-- t-1
vdivsd xmm2,xmm1,xmm2 ; xmm2 <-- t/(t-1)
vfmsub132sd xmm2,xmm4,L_two ; xmm2 <-- 2*t/(t-1) - 1
vmulsd xmm1,xmm2,xmm7 ; xmm1 <-- transform*(2*t/(t-1) - 1)
jmp Ltan_fma3_exit_piby4
ALIGN 16
Ltan_fma3_if_recip_set:
; Here we did not transform x and xx, but if we are in an odd quadrant
; we will need to return -1./(t1+t2), computed accurately
; (t=t1 is in xmm1, t2 is in xmm2)
bt rax,0
jnc Ltan_fma3_exit_piby4
vandpd xmm7,xmm1,L_half_mask ; xmm7 <-- z1 = high bits of t
vsubsd xmm4,xmm7,xmm0 ; xmm4 <-- z1 - t1
vsubsd xmm4,xmm2,xmm4 ; xmm4 <-- z2 = t2 - (z1-t1)
vmovsd xmm2,L_n_one
vdivsd xmm2,xmm2,xmm1 ; xmm2 <-- trec = -1./t
vandpd xmm5,xmm2,L_half_mask ; xmm5 <-- trec_top=high bits of trec
vfmadd213sd xmm7,xmm5,L_one ; xmm7 <-- trec_top*z1 + 1.
vfmadd231sd xmm7 ,xmm4,xmm5 ; xmm7 <-- z2*trec_top + (trec_top*z1 + 1.)
vfmadd213sd xmm7,xmm2,xmm5 ; xmm7 <-- u = trec_top + trec*(z2*trec_top + (trec_top*z1+1.))
vmovapd xmm1,xmm7 ; xmm1 <-- u
Ltan_fma3_exit_piby4:
vmovapd xmm0,xmm1 ; xmm0 <-- t, u, or v, as needed
vmovapd xmm1,[store_input + rsp]
vandpd xmm1,xmm1,L_signbit
vxorpd xmm0,xmm0,xmm1 ; tan(-x) = -tan(x)
; restore volatile registers
AVXRestoreXmm xmm7, save_xmm7
AVXRestoreXmm xmm6, save_xmm6
StackDeallocate stack_size
ret
ALIGN 16
Ltan_fma3_remainder_piby2:
; argument reduction for general x
call __remainder_piby2_fma3
jmp Ltan_fma3_full_computation
Ltan_fma3_naninf: ; here argument is +-Inf or NaN. Special case.
call fname_special
AVXRestoreXmm xmm7, save_xmm7
AVXRestoreXmm xmm6, save_xmm6
StackDeallocate stack_size
ret
fname endp
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
.const
tan_piby4_save_xmm6 EQU 030h
tan_piby4_stack_size EQU 048h
.code
ALIGN 16
_tan_piby4 PROC PRIVATE FRAME
StackAllocate tan_piby4_stack_size
SaveXmm xmm6, tan_piby4_save_xmm6
.ENDPROLOG
; Compute tangent for x+xx in [-pi/4,pi/4].
; xmm0 has x
; xmm1 has xx
; r8d has recip. If recip is true, return -1/tan(x+xx) else tan(x+xx)
xor eax, eax
comisd xmm0, QWORD PTR L_point_68
movaps xmm3, xmm1
movaps xmm6, xmm0
jbe Ltan_piby4_x_le_point_68
; Here x > .68, so we transform x using the identity
; tan(pi/4-x) = (1-tan(x))/(1+tan(x))
movsd xmm2, QWORD PTR L_piby4_lead
mov eax, 1 ; eax <-- transform = 1
subsd xmm2, xmm0 ; xmm2 <-- x = piby4_lead - x
movsd xmm0, QWORD PTR L_piby4_tail
subsd xmm0, xmm1 ; xmm0 <-- xl = piby4_tail - xx
movaps xmm6, xmm2
addsd xmm6, xmm0 ; xmm6 <-- x = x + xl
xorps xmm3,xmm3 ; xmm3 <-- xx = 0.
jmp Ltan_piby4_do_remez
Ltan_piby4_x_le_point_68:
; 43 : else if (x < -0.68)
movsd xmm0, QWORD PTR L_n_point_68
comisd xmm0, xmm6
jbe Ltan_piby4_do_remez ; jump if x >= -.68
; Here x < -.68, so we transform x using the identity
; tan(x-pi/4) = (tan(x)-1)/(tan(x)+1)
addsd xmm6, QWORD PTR L_piby4_lead ; xmm6 <-- x = piby4_lead + x
addsd xmm3, QWORD PTR L_piby4_tail ; xmm3 <-- xl = piby4_tail + xx
or eax, -1 ; eax <-- transform = -1
addsd xmm6, xmm3 ; xmm6 <-- x = x + xl
xorps xmm3, xmm3 ; xmm3 <-- xx = 0
Ltan_piby4_do_remez:
; Core Remez [2,3] approximation to tan(x+xx) on the interval [0,0.68].
movaps xmm0, xmm6
movaps xmm2, xmm6;
; An implementation of the tan function.
;
; Prototype:
;
; double tan(double x);
;
; Computes tan(x).
; It will provide proper C99 return values,
; but may not raise floating point status bits properly.
; Based on the NAG C implementation.
;
;
mulsd xmm0, xmm6 ; xmm0 <-- x*x
addsd xmm2, xmm2 ; xmm2 <-- 2*x
mulsd xmm2, xmm3 ; xmm2 <-- 2*x*xx
addsd xmm2, xmm0 ; xmm2 <-- r = x*x + 2*x*xx
; Magic Remez approximation
movaps xmm0, xmm2
movaps xmm5, xmm2
movaps xmm1, xmm2
mulsd xmm5, QWORD PTR L_tan_p4
mulsd xmm1, QWORD PTR L_tan_q6
mulsd xmm0, xmm6
addsd xmm5, QWORD PTR L_tan_p2
mulsd xmm5, xmm2
addsd xmm5, QWORD PTR L_tan_p0
mulsd xmm5, xmm0
movsd xmm0, QWORD PTR L_tan_q4
addsd xmm0, xmm1
mulsd xmm0, xmm2
addsd xmm0, QWORD PTR L_tan_q2
mulsd xmm0, xmm2
addsd xmm0, QWORD PTR L_tan_q0
divsd xmm5, xmm0
addsd xmm5, xmm3 ; xmm5 <-- t2
test eax, eax
je Ltan_piby4_transform_false
addsd xmm5, xmm6 ; xmm5 <-- t = t1 + t2 = x + t2
test r8d, r8d
je Ltan_piby4_transform_true_recip_false
; Here transform and recip are both true.
; return transform*(2*t/(t-1) - 1.0);
movaps xmm0, xmm5
subsd xmm5, QWORD PTR L_one
movd xmm1, eax
addsd xmm0, xmm0
divsd xmm0, xmm5
cvtdq2pd xmm1, xmm1
subsd xmm0, QWORD PTR L_one
mulsd xmm0, xmm1
RestoreXmm xmm6, tan_piby4_save_xmm6
StackDeallocate tan_piby4_stack_size
ret 0
Ltan_piby4_transform_true_recip_false:
; Here return transform*(1.0 - 2*t/(1+t));
movsd xmm0, QWORD PTR L_one
movaps xmm1, xmm5
addsd xmm5, xmm0
addsd xmm1, xmm1
divsd xmm1, xmm5
subsd xmm0, xmm1
movd xmm1, eax
cvtdq2pd xmm1, xmm1
mulsd xmm0, xmm1
RestoreXmm xmm6, tan_piby4_save_xmm6
StackDeallocate tan_piby4_stack_size
ret 0
Ltan_piby4_transform_false:
test r8d, r8d
je Ltan_piby4_atransform_false_recip_false
; Here transform is false but recip is true
; We return an accurate computation of -1.0/(t1 + t2).
movsd xmm4, QWORD PTR L_n_one
movaps xmm0, xmm5
mov rcx, -4294967296 ; ffffffff00000000H
addsd xmm0, xmm6
movd rax, xmm0 ; really movq
divsd xmm4, xmm0
and rax, rcx
movd xmm3, rax ; really movq
movaps xmm1, xmm3
subsd xmm1, xmm6
movd rax, xmm4 ; really movq
subsd xmm5, xmm1
and rax, rcx
movd xmm2, rax ; really movq
; return trec_top + trec * ((1.0 + trec_top * z1) + trec_top * z2);
movaps xmm0, xmm2
mulsd xmm5, xmm2
mulsd xmm0, xmm3
addsd xmm0, QWORD PTR L_one
addsd xmm0, xmm5
mulsd xmm0, xmm4
addsd xmm0, xmm2
RestoreXmm xmm6, tan_piby4_save_xmm6
StackDeallocate tan_piby4_stack_size
ret 0
Ltan_piby4_atransform_false_recip_false:
; Here both transform and recip are false; we just return t1 + t2
addsd xmm5, xmm6
movaps xmm0, xmm5
RestoreXmm xmm6, tan_piby4_save_xmm6
StackDeallocate tan_piby4_stack_size
ret 0
_tan_piby4 endp
END