mirror of
https://github.com/reactos/reactos.git
synced 2024-12-28 18:15:11 +00:00
551 lines
19 KiB
NASM
551 lines
19 KiB
NASM
;
|
|
;
|
|
; MIT License
|
|
; -----------
|
|
;
|
|
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
|
;
|
|
; Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
; of this Software and associated documentaon files (the "Software"), to deal
|
|
; in the Software without restriction, including without limitation the rights
|
|
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
; copies of the Software, and to permit persons to whom the Software is
|
|
; furnished to do so, subject to the following conditions:
|
|
;
|
|
; The above copyright notice and this permission notice shall be included in
|
|
; all copies or substantial portions of the Software.
|
|
;
|
|
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
; THE SOFTWARE.
|
|
;
|
|
; An implementation of the tanf function using the fma3 instruction.
|
|
;
|
|
; Prototype:
|
|
;
|
|
; float tanf(float x);
|
|
;
|
|
; Computes tanf(x).
|
|
; It will provide proper C99 return values,
|
|
; but may not raise floating point status bits properly.
|
|
; Based on the NAG C implementation.
|
|
;
|
|
.const
|
|
ALIGN 16
|
|
L_sign_mask DQ 07FFFFFFFFFFFFFFFh
|
|
DQ 07FFFFFFFFFFFFFFFh
|
|
L_twobypi DQ 03FE45F306DC9C883h
|
|
DQ 03FE45F306DC9C883h
|
|
L_int_three DQ 00000000000000003h
|
|
DQ 00000000000000003h
|
|
L_int_one DQ 00000000000000001h
|
|
DQ 00000000000000001h
|
|
L_signbit DQ 08000000000000000h
|
|
DQ 08000000000000000h
|
|
|
|
L_tanf DQ 03FD8A8B0DA56CB17h ; c0
|
|
DQ 0BF919DBA6EFD6AADh ; c1
|
|
DQ 03FF27E84A3E73A2Eh ; d0
|
|
DQ 0BFE07266D7B3511Bh ; d1
|
|
DQ 03F92E29003C692D9h ; d2
|
|
|
|
L_large_x_sse2 DQ 04160000000000000h ; 8388608.
|
|
L_large_x_fma3 DQ 041E921FB40000000h ; 3.373259264e9
|
|
L_point_333 DQ 03FD5555555555555h
|
|
L_mask_3e4 DQ 03e40000000000000h
|
|
L_mask_3f2 DQ 03f20000000000000h
|
|
L_point_five DQ 03FE0000000000000h
|
|
L_piby2_1 DQ 03FF921FB54400000h
|
|
L_piby2_1tail DQ 03DD0B4611A626331h
|
|
L_piby2_lead DQ 03ff921fb54442d18h
|
|
L_n_one DQ 0BFF0000000000000h
|
|
L_piby4 DQ 03fe921fb54442d18h
|
|
L_min_norm DQ 00010000000000000h
|
|
|
|
|
|
L_inf_mask_32 DD 07F800000h
|
|
DD 07F800000h
|
|
|
|
EXTRN __use_fma3_lib:DWORD
|
|
EXTRN __L_2_by_pi_bits:BYTE
|
|
|
|
fname TEXTEQU <tanf>
|
|
fname_special TEXTEQU <_tanf_special>
|
|
|
|
; define local variable storage offsets
|
|
; actually there aren't any, but we need to leave room for _tanf_special.
|
|
dummy_space EQU 20h
|
|
stack_size EQU 38h
|
|
|
|
include fm.inc
|
|
|
|
;Define name and any external functions being called
|
|
EXTERN fname_special : PROC
|
|
|
|
.code
|
|
PUBLIC fname
|
|
fname PROC FRAME
|
|
StackAllocate stack_size
|
|
.ENDPROLOG
|
|
cmp DWORD PTR __use_fma3_lib, 0
|
|
jne Ltanf_fma3
|
|
|
|
Ltanf_sse2:
|
|
movd eax,xmm0
|
|
mov r8d,L_inf_mask_32
|
|
and eax,r8d
|
|
cmp eax, r8d
|
|
jz Ltanf_sse2_naninf
|
|
|
|
cvtss2sd xmm5,xmm0
|
|
movd r9,xmm5
|
|
btr r9,63 ; r9 <-- |x|
|
|
|
|
cmp r9,L_piby4
|
|
jg Ltanf_sse2_range_reduce
|
|
cmp r9,L_mask_3f2 ; compare to 2^-13 = 0.0001220703125
|
|
jge Ltanf_sse2_compute_tanf_piby_4
|
|
cmp r9,L_mask_3e4 ; compare to 2^-27 = 7.4505805969238281e-009
|
|
jge Ltanf_sse2_compute_x_xxx_0_333
|
|
; At this point tan(x) ~= x; if it's not exact, set the inexact flag.
|
|
|
|
test r9, r9
|
|
je Ltanf_sse2_exact_return
|
|
movsd xmm1, L_n_one
|
|
addsd xmm1, L_min_norm ; set inexact
|
|
|
|
Ltanf_sse2_exact_return:
|
|
StackDeallocate stack_size
|
|
ret
|
|
|
|
ALIGN 16
|
|
Ltanf_sse2_compute_x_xxx_0_333:
|
|
movapd xmm2,xmm5
|
|
mulsd xmm2,xmm2 ; xmm2 <-- x^2
|
|
movapd xmm0,xmm2
|
|
mulsd xmm0,xmm5 ; xmm0 <-- x^3
|
|
mulsd xmm0,L_point_333
|
|
addsd xmm0,xmm5 ; x + x*x*x*0.3333333333333333;
|
|
jmp Ltanf_sse2_return_s
|
|
|
|
ALIGN 16
|
|
Ltanf_sse2_compute_tanf_piby_4:
|
|
movapd xmm0,xmm5 ; xmm0 <-- x (as double)
|
|
|
|
movapd xmm1,xmm0
|
|
mulsd xmm1,xmm0 ; xmm1 <-- x*x
|
|
|
|
movsd xmm3,L_tanf+008h ; xmm3 <-- c1
|
|
mulsd xmm3,xmm1 ; xmm3 <-- c1*x^2
|
|
addsd xmm3,L_tanf ; xmm3 <-- c = c1*x^2 + c0
|
|
|
|
movsd xmm2,L_tanf+020h ; xmm2 <-- d2
|
|
mulsd xmm2,xmm1 ; xmm2 <-- d2*x^2
|
|
addsd xmm2,L_tanf+018h ; xmm2 <-- d2*x^2 + d1
|
|
mulsd xmm2,xmm1 ; xmm2 <-- (d2*x^2 + d1)*x^2
|
|
addsd xmm2,L_tanf+010h ; xmm2 <-- d = (d2*x^2 + d1)*x^2 + d0
|
|
divsd xmm3,xmm2 ; xmm3 <-- c/d
|
|
mulsd xmm1,xmm0 ; xmm1 <-- x^3
|
|
mulsd xmm1,xmm3 ; xmm1 <-- x^3 * c/d
|
|
addsd xmm0,xmm1 ; xmm0 <-- x + x^3 * c/d
|
|
jmp Ltanf_sse2_return_s
|
|
|
|
Ltanf_sse2_range_reduce:
|
|
movd xmm0,r9
|
|
cmp r9,L_large_x_sse2
|
|
jge Ltanf_sse2_tanf_reduce_large
|
|
|
|
Ltanf_sse2_tanf_reduce_moderate:
|
|
movapd xmm1,xmm0
|
|
andpd xmm1,L_sign_mask
|
|
movapd xmm2,L_twobypi
|
|
mulsd xmm2,xmm1
|
|
addsd xmm2,L_point_five
|
|
cvttpd2dq xmm4,xmm2
|
|
cvtdq2pd xmm1,xmm4
|
|
andpd xmm4,L_int_three ; xmm4 <-- region
|
|
movapd xmm2,xmm0
|
|
|
|
movapd xmm3,xmm1
|
|
mulsd xmm1,L_piby2_1
|
|
subsd xmm2,xmm1
|
|
mulsd xmm3,L_piby2_1tail ; xmm3 rtail
|
|
movapd xmm0,xmm2
|
|
subsd xmm0,xmm3
|
|
subsd xmm2,xmm0
|
|
movapd xmm1,xmm2
|
|
subsd xmm1,xmm3
|
|
jmp Ltanf_sse2_exit_s
|
|
|
|
Ltanf_sse2_tanf_reduce_large:
|
|
lea r9,__L_2_by_pi_bits
|
|
;xexp = (x >> 52) 1023
|
|
movd r11,xmm0
|
|
mov rcx,r11
|
|
shr r11,52
|
|
sub r11,1023 ; r11 <-- xexp = exponent of input x
|
|
;calculate the last byte from which to start multiplication
|
|
;last = 134 (xexp >> 3)
|
|
mov r10,r11
|
|
shr r10,3
|
|
sub r10,134 ; r10 <-- -last
|
|
neg r10 ; r10 <-- last
|
|
;load 64 bits of 2_by_pi
|
|
mov rax,[r9+r10]
|
|
;mantissa of x = ((x << 12) >> 12) | implied bit
|
|
shl rcx,12
|
|
shr rcx,12 ; rcx <-- mantissa part of input x
|
|
bts rcx,52 ; add the implied bit as well
|
|
;load next 128 bits of 2_by_pi
|
|
add r10,8 ; increment to next 8 bytes of 2_by_pi
|
|
movdqu xmm0,[r9+r10]
|
|
;do three 64bit multiplications with mant of x
|
|
mul rcx
|
|
mov r8,rax ; r8 = last 64 bits of mul = res1[2]
|
|
mov r10,rdx ; r10 = carry
|
|
vmovq rax,xmm0
|
|
mul rcx
|
|
;resexp = xexp & 7
|
|
and r11,7 ; r11 <-- resexp = last 3 bits of xexp
|
|
psrldq xmm0,8
|
|
add rax,r10 ; add the previous carry
|
|
adc rdx,0
|
|
mov r9,rax ; r9 <-- next 64 bits of mul = res1[1]
|
|
mov r10,rdx ; r10 <-- carry
|
|
movd rax,xmm0
|
|
mul rcx
|
|
add r10,rax ;r10 = most sig 64 bits = res1[0]
|
|
;find the region
|
|
;last three bits ltb = most sig bits >> (54 resexp))
|
|
; decimal point in last 18 bits == 8 lsb's in first 64 bits
|
|
; and 8 msb's in next 64 bits
|
|
;point_five = ltb & 01h;
|
|
;region = ((ltb >> 1) + point_five) & 3;
|
|
mov rcx,54
|
|
mov rax,r10
|
|
sub rcx,r11
|
|
xor rdx,rdx ;rdx = sign of x
|
|
shr rax,cl
|
|
jnc Ltanf_sse2_no_point_five_f
|
|
;;if there is carry.. then negate the result of multiplication
|
|
not r10
|
|
not r9
|
|
not r8
|
|
mov rdx,08000000000000000h
|
|
ALIGN 16
|
|
Ltanf_sse2_no_point_five_f:
|
|
adc rax,0
|
|
and rax,3
|
|
movd xmm4,eax ; xmm4 <-- region
|
|
;calculate the number of integer bits and zero them out
|
|
mov rcx,r11
|
|
add rcx,10 ; rcx = no. of integer bits
|
|
shl r10,cl
|
|
shr r10,cl ; r10 contains only mant bits
|
|
sub rcx,64 ; form the exponent
|
|
mov r11,rcx
|
|
;find the highest set bit
|
|
bsr rcx,r10
|
|
jnz Ltanf_sse2_form_mantissa_f
|
|
mov r10,r9
|
|
mov r9,r8
|
|
mov r8,0
|
|
bsr rcx,r10 ;rcx = hsb
|
|
sub r11,64
|
|
ALIGN 16
|
|
Ltanf_sse2_form_mantissa_f:
|
|
add r11,rcx ; for exp of x
|
|
sub rcx,52 ; rcx = no. of bits to shift in r10
|
|
cmp rcx,0
|
|
jl Ltanf_sse2_hsb_below_52_f
|
|
je Ltanf_sse2_form_numbers_f
|
|
;hsb above 52
|
|
mov r8,r10
|
|
shr r10,cl ; r10 = mantissa of x with hsb at 52
|
|
shr r9,cl ; make space for bits from r10
|
|
sub rcx,64
|
|
neg rcx ; rcx = no of bits to shift r10
|
|
shl r8,cl
|
|
or r9,r8 ; r9 = mantissa bits of xx
|
|
jmp Ltanf_sse2_form_numbers_f
|
|
|
|
ALIGN 16
|
|
Ltanf_sse2_hsb_below_52_f:
|
|
neg rcx
|
|
mov rax,r9
|
|
shl r10,cl
|
|
shl r9,cl
|
|
sub rcx,64
|
|
neg rcx
|
|
shr rax,cl
|
|
or r10,rax
|
|
shr r8,cl
|
|
or r9,r8
|
|
ALIGN 16
|
|
Ltanf_sse2_form_numbers_f:
|
|
add r11,1023
|
|
btr r10,52 ; remove the implied bit
|
|
mov rcx,r11
|
|
or r10,rdx ; put the sign
|
|
shl rcx,52
|
|
or r10,rcx ; x is in r10
|
|
movd xmm0,r10 ; xmm0 <-- x
|
|
mulsd xmm0,L_piby2_lead
|
|
|
|
Ltanf_sse2_exit_s:
|
|
movd eax,xmm4
|
|
and eax,1 ; eax <-- region & 1
|
|
movapd xmm1,xmm0
|
|
mulsd xmm1,xmm0 ; xmm1 <-- x*x
|
|
|
|
movsd xmm3,L_tanf+008h ; xmm3 <-- c1
|
|
mulsd xmm3,xmm1 ; xmm3 <-- c1*x^2
|
|
addsd xmm3,L_tanf ; xmm3 <-- c = c1*x^2 + c0
|
|
|
|
movsd xmm2,L_tanf+020h ; xmm2 <-- d2
|
|
mulsd xmm2,xmm1 ; xmm2 <-- d2*x^2
|
|
addsd xmm2,L_tanf+018h ; xmm2 <-- d2*x^2 + d1
|
|
mulsd xmm2,xmm1 ; xmm2 <-- (d2*x^2 + d1)*x^2
|
|
addsd xmm2,L_tanf+010h ; xmm2 <-- d = (d2*x^2 + d1)*x^2 + d0
|
|
divsd xmm3,xmm2 ; xmm3 <-- c/d
|
|
mulsd xmm1,xmm0 ; xmm1 <-- x^3
|
|
mulsd xmm1,xmm3 ; xmm1 <-- x^3 * c/d
|
|
addsd xmm0,xmm1 ; xmm0 <-- x + x^3 * c/d
|
|
cmp eax,01h
|
|
jne Ltanf_sse2_exit_tanpiby4
|
|
Ltanf_sse2_recip :
|
|
movd xmm3,L_n_one
|
|
divsd xmm3,xmm0
|
|
movsd xmm0,xmm3
|
|
Ltanf_sse2_exit_tanpiby4 :
|
|
andpd xmm5,L_signbit
|
|
xorpd xmm0,xmm5
|
|
|
|
Ltanf_sse2_return_s:
|
|
cvtsd2ss xmm0,xmm0
|
|
Ltanf_sse2_return_c:
|
|
StackDeallocate stack_size
|
|
ret
|
|
|
|
Ltanf_sse2_naninf:
|
|
call fname_special
|
|
StackDeallocate stack_size
|
|
ret
|
|
|
|
ALIGN 16
|
|
Ltanf_fma3:
|
|
vmovd eax,xmm0
|
|
mov r8d,L_inf_mask_32
|
|
and eax,r8d
|
|
cmp eax, r8d
|
|
jz Ltanf_fma3_naninf
|
|
|
|
vcvtss2sd xmm5,xmm0,xmm0
|
|
vmovq r9,xmm5
|
|
btr r9,63 ; r9 <-- |x|
|
|
|
|
cmp r9,L_piby4
|
|
jg Ltanf_fma3_range_reduce
|
|
cmp r9,L_mask_3f2
|
|
jge Ltanf_fma3_compute_tanf_piby_4
|
|
cmp r9,L_mask_3e4
|
|
jge Ltanf_fma3_compute_x_xxx_0_333
|
|
jmp Ltanf_fma3_return_c
|
|
|
|
Ltanf_fma3_compute_x_xxx_0_333:
|
|
vmulsd xmm2,xmm5,xmm5
|
|
vmulsd xmm0,xmm2,xmm5
|
|
vfmadd132sd xmm0,xmm5,L_point_333 ; x + x*x*x*0.3333333333333333;
|
|
jmp Ltanf_fma3_return_s
|
|
|
|
Ltanf_fma3_compute_tanf_piby_4:
|
|
vmovsd xmm0,xmm5,xmm5
|
|
vmulsd xmm1,xmm0,xmm0
|
|
vmovsd xmm3,L_tanf+008h
|
|
vfmadd213sd xmm3,xmm1,L_tanf
|
|
vmovsd xmm2,L_tanf+020h
|
|
vfmadd213sd xmm2,xmm1,L_tanf+018h
|
|
vfmadd213sd xmm2,xmm1,L_tanf+010h
|
|
vdivsd xmm3,xmm3,xmm2
|
|
vmulsd xmm1,xmm1,xmm0
|
|
vfmadd231sd xmm0,xmm1,xmm3
|
|
jmp Ltanf_fma3_return_s
|
|
|
|
Ltanf_fma3_range_reduce:
|
|
vmovq xmm0,r9
|
|
cmp r9,L_large_x_fma3
|
|
jge Ltanf_fma3_tanf_reduce_large
|
|
|
|
Ltanf_fma3_tanf_reduce_moderate:
|
|
vandpd xmm1,xmm0,L_sign_mask
|
|
vmovapd xmm2,L_twobypi
|
|
vfmadd213sd xmm2,xmm1,L_point_five
|
|
vcvttpd2dq xmm2,xmm2
|
|
vpmovsxdq xmm1,xmm2
|
|
vandpd xmm4,xmm1,L_int_three ; xmm4 <-- region
|
|
vshufps xmm1 ,xmm1,xmm1,8
|
|
vcvtdq2pd xmm1,xmm1
|
|
vmovdqa xmm2,xmm0
|
|
vfnmadd231sd xmm2,xmm1,L_piby2_1 ; xmm2 rhead
|
|
vmulsd xmm3,xmm1,L_piby2_1tail ; xmm3 rtail
|
|
vsubsd xmm0,xmm2,xmm3
|
|
vsubsd xmm2,xmm2,xmm0
|
|
vsubsd xmm1,xmm2,xmm3
|
|
jmp Ltanf_fma3_exit_s
|
|
|
|
Ltanf_fma3_tanf_reduce_large:
|
|
lea r9,__L_2_by_pi_bits
|
|
;xexp = (x >> 52) 1023
|
|
vmovq r11,xmm0
|
|
mov rcx,r11
|
|
shr r11,52
|
|
sub r11,1023 ; r11 <-- xexp = exponent of input x
|
|
;calculate the last byte from which to start multiplication
|
|
;last = 134 (xexp >> 3)
|
|
mov r10,r11
|
|
shr r10,3
|
|
sub r10,134 ; r10 <-- -last
|
|
neg r10 ; r10 <-- last
|
|
;load 64 bits of 2_by_pi
|
|
mov rax,[r9+r10]
|
|
;mantissa of x = ((x << 12) >> 12) | implied bit
|
|
shl rcx,12
|
|
shr rcx,12 ; rcx <-- mantissa part of input x
|
|
bts rcx,52 ; add the implied bit as well
|
|
;load next 128 bits of 2_by_pi
|
|
add r10,8 ; increment to next 8 bytes of 2_by_pi
|
|
vmovdqu xmm0,XMMWORD PTR[r9+r10]
|
|
;do three 64bit multiplications with mant of x
|
|
mul rcx
|
|
mov r8,rax ; r8 = last 64 bits of mul = res1[2]
|
|
mov r10,rdx ; r10 = carry
|
|
vmovq rax,xmm0
|
|
mul rcx
|
|
;resexp = xexp & 7
|
|
and r11,7 ; r11 <-- resexp = last 3 bits of xexp
|
|
vpsrldq xmm0,xmm0,8
|
|
add rax,r10 ; add the previous carry
|
|
adc rdx,0
|
|
mov r9,rax ; r9 <-- next 64 bits of mul = res1[1]
|
|
mov r10,rdx ; r10 <-- carry
|
|
vmovq rax,xmm0
|
|
mul rcx
|
|
add r10,rax ;r10 = most sig 64 bits = res1[0]
|
|
;find the region
|
|
;last three bits ltb = most sig bits >> (54 resexp))
|
|
; decimal point in last 18 bits == 8 lsb's in first 64 bits
|
|
; and 8 msb's in next 64 bits
|
|
;point_five = ltb & 01h;
|
|
;region = ((ltb >> 1) + point_five) & 3;
|
|
mov rcx,54
|
|
mov rax,r10
|
|
sub rcx,r11
|
|
xor rdx,rdx ;rdx = sign of x
|
|
shr rax,cl
|
|
jnc Ltanf_fma3_no_point_five_f
|
|
;;if there is carry.. then negate the result of multiplication
|
|
not r10
|
|
not r9
|
|
not r8
|
|
mov rdx,08000000000000000h
|
|
ALIGN 16
|
|
Ltanf_fma3_no_point_five_f:
|
|
adc rax,0
|
|
and rax,3
|
|
vmovd xmm4,eax ; xmm4 <-- region
|
|
;calculate the number of integer bits and zero them out
|
|
mov rcx,r11
|
|
add rcx,10 ; rcx = no. of integer bits
|
|
shl r10,cl
|
|
shr r10,cl ; r10 contains only mant bits
|
|
sub rcx,64 ; form the exponent
|
|
mov r11,rcx
|
|
;find the highest set bit
|
|
bsr rcx,r10
|
|
jnz Ltanf_fma3_form_mantissa_f
|
|
mov r10,r9
|
|
mov r9,r8
|
|
mov r8,0
|
|
bsr rcx,r10 ;rcx = hsb
|
|
sub r11,64
|
|
ALIGN 16
|
|
Ltanf_fma3_form_mantissa_f:
|
|
add r11,rcx ; for exp of x
|
|
sub rcx,52 ; rcx = no. of bits to shift in r10
|
|
cmp rcx,0
|
|
jl Ltanf_fma3_hsb_below_52_f
|
|
je Ltanf_fma3_form_numbers_f
|
|
;hsb above 52
|
|
mov r8,r10
|
|
shr r10,cl ; r10 = mantissa of x with hsb at 52
|
|
shr r9,cl ; make space for bits from r10
|
|
sub rcx,64
|
|
neg rcx ; rcx = no of bits to shift r10
|
|
shl r8,cl
|
|
or r9,r8 ; r9 = mantissa bits of xx
|
|
jmp Ltanf_fma3_form_numbers_f
|
|
|
|
ALIGN 16
|
|
Ltanf_fma3_hsb_below_52_f:
|
|
neg rcx
|
|
mov rax,r9
|
|
shl r10,cl
|
|
shl r9,cl
|
|
sub rcx,64
|
|
neg rcx
|
|
shr rax,cl
|
|
or r10,rax
|
|
shr r8,cl
|
|
or r9,r8
|
|
ALIGN 16
|
|
Ltanf_fma3_form_numbers_f:
|
|
add r11,1023
|
|
btr r10,52 ; remove the implied bit
|
|
mov rcx,r11
|
|
or r10,rdx ; put the sign
|
|
shl rcx,52
|
|
or r10,rcx ; x is in r10
|
|
vmovq xmm0,r10 ; xmm0 <-- x
|
|
vmulsd xmm0,xmm0,L_piby2_lead
|
|
|
|
Ltanf_fma3_exit_s:
|
|
vandpd xmm2,xmm4,XMMWORD PTR L_int_one
|
|
vmovd eax,xmm2
|
|
vmulsd xmm1,xmm0,xmm0
|
|
vmovsd xmm3,L_tanf+008h
|
|
vfmadd213sd xmm3,xmm1,L_tanf
|
|
vmovsd xmm2,L_tanf+020h
|
|
vfmadd213sd xmm2,xmm1,L_tanf+018h
|
|
vfmadd213sd xmm2,xmm1,L_tanf+010h
|
|
vdivsd xmm3,xmm3,xmm2
|
|
vmulsd xmm1,xmm1,xmm0
|
|
vfmadd231sd xmm0,xmm1,xmm3
|
|
cmp eax,01h
|
|
je Ltanf_fma3_recip
|
|
jmp Ltanf_fma3_exit_tanpiby4
|
|
|
|
Ltanf_fma3_recip :
|
|
vmovq xmm3,L_n_one
|
|
vdivsd xmm0,xmm3,xmm0
|
|
|
|
Ltanf_fma3_exit_tanpiby4 :
|
|
vandpd xmm5,xmm5,L_signbit
|
|
vxorpd xmm0,xmm0,xmm5
|
|
|
|
Ltanf_fma3_return_s:
|
|
vcvtsd2ss xmm0,xmm0,xmm0
|
|
Ltanf_fma3_return_c:
|
|
StackDeallocate stack_size
|
|
ret
|
|
|
|
Ltanf_fma3_naninf:
|
|
call fname_special
|
|
StackDeallocate stack_size
|
|
ret
|
|
|
|
fname endp
|
|
END
|