mirror of
https://github.com/reactos/reactos.git
synced 2024-10-31 03:48:17 +00:00
512 lines
19 KiB
NASM
512 lines
19 KiB
NASM
;
|
|
; MIT License
|
|
; -----------
|
|
;
|
|
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
|
;
|
|
; Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
; of this Software and associated documentaon files (the "Software"), to deal
|
|
; in the Software without restriction, including without limitation the rights
|
|
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
; copies of the Software, and to permit persons to whom the Software is
|
|
; furnished to do so, subject to the following conditions:
|
|
;
|
|
; The above copyright notice and this permission notice shall be included in
|
|
; all copies or substantial portions of the Software.
|
|
;
|
|
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
; THE SOFTWARE.
|
|
;
|
|
;
|
|
; An implementation of the sin function.
|
|
;
|
|
; Prototype:
|
|
;
|
|
; double sin(double x);
|
|
;
|
|
; Computes sin(x).
|
|
; It will provide proper C99 return values,
|
|
; but may not raise floating point status bits properly.
|
|
; Based on the NAG C implementation.
|
|
;
|
|
; If FMA3 hardware is available, an FMA3 implementation of sin will be used.
|
|
|
|
|
|
.const
|
|
ALIGN 16
|
|
L_real_piby2_1 DQ 03ff921fb54400000h ; piby2_1
|
|
DQ 0
|
|
L_real_piby2_1tail DQ 03dd0b4611a626331h ; piby2_1tail
|
|
DQ 0
|
|
L_real_piby2_2 DQ 03dd0b4611a600000h ; piby2_2
|
|
DQ 0
|
|
L_real_piby2_2tail DQ 03ba3198a2e037073h ; piby2_2tail
|
|
DQ 0
|
|
ALIGN 16
|
|
|
|
L_one DQ 03FF0000000000000h, 03FF0000000000000h
|
|
L_signbit DQ 08000000000000000h, 00000000000000000h
|
|
L_int_one DQ 00000000000000001h, 00000000000000000h
|
|
L_int_two DQ 00000000000000002h, 00000000000000000h
|
|
L_int_three DQ 00000000000000003h, 00000000000000000h
|
|
|
|
L_2_by_pi DQ 03fe45f306dc9c883h ; 2/pi
|
|
L_one_half DQ 03FE0000000000000h ; .5
|
|
L_one_sixth DQ 03FC5555555555555h ; .1666...
|
|
L_two_to_neg_27 DQ 03e40000000000000h ; 2^-27
|
|
L_two_to_neg_13 DQ 03f20000000000000h ; 2^-13
|
|
L_piby4 DQ 03FE921FB54442D18h ; pi/4
|
|
L_small_arg_cw DQ 0411E848000000000h ; 5.e5, appropriate for CW
|
|
L_small_arg_bdl DQ 0417312D000000000h ; 2e7, works for BDL
|
|
|
|
L__inf_mask_64 DQ 07FF0000000000000h ; +Inf
|
|
|
|
EXTRN __Lcosarray:QWORD
|
|
EXTRN __Lsinarray:QWORD
|
|
EXTRN __use_fma3_lib:DWORD
|
|
|
|
; define local variable storage offsets
|
|
p_temp EQU 030h
|
|
p_temp1 EQU 040h
|
|
save_r10 EQU 050h
|
|
dummy_space EQU 060h
|
|
stack_size EQU 078h
|
|
|
|
include fm.inc
|
|
|
|
fname TEXTEQU <sin>
|
|
fname_special TEXTEQU <_sin_special>
|
|
|
|
;Define name and any external functions being called
|
|
EXTERN __remainder_piby2_forAsm : PROC
|
|
EXTERN __remainder_piby2_fma3 : PROC
|
|
EXTERN __remainder_piby2_fma3_bdl : PROC
|
|
EXTERN fname_special : PROC
|
|
|
|
.code
|
|
|
|
PUBLIC fname
|
|
fname PROC FRAME
|
|
StackAllocate stack_size
|
|
.ENDPROLOG
|
|
cmp DWORD PTR __use_fma3_lib, 0
|
|
jne Lsin_fma3
|
|
|
|
Lsin_sse2:
|
|
movd rdx, xmm0
|
|
xorpd xmm2, xmm2 ; zeroed out for later use
|
|
|
|
mov r10,rdx
|
|
mov r8d, 1 ; for determining region later on
|
|
btr r10,63 ; r10 <-- |x|
|
|
cmp r10,L_piby4
|
|
jb Lsin_sse2_absx_lt_piby4
|
|
|
|
Lsin_sse2_absx_nlt_piby4: ; common case
|
|
mov r11,rdx
|
|
shr r11,63
|
|
movd xmm0,r10 ; xmm0 <-- |x|
|
|
cmp r10, QWORD PTR L_small_arg_cw
|
|
jae Lsin_reduce_precise ; Note NaN/Inf will branch
|
|
|
|
; At this point we have |x| < L_small_arg_cw, which is currently 500000.
|
|
; Note that if |x| were too large, conversion of npi2 to integer would fail.
|
|
; We reduce the argument to be in a range from -pi/4 to +pi/4
|
|
; by subtracting multiples of pi/2
|
|
movapd xmm2, xmm0
|
|
mulsd xmm2, L_2_by_pi
|
|
movapd xmm4, xmm0
|
|
|
|
; xexp = ax >> EXPSHIFTBITS_DP64;
|
|
mov r9, r10
|
|
shr r9, 52 ; >>EXPSHIFTBITS_DP64
|
|
|
|
; How many pi/2 is |x| a multiple of?
|
|
; npi2 = (int)(x * twobypi + 0.5);
|
|
addsd xmm2, L_one_half ; npi2
|
|
|
|
movsd xmm3, L_real_piby2_1
|
|
cvttpd2dq xmm0, xmm2 ; convert npi2 to integer
|
|
movsd xmm1, L_real_piby2_1tail
|
|
cvtdq2pd xmm2, xmm0 ; npi2 back to double
|
|
|
|
; Subtract the multiple from x to get an extra-precision remainder
|
|
; rhead = x - npi2 * piby2_1;
|
|
mulsd xmm3, xmm2
|
|
subsd xmm4, xmm3 ; rhead
|
|
|
|
; rtail = npi2 * piby2_1tail;
|
|
mulsd xmm1, xmm2 ; rtail
|
|
movd eax, xmm0 ; eax <-- npi2
|
|
|
|
; GET_BITS_DP64(rhead-rtail, uy);
|
|
; originally only rhead
|
|
movapd xmm0, xmm4
|
|
subsd xmm0, xmm1
|
|
|
|
movsd xmm3, L_real_piby2_2
|
|
movd rcx, xmm0 ; rcx <-- rhead - rtail
|
|
movsd xmm5, L_real_piby2_2tail ; piby2_2tail
|
|
|
|
; xmm0=r, xmm1=rtail, xmm2=npi2, xmm3=temp for calc,
|
|
; xmm4=rhead, xmm5= temp for calc
|
|
; expdiff = xexp - ((uy & EXPBITS_DP64) >> EXPSHIFTBITS_DP64);
|
|
; expdiff measures how close rhead - rtail is to |x|
|
|
; (larger expdiff ==> more cancellation in |x| - (rhead-rtail) ==> closer)
|
|
shl rcx, 1 ; strip any sign bit
|
|
shr rcx, 53 ; >> EXPSHIFTBITS_DP64 +1
|
|
sub r9, rcx ; expdiff
|
|
|
|
;; if (expdiff > 15)
|
|
cmp r9, 15
|
|
jle Lsin_sse2_cw_reduction_done
|
|
|
|
; Here the remainder is pretty small compared with x, which
|
|
; implies that x is a near multiple of pi/2
|
|
; (x matches the multiple to at least 15 bits)
|
|
; So we do another stage of argument reduction.
|
|
|
|
; t = rhead;
|
|
movapd xmm1, xmm4
|
|
|
|
; rtail = npi2 * piby2_2;
|
|
mulsd xmm3, xmm2
|
|
|
|
; rhead = t - rtail;
|
|
mulsd xmm5, xmm2 ; npi2 * piby2_2tail
|
|
subsd xmm4, xmm3 ; rhead
|
|
|
|
; rtail = npi2 * piby2_2tail - ((t - rhead) - rtail);
|
|
subsd xmm1, xmm4 ; t - rhead
|
|
subsd xmm1, xmm3 ; -rtail
|
|
subsd xmm5, xmm1 ; rtail
|
|
|
|
; r = rhead - rtail;
|
|
movapd xmm0, xmm4
|
|
|
|
;HARSHA
|
|
;xmm1=rtail
|
|
movapd xmm1, xmm5 ; xmm1 <-- copy of rtail
|
|
subsd xmm0, xmm5
|
|
|
|
|
|
; xmm0=r, xmm4=rhead, xmm1=rtail
|
|
Lsin_sse2_cw_reduction_done:
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;; if the input was close to a pi/2 multiple
|
|
; The original NAG code missed this trick.
|
|
; If the input is very close to n*pi/2 after reduction, so r < 2^-27,
|
|
; then the sin is either ~ 1.0 or ~r, to within 53 bits.
|
|
|
|
; Note: Unfortunately this introduces two jcc instructions close to each
|
|
; other and to other branches. As r < 2^-13 should be rather uncommon, it
|
|
; almost certainly costs more than it saves. - WAT
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
; region = npi2 & 3;
|
|
|
|
subsd xmm4, xmm0 ; rhead-r
|
|
subsd xmm4, xmm1 ; rr = (rhead-r) - rtail
|
|
|
|
Lsin_piby4:
|
|
; perform taylor series to calc sinx, sinx for |x| <= pi/4
|
|
; x2 = r * r;
|
|
|
|
;xmm4 = a part of rr for the sin path, xmm4 is overwritten in the sin path
|
|
;instead use xmm3 because that was freed up in the sin path, xmm3 is overwritten in sin path
|
|
movapd xmm3, xmm0
|
|
movapd xmm2, xmm0
|
|
mulsd xmm2, xmm0 ;x2
|
|
|
|
bt eax,0
|
|
jc Lsin_sse2_calc_cos
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
; region 0 or 2 do a sin calculation
|
|
movsd xmm3, __Lsinarray+50h ; s6
|
|
mulsd xmm3, xmm2 ; x2s6
|
|
movsd xmm5, __Lsinarray+20h ; s3
|
|
movsd QWORD PTR p_temp[rsp], xmm4 ; store xx
|
|
movapd xmm1, xmm2 ; move for x4
|
|
mulsd xmm1, xmm2 ; x4
|
|
movsd QWORD PTR p_temp1[rsp], xmm0 ; store x
|
|
mulsd xmm5, xmm2 ; x2s3
|
|
movapd xmm4, xmm0 ; move for x3
|
|
addsd xmm3, __Lsinarray+40h ; s5+x2s6
|
|
mulsd xmm1, xmm2 ; x6
|
|
mulsd xmm3, xmm2 ; x2(s5+x2s6)
|
|
mulsd xmm4, xmm2 ; x3
|
|
addsd xmm5, __Lsinarray+10h ; s2+x2s3
|
|
mulsd xmm5, xmm2 ; x2(s2+x2s3)
|
|
addsd xmm3, __Lsinarray+30h ; s4 + x2(s5+x2s6)
|
|
mulsd xmm2, L_one_half ; 0.5 *x2
|
|
movsd xmm0, QWORD PTR p_temp[rsp] ; load xx
|
|
mulsd xmm3, xmm1 ; x6(s4 + x2(s5+x2s6))
|
|
addsd xmm5, __Lsinarray ; s1+x2(s2+x2s3)
|
|
mulsd xmm2, xmm0 ; 0.5 * x2 *xx
|
|
addsd xmm3, xmm5 ; zs
|
|
mulsd xmm4, xmm3 ; *x3
|
|
subsd xmm4, xmm2 ; x3*zs - 0.5 * x2 *xx
|
|
addsd xmm0, xmm4 ; +xx
|
|
addsd xmm0, QWORD PTR p_temp1[rsp] ; +x
|
|
|
|
jmp Lsin_sse2_adjust_region
|
|
|
|
ALIGN 16
|
|
Lsin_sse2_calc_cos:
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
; region 1 or 3 - do a cos calculation
|
|
; zc = (c2 + x2 * (c3 + x2 * (c4 + x2 * (c5 + x2 * c6))));
|
|
mulsd xmm4, xmm0 ; x*xx
|
|
movsd xmm5, L_one_half
|
|
movsd xmm1, __Lcosarray+50h ; c6
|
|
movsd xmm0, __Lcosarray+20h ; c3
|
|
mulsd xmm5, xmm2 ; r = 0.5 *x2
|
|
movapd xmm3, xmm2 ; copy of x2
|
|
movsd QWORD PTR p_temp[rsp], xmm4 ; store x*xx
|
|
mulsd xmm1, xmm2 ; c6*x2
|
|
mulsd xmm0, xmm2 ; c3*x2
|
|
subsd xmm5, L_one ; -t=r-1.0, trash r
|
|
mulsd xmm3, xmm2 ; x4
|
|
addsd xmm1, __Lcosarray+40h ; c5+x2c6
|
|
addsd xmm0, __Lcosarray+10h ; c2+x2C3
|
|
addsd xmm5, L_one ; 1 + (-t), trash t
|
|
mulsd xmm3, xmm2 ; x6
|
|
mulsd xmm1, xmm2 ; x2(c5+x2c6)
|
|
mulsd xmm0, xmm2 ; x2(c2+x2C3)
|
|
movapd xmm4, xmm2 ; copy of x2
|
|
mulsd xmm4, L_one_half ; r recalculate
|
|
addsd xmm1, __Lcosarray+30h ; c4 + x2(c5+x2c6)
|
|
addsd xmm0, __Lcosarray ; c1+x2(c2+x2C3)
|
|
mulsd xmm2, xmm2 ; x4 recalculate
|
|
subsd xmm5, xmm4 ; (1 + (-t)) - r
|
|
mulsd xmm1, xmm3 ; x6(c4 + x2(c5+x2c6))
|
|
addsd xmm0, xmm1 ; zc
|
|
subsd xmm4, L_one ; t relaculate
|
|
subsd xmm5, QWORD PTR p_temp[rsp] ; ((1 + (-t)) - r) - x*xx
|
|
mulsd xmm0, xmm2 ; x4 * zc
|
|
addsd xmm0, xmm5 ; x4 * zc + ((1 + (-t)) - r -x*xx)
|
|
subsd xmm0, xmm4 ; result - (-t)
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
Lsin_sse2_adjust_region:
|
|
; positive or negative
|
|
; switch (region)
|
|
shr eax, 1
|
|
mov ecx, eax
|
|
and eax, r11d
|
|
|
|
not ecx
|
|
not r11d
|
|
and ecx, r11d
|
|
|
|
or eax, ecx
|
|
and eax, 1
|
|
jnz Lsin_sse2_cleanup
|
|
|
|
;; if the original region 0, 1 and arg is negative, then we negate the result.
|
|
;; if the original region 2, 3 and arg is positive, then we negate the result.
|
|
movapd xmm2, xmm0
|
|
xorpd xmm0, xmm0
|
|
subsd xmm0, xmm2
|
|
|
|
ALIGN 16
|
|
Lsin_sse2_cleanup:
|
|
StackDeallocate stack_size
|
|
ret
|
|
|
|
ALIGN 16
|
|
Lsin_sse2_absx_lt_piby4:
|
|
; sin = sin_piby4(x, 0.0);
|
|
|
|
; x2 = r * r;
|
|
movapd xmm2, xmm0
|
|
mulsd xmm2, xmm0 ; x2
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
; region 0 - do a sin calculation
|
|
; zs = (s2 + x2 * (s3 + x2 * (s4 + x2 * (s5 + x2 * s6))));
|
|
movsd xmm3, __Lsinarray+50h ; s6
|
|
mulsd xmm3, xmm2 ; x2s6
|
|
movsd xmm5, __Lsinarray+20h ; s3
|
|
movapd xmm1, xmm2 ; move for x4
|
|
mulsd xmm1, xmm2 ; x4
|
|
mulsd xmm5, xmm2 ; x2s3
|
|
movapd xmm4, xmm0 ; move for x3
|
|
addsd xmm3, __Lsinarray+40h ; s5+x2s6
|
|
mulsd xmm1, xmm2 ; x6
|
|
mulsd xmm3, xmm2 ; x2(s5+x2s6)
|
|
mulsd xmm4, xmm2 ; x3
|
|
addsd xmm5, __Lsinarray+10h ; s2+x2s3
|
|
mulsd xmm5, xmm2 ; x2(s2+x2s3)
|
|
addsd xmm3, __Lsinarray+30h ; s4 + x2(s5+x2s6)
|
|
mulsd xmm3, xmm1 ; x6(s4 + x2(s5+x2s6))
|
|
addsd xmm5, __Lsinarray ; s1+x2(s2+x2s3)
|
|
addsd xmm3, xmm5 ; zs
|
|
mulsd xmm4, xmm3 ; *x3
|
|
addsd xmm0, xmm4 ; +x
|
|
jmp Lsin_sse2_cleanup
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
ALIGN 16
|
|
Lsin_reduce_precise:
|
|
; Reduce x into range [-pi/4, pih/4]
|
|
cmp r10,L__inf_mask_64
|
|
jae Lsin_x_naninf
|
|
mov QWORD PTR p_temp[rsp], r11
|
|
call __remainder_piby2_forAsm
|
|
mov r11, QWORD PTR p_temp[rsp]
|
|
|
|
; At this point xmm0 has r, xmm1 has rr, rax has region
|
|
|
|
movapd xmm4, xmm1 ; xmm4 <-- rr
|
|
jmp Lsin_piby4
|
|
|
|
; xmm0 = x, xmm4 = xx, eax= region
|
|
|
|
|
|
ALIGN 16
|
|
Lsin_x_naninf:
|
|
call fname_special
|
|
StackDeallocate stack_size
|
|
ret
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
; From this point we assume that FMA3 and AVX hardware are present.
|
|
|
|
ALIGN 16
|
|
Lsin_fma3:
|
|
vmovq r9,xmm0
|
|
mov r10,r9 ; save x to get sign later
|
|
btr r9,63 ; r9 <-- |x|
|
|
cmp r9,L_piby4
|
|
jae Lsin_fma3_absx_nlt_piby4 ; Note that NaN will branch
|
|
cmp r9,L_two_to_neg_13
|
|
jae Lsin_fma3_calc_sin_for_absx_lt_piby4
|
|
cmp r9,L_two_to_neg_27
|
|
jae Lsin_fma3_compute_x_xxx_0_1666
|
|
StackDeallocate stack_size
|
|
ret ; sin x ~= x for |x| < 2^-27
|
|
|
|
ALIGN 16
|
|
Lsin_fma3_compute_x_xxx_0_1666: ; |x| in [2^-27,2^-13]
|
|
vmulsd xmm1,xmm0,xmm0 ; xmm1l <-- x*x
|
|
vmulsd xmm1,xmm1,xmm0 ; xmm1l <-- x*x*x
|
|
vfnmadd231sd xmm0,xmm1,L_one_sixth ; xmm0l <-- x - x*x*x*(1/6)
|
|
StackDeallocate stack_size
|
|
ret
|
|
|
|
ALIGN 16
|
|
Lsin_fma3_calc_sin_for_absx_lt_piby4: ; |x| in [2^-13,pi/4]
|
|
vmovsd xmm5,__Lsinarray+050h
|
|
vmulsd xmm3,xmm0,xmm0 ; xmm3l <-- x^2
|
|
|
|
vfmadd213sd xmm5,xmm3,__Lsinarray+040h
|
|
vfmadd213sd xmm5,xmm3,__Lsinarray+030h
|
|
vfmadd213sd xmm5,xmm3,__Lsinarray+020h
|
|
vfmadd213sd xmm5,xmm3,__Lsinarray+010h
|
|
|
|
vmulsd xmm4,xmm0,xmm3 ; xmm4l <-- x^3
|
|
vfmadd213sd xmm5,xmm3,__Lsinarray
|
|
vfmadd231sd xmm0,xmm4,xmm5 ; xmm0l <-- x + x^3 p(x^2)
|
|
|
|
StackDeallocate stack_size
|
|
ret
|
|
|
|
ALIGN 16
|
|
Lsin_fma3_absx_nlt_piby4: ; !(|x| < pi/4)
|
|
; here r9 has |x|
|
|
cmp r9,L__inf_mask_64
|
|
jae Lsin_x_naninf
|
|
;Lrange_reduce: ;; unused label
|
|
|
|
vmovq xmm0,r9 ; xmm0 <-- |x|
|
|
cmp r9,L_small_arg_bdl
|
|
jae Lsin_fma3_do_general_arg_reduction
|
|
|
|
; Note that __remainder_piby2_fma3 conventions are
|
|
; on input
|
|
; |x| is in xmm0
|
|
; on output
|
|
; r is in xmm0
|
|
; rr is in xmm1
|
|
; region of |x| is in rax
|
|
|
|
; Boldo-Daumas-Li reduction for reasonably small |x|
|
|
call __remainder_piby2_fma3_bdl
|
|
Lsin_fma3_exit_s:
|
|
bt rax,0
|
|
vmulsd xmm3,xmm0,xmm0 ; xmm3 <-- x2 = x * x
|
|
jc Lsin_fma3_calc_cos
|
|
|
|
Lsin_fma3_calc_sin: ;; unused label
|
|
; region 0 or 2
|
|
; compute the sine of r+rr, where this sum is in [-pi/4,pi/4]
|
|
vmovsd xmm5,__Lsinarray+050h
|
|
vfmadd213sd xmm5,xmm3,__Lsinarray+040h
|
|
vfmadd213sd xmm5,xmm3,__Lsinarray+030h
|
|
vfmadd213sd xmm5,xmm3,__Lsinarray+020h
|
|
vfmadd213sd xmm5,xmm3,__Lsinarray+010h ; xmm5 <-- r
|
|
|
|
vmulsd xmm4,xmm0,xmm3 ; xmm4 <-- x3 = x*x*x
|
|
vmulsd xmm2,xmm4,xmm5 ; xmm2 <-- x*x*x * r
|
|
vmulsd xmm5,xmm1,L_one_half ; xmm5 <-- .5*x*x
|
|
vsubsd xmm2,xmm5,xmm2 ; xmm2 <-- .5*x*x - x*x*x*r
|
|
vmulsd xmm2,xmm3,xmm2
|
|
vsubsd xmm2,xmm2,xmm1
|
|
vfnmadd231sd xmm2, xmm4,__Lsinarray
|
|
vsubsd xmm0,xmm0,xmm2
|
|
jmp Lsin_fma3_exit_s_1
|
|
|
|
ALIGN 16
|
|
Lsin_fma3_calc_cos:
|
|
; region 1 or 3
|
|
; compute the cosine of r+rr, where this sum is in [-pi/4,pi/4]
|
|
vmovapd xmm2,L_one
|
|
vmulsd xmm5,xmm3,L_one_half ; xmm5 <-- x*x*.5 == r
|
|
vsubsd xmm4,xmm2,xmm5 ; xmm4 <-- t = 1. - x*x*.5
|
|
vsubsd xmm2,xmm2,xmm4 ; 1-t
|
|
vsubsd xmm2,xmm2,xmm5 ; xmm2 <-- (1-t) - r
|
|
vmovsd xmm5,__Lcosarray+050h
|
|
vfnmadd231sd xmm2,xmm0,xmm1 ; (1.0 - t) - r) - x * xx) xmm2
|
|
vmulsd xmm1,xmm3,xmm3 ; x2 * x2 xmm1
|
|
vfmadd213sd xmm5,xmm3,__Lcosarray+040h
|
|
vfmadd213sd xmm5,xmm3,__Lcosarray+030h
|
|
vfmadd213sd xmm5,xmm3,__Lcosarray+020h
|
|
vfmadd213sd xmm5,xmm3,__Lcosarray+010h
|
|
vfmadd213sd xmm5,xmm3,__Lcosarray
|
|
vfmadd213sd xmm5,xmm1,xmm2
|
|
vaddsd xmm0,xmm5,xmm4
|
|
|
|
Lsin_fma3_exit_s_1:
|
|
xor r8,r8 ; prepare r8 for cmov
|
|
and r10,L_signbit ; isolate original sign of x
|
|
bt eax,1
|
|
cmovc r8,L_signbit
|
|
xor r8,r10
|
|
vmovq xmm3,r8
|
|
vxorpd xmm0,xmm0,xmm3
|
|
|
|
StackDeallocate stack_size
|
|
ret
|
|
|
|
ALIGN 16
|
|
Lsin_fma3_do_general_arg_reduction:
|
|
; argument reduction for general x
|
|
|
|
; NOTE: the BDL argument reduction routine does not touch r10,
|
|
; but the general-purpose reduction does.
|
|
mov QWORD PTR [save_r10+rsp], r10
|
|
call __remainder_piby2_fma3
|
|
mov r10, QWORD PTR [save_r10+rsp]
|
|
jmp Lsin_fma3_exit_s
|
|
|
|
fname endp
|
|
END
|
|
|