mirror of
https://github.com/reactos/reactos.git
synced 2024-11-20 06:15:26 +00:00
533 lines
20 KiB
NASM
533 lines
20 KiB
NASM
;
|
|
; MIT License
|
|
; -----------
|
|
;
|
|
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
|
;
|
|
; Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
; of this Software and associated documentaon files (the "Software"), to deal
|
|
; in the Software without restriction, including without limitation the rights
|
|
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
; copies of the Software, and to permit persons to whom the Software is
|
|
; furnished to do so, subject to the following conditions:
|
|
;
|
|
; The above copyright notice and this permission notice shall be included in
|
|
; all copies or substantial portions of the Software.
|
|
;
|
|
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
; THE SOFTWARE.
|
|
;
|
|
;
|
|
; An implementation of the cos function.
|
|
;
|
|
; Prototype:
|
|
;
|
|
; double cos(double x);
|
|
;
|
|
; Computes cos(x).
|
|
; It will provide proper C99 return values,
|
|
; but may not raise floating point status bits properly.
|
|
; Based on the NAG C implementation.
|
|
;
|
|
; If FMA3 hardware is available, an FMA3 implementation of cos will be used.
|
|
|
|
.const
|
|
ALIGN 16
|
|
L_real_piby2_1 DQ 03ff921fb54400000h ; piby2_1
|
|
DQ 0
|
|
L_real_piby2_1tail DQ 03dd0b4611a626331h ; piby2_1tail
|
|
DQ 0
|
|
L_real_piby2_2 DQ 03dd0b4611a600000h ; piby2_2
|
|
DQ 0
|
|
L_real_piby2_2tail DQ 03ba3198a2e037073h ; piby2_2tail
|
|
DQ 0
|
|
|
|
ALIGN 16
|
|
L_one DQ 03FF0000000000000h, 03FF0000000000000h
|
|
L_signbit DQ 08000000000000000h, 00000000000000000h
|
|
L_int_one DQ 00000000000000001h, 00000000000000000h
|
|
L_int_two DQ 00000000000000002h, 00000000000000000h
|
|
|
|
L_2_by_pi DQ 03fe45f306dc9c883h ; 2/pi
|
|
L_one_half DQ 03FE0000000000000h ; .5
|
|
L_neg_one_half DQ 0bfe0000000000000h ; - 0.5
|
|
L_two_to_neg_27 DQ 03e40000000000000h ; 2^-27
|
|
L_two_to_neg_13 DQ 03f20000000000000h ; 2^-13
|
|
L_piby4 DQ 03FE921FB54442D18h ; pi/4
|
|
L_small_arg_cw DQ 0411E848000000000h ; 5.e5, appropriate for CW
|
|
L_small_arg_bdl DQ 0417312D000000000h ; 2e7, works for BDL
|
|
L_sign_mask DQ 07FFFFFFFFFFFFFFFh
|
|
|
|
L__inf_mask_64 DQ 07FF0000000000000h ; +Inf
|
|
|
|
|
|
|
|
EXTRN __Lcosarray:QWORD
|
|
EXTRN __Lsinarray:QWORD
|
|
EXTRN __use_fma3_lib:DWORD
|
|
|
|
; local storage offsets
|
|
p_temp EQU 020h ; temporary for get/put bits operation
|
|
p_temp1 EQU 030h ; temporary for get/put bits operation
|
|
dummy_space EQU 040h
|
|
stack_size EQU 068h
|
|
|
|
include fm.inc
|
|
|
|
fname TEXTEQU <cos>
|
|
fname_special TEXTEQU <_cos_special>
|
|
|
|
;Define name and any external functions being called
|
|
EXTERN __remainder_piby2_forAsm : PROC
|
|
EXTERN __remainder_piby2_fma3 : PROC
|
|
EXTERN __remainder_piby2_fma3_bdl : PROC
|
|
EXTERN fname_special : PROC
|
|
|
|
.code
|
|
|
|
PUBLIC fname
|
|
fname PROC FRAME
|
|
StackAllocate stack_size
|
|
.ENDPROLOG
|
|
|
|
cmp DWORD PTR __use_fma3_lib, 0
|
|
jne L_cos_fma3
|
|
|
|
Lcos_sse2:
|
|
movd rdx, xmm0
|
|
xorpd xmm2, xmm2 ; zeroed out for later use
|
|
|
|
mov r10, rdx
|
|
btr r10, 63 ; r10 <-- |x|
|
|
cmp r10, L_piby4
|
|
jb Lcos_sse2_absx_lt_piby4
|
|
|
|
Lcos_absx_nlt_piby4: ; common case
|
|
|
|
; Here rdx has x, r10 has |x|
|
|
movd xmm0, r10 ; xmm0 <-- |x|
|
|
|
|
cmp r10, QWORD PTR L_small_arg_cw
|
|
jae Lcos_reduce_precise ; Note NaN/Inf will branch
|
|
|
|
; At this point we have |x| < L_small_arg_cw, which is currently 500000.
|
|
; Note that if |x| were too large, conversion of npi2 to integer would fail.
|
|
; We reduce the argument to be in a range from -pi/4 to +pi/4
|
|
; by subtracting multiples of pi/2
|
|
movapd xmm2, xmm0
|
|
mulsd xmm2, L_2_by_pi
|
|
movapd xmm4, xmm0
|
|
|
|
; xexp = ax >> EXPSHIFTBITS_DP64;
|
|
mov r9, r10
|
|
shr r9, 52 ; >>EXPSHIFTBITS_DP64
|
|
|
|
; How many pi/2 is |x| a multiple of?
|
|
; npi2 = (int)(x * twobypi + 0.5);
|
|
addsd xmm2, L_one_half ; npi2
|
|
|
|
movsd xmm3, L_real_piby2_1
|
|
cvttpd2dq xmm0, xmm2 ; convert npi2 to integer
|
|
movsd xmm1, L_real_piby2_1tail
|
|
cvtdq2pd xmm2, xmm0 ; and back to double.
|
|
|
|
; Subtract the multiple from x to get an extra-precision remainder
|
|
; rhead = x - npi2 * piby2_1;
|
|
mulsd xmm3, xmm2
|
|
subsd xmm4, xmm3 ; rhead
|
|
|
|
; rtail = npi2 * piby2_1tail;
|
|
mulsd xmm1, xmm2 ; rtail
|
|
movd eax, xmm0 ; eax <-- npi2
|
|
|
|
; GET_BITS_DP64(rhead-rtail, uy);
|
|
; originally only rhead
|
|
movapd xmm0, xmm4
|
|
subsd xmm0, xmm1
|
|
|
|
movsd xmm3, L_real_piby2_2
|
|
movd rcx, xmm0 ; rcx <-- rhead - rtail
|
|
movsd xmm5, L_real_piby2_2tail ; piby2_2tail
|
|
|
|
; xmm0=r, xmm1=rtail, xmm2=npi2, xmm3=temp for calc,
|
|
; xmm4=rhead xmm5= temp for calc
|
|
; expdiff = xexp - ((uy & EXPBITS_DP64) >> EXPSHIFTBITS_DP64);
|
|
; expdiff measures how close rhead - rtail is to |x|
|
|
; (larger expdiff ==> more cancellation in |x| - (rhead-rtail) ==> closer)
|
|
shl rcx, 1 ; strip any sign bit
|
|
shr rcx, 53 ; >> EXPSHIFTBITS_DP64 +1
|
|
sub r9, rcx ; expdiff
|
|
|
|
;; if (expdiff > 15)
|
|
cmp r9, 15
|
|
jle Lcos_sse2_cw_reduction_done
|
|
|
|
; Here the remainder is pretty small compared with x, which
|
|
; implies that x is a near multiple of pi/2
|
|
; (x matches the multiple to at least 15 bits)
|
|
; So we do another stage of argument reduction.
|
|
|
|
; t = rhead;
|
|
movapd xmm1, xmm4
|
|
|
|
; rtail = npi2 * piby2_2;
|
|
mulsd xmm3, xmm2
|
|
|
|
; rhead = t - rtail;
|
|
mulsd xmm5, xmm2 ; npi2 * piby2_2tail
|
|
subsd xmm4, xmm3 ; rhead
|
|
|
|
; rtail = npi2 * piby2_2tail - ((t - rhead) - rtail);
|
|
subsd xmm1, xmm4 ; t - rhead
|
|
subsd xmm1, xmm3 ; -rtail
|
|
subsd xmm5, xmm1 ; rtail
|
|
|
|
; r = rhead - rtail;
|
|
movapd xmm0, xmm4
|
|
|
|
;HARSHA
|
|
;xmm1=rtail
|
|
movapd xmm1, xmm5 ; xmm1 <-- copy of rtail
|
|
subsd xmm0, xmm5
|
|
|
|
; xmm0=r, xmm4=rhead, xmm1=rtail
|
|
Lcos_sse2_cw_reduction_done:
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;; if the input was close to a pi/2 multiple
|
|
; The original NAG code missed this trick.
|
|
; If the input is very close to n*pi/2 after reduction, so r < 2^-27,
|
|
; then the cos is either ~ 1.0 or ~r, to within 53 bits.
|
|
|
|
; NOTE: Unfortunately, this introduces two jcc instructions close to each
|
|
; other and to other branches. As r < 2^-13 should be rather uncommon,
|
|
; the problems for branch prediction outweigh the computational savings. - WAT
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
; region = npi2 & 3;
|
|
subsd xmm4, xmm0 ; rhead-r
|
|
subsd xmm4, xmm1 ; rr = (rhead-r) - rtail
|
|
|
|
Lcos_piby4:
|
|
; perform taylor series to calc sinx or cosx
|
|
; x2 = r * r;
|
|
|
|
;xmm4 = a part of rr for the sin path, xmm4 is overwritten in the cos path
|
|
;instead use xmm3 because that was freed up in the sin path, xmm3 is overwritten in sin path
|
|
movapd xmm3, xmm0
|
|
movapd xmm2, xmm0
|
|
mulsd xmm2, xmm0 ;x2
|
|
|
|
bt eax,0
|
|
jnc Lcos_sse2_calc_cos
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
; region 1 or 3 do a sin calculation
|
|
movsd xmm3, __Lsinarray+50h ; s6
|
|
mulsd xmm3, xmm2 ; x2s6
|
|
movsd xmm5, __Lsinarray+20h ; s3
|
|
movsd QWORD PTR p_temp[rsp], xmm4 ; store xx
|
|
movapd xmm1, xmm2 ; move for x4
|
|
mulsd xmm1, xmm2 ; x4
|
|
movsd QWORD PTR p_temp1[rsp], xmm0 ; store x
|
|
mulsd xmm5, xmm2 ; x2s3
|
|
movapd xmm4, xmm0 ; move for x3
|
|
addsd xmm3, __Lsinarray+40h ; s5+x2s6
|
|
mulsd xmm1, xmm2 ; x6
|
|
mulsd xmm3, xmm2 ; x2(s5+x2s6)
|
|
mulsd xmm4, xmm2 ; x3
|
|
addsd xmm5, __Lsinarray+10h ; s2+x2s3
|
|
mulsd xmm5, xmm2 ; x2(s2+x2s3)
|
|
addsd xmm3, __Lsinarray+30h ; s4 + x2(s5+x2s6)
|
|
mulsd xmm2, L_one_half ; 0.5 *x2
|
|
movsd xmm0, QWORD PTR p_temp[rsp] ; load xx
|
|
mulsd xmm3, xmm1 ; x6(s4 + x2(s5+x2s6))
|
|
addsd xmm5, __Lsinarray ; s1+x2(s2+x2s3)
|
|
mulsd xmm2, xmm0 ; 0.5 * x2 *xx
|
|
addsd xmm3, xmm5 ; zs
|
|
mulsd xmm4, xmm3 ; *x3
|
|
subsd xmm4, xmm2 ; x3*zs - 0.5 * x2 *xx
|
|
addsd xmm0, xmm4 ; +xx
|
|
addsd xmm0, QWORD PTR p_temp1[rsp] ; +x
|
|
|
|
jmp Lcos_sse2_adjust_region
|
|
|
|
ALIGN 16
|
|
Lcos_sse2_calc_cos:
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
; region 0 or 2 - do a cos calculation
|
|
; zc = (c2 + x2 * (c3 + x2 * (c4 + x2 * (c5 + x2 * c6))));
|
|
mulsd xmm4, xmm0 ; x*xx
|
|
movsd xmm5, L_one_half
|
|
movsd xmm1, __Lcosarray+50h ; c6
|
|
movsd xmm0, __Lcosarray+20h ; c3
|
|
mulsd xmm5, xmm2 ; r = 0.5 *x2
|
|
movapd xmm3, xmm2 ; copy of x2
|
|
movsd QWORD PTR p_temp[rsp], xmm4 ; store x*xx
|
|
mulsd xmm1, xmm2 ; c6*x2
|
|
mulsd xmm0, xmm2 ; c3*x2
|
|
subsd xmm5, L_one ; -t=r-1.0, trash r
|
|
mulsd xmm3, xmm2 ; x4
|
|
addsd xmm1, __Lcosarray+40h ; c5+x2c6
|
|
addsd xmm0, __Lcosarray+10h ; c2+x2C3
|
|
addsd xmm5, L_one ; 1 + (-t), trash t
|
|
mulsd xmm3, xmm2 ; x6
|
|
mulsd xmm1, xmm2 ; x2(c5+x2c6)
|
|
mulsd xmm0, xmm2 ; x2(c2+x2C3)
|
|
movapd xmm4, xmm2 ; copy of x2
|
|
mulsd xmm4, L_one_half ; r recalculate
|
|
addsd xmm1, __Lcosarray+30h ; c4 + x2(c5+x2c6)
|
|
addsd xmm0, __Lcosarray ; c1+x2(c2+x2C3)
|
|
mulsd xmm2, xmm2 ; x4 recalculate
|
|
subsd xmm5, xmm4 ; (1 + (-t)) - r
|
|
mulsd xmm1, xmm3 ; x6(c4 + x2(c5+x2c6))
|
|
addsd xmm0, xmm1 ; zc
|
|
subsd xmm4, L_one ; t relaculate
|
|
subsd xmm5, QWORD PTR p_temp[rsp] ; ((1 + (-t)) - r) - x*xx
|
|
mulsd xmm0, xmm2 ; x4 * zc
|
|
addsd xmm0, xmm5 ; x4 * zc + ((1 + (-t)) - r -x*xx)
|
|
subsd xmm0, xmm4 ; result - (-t)
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
Lcos_sse2_adjust_region:
|
|
; switch (region)
|
|
add eax, 1
|
|
and eax, 2
|
|
jz Lcos_sse2_cleanup
|
|
|
|
;; if the original region 1 or 2 then we negate the result.
|
|
movapd xmm2, xmm0
|
|
xorpd xmm0, xmm0
|
|
subsd xmm0, xmm2
|
|
|
|
ALIGN 16
|
|
Lcos_sse2_cleanup:
|
|
StackDeallocate stack_size
|
|
ret
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ALIGN 16
|
|
Lcos_sse2_absx_lt_piby4:
|
|
; cos = cos_piby4(x, 0.0);
|
|
|
|
; x2 = r * r;
|
|
cmp r10, L_two_to_neg_13
|
|
jb Lcos_sse2_x_small
|
|
movapd xmm2, xmm0
|
|
mulsd xmm2, xmm0 ; x2
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
; region 0 - do a cos calculation
|
|
; zc = (c2 + x2 * (c3 + x2 * (c4 + x2 * (c5 + x2 * c6))));
|
|
movsd xmm1, __Lcosarray+10h ; c2
|
|
movapd xmm4, xmm2 ; move for x4
|
|
mulsd xmm4, xmm2 ; x4
|
|
movsd xmm3, __Lcosarray+30h ; c4
|
|
mulsd xmm1, xmm2 ; c2x2
|
|
movsd xmm5, __Lcosarray+50h ; c6
|
|
mulsd xmm3, xmm2 ; c4x2
|
|
movapd xmm0, xmm4 ; move for x8
|
|
mulsd xmm5, xmm2 ; c6x2
|
|
mulsd xmm0, xmm4 ; x8
|
|
addsd xmm1, __Lcosarray ; c1 + c2x2
|
|
mulsd xmm1, xmm4 ; c1x4 + c2x6
|
|
addsd xmm3, __Lcosarray+20h ; c3 + c4x2
|
|
mulsd xmm2, L_neg_one_half ; -0.5x2, destroy xmm2
|
|
addsd xmm5, __Lcosarray+40h ; c5 + c6x2
|
|
mulsd xmm3, xmm0 ; c3x8 + c4x10
|
|
mulsd xmm4, xmm0 ; x12
|
|
mulsd xmm4, xmm5 ; c5x12 + c6x14
|
|
|
|
movsd xmm0, L_one
|
|
addsd xmm1, xmm3 ; c1x4 + c2x6 + c3x8 + c4x10
|
|
movapd xmm3, xmm2 ; preserve -0.5x2
|
|
addsd xmm2, xmm0 ; t = 1 - 0.5x2
|
|
subsd xmm0, xmm2 ; 1-t
|
|
addsd xmm0, xmm3 ; (1-t) - r
|
|
addsd xmm1, xmm4 ; c1x4 + c2x6 + c3x8 + c4x10 + c5x12 + c6x14
|
|
addsd xmm0, xmm1 ; (1-t) - r + c1x4 + c2x6 + c3x8 + c4x10 + c5x12 + c6x14
|
|
addsd xmm0, xmm2 ; 1 - 0.5x2 + above
|
|
|
|
StackDeallocate stack_size
|
|
ret
|
|
|
|
ALIGN 16
|
|
Lcos_sse2_x_small:
|
|
movsd xmm2, xmm0
|
|
movsd xmm0, L_one
|
|
cmp r10, L_two_to_neg_27
|
|
jb Lcos_sse2_x_smaller
|
|
mulsd xmm2, xmm2
|
|
mulsd xmm2, L_one_half
|
|
subsd xmm0, xmm2
|
|
StackDeallocate stack_size
|
|
ret
|
|
|
|
ALIGN 16
|
|
Lcos_sse2_x_smaller:
|
|
movsd xmm0, L_one
|
|
addsd xmm0, L_int_one ; really adding smallest subnormal; set inexact
|
|
StackDeallocate stack_size
|
|
ret
|
|
|
|
ALIGN 16
|
|
Lcos_reduce_precise:
|
|
; Reduce x into range [-pi/4, pi/4]
|
|
cmp r10, L__inf_mask_64
|
|
jae Lcos_x_naninf
|
|
call __remainder_piby2_forAsm
|
|
|
|
; At this point xmm0 has r, xmm1 has rr, rax has region
|
|
|
|
movapd xmm4, xmm1 ; xmm4 <-- rr
|
|
jmp Lcos_piby4
|
|
|
|
; xmm0 = x, xmm4 = xx, eax= region
|
|
|
|
|
|
ALIGN 16
|
|
Lcos_x_naninf:
|
|
call fname_special
|
|
StackDeallocate stack_size
|
|
ret
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
; From this point we assume that FMA3 and AVX hardware are present.
|
|
|
|
ALIGN 16
|
|
L_cos_fma3:
|
|
vmovq r9,xmm0
|
|
mov rax,r9
|
|
and r9,L_sign_mask ; clear sign
|
|
|
|
Lcos_early_exit_s_1: ;; unused label
|
|
cmp r9,L_piby4
|
|
jg Lcos_early_exit_s ; Note that NaN will branch
|
|
cmp r9,L_two_to_neg_13
|
|
jge Lcompute_cos_pyby_4
|
|
cmp r9,L_two_to_neg_27
|
|
jge Lcompute_1_xx_5
|
|
vmovq xmm0,L_one ; for tiniest args, cos is 1
|
|
jmp Lreturn_no_restore
|
|
|
|
Lcompute_1_xx_5:
|
|
vmulsd xmm1,xmm0,L_one_half ; xmm1l <-- .5*x
|
|
vfnmadd213sd xmm0,xmm1,L_one ; xmm0l <-- 1.0 - (.5*x)*x
|
|
jmp Lreturn_no_restore
|
|
|
|
Lcompute_cos_pyby_4:
|
|
; make sure this is accurate enough
|
|
; note that x^2 can't be all that close to 1 here
|
|
vmulsd xmm3,xmm0,xmm0 ; xmm3 <-- xx = x*x
|
|
vmovapd xmm0,__Lcosarray+050h ; xmm0 <-- c5
|
|
vfmadd213sd xmm0,xmm3,__Lcosarray+040h ; xmm0 <-- c5*xx + c4
|
|
vfmadd213sd xmm0,xmm3,__Lcosarray+030h ; xmm0 <-- (c5*xx + c4)*xx + c3
|
|
vfmadd213sd xmm0,xmm3,__Lcosarray+020h
|
|
vfmadd213sd xmm0,xmm3,__Lcosarray+010h
|
|
vfmadd213sd xmm0,xmm3,__Lcosarray
|
|
vfmsub213sd xmm0,xmm3,L_one_half
|
|
vfmadd213sd xmm0,xmm3,L_one
|
|
|
|
jmp Lreturn_no_restore
|
|
|
|
Lcos_early_exit_s:
|
|
mov r8,L__inf_mask_64
|
|
and rax,r8
|
|
cmp rax, r8
|
|
jz Lcos_x_naninf
|
|
|
|
Lrange_reduce:
|
|
vmovq xmm0,r9 ; r9 <-- |x|
|
|
cmp r9,L_small_arg_bdl
|
|
jae Lcos_remainder_piby2
|
|
|
|
; For __remainder_piby2_fma3 and __remainder_piby2_fma3_bdl
|
|
; on input
|
|
; x is in xmm0
|
|
; on output
|
|
; r is in xmm0
|
|
; rr is in xmm1
|
|
; region is in rax
|
|
|
|
; Boldo-Daumas-Li reduction for reasonably small |x|
|
|
call __remainder_piby2_fma3_bdl
|
|
|
|
;; if region is 0 or 2 do a cos calc.
|
|
;; if region is 1 or 3 do a sin calc.
|
|
Lcos_exit_s:
|
|
bt rax,0
|
|
jc Lsin_piby4_compute
|
|
|
|
Lcos_piby4_compute: ;; unused label
|
|
; compute the cosine of r+rr, where this sum is in [-pi/4,pi/4]
|
|
vmovapd xmm2,L_one
|
|
vmulsd xmm3,xmm0,xmm0 ; xmm3 <-- x * x
|
|
vmulsd xmm5,xmm3,L_one_half ; xmm5 <-- x*x*.5 == r
|
|
vsubsd xmm4,xmm2,xmm5 ; xmm4 <-- t = 1. - x*x*.5
|
|
vsubsd xmm2,xmm2,xmm4 ; 1-t
|
|
vsubsd xmm2,xmm2,xmm5 ; xmm2 <-- (1-t) - r
|
|
vmovapd xmm5,__Lcosarray+040h
|
|
vfnmadd231sd xmm2,xmm0,xmm1 ; (1.0 - t) - r) - x * xx) xmm2
|
|
vmulsd xmm1,xmm3,xmm3 ; x2 * x2 xmm1
|
|
vfmadd231sd xmm5,xmm3,__Lcosarray+050h
|
|
vfmadd213sd xmm5,xmm3,__Lcosarray+030h
|
|
vfmadd213sd xmm5,xmm3,__Lcosarray+020h
|
|
vfmadd213sd xmm5,xmm3,__Lcosarray+010h
|
|
vfmadd213sd xmm5,xmm3,__Lcosarray
|
|
vfmadd213sd xmm5,xmm1,xmm2
|
|
vaddsd xmm0,xmm5,xmm4
|
|
|
|
jmp Lcos_exit_s_1
|
|
|
|
ALIGN 16
|
|
Lsin_piby4_compute:
|
|
; compute the sine of r+rr, where this sum is in [-pi/4,pi/4]
|
|
vmovapd xmm5,__Lsinarray+040h
|
|
vmulsd xmm3,xmm0,xmm0 ; xmm3 <-- x2 = x * x
|
|
vfmadd231sd xmm5,xmm3,__Lsinarray+050h
|
|
vfmadd213sd xmm5,xmm3,__Lsinarray+030h
|
|
vfmadd213sd xmm5,xmm3,__Lsinarray+020h
|
|
vfmadd213sd xmm5,xmm3,__Lsinarray+010h ; xmm5 <-- r
|
|
|
|
vmulsd xmm4,xmm0,xmm3 ; xmm4 <-- x3 = x*x*x
|
|
vmulsd xmm2,xmm4,xmm5 ; xmm2 <-- x*x*x * r
|
|
vmulsd xmm5,xmm1,L_one_half ; xmm5 <-- .5*x*x
|
|
vsubsd xmm2,xmm5,xmm2 ; xmm2 <-- .5*x*x - x*x*x*r
|
|
vmulsd xmm2,xmm3,xmm2
|
|
vsubsd xmm2,xmm2,xmm1
|
|
vfnmadd231sd xmm2, xmm4,__Lsinarray
|
|
vsubsd xmm0,xmm0,xmm2
|
|
|
|
Lcos_exit_s_1:
|
|
xor r8,r8
|
|
add eax, 1
|
|
and eax, 2
|
|
cmovnz r8, L_signbit
|
|
vmovq xmm3,r8
|
|
vxorpd xmm0,xmm0,xmm3
|
|
|
|
Lreturn_restore_regs:
|
|
StackDeallocate stack_size
|
|
ret
|
|
|
|
Lreturn_no_restore:
|
|
StackDeallocate stack_size
|
|
ret
|
|
|
|
ALIGN 16
|
|
Lcos_remainder_piby2:
|
|
; argument reduction for general x
|
|
call __remainder_piby2_fma3
|
|
jmp Lcos_exit_s
|
|
|
|
|
|
fname endp
|
|
END
|