reactos/sdk/lib/crt/math/libm_sse2/sinf.asm
2022-12-01 15:21:59 +02:00

665 lines
23 KiB
NASM

;
; MIT License
; -----------
;
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
;
; Permission is hereby granted, free of charge, to any person obtaining a copy
; of this Software and associated documentaon files (the "Software"), to deal
; in the Software without restriction, including without limitation the rights
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
; copies of the Software, and to permit persons to whom the Software is
; furnished to do so, subject to the following conditions:
;
; The above copyright notice and this permission notice shall be included in
; all copies or substantial portions of the Software.
;
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
; THE SOFTWARE.
;
;
; An implementation of the sinf function.
;
; Prototype
;
; float sinf(float x);
;
; Computes sinf(x).
; It will provide proper C99 return values,
; but may not raise floating point status bits properly.
; Based on the NAG C implementation.
;
.const
ALIGN 16
L_signbit DQ 08000000000000000h
DQ 08000000000000000h
L_sign_mask DQ 07FFFFFFFFFFFFFFFh
DQ 07FFFFFFFFFFFFFFFh
L_one DQ 03FF0000000000000h
DQ 03FF0000000000000h
L_int_three DQ 00000000000000003h
DQ 00000000000000003h
L_one_half DQ 03FE0000000000000h
DQ 03FE0000000000000h
L_twobypi DQ 03FE45F306DC9C883h
DQ 03FE45F306DC9C883h
L_piby2_1 DQ 03FF921FB54400000h
DQ 03FF921FB54400000h
L_one_sixth DQ 03FC5555555555555h
DQ 03FC5555555555555h
L_piby2_1tail DQ 03DD0B4611A626331h
DQ 03DD0B4611A626331h
L_piby2_2 DQ 03dd0b4611a600000h
DQ 03dd0b4611a600000h
L_piby2_2tail DQ 03ba3198a2e037073h
DQ 03ba3198a2e037073h
L_inf_mask_32 DD 07F800000h
DD 07F800000h
DQ 07F8000007F800000h
L_int_two DQ 00000000000000002h
DQ 00000000000000002h
L_piby2_lead DQ 03ff921fb54442d18h
DQ 03ff921fb54442d18h
L_piby4 DQ 03fe921fb54442d18h
DQ 03fe921fb54442d18h
L_mask_3f2 DQ 03f20000000000000h
DQ 03f20000000000000h
L_mask_3f8 DQ 03f80000000000000h
DQ 03f80000000000000h
; Do these really need to be different?
L_large_x_fma3 DQ 04170008AC0000000h ; 16779436
L_large_x_sse2 DQ 0416E848000000000h ; 16000000
EXTRN __Lcosfarray:QWORD
EXTRN __Lsinfarray:QWORD
EXTRN __use_fma3_lib:DWORD
EXTRN __L_2_by_pi_bits:BYTE
; define local variable storage offsets
p_temp EQU 010h ; temporary for get/put bits operation
p_temp1 EQU 018h ; temporary for get/put bits operation
region EQU 020h ; pointer to region for remainder_piby2
r EQU 028h ; pointer to r for remainder_piby2
dummy_space EQU 040h
stack_size EQU 058h
include fm.inc
fname TEXTEQU <sinf>
fname_special TEXTEQU <_sinf_special>
;Define name and any external functions being called
EXTRN __remainder_piby2d2f_forC : PROC ; NEAR
EXTERN fname_special : PROC
.code
ALIGN 16
PUBLIC fname
fname PROC FRAME
StackAllocate stack_size
.ENDPROLOG
cmp DWORD PTR __use_fma3_lib, 0
jne Lsinf_fma3
Lsinf_sse2:
xorpd xmm2, xmm2 ; zeroed out for later use
;; if NaN or inf
movd edx, xmm0
mov eax, 07f800000h
mov r10d, eax
and r10d, edx
cmp r10d, eax
jz Lsinf_sse2_naninf
; GET_BITS_DP64(x, ux);
; get the input value to an integer register.
cvtss2sd xmm0, xmm0 ; convert input to double.
movd rdx, xmm0 ; rdx is ux
; ax = (ux & ~SIGNBIT_DP64);
mov r10, rdx
btr r10, 63 ; r10 is ax
mov r8d, 1 ; for determining region later on
;; if (ax <= 0x3fe921fb54442d18) abs(x) <= pi/4
mov rax, 03fe921fb54442d18h
cmp r10, rax
jg Lsinf_absx_gt_piby4
;; if (ax < 0x3f80000000000000) abs(x) < 2.0^(-7)
mov rax, 3f80000000000000h
cmp r10, rax
jge Lsinf_sse2_small
;; if (ax < 0x3f20000000000000) abs(x) < 2.0^(-13)
mov rax, 3f20000000000000h
cmp r10, rax
jge Lsinf_sse2_smaller
; sinf = x;
jmp Lsinf_sse2_cleanup
ALIGN 16
Lsinf_sse2_smaller:
; sinf = x - x^3 * 0.1666666666666666666;
movsd xmm2, xmm0
movsd xmm4, QWORD PTR L_one_sixth ; 0.1666666666666666666
mulsd xmm2, xmm2 ; x^2
mulsd xmm2, xmm0 ; x^3
mulsd xmm2, xmm4 ; x^3 * 0.1666666666666666666
subsd xmm0, xmm2 ; x - x^3 * 0.1666666666666666666
jmp Lsinf_sse2_cleanup
ALIGN 16
Lsinf_sse2_small:
movsd xmm2, xmm0 ; x2 = r * r;
mulsd xmm2, xmm0 ; x2
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; region 0 or 2 - do a sinf calculation
; zs = x + x3((s1 + x2 * s2) + x4(s3 + x2 * s4));
movsd xmm1, QWORD PTR __Lsinfarray+18h ; s4
mulsd xmm1, xmm2 ; s4x2
movsd xmm4, xmm2 ; move for x4
movsd xmm5, QWORD PTR __Lsinfarray+8h ; s2
mulsd xmm4, xmm2 ; x4
movsd xmm3, xmm0 ; move for x3
mulsd xmm5, xmm2 ; s2x2
mulsd xmm3, xmm2 ; x3
addsd xmm1, QWORD PTR __Lsinfarray+10h ; s3+s4x2
mulsd xmm1, xmm4 ; s3x4+s4x6
addsd xmm5, QWORD PTR __Lsinfarray ; s1+s2x2
addsd xmm1, xmm5 ; s1+s2x2+s3x4+s4x6
mulsd xmm1, xmm3 ; x3(s1+s2x2+s3x4+s4x6)
addsd xmm0, xmm1 ; x + x3(s1+s2x2+s3x4+s4x6)
jmp Lsinf_sse2_cleanup
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
ALIGN 16
Lsinf_absx_gt_piby4:
; xneg = (ax != ux);
cmp rdx, r10
mov r11d, 0
;; if (xneg) x = -x;
jz Lsinf_sse2_reduce_moderate
mov r11d, 1
subsd xmm2, xmm0
movsd xmm0, xmm2
Lsinf_sse2_reduce_moderate:
;; if (x < 5.0e6)
cmp r10, QWORD PTR L_large_x_sse2
jae Lsinf_sse2_reduce_large
; reduce the argument to be in a range from -pi/4 to +pi/4
; by subtracting multiples of pi/2
movsd xmm2, xmm0
movsd xmm3, QWORD PTR L_twobypi
movsd xmm4, xmm0
movsd xmm5, QWORD PTR L_one_half ; .5
mulsd xmm2, xmm3
;/* How many pi/2 is x a multiple of? */
; xexp = ax >> EXPSHIFTBITS_DP64;
mov r9, r10
shr r9, 52 ; >>EXPSHIFTBITS_DP64
; npi2 = (int)(x * twobypi + 0.5);
addsd xmm2, xmm5 ; npi2
movsd xmm3, QWORD PTR L_piby2_1
cvttpd2dq xmm0, xmm2 ; convert to integer
movsd xmm1, QWORD PTR L_piby2_1tail
cvtdq2pd xmm2, xmm0 ; and back to double.
; /* Subtract the multiple from x to get an extra-precision remainder */
; rhead = x - npi2 * piby2_1;
mulsd xmm3, xmm2
subsd xmm4, xmm3 ; rhead
; rtail = npi2 * piby2_1tail;
mulsd xmm1, xmm2
movd eax, xmm0
; GET_BITS_DP64(rhead-rtail, uy);
; originally only rhead
movsd xmm0, xmm4
subsd xmm0, xmm1
movsd xmm3, QWORD PTR L_piby2_2
movd rcx, xmm0
movsd xmm5, QWORD PTR L_piby2_2tail
; xmm0=r, xmm4=rhead, xmm1=rtail, xmm2=npi2, xmm3=temp for calc, xmm5= temp for calc
; expdiff = xexp - ((uy & EXPBITS_DP64) >> EXPSHIFTBITS_DP64);
shl rcx, 1 ; strip any sign bit
shr rcx, 53 ; >> EXPSHIFTBITS_DP64 +1
sub r9, rcx ; expdiff
;; if (expdiff > 15)
cmp r9, 15
jle Lsinf_sse2_expdiff_le_15
; The remainder is pretty small compared with x, which
; implies that x is a near multiple of pi/2
; (x matches the multiple to at least 15 bits)
; t = rhead;
movsd xmm1, xmm4
; rtail = npi2 * piby2_2;
mulsd xmm3, xmm2
; rhead = t - rtail;
mulsd xmm5, xmm2 ; npi2 * piby2_2tail
subsd xmm4, xmm3 ; rhead
; rtail = npi2 * piby2_2tail - ((t - rhead) - rtail);
subsd xmm1, xmm4 ; t - rhead
subsd xmm1, xmm3 ; -rtail
subsd xmm5, xmm1 ; rtail
; r = rhead - rtail;
movsd xmm0, xmm4
;HARSHA
;xmm1=rtail
movsd xmm1, xmm5
subsd xmm0, xmm5
; xmm0=r, xmm4=rhead, xmm1=rtail
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
Lsinf_sse2_expdiff_le_15:
cmp rcx, 03f2h ; is r < 2^-13 ?
jge Lsinf_sse2_calc_sincosf_piby4 ; use taylor series if not
cmp rcx, 03deh ; if r really small.
jle Lsinf_sse2_r_very_small ; then sinf(r) ~ r or 1
movsd xmm2, xmm0
mulsd xmm2, xmm0 ; xmm2 <-- r^2
;; if region is 0 or 2 do a sinf calc.
and r8d, eax
jnz Lsinf_sse2_small_calc_sin
; region 0 or 2 do a sinf calculation
; use simply polynomial
; x - x*x*x*0.166666666666666666;
movsd xmm3, QWORD PTR L_one_sixth
mulsd xmm3, xmm0 ; * x
mulsd xmm3, xmm2 ; * x^2
subsd xmm0, xmm3 ; xs
jmp Lsinf_sse2_adjust_region
ALIGN 16
Lsinf_sse2_small_calc_sin:
; region 1 or 3 do a cosf calculation
; use simply polynomial
; 1.0 - x*x*0.5;
movsd xmm0, QWORD PTR L_one ; 1.0
mulsd xmm2, QWORD PTR L_one_half ; 0.5 *x^2
subsd xmm0, xmm2 ; xc
jmp Lsinf_sse2_adjust_region
ALIGN 16
Lsinf_sse2_r_very_small:
;; if region is 0 or 2 do a sinf calc. (sinf ~ x)
and r8d, eax
jz Lsinf_sse2_adjust_region
movsd xmm0, QWORD PTR L_one ; cosf(r) is a 1
jmp Lsinf_sse2_adjust_region
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
ALIGN 16
Lsinf_sse2_reduce_large:
; Reduce x into range [-pi/4, pi/4]
; __remainder_piby2d2f_forC(x, &r, &region);
mov QWORD PTR p_temp[rsp], r11
lea rdx, QWORD PTR r[rsp]
lea r8, QWORD PTR region[rsp]
movd rcx, xmm0
call __remainder_piby2d2f_forC
mov r11, QWORD PTR p_temp[rsp]
mov r8d, 1 ; for determining region later on
movsd xmm1, QWORD PTR r[rsp] ; x
mov eax, DWORD PTR region[rsp] ; region
; xmm0 = x, xmm4 = xx, r8d = 1, eax= region
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; perform taylor series to calc sinfx, cosfx
Lsinf_sse2_calc_sincosf_piby4:
; x2 = r * r;
movsd xmm2, xmm0
mulsd xmm2, xmm0 ; x2
;; if region is 1 or 3, do a cosf calc.
and r8d, eax
jnz Lsinf_sse2_do_cosf_calc
; region is 0 or 2: do a sinf calc.
; zs = x + x3((s1 + x2 * s2) + x4(s3 + x2 * s4));
Lsinf_sse2_do_sinf_calc:
movsd xmm1, QWORD PTR __Lsinfarray+18h ; s4
mulsd xmm1, xmm2 ; s4x2
movsd xmm4, xmm2 ; move for x4
mulsd xmm4, xmm2 ; x4
movsd xmm5, QWORD PTR __Lsinfarray+8h ; s2
mulsd xmm5, xmm2 ; s2x2
movsd xmm3, xmm0 ; move for x3
mulsd xmm3, xmm2 ; x3
addsd xmm1, QWORD PTR __Lsinfarray+10h ; s3+s4x2
mulsd xmm1, xmm4 ; s3x4+s4x6
addsd xmm5, QWORD PTR __Lsinfarray ; s1+s2x2
addsd xmm1, xmm5 ; s1+s2x2+s3x4+s4x6
mulsd xmm1, xmm3 ; x3(s1+s2x2+s3x4+s4x6)
addsd xmm0, xmm1 ; x + x3(s1+s2x2+s3x4+s4x6)
jmp Lsinf_sse2_adjust_region
ALIGN 16
Lsinf_sse2_do_cosf_calc:
; region 1 or 3 - do a cosf calculation
; zc = 1-0.5*x2+ c1*x4 +c2*x6 +c3*x8;
; zc = 1-0.5*x2+ c1*x4 +c2*x6 +c3*x8 + c4*x10 for a higher precision
movsd xmm1, QWORD PTR __Lcosfarray+20h ; c4
movsd xmm4, xmm2 ; move for x4
mulsd xmm1, xmm2 ; c4x2
movsd xmm3, QWORD PTR __Lcosfarray+10h ; c2
mulsd xmm4, xmm2 ; x4
movsd xmm0, QWORD PTR __Lcosfarray ; c0
mulsd xmm3, xmm2 ; c2x2
mulsd xmm0, xmm2 ; c0x2 (=-0.5x2)
addsd xmm1, QWORD PTR __Lcosfarray+18h ; c3+c4x2
mulsd xmm1, xmm4 ; c3x4 + c4x6
addsd xmm3, QWORD PTR __Lcosfarray+8h ; c1+c2x2
addsd xmm1, xmm3 ; c1 + c2x2 + c3x4 + c4x6
mulsd xmm1, xmm4 ; c1x4 + c2x6 + c3x8 + c4x10
addsd xmm0, QWORD PTR L_one ; 1 - 0.5x2
addsd xmm0, xmm1 ; 1 - 0.5x2 + c1x4 + c2x6 + c3x8 + c4x10
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
Lsinf_sse2_adjust_region:
; positive or negative
; switch (region)
shr eax, 1
mov ecx, eax
and eax, r11d
not ecx
not r11d
and ecx, r11d
or eax, ecx
and eax, 1
jnz Lsinf_sse2_cleanup
;; if the original region 0, 1 and arg is negative, then we negate the result.
;; if the original region 2, 3 and arg is positive, then we negate the result.
movsd xmm2, xmm0
xorpd xmm0, xmm0
subsd xmm0, xmm2
Lsinf_sse2_cleanup:
cvtsd2ss xmm0, xmm0
StackDeallocate stack_size
ret
ALIGN 16
Lsinf_sse2_naninf:
call fname_special
StackDeallocate stack_size
ret
ALIGN 16
Lsinf_fma3:
vmovd eax,xmm0
mov r8d,L_inf_mask_32
and eax,r8d
cmp eax, r8d
jz Lsinf_fma3_naninf
vcvtss2sd xmm5,xmm0,xmm0
vmovq r9,xmm5
btr r9,63 ; r9 <-- |x|
cmp r9,L_piby4
jg Lsinf_fma3_range_reduce
cmp r9,L_mask_3f8
jge Lsinf_fma3_compute_sinf_piby_4
cmp r9,L_mask_3f2
jge Lsinf_fma3_compute_x_xxx_0_1666
; Here |x| < 2^-13; just return sin x ~ x
StackDeallocate stack_size
ret
ALIGN 16
Lsinf_fma3_compute_x_xxx_0_1666:
; Here |x| < 2^-7; return sin x ~ x + 1/6 x^3
vmulsd xmm1,xmm5,xmm5
vmulsd xmm0,xmm1,xmm5 ; xmm1 <-- x^3
vfnmadd132sd xmm0,xmm5,L_one_sixth ; x - x*x*x*0.166666666666666666
jmp Lsinf_fma3_return_sinf_s
ALIGN 16
Lsinf_fma3_compute_sinf_piby_4:
vmovapd xmm0,xmm5
vmovsd xmm1,__Lsinfarray+010h
vmulsd xmm3,xmm0,xmm0 ; xmm3 <-- x^2
vfmadd231sd xmm1,xmm3,__Lsinfarray+018h
vfmadd213sd xmm1,xmm3,__Lsinfarray+08h
vfmadd213sd xmm1,xmm3,__Lsinfarray
vmulsd xmm3,xmm0,xmm3 ; xmm3 <-- x^3
vfmadd231sd xmm0,xmm1,xmm3
jmp Lsinf_fma3_return_sinf_s
ALIGN 16
Lsinf_fma3_range_reduce:
vmovq xmm0,r9 ; xmm0 <-- |x|
cmp r9,L_large_x_fma3
jge Lsinf_fma3_reduce_large
Lsinf_fma3_sinf_reduce_moderate:
vandpd xmm1,xmm0,L_sign_mask ; xmm1 <-- |x| mov should suffice WAT
vmovapd xmm2,L_twobypi
vfmadd213sd xmm2,xmm1,L_one_half
vcvttpd2dq xmm2,xmm2
vpmovsxdq xmm1,xmm2
vandpd xmm4,xmm1,L_int_three ; xmm4 <-- region
vshufps xmm1 ,xmm1,xmm1,8
vcvtdq2pd xmm1,xmm1
vmovdqa xmm2,xmm0
vfnmadd231sd xmm2,xmm1,L_piby2_1 ; xmm2 <-- rhead
vmulsd xmm3,xmm1,L_piby2_1tail ; xmm3 <-- rtail
vsubsd xmm0,xmm2,xmm3 ; xmm0 <-- r_1
vsubsd xmm2,xmm2,xmm0
vsubsd xmm1,xmm2,xmm3 ; xmm4 <-- rr_1
jmp Lsinf_fma3_exit_s
ALIGN 16
Lsinf_fma3_reduce_large:
lea r9,__L_2_by_pi_bits
;xexp = (x >> 52) 1023
vmovq r11,xmm0
mov rcx,r11
shr r11,52
sub r11,1023 ; r11 <-- xexp = exponent of input x
;calculate the last byte from which to start multiplication
;last = 134 (xexp >> 3)
mov r10,r11
shr r10,3
sub r10,134 ;r10 = last
neg r10 ;r10 = last
;load 64 bits of 2_by_pi
mov rax,[r9+r10]
;mantissa of x = ((x << 12) >> 12) | implied bit
shl rcx,12
shr rcx,12 ;rcx = mantissa part of input x
bts rcx,52 ;add the implied bit as well
;load next 128 bits of 2_by_pi
add r10,8 ;increment to next 8 bytes of 2_by_pi
vmovdqu xmm0,XMMWORD PTR[r9+r10]
;do three 64bit multiplications with mant of x
mul rcx
mov r8,rax ; r8 <-- last 64 bits of mul = res1[2]
mov r10,rdx ; r10 <-- carry
vmovq rax,xmm0
mul rcx
;resexp = xexp & 7
and r11,7 ; r11 <-- resexp = last 3 bits
psrldq xmm0,8
add rax,r10 ; add the previous carry
adc rdx,0
mov r9,rax ; r9 <-- next 64 bits of mul = res1[1]
mov r10,rdx ; r10 <-- carry
vmovq rax,xmm0
mul rcx
add r10,rax ; r10 = most sig 64 bits = res1[0]
;find the region
;last three bits ltb = most sig bits >> (54 resexp))
; decimal point in last 18 bits == 8 lsb's in first 64 bits
; and 8 msb's in next 64 bits
;point_five = ltb & 01h;
;region = ((ltb >> 1) + point_five) & 3;
mov rcx,54
mov rax,r10
sub rcx,r11
xor rdx,rdx ;rdx = sign of x(i.e first part of x * 2bypi)
shr rax,cl
jnc Lsinf_fma3_no_point_five_f
;;if there is carry.. then negate the result of multiplication
not r10
not r9
not r8
mov rdx,08000000000000000h
Lsinf_fma3_no_point_five_f:
adc rax,0
and rax,3
vmovd xmm4,eax ;store region to xmm4
;calculate the number of integer bits and zero them out
mov rcx,r11
add rcx,10 ; rcx <-- no. of integer bits
shl r10,cl
shr r10,cl ; r10 contains only mant bits
sub rcx,64 ; form the exponent
mov r11,rcx
;find the highest set bit
bsr rcx,r10
jnz Lsinf_fma3_form_mantissa_f
mov r10,r9
mov r9,r8
mov r8,0
bsr rcx,r10 ; rcx <-- hsb
sub r11,64
Lsinf_fma3_form_mantissa_f:
add r11,rcx ;for exp of x
sub rcx,52 ;rcx = no. of bits to shift in r10
cmp rcx,0
jl Lsinf_fma3_hsb_below_52_f
je Lsinf_fma3_form_numbers_f
;hsb above 52
mov r8,r10 ; previous contents of r8 not required
shr r10,cl ; r10 = mantissa of x with hsb at 52
shr r9,cl ; make space for bits from r10
sub rcx,64
neg rcx
shl r8,cl
or r9,r8 ; r9 = mantissa bits of xx
jmp Lsinf_fma3_form_numbers_f
ALIGN 16
Lsinf_fma3_hsb_below_52_f:
neg rcx
mov rax,r9
shl r10,cl
shl r9,cl
sub rcx,64
neg rcx
shr rax,cl
or r10,rax
shr r8,cl
or r9,r8
ALIGN 16
Lsinf_fma3_form_numbers_f:
add r11,1023
btr r10,52 ; remove the implied bit
mov rcx,r11
or r10,rdx ; put the sign
shl rcx,52
or r10,rcx ; r10 <-- x
vmovq xmm0,r10 ; xmm0 <-- x
vmulsd xmm0,xmm0,L_piby2_lead
Lsinf_fma3_exit_s:
vmovq rax,xmm4
and rax,01h
cmp rax,01h
jz Lsinf_fma3_cos_piby4_compute
Lsinf_fma3_sin_piby4_compute:
;; vmovapd xmm1,__Lsinfarray+010h
vmovsd xmm1,__Lsinfarray+010h
vmulsd xmm3,xmm0,xmm0
vfmadd231sd xmm1,xmm3,__Lsinfarray+018h
vfmadd213sd xmm1,xmm3,__Lsinfarray+008h
vfmadd213sd xmm1,xmm3,__Lsinfarray
vmulsd xmm3,xmm0,xmm3 ; xmm3 <-- x^3
vfmadd231sd xmm0,xmm1,xmm3
jmp Lsinf_fma3_exit_s_1
ALIGN 16
Lsinf_fma3_cos_piby4_compute:
vmovapd xmm2,L_one
vmulsd xmm3,xmm0,xmm0
vfmadd231sd xmm2,xmm3,__Lcosfarray ; xmm2 <-- 1 + c0 x^2
; would simple Horner's be slower?
vmovsd xmm1,__Lcosfarray+018h ; xmm1 <-- c3
vfmadd231sd xmm1,xmm3,__Lcosfarray+020h ; xmm1 <-- c4 x^2+ c3
vfmadd213sd xmm1,xmm3,__Lcosfarray+010h ; xmm1 <-- (c4 x^2+ c3)x^2 + c2
vfmadd213sd xmm1,xmm3,__Lcosfarray+008h ; xmm1 <-- ((c4 x^2+ c3)x^2 + c2)x^2 + c1
vmulsd xmm3,xmm3,xmm3 ; xmm3 <-- x^4
vmovdqa xmm0,xmm2
vfmadd231sd xmm0,xmm1,xmm3
Lsinf_fma3_exit_s_1:
; assuming FMA3 ==> AVX ==> SSE4.1
vpcmpeqq xmm2,xmm4,XMMWORD PTR L_int_two
vpcmpeqq xmm3,xmm4,XMMWORD PTR L_int_three
vorpd xmm3,xmm2,xmm3
vandnpd xmm3,xmm3,L_signbit
vxorpd xmm0,xmm0,xmm3
vandnpd xmm1,xmm5,L_signbit
vxorpd xmm0,xmm1,xmm0
Lsinf_fma3_return_sinf_s:
vcvtsd2ss xmm0,xmm0,xmm0
StackDeallocate stack_size
ret
Lsinf_fma3_naninf:
call fname_special
StackDeallocate stack_size
ret
fname endp
END