mirror of
https://github.com/reactos/reactos.git
synced 2025-07-31 18:42:14 +00:00
[LIBM] Import win-libm from AMD
Source: https://github.com/amd/win-libm
This commit is contained in:
parent
081c637c06
commit
4afb647c78
82 changed files with 22392 additions and 0 deletions
525
sdk/lib/crt/math/libm_sse2/cosf.asm
Normal file
525
sdk/lib/crt/math/libm_sse2/cosf.asm
Normal file
|
@ -0,0 +1,525 @@
|
|||
;
|
||||
; MIT License
|
||||
; -----------
|
||||
;
|
||||
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
||||
;
|
||||
; Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
; of this Software and associated documentaon files (the "Software"), to deal
|
||||
; in the Software without restriction, including without limitation the rights
|
||||
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
; copies of the Software, and to permit persons to whom the Software is
|
||||
; furnished to do so, subject to the following conditions:
|
||||
;
|
||||
; The above copyright notice and this permission notice shall be included in
|
||||
; all copies or substantial portions of the Software.
|
||||
;
|
||||
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
; THE SOFTWARE.
|
||||
;
|
||||
; An implementation of the cosf function.
|
||||
;
|
||||
; Prototype:
|
||||
;
|
||||
; float cosf(float x);
|
||||
;
|
||||
; Computes cosf(x).
|
||||
; Based on the NAG C implementation.
|
||||
; It will provide proper C99 return values,
|
||||
; but may not raise floating point status bits properly.
|
||||
; Original Author: Harsha Jagasia
|
||||
|
||||
.const
|
||||
ALIGN 16
|
||||
L_real_one DQ 03ff0000000000000h ; 1.0
|
||||
DQ 0 ; for alignment
|
||||
L_one_half DQ 03fe0000000000000h ; 0.5
|
||||
DQ 0
|
||||
L_2bypi DQ 03fe45f306dc9c883h ; 2./pi
|
||||
DQ 0
|
||||
L_one_sixth DQ 03fc5555555555555h ; 0.166666666666
|
||||
DQ 0
|
||||
L_piby2 DQ 03fe921fb54442d18h
|
||||
DQ 0
|
||||
L_piby2_1 DQ 03ff921fb54400000h ; piby2_1
|
||||
DQ 0
|
||||
L_piby2_1tail DQ 03dd0b4611a626331h ; piby2_1tail
|
||||
DQ 0
|
||||
L_piby2_2 DQ 03dd0b4611a600000h ; piby2_2
|
||||
DQ 0
|
||||
L_piby2_2tail DQ 03ba3198a2e037073h ; piby2_2tail
|
||||
DQ 0
|
||||
L_large_x_sse2 DQ 0411E848000000000h ; 5e5
|
||||
DQ 0
|
||||
L_large_x_fma3 DQ 041E921FB60000000h ; 3.37325952e9
|
||||
DQ 0
|
||||
L_sign_mask DQ 07FFFFFFFFFFFFFFFh
|
||||
DQ 07FFFFFFFFFFFFFFFh
|
||||
L__int_three DQ 00000000000000003h
|
||||
DQ 00000000000000003h
|
||||
L__min_norm_double DQ 00010000000000000h
|
||||
DQ 00010000000000000h
|
||||
L_two_to_neg_7 DQ 03f80000000000000h
|
||||
DQ 0
|
||||
L_two_to_neg_13 DQ 03f20000000000000h
|
||||
DQ 0
|
||||
L_inf_mask_32 DD 07F800000h
|
||||
DQ 0
|
||||
|
||||
fname TEXTEQU <cosf>
|
||||
fname_special TEXTEQU <_cosf_special>
|
||||
|
||||
;Define name and any external functions being called
|
||||
EXTERN __remainder_piby2d2f_forAsm : PROC ; NEAR
|
||||
EXTERN __remainder_piby2_fma3_bdl : PROC ; NEAR
|
||||
EXTERN __remainder_piby2_fma3 : PROC ; NEAR
|
||||
EXTERN fname_special : PROC
|
||||
EXTERN _set_statfp : PROC
|
||||
|
||||
|
||||
EXTRN __Lcosfarray:QWORD
|
||||
EXTRN __Lsinfarray:QWORD
|
||||
EXTRN __use_fma3_lib:DWORD
|
||||
|
||||
; define local variable storage offsets
|
||||
p_temp equ 020h ; temporary for get/put bits operation
|
||||
p_temp1 equ 030h ; temporary for get/put bits operation
|
||||
dummy_space EQU 040h
|
||||
stack_size EQU 068h
|
||||
|
||||
include fm.inc
|
||||
|
||||
.code
|
||||
|
||||
ALIGN 16
|
||||
PUBLIC fname
|
||||
fname PROC FRAME
|
||||
StackAllocate stack_size
|
||||
.ENDPROLOG
|
||||
cmp DWORD PTR __use_fma3_lib, 0
|
||||
jne Lcosf_fma3
|
||||
|
||||
Lcosf_sse2:
|
||||
|
||||
xorpd xmm2, xmm2 ; zeroed out for later use
|
||||
|
||||
;; if NaN or inf
|
||||
movd edx, xmm0
|
||||
mov eax, 07f800000h
|
||||
mov r10d, eax
|
||||
and r10d, edx
|
||||
cmp r10d, eax
|
||||
jz Lcosf_sse2_naninf
|
||||
|
||||
cvtss2sd xmm0, xmm0
|
||||
movd rdx, xmm0
|
||||
|
||||
; ax = (ux & ~SIGNBIT_DP64);
|
||||
mov r10, rdx
|
||||
btr r10, 63 ; r10 <-- |x|
|
||||
mov r8d, 1 ; for determining region later on
|
||||
|
||||
movapd xmm1, xmm0 ; xmm1 <-- copy of x
|
||||
|
||||
|
||||
;; if (ax <= 3fe921fb54442d18h) /* abs(x) <= pi/4 */
|
||||
mov rax, 03fe921fb54442d18h
|
||||
cmp r10, rax
|
||||
jg Lcosf_sse2_absx_gt_piby4
|
||||
|
||||
; *c = cos_piby4(x, 0.0);
|
||||
movapd xmm2, xmm0
|
||||
mulsd xmm2, xmm2 ;x^2
|
||||
xor eax, eax
|
||||
mov rdx, r10
|
||||
movsd xmm5, QWORD PTR L_one_half
|
||||
jmp Lcosf_sse2_calc_sincosf_piby4 ; done
|
||||
|
||||
|
||||
ALIGN 16
|
||||
Lcosf_sse2_absx_gt_piby4:
|
||||
; reduce the argument to be in a range from -pi/4 to +pi/4
|
||||
; by subtracting multiples of pi/2
|
||||
; xneg = (ax != ux);
|
||||
movd xmm0, r10 ; xmm0 <-- |x|
|
||||
cmp r10, QWORD PTR L_large_x_sse2
|
||||
jae Lcosf_sse2_reduce_precise
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; xmm0=abs(x), xmm1=x
|
||||
;/* How many pi/2 is x a multiple of? */
|
||||
|
||||
movapd xmm2, xmm0
|
||||
movsd xmm3, QWORD PTR L_2bypi
|
||||
movapd xmm4, xmm0
|
||||
movsd xmm5, QWORD PTR L_one_half
|
||||
mulsd xmm2, xmm3
|
||||
|
||||
; movsd xmm5, QWORD PTR L_one_half
|
||||
; movapd xmm2, xmm0
|
||||
; mulsd xmm2, QWORD PTR L_2bypi
|
||||
; movapd xmm4, xmm0
|
||||
|
||||
mov r9, r10
|
||||
shr r9, 52 ; r9 <-- biased exponent of x
|
||||
|
||||
; npi2 = (int)(x * twobypi + 0.5);
|
||||
addsd xmm2, xmm5 ; npi2
|
||||
|
||||
movsd xmm3, QWORD PTR L_piby2_1 ; piby2_1
|
||||
cvttpd2dq xmm0, xmm2 ; xmm0 <-- npi2
|
||||
movsd xmm1, QWORD PTR L_piby2_1tail ; piby2_1tail
|
||||
cvtdq2pd xmm2, xmm0 ; xmm2 <-- (double)npi2
|
||||
|
||||
; Subtract the multiple from x to get an extra-precision remainder
|
||||
; rhead = x - npi2 * piby2_1;
|
||||
|
||||
mulsd xmm3, xmm2 ; use piby2_1
|
||||
subsd xmm4, xmm3 ; rhead
|
||||
|
||||
; rtail = npi2 * piby2_1tail;
|
||||
mulsd xmm1, xmm2 ; rtail
|
||||
movd eax, xmm0
|
||||
|
||||
; GET_BITS_DP64(rhead-rtail, uy);
|
||||
; originally only rhead
|
||||
movapd xmm0, xmm4
|
||||
subsd xmm0, xmm1
|
||||
|
||||
movsd xmm3, QWORD PTR L_piby2_2 ; piby2_2
|
||||
movd rcx, xmm0 ; rcx <-- rhead-rtail
|
||||
movsd xmm5, QWORD PTR L_piby2_2tail ; piby2_2tail
|
||||
|
||||
; region = npi2 & 3;
|
||||
; and eax, 3
|
||||
; expdiff = xexp - ((uy & EXPBITS_DP64) >> EXPSHIFTBITS_DP64);
|
||||
shl rcx, 1 ; strip any sign bit
|
||||
shr rcx, 53 ; >> EXPSHIFTBITS_DP64 +1
|
||||
sub r9, rcx ; expdiff
|
||||
|
||||
;; if (expdiff > 15)
|
||||
cmp r9, 15
|
||||
jle Lcosf_sse2_expdiff_le_15
|
||||
|
||||
; The remainder is pretty small compared with x, which
|
||||
; implies that x is a near multiple of pi/2
|
||||
; (x matches the multiple to at least 15 bits)
|
||||
; t = rhead;
|
||||
movapd xmm1, xmm4
|
||||
|
||||
; rtail = npi2 * piby2_2;
|
||||
mulsd xmm3, xmm2
|
||||
|
||||
; rhead = t - rtail;
|
||||
mulsd xmm5, xmm2 ; npi2 * piby2_2tail
|
||||
subsd xmm4, xmm3 ; rhead
|
||||
|
||||
; rtail = npi2 * piby2_2tail - ((t - rhead) - rtail);
|
||||
subsd xmm1, xmm4 ; t - rhead
|
||||
subsd xmm1, xmm3 ; -rtail
|
||||
subsd xmm5, xmm1 ; rtail
|
||||
|
||||
; r = rhead - rtail;
|
||||
movapd xmm0, xmm4
|
||||
|
||||
;HARSHA
|
||||
;xmm1=rtail
|
||||
movapd xmm1, xmm5
|
||||
subsd xmm0, xmm5
|
||||
|
||||
; xmm0=r, xmm4=rhead, xmm1=rtail
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
Lcosf_sse2_expdiff_le_15:
|
||||
cmp rcx, 03f2h ; is r < 2^-13 ?
|
||||
jge Lcosf_sse2_calc_sincosf_piby4 ; use taylor series if not
|
||||
cmp rcx, 03deh ; is r < 2^-33 ?
|
||||
jle Lcosf_sse2_r_very_small ; then cosf(r) ~ 1 or r
|
||||
|
||||
movapd xmm2, xmm0
|
||||
mulsd xmm2, xmm0 ; xmm2 <-- x^2
|
||||
|
||||
;; if region is 1 or 3 do a sinf calc.
|
||||
and r8d, eax
|
||||
jz Lcosf_sse2_r_small_calc_sin
|
||||
|
||||
Lcosf_sse2_r_small_calc_cos:
|
||||
; region 1 or 3
|
||||
; use simply polynomial
|
||||
; *s = x - x*x*x*0.166666666666666666;
|
||||
movsd xmm3, QWORD PTR L_one_sixth
|
||||
mulsd xmm3, xmm0 ; * x
|
||||
mulsd xmm3, xmm2 ; * x^2
|
||||
subsd xmm0, xmm3 ; xs
|
||||
jmp Lcosf_sse2_adjust_region
|
||||
|
||||
ALIGN 16
|
||||
Lcosf_sse2_r_small_calc_sin:
|
||||
; region 0 or 2
|
||||
; cos = 1.0 - x*x*0.5;
|
||||
movsd xmm0, QWORD PTR L_real_one ; 1.0
|
||||
mulsd xmm2, QWORD PTR L_one_half ; 0.5 *x^2
|
||||
subsd xmm0, xmm2
|
||||
jmp Lcosf_sse2_adjust_region
|
||||
|
||||
ALIGN 16
|
||||
Lcosf_sse2_r_very_small:
|
||||
; then sin(r) = r
|
||||
; if region is 1 or 3 do a sin calc.
|
||||
and r8d, eax
|
||||
jnz Lcosf_sse2_adjust_region
|
||||
|
||||
movsd xmm0, QWORD PTR L_real_one ; cosf(r) is a 1
|
||||
; By this point, calculations should already have set inexact
|
||||
jmp Lcosf_sse2_adjust_region
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
ALIGN 16
|
||||
Lcosf_sse2_reduce_precise:
|
||||
; Reduce abs(x) into range [-pi/4, pi/4]
|
||||
; remainder_piby2d2f(ax, &r, ®ion);
|
||||
mov QWORD PTR p_temp[rsp], rdx ; save ux for use later
|
||||
mov QWORD PTR p_temp1[rsp], r10 ; save ax for use later
|
||||
|
||||
call __remainder_piby2d2f_forAsm
|
||||
mov rdx, QWORD PTR p_temp[rsp] ; restore ux for use later
|
||||
mov r10, QWORD PTR p_temp1[rsp] ; restore ax for use later
|
||||
mov r8d, 1 ; for determining region later
|
||||
|
||||
; Reduced argument is in xmm0. No second word; after all, we started in
|
||||
; single precision. Region is in rax.
|
||||
movapd xmm1, xmm0
|
||||
movsd xmm5, QWORD PTR L_one_half
|
||||
|
||||
jmp Lcosf_sse2_calc_sincosf_piby4
|
||||
|
||||
|
||||
; done with reducing the argument. Now perform the sin/cos calculations.
|
||||
ALIGN 16
|
||||
Lcosf_sse2_calc_sincosf_piby4:
|
||||
movapd xmm2, xmm0
|
||||
mulsd xmm2, xmm0 ; x^2
|
||||
|
||||
;; if region is 0 or 2, do a cosf calc
|
||||
and r8d, eax
|
||||
jz Lcosf_sse2_do_cosf_calc
|
||||
; region is 1 or 3: do a sinf calc.
|
||||
Lcosf_sse2_do_sinf_calc:
|
||||
movsd xmm1, QWORD PTR __Lsinfarray+18h ; s4
|
||||
mulsd xmm1, xmm2 ; s4x2
|
||||
movsd xmm4, xmm2 ; move for x4
|
||||
mulsd xmm4, xmm2 ; x4
|
||||
movsd xmm5, QWORD PTR __Lsinfarray+8h ; s2
|
||||
mulsd xmm5, xmm2 ; s2x2
|
||||
movsd xmm3, xmm0 ; move for x3
|
||||
mulsd xmm3, xmm2 ; x3
|
||||
addsd xmm1, QWORD PTR __Lsinfarray+10h ; s3+s4x2
|
||||
mulsd xmm1, xmm4 ; s3x4+s4x6
|
||||
addsd xmm5, QWORD PTR __Lsinfarray ; s1+s2x2
|
||||
addsd xmm1, xmm5 ; s1+s2x2+s3x4+s4x6
|
||||
mulsd xmm1, xmm3 ; x3(s1+s2x2+s3x4+s4x6)
|
||||
addsd xmm0, xmm1 ; x + x3(s1+s2x2+s3x4+s4x6)
|
||||
jmp Lcosf_sse2_adjust_region
|
||||
|
||||
ALIGN 16
|
||||
Lcosf_sse2_do_cosf_calc:
|
||||
; region 0 or 2 - do a cos calculation
|
||||
; zc = 1-0.5*x2+ c1*x4 +c2*x6 +c3*x8;
|
||||
; zc = 1-0.5*x2+ c1*x4 +c2*x6 +c3*x8 + c4*x10 for a higher precision
|
||||
movsd xmm1, QWORD PTR __Lcosfarray+20h ; c4
|
||||
movsd xmm4, xmm2 ; move for x4
|
||||
mulsd xmm1, xmm2 ; c4x2
|
||||
movsd xmm3, QWORD PTR __Lcosfarray+10h ; c2
|
||||
mulsd xmm4, xmm2 ; x4
|
||||
movsd xmm0, QWORD PTR __Lcosfarray ; c0
|
||||
mulsd xmm3, xmm2 ; c2x2
|
||||
mulsd xmm0, xmm2 ; c0x2 (=-0.5x2)
|
||||
addsd xmm1, QWORD PTR __Lcosfarray+18h ; c3+c4x2
|
||||
mulsd xmm1, xmm4 ; c3x4 + c4x6
|
||||
addsd xmm3, QWORD PTR __Lcosfarray+8h ; c1+c2x2
|
||||
addsd xmm1, xmm3 ; c1 + c2x2 + c3x4 + c4x6
|
||||
mulsd xmm1, xmm4 ; c1x4 + c2x6 + c3x8 + c4x10
|
||||
addsd xmm0, QWORD PTR L_real_one ; 1 - 0.5x2
|
||||
addsd xmm0, xmm1 ; 1 - 0.5x2 + c1x4 + c2x6 + c3x8 + c4x10
|
||||
|
||||
Lcosf_sse2_adjust_region:
|
||||
; xmm1 is cos or sin, relies on previous sections to
|
||||
; switch (region)
|
||||
add eax, 1
|
||||
and eax, 2
|
||||
jz Lcosf_sse2_cleanup
|
||||
;; if region 1 or 2 then we negate the result.
|
||||
xorpd xmm2, xmm2
|
||||
subsd xmm2, xmm0
|
||||
movapd xmm0, xmm2
|
||||
|
||||
ALIGN 16
|
||||
Lcosf_sse2_cleanup:
|
||||
cvtsd2ss xmm0, xmm0
|
||||
StackDeallocate stack_size
|
||||
ret
|
||||
|
||||
|
||||
Lcosf_sse2_naninf:
|
||||
call fname_special
|
||||
StackDeallocate stack_size
|
||||
ret
|
||||
|
||||
|
||||
ALIGN 16
|
||||
Lcosf_fma3:
|
||||
vmovd eax,xmm0
|
||||
mov r8d,L_inf_mask_32
|
||||
and eax,r8d
|
||||
cmp eax, r8d
|
||||
jz Lcosf_fma3_naninf
|
||||
|
||||
vcvtss2sd xmm5,xmm0,xmm0
|
||||
vmovq r9,xmm5
|
||||
btr r9,63 ;clear sign
|
||||
|
||||
cmp r9,L_piby2
|
||||
jg Lcosf_fma3_range_reduce
|
||||
cmp r9,L_two_to_neg_7
|
||||
jge Lcosf_fma3_compute_cosf_piby_4
|
||||
cmp r9,L_two_to_neg_13
|
||||
jge Lcosf_fma3_compute_1_xx_5
|
||||
|
||||
vmovq xmm0,QWORD PTR L_real_one
|
||||
; Here we need to set inexact
|
||||
vaddsd xmm0,xmm0,L__min_norm_double ; this will set inexact
|
||||
jmp Lcosf_fma3_return
|
||||
|
||||
ALIGN 16
|
||||
Lcosf_fma3_compute_1_xx_5:
|
||||
vmulsd xmm0,xmm5,QWORD PTR L_one_half
|
||||
vfnmadd213sd xmm0,xmm5,L_real_one ; xmm9 1.0 - x*x*(double2)0.5
|
||||
jmp Lcosf_fma3_return
|
||||
|
||||
ALIGN 16
|
||||
Lcosf_fma3_compute_cosf_piby_4:
|
||||
movsd xmm0,xmm5
|
||||
vmovapd xmm2,L_real_one
|
||||
vmulsd xmm3,xmm0,xmm0
|
||||
vmulsd xmm1,xmm3,L_one_half ; xmm1 <-- r
|
||||
vsubsd xmm2,xmm2,xmm1
|
||||
vmovsd xmm1,__Lcosfarray+018h
|
||||
vfmadd231sd xmm1,xmm3,__Lcosfarray+020h
|
||||
vfmadd213sd xmm1,xmm3,__Lcosfarray+010h
|
||||
vfmadd213sd xmm1,xmm3,__Lcosfarray+008h
|
||||
vmulsd xmm3,xmm3,xmm3 ; xmm3 <-- x^4
|
||||
vmovdqa xmm0,xmm2
|
||||
vfmadd231sd xmm0,xmm1,xmm3
|
||||
jmp Lcosf_fma3_return
|
||||
|
||||
ALIGN 16
|
||||
Lcosf_fma3_range_reduce:
|
||||
vmovq xmm0,r9 ; xmm0 <-- |x|
|
||||
cmp r9,L_large_x_fma3
|
||||
jge Lcosf_reduce_precise
|
||||
|
||||
;cosff_range_e_5_s:
|
||||
vandpd xmm1,xmm0,L_sign_mask
|
||||
vmovapd xmm2,L_2bypi
|
||||
vfmadd213sd xmm2,xmm1,L_one_half
|
||||
vcvttpd2dq xmm2,xmm2
|
||||
vpmovsxdq xmm1,xmm2
|
||||
vandpd xmm4,xmm1,L__int_three ; region xmm4
|
||||
vshufps xmm1 ,xmm1,xmm1,8
|
||||
vcvtdq2pd xmm1,xmm1
|
||||
vmovdqa xmm2,xmm0
|
||||
vfnmadd231sd xmm2,xmm1,L_piby2_1 ; xmm2 rhead
|
||||
vmulsd xmm3,xmm1,L_piby2_1tail ; xmm3 rtail
|
||||
vsubsd xmm0,xmm2,xmm3 ; r_1 xmm0
|
||||
vsubsd xmm2,xmm2,xmm0
|
||||
vsubsd xmm1,xmm2,xmm3
|
||||
vmovq rax,xmm4
|
||||
jmp Lcosf_exit_s
|
||||
|
||||
ALIGN 16
|
||||
Lcosf_reduce_precise:
|
||||
|
||||
vmovq xmm0,r9 ; r9 <-- |x|
|
||||
cmp r9,L_large_x_fma3
|
||||
jge Lcos_remainder_piby2
|
||||
|
||||
; __remainder_piby2_fma3 and __remainder_piby2_fma3_bdl
|
||||
; have the following conventions:
|
||||
; on input
|
||||
; x is in xmm0
|
||||
; on output
|
||||
; r is in xmm0
|
||||
; rr is in xmm1
|
||||
; region is in rax
|
||||
; The _bdl routine is guaranteed not to touch r10
|
||||
|
||||
Lcos_remainder_piby2_small: ;; unused label
|
||||
; Boldo-Daumas-Li reduction for reasonably small |x|
|
||||
call __remainder_piby2_fma3_bdl
|
||||
jmp Lcosf_exit_s
|
||||
|
||||
ALIGN 16
|
||||
Lcos_remainder_piby2:
|
||||
; argument reduction for general x
|
||||
call __remainder_piby2_fma3
|
||||
Lcosf_exit_s:
|
||||
bt rax,0
|
||||
jnc Lcosf_piby4_compute
|
||||
|
||||
;sinf_piby4_compute:
|
||||
; vmovapd xmm1,__Lsinfarray+010h
|
||||
vmovsd xmm1,__Lsinfarray+010h
|
||||
vmulsd xmm3,xmm0,xmm0
|
||||
vfmadd231sd xmm1,xmm3,__Lsinfarray+018h
|
||||
vfmadd213sd xmm1,xmm3,__Lsinfarray+008h
|
||||
vfmadd213sd xmm1,xmm3,__Lsinfarray
|
||||
vmulsd xmm3,xmm0,xmm3 ; xmm3 <-- x^3
|
||||
vfmadd231sd xmm0,xmm1,xmm3
|
||||
jmp Lcosf_fma3_adjust_sign
|
||||
|
||||
ALIGN 16
|
||||
Lcosf_piby4_compute:
|
||||
vmovapd xmm2,L_real_one
|
||||
vmulsd xmm3,xmm0,xmm0
|
||||
vmulsd xmm1,xmm3,L_one_half ; xmm1 <-- r
|
||||
vsubsd xmm2,xmm2,xmm1
|
||||
vmovsd xmm1,__Lcosfarray+018h
|
||||
vfmadd231sd xmm1 ,xmm3,__Lcosfarray+020h
|
||||
vfmadd213sd xmm1 ,xmm3,__Lcosfarray+010h
|
||||
vfmadd213sd xmm1 ,xmm3,__Lcosfarray+008h
|
||||
vmulsd xmm3,xmm3,xmm3 ; xmm3 <-- x^4
|
||||
vmovdqa xmm0, xmm2
|
||||
vfmadd231sd xmm0 ,xmm1,xmm3
|
||||
|
||||
Lcosf_fma3_adjust_sign:
|
||||
; assuming FMA3 ==> AVX ==> SSE4.1
|
||||
; vpcmpeqq xmm1,xmm4,XMMWORD PTR L_int_one
|
||||
; vpcmpeqq xmm2,xmm4,XMMWORD PTR L_int_two
|
||||
; vorpd xmm3,xmm2,xmm1
|
||||
|
||||
; vandpd xmm3,xmm3,L_signbit
|
||||
|
||||
add rax,1 ; 1,2 --> 2,3
|
||||
shr rax,1 ; 2,3 --> 1
|
||||
shl rax,63 ; 1 --> sign bit
|
||||
vmovq xmm3,rax
|
||||
|
||||
vxorpd xmm0,xmm0,xmm3
|
||||
|
||||
Lcosf_fma3_return:
|
||||
vcvtsd2ss xmm0,xmm0,xmm0
|
||||
StackDeallocate stack_size
|
||||
ret
|
||||
|
||||
Lcosf_fma3_naninf:
|
||||
call fname_special
|
||||
StackDeallocate stack_size
|
||||
ret
|
||||
|
||||
fname endp
|
||||
END
|
Loading…
Add table
Add a link
Reference in a new issue