mirror of
https://github.com/reactos/reactos.git
synced 2024-10-29 19:13:58 +00:00
526 lines
18 KiB
NASM
526 lines
18 KiB
NASM
|
;
|
||
|
; MIT License
|
||
|
; -----------
|
||
|
;
|
||
|
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
||
|
;
|
||
|
; Permission is hereby granted, free of charge, to any person obtaining a copy
|
||
|
; of this Software and associated documentaon files (the "Software"), to deal
|
||
|
; in the Software without restriction, including without limitation the rights
|
||
|
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||
|
; copies of the Software, and to permit persons to whom the Software is
|
||
|
; furnished to do so, subject to the following conditions:
|
||
|
;
|
||
|
; The above copyright notice and this permission notice shall be included in
|
||
|
; all copies or substantial portions of the Software.
|
||
|
;
|
||
|
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||
|
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||
|
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||
|
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||
|
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||
|
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||
|
; THE SOFTWARE.
|
||
|
;
|
||
|
; An implementation of the cosf function.
|
||
|
;
|
||
|
; Prototype:
|
||
|
;
|
||
|
; float cosf(float x);
|
||
|
;
|
||
|
; Computes cosf(x).
|
||
|
; Based on the NAG C implementation.
|
||
|
; It will provide proper C99 return values,
|
||
|
; but may not raise floating point status bits properly.
|
||
|
; Original Author: Harsha Jagasia
|
||
|
|
||
|
.const
|
||
|
ALIGN 16
|
||
|
L_real_one DQ 03ff0000000000000h ; 1.0
|
||
|
DQ 0 ; for alignment
|
||
|
L_one_half DQ 03fe0000000000000h ; 0.5
|
||
|
DQ 0
|
||
|
L_2bypi DQ 03fe45f306dc9c883h ; 2./pi
|
||
|
DQ 0
|
||
|
L_one_sixth DQ 03fc5555555555555h ; 0.166666666666
|
||
|
DQ 0
|
||
|
L_piby2 DQ 03fe921fb54442d18h
|
||
|
DQ 0
|
||
|
L_piby2_1 DQ 03ff921fb54400000h ; piby2_1
|
||
|
DQ 0
|
||
|
L_piby2_1tail DQ 03dd0b4611a626331h ; piby2_1tail
|
||
|
DQ 0
|
||
|
L_piby2_2 DQ 03dd0b4611a600000h ; piby2_2
|
||
|
DQ 0
|
||
|
L_piby2_2tail DQ 03ba3198a2e037073h ; piby2_2tail
|
||
|
DQ 0
|
||
|
L_large_x_sse2 DQ 0411E848000000000h ; 5e5
|
||
|
DQ 0
|
||
|
L_large_x_fma3 DQ 041E921FB60000000h ; 3.37325952e9
|
||
|
DQ 0
|
||
|
L_sign_mask DQ 07FFFFFFFFFFFFFFFh
|
||
|
DQ 07FFFFFFFFFFFFFFFh
|
||
|
L__int_three DQ 00000000000000003h
|
||
|
DQ 00000000000000003h
|
||
|
L__min_norm_double DQ 00010000000000000h
|
||
|
DQ 00010000000000000h
|
||
|
L_two_to_neg_7 DQ 03f80000000000000h
|
||
|
DQ 0
|
||
|
L_two_to_neg_13 DQ 03f20000000000000h
|
||
|
DQ 0
|
||
|
L_inf_mask_32 DD 07F800000h
|
||
|
DQ 0
|
||
|
|
||
|
fname TEXTEQU <cosf>
|
||
|
fname_special TEXTEQU <_cosf_special>
|
||
|
|
||
|
;Define name and any external functions being called
|
||
|
EXTERN __remainder_piby2d2f_forAsm : PROC ; NEAR
|
||
|
EXTERN __remainder_piby2_fma3_bdl : PROC ; NEAR
|
||
|
EXTERN __remainder_piby2_fma3 : PROC ; NEAR
|
||
|
EXTERN fname_special : PROC
|
||
|
EXTERN _set_statfp : PROC
|
||
|
|
||
|
|
||
|
EXTRN __Lcosfarray:QWORD
|
||
|
EXTRN __Lsinfarray:QWORD
|
||
|
EXTRN __use_fma3_lib:DWORD
|
||
|
|
||
|
; define local variable storage offsets
|
||
|
p_temp equ 020h ; temporary for get/put bits operation
|
||
|
p_temp1 equ 030h ; temporary for get/put bits operation
|
||
|
dummy_space EQU 040h
|
||
|
stack_size EQU 068h
|
||
|
|
||
|
include fm.inc
|
||
|
|
||
|
.code
|
||
|
|
||
|
ALIGN 16
|
||
|
PUBLIC fname
|
||
|
fname PROC FRAME
|
||
|
StackAllocate stack_size
|
||
|
.ENDPROLOG
|
||
|
cmp DWORD PTR __use_fma3_lib, 0
|
||
|
jne Lcosf_fma3
|
||
|
|
||
|
Lcosf_sse2:
|
||
|
|
||
|
xorpd xmm2, xmm2 ; zeroed out for later use
|
||
|
|
||
|
;; if NaN or inf
|
||
|
movd edx, xmm0
|
||
|
mov eax, 07f800000h
|
||
|
mov r10d, eax
|
||
|
and r10d, edx
|
||
|
cmp r10d, eax
|
||
|
jz Lcosf_sse2_naninf
|
||
|
|
||
|
cvtss2sd xmm0, xmm0
|
||
|
movd rdx, xmm0
|
||
|
|
||
|
; ax = (ux & ~SIGNBIT_DP64);
|
||
|
mov r10, rdx
|
||
|
btr r10, 63 ; r10 <-- |x|
|
||
|
mov r8d, 1 ; for determining region later on
|
||
|
|
||
|
movapd xmm1, xmm0 ; xmm1 <-- copy of x
|
||
|
|
||
|
|
||
|
;; if (ax <= 3fe921fb54442d18h) /* abs(x) <= pi/4 */
|
||
|
mov rax, 03fe921fb54442d18h
|
||
|
cmp r10, rax
|
||
|
jg Lcosf_sse2_absx_gt_piby4
|
||
|
|
||
|
; *c = cos_piby4(x, 0.0);
|
||
|
movapd xmm2, xmm0
|
||
|
mulsd xmm2, xmm2 ;x^2
|
||
|
xor eax, eax
|
||
|
mov rdx, r10
|
||
|
movsd xmm5, QWORD PTR L_one_half
|
||
|
jmp Lcosf_sse2_calc_sincosf_piby4 ; done
|
||
|
|
||
|
|
||
|
ALIGN 16
|
||
|
Lcosf_sse2_absx_gt_piby4:
|
||
|
; reduce the argument to be in a range from -pi/4 to +pi/4
|
||
|
; by subtracting multiples of pi/2
|
||
|
; xneg = (ax != ux);
|
||
|
movd xmm0, r10 ; xmm0 <-- |x|
|
||
|
cmp r10, QWORD PTR L_large_x_sse2
|
||
|
jae Lcosf_sse2_reduce_precise
|
||
|
|
||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||
|
; xmm0=abs(x), xmm1=x
|
||
|
;/* How many pi/2 is x a multiple of? */
|
||
|
|
||
|
movapd xmm2, xmm0
|
||
|
movsd xmm3, QWORD PTR L_2bypi
|
||
|
movapd xmm4, xmm0
|
||
|
movsd xmm5, QWORD PTR L_one_half
|
||
|
mulsd xmm2, xmm3
|
||
|
|
||
|
; movsd xmm5, QWORD PTR L_one_half
|
||
|
; movapd xmm2, xmm0
|
||
|
; mulsd xmm2, QWORD PTR L_2bypi
|
||
|
; movapd xmm4, xmm0
|
||
|
|
||
|
mov r9, r10
|
||
|
shr r9, 52 ; r9 <-- biased exponent of x
|
||
|
|
||
|
; npi2 = (int)(x * twobypi + 0.5);
|
||
|
addsd xmm2, xmm5 ; npi2
|
||
|
|
||
|
movsd xmm3, QWORD PTR L_piby2_1 ; piby2_1
|
||
|
cvttpd2dq xmm0, xmm2 ; xmm0 <-- npi2
|
||
|
movsd xmm1, QWORD PTR L_piby2_1tail ; piby2_1tail
|
||
|
cvtdq2pd xmm2, xmm0 ; xmm2 <-- (double)npi2
|
||
|
|
||
|
; Subtract the multiple from x to get an extra-precision remainder
|
||
|
; rhead = x - npi2 * piby2_1;
|
||
|
|
||
|
mulsd xmm3, xmm2 ; use piby2_1
|
||
|
subsd xmm4, xmm3 ; rhead
|
||
|
|
||
|
; rtail = npi2 * piby2_1tail;
|
||
|
mulsd xmm1, xmm2 ; rtail
|
||
|
movd eax, xmm0
|
||
|
|
||
|
; GET_BITS_DP64(rhead-rtail, uy);
|
||
|
; originally only rhead
|
||
|
movapd xmm0, xmm4
|
||
|
subsd xmm0, xmm1
|
||
|
|
||
|
movsd xmm3, QWORD PTR L_piby2_2 ; piby2_2
|
||
|
movd rcx, xmm0 ; rcx <-- rhead-rtail
|
||
|
movsd xmm5, QWORD PTR L_piby2_2tail ; piby2_2tail
|
||
|
|
||
|
; region = npi2 & 3;
|
||
|
; and eax, 3
|
||
|
; expdiff = xexp - ((uy & EXPBITS_DP64) >> EXPSHIFTBITS_DP64);
|
||
|
shl rcx, 1 ; strip any sign bit
|
||
|
shr rcx, 53 ; >> EXPSHIFTBITS_DP64 +1
|
||
|
sub r9, rcx ; expdiff
|
||
|
|
||
|
;; if (expdiff > 15)
|
||
|
cmp r9, 15
|
||
|
jle Lcosf_sse2_expdiff_le_15
|
||
|
|
||
|
; The remainder is pretty small compared with x, which
|
||
|
; implies that x is a near multiple of pi/2
|
||
|
; (x matches the multiple to at least 15 bits)
|
||
|
; t = rhead;
|
||
|
movapd xmm1, xmm4
|
||
|
|
||
|
; rtail = npi2 * piby2_2;
|
||
|
mulsd xmm3, xmm2
|
||
|
|
||
|
; rhead = t - rtail;
|
||
|
mulsd xmm5, xmm2 ; npi2 * piby2_2tail
|
||
|
subsd xmm4, xmm3 ; rhead
|
||
|
|
||
|
; rtail = npi2 * piby2_2tail - ((t - rhead) - rtail);
|
||
|
subsd xmm1, xmm4 ; t - rhead
|
||
|
subsd xmm1, xmm3 ; -rtail
|
||
|
subsd xmm5, xmm1 ; rtail
|
||
|
|
||
|
; r = rhead - rtail;
|
||
|
movapd xmm0, xmm4
|
||
|
|
||
|
;HARSHA
|
||
|
;xmm1=rtail
|
||
|
movapd xmm1, xmm5
|
||
|
subsd xmm0, xmm5
|
||
|
|
||
|
; xmm0=r, xmm4=rhead, xmm1=rtail
|
||
|
|
||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||
|
Lcosf_sse2_expdiff_le_15:
|
||
|
cmp rcx, 03f2h ; is r < 2^-13 ?
|
||
|
jge Lcosf_sse2_calc_sincosf_piby4 ; use taylor series if not
|
||
|
cmp rcx, 03deh ; is r < 2^-33 ?
|
||
|
jle Lcosf_sse2_r_very_small ; then cosf(r) ~ 1 or r
|
||
|
|
||
|
movapd xmm2, xmm0
|
||
|
mulsd xmm2, xmm0 ; xmm2 <-- x^2
|
||
|
|
||
|
;; if region is 1 or 3 do a sinf calc.
|
||
|
and r8d, eax
|
||
|
jz Lcosf_sse2_r_small_calc_sin
|
||
|
|
||
|
Lcosf_sse2_r_small_calc_cos:
|
||
|
; region 1 or 3
|
||
|
; use simply polynomial
|
||
|
; *s = x - x*x*x*0.166666666666666666;
|
||
|
movsd xmm3, QWORD PTR L_one_sixth
|
||
|
mulsd xmm3, xmm0 ; * x
|
||
|
mulsd xmm3, xmm2 ; * x^2
|
||
|
subsd xmm0, xmm3 ; xs
|
||
|
jmp Lcosf_sse2_adjust_region
|
||
|
|
||
|
ALIGN 16
|
||
|
Lcosf_sse2_r_small_calc_sin:
|
||
|
; region 0 or 2
|
||
|
; cos = 1.0 - x*x*0.5;
|
||
|
movsd xmm0, QWORD PTR L_real_one ; 1.0
|
||
|
mulsd xmm2, QWORD PTR L_one_half ; 0.5 *x^2
|
||
|
subsd xmm0, xmm2
|
||
|
jmp Lcosf_sse2_adjust_region
|
||
|
|
||
|
ALIGN 16
|
||
|
Lcosf_sse2_r_very_small:
|
||
|
; then sin(r) = r
|
||
|
; if region is 1 or 3 do a sin calc.
|
||
|
and r8d, eax
|
||
|
jnz Lcosf_sse2_adjust_region
|
||
|
|
||
|
movsd xmm0, QWORD PTR L_real_one ; cosf(r) is a 1
|
||
|
; By this point, calculations should already have set inexact
|
||
|
jmp Lcosf_sse2_adjust_region
|
||
|
|
||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||
|
ALIGN 16
|
||
|
Lcosf_sse2_reduce_precise:
|
||
|
; Reduce abs(x) into range [-pi/4, pi/4]
|
||
|
; remainder_piby2d2f(ax, &r, ®ion);
|
||
|
mov QWORD PTR p_temp[rsp], rdx ; save ux for use later
|
||
|
mov QWORD PTR p_temp1[rsp], r10 ; save ax for use later
|
||
|
|
||
|
call __remainder_piby2d2f_forAsm
|
||
|
mov rdx, QWORD PTR p_temp[rsp] ; restore ux for use later
|
||
|
mov r10, QWORD PTR p_temp1[rsp] ; restore ax for use later
|
||
|
mov r8d, 1 ; for determining region later
|
||
|
|
||
|
; Reduced argument is in xmm0. No second word; after all, we started in
|
||
|
; single precision. Region is in rax.
|
||
|
movapd xmm1, xmm0
|
||
|
movsd xmm5, QWORD PTR L_one_half
|
||
|
|
||
|
jmp Lcosf_sse2_calc_sincosf_piby4
|
||
|
|
||
|
|
||
|
; done with reducing the argument. Now perform the sin/cos calculations.
|
||
|
ALIGN 16
|
||
|
Lcosf_sse2_calc_sincosf_piby4:
|
||
|
movapd xmm2, xmm0
|
||
|
mulsd xmm2, xmm0 ; x^2
|
||
|
|
||
|
;; if region is 0 or 2, do a cosf calc
|
||
|
and r8d, eax
|
||
|
jz Lcosf_sse2_do_cosf_calc
|
||
|
; region is 1 or 3: do a sinf calc.
|
||
|
Lcosf_sse2_do_sinf_calc:
|
||
|
movsd xmm1, QWORD PTR __Lsinfarray+18h ; s4
|
||
|
mulsd xmm1, xmm2 ; s4x2
|
||
|
movsd xmm4, xmm2 ; move for x4
|
||
|
mulsd xmm4, xmm2 ; x4
|
||
|
movsd xmm5, QWORD PTR __Lsinfarray+8h ; s2
|
||
|
mulsd xmm5, xmm2 ; s2x2
|
||
|
movsd xmm3, xmm0 ; move for x3
|
||
|
mulsd xmm3, xmm2 ; x3
|
||
|
addsd xmm1, QWORD PTR __Lsinfarray+10h ; s3+s4x2
|
||
|
mulsd xmm1, xmm4 ; s3x4+s4x6
|
||
|
addsd xmm5, QWORD PTR __Lsinfarray ; s1+s2x2
|
||
|
addsd xmm1, xmm5 ; s1+s2x2+s3x4+s4x6
|
||
|
mulsd xmm1, xmm3 ; x3(s1+s2x2+s3x4+s4x6)
|
||
|
addsd xmm0, xmm1 ; x + x3(s1+s2x2+s3x4+s4x6)
|
||
|
jmp Lcosf_sse2_adjust_region
|
||
|
|
||
|
ALIGN 16
|
||
|
Lcosf_sse2_do_cosf_calc:
|
||
|
; region 0 or 2 - do a cos calculation
|
||
|
; zc = 1-0.5*x2+ c1*x4 +c2*x6 +c3*x8;
|
||
|
; zc = 1-0.5*x2+ c1*x4 +c2*x6 +c3*x8 + c4*x10 for a higher precision
|
||
|
movsd xmm1, QWORD PTR __Lcosfarray+20h ; c4
|
||
|
movsd xmm4, xmm2 ; move for x4
|
||
|
mulsd xmm1, xmm2 ; c4x2
|
||
|
movsd xmm3, QWORD PTR __Lcosfarray+10h ; c2
|
||
|
mulsd xmm4, xmm2 ; x4
|
||
|
movsd xmm0, QWORD PTR __Lcosfarray ; c0
|
||
|
mulsd xmm3, xmm2 ; c2x2
|
||
|
mulsd xmm0, xmm2 ; c0x2 (=-0.5x2)
|
||
|
addsd xmm1, QWORD PTR __Lcosfarray+18h ; c3+c4x2
|
||
|
mulsd xmm1, xmm4 ; c3x4 + c4x6
|
||
|
addsd xmm3, QWORD PTR __Lcosfarray+8h ; c1+c2x2
|
||
|
addsd xmm1, xmm3 ; c1 + c2x2 + c3x4 + c4x6
|
||
|
mulsd xmm1, xmm4 ; c1x4 + c2x6 + c3x8 + c4x10
|
||
|
addsd xmm0, QWORD PTR L_real_one ; 1 - 0.5x2
|
||
|
addsd xmm0, xmm1 ; 1 - 0.5x2 + c1x4 + c2x6 + c3x8 + c4x10
|
||
|
|
||
|
Lcosf_sse2_adjust_region:
|
||
|
; xmm1 is cos or sin, relies on previous sections to
|
||
|
; switch (region)
|
||
|
add eax, 1
|
||
|
and eax, 2
|
||
|
jz Lcosf_sse2_cleanup
|
||
|
;; if region 1 or 2 then we negate the result.
|
||
|
xorpd xmm2, xmm2
|
||
|
subsd xmm2, xmm0
|
||
|
movapd xmm0, xmm2
|
||
|
|
||
|
ALIGN 16
|
||
|
Lcosf_sse2_cleanup:
|
||
|
cvtsd2ss xmm0, xmm0
|
||
|
StackDeallocate stack_size
|
||
|
ret
|
||
|
|
||
|
|
||
|
Lcosf_sse2_naninf:
|
||
|
call fname_special
|
||
|
StackDeallocate stack_size
|
||
|
ret
|
||
|
|
||
|
|
||
|
ALIGN 16
|
||
|
Lcosf_fma3:
|
||
|
vmovd eax,xmm0
|
||
|
mov r8d,L_inf_mask_32
|
||
|
and eax,r8d
|
||
|
cmp eax, r8d
|
||
|
jz Lcosf_fma3_naninf
|
||
|
|
||
|
vcvtss2sd xmm5,xmm0,xmm0
|
||
|
vmovq r9,xmm5
|
||
|
btr r9,63 ;clear sign
|
||
|
|
||
|
cmp r9,L_piby2
|
||
|
jg Lcosf_fma3_range_reduce
|
||
|
cmp r9,L_two_to_neg_7
|
||
|
jge Lcosf_fma3_compute_cosf_piby_4
|
||
|
cmp r9,L_two_to_neg_13
|
||
|
jge Lcosf_fma3_compute_1_xx_5
|
||
|
|
||
|
vmovq xmm0,QWORD PTR L_real_one
|
||
|
; Here we need to set inexact
|
||
|
vaddsd xmm0,xmm0,L__min_norm_double ; this will set inexact
|
||
|
jmp Lcosf_fma3_return
|
||
|
|
||
|
ALIGN 16
|
||
|
Lcosf_fma3_compute_1_xx_5:
|
||
|
vmulsd xmm0,xmm5,QWORD PTR L_one_half
|
||
|
vfnmadd213sd xmm0,xmm5,L_real_one ; xmm9 1.0 - x*x*(double2)0.5
|
||
|
jmp Lcosf_fma3_return
|
||
|
|
||
|
ALIGN 16
|
||
|
Lcosf_fma3_compute_cosf_piby_4:
|
||
|
movsd xmm0,xmm5
|
||
|
vmovapd xmm2,L_real_one
|
||
|
vmulsd xmm3,xmm0,xmm0
|
||
|
vmulsd xmm1,xmm3,L_one_half ; xmm1 <-- r
|
||
|
vsubsd xmm2,xmm2,xmm1
|
||
|
vmovsd xmm1,__Lcosfarray+018h
|
||
|
vfmadd231sd xmm1,xmm3,__Lcosfarray+020h
|
||
|
vfmadd213sd xmm1,xmm3,__Lcosfarray+010h
|
||
|
vfmadd213sd xmm1,xmm3,__Lcosfarray+008h
|
||
|
vmulsd xmm3,xmm3,xmm3 ; xmm3 <-- x^4
|
||
|
vmovdqa xmm0,xmm2
|
||
|
vfmadd231sd xmm0,xmm1,xmm3
|
||
|
jmp Lcosf_fma3_return
|
||
|
|
||
|
ALIGN 16
|
||
|
Lcosf_fma3_range_reduce:
|
||
|
vmovq xmm0,r9 ; xmm0 <-- |x|
|
||
|
cmp r9,L_large_x_fma3
|
||
|
jge Lcosf_reduce_precise
|
||
|
|
||
|
;cosff_range_e_5_s:
|
||
|
vandpd xmm1,xmm0,L_sign_mask
|
||
|
vmovapd xmm2,L_2bypi
|
||
|
vfmadd213sd xmm2,xmm1,L_one_half
|
||
|
vcvttpd2dq xmm2,xmm2
|
||
|
vpmovsxdq xmm1,xmm2
|
||
|
vandpd xmm4,xmm1,L__int_three ; region xmm4
|
||
|
vshufps xmm1 ,xmm1,xmm1,8
|
||
|
vcvtdq2pd xmm1,xmm1
|
||
|
vmovdqa xmm2,xmm0
|
||
|
vfnmadd231sd xmm2,xmm1,L_piby2_1 ; xmm2 rhead
|
||
|
vmulsd xmm3,xmm1,L_piby2_1tail ; xmm3 rtail
|
||
|
vsubsd xmm0,xmm2,xmm3 ; r_1 xmm0
|
||
|
vsubsd xmm2,xmm2,xmm0
|
||
|
vsubsd xmm1,xmm2,xmm3
|
||
|
vmovq rax,xmm4
|
||
|
jmp Lcosf_exit_s
|
||
|
|
||
|
ALIGN 16
|
||
|
Lcosf_reduce_precise:
|
||
|
|
||
|
vmovq xmm0,r9 ; r9 <-- |x|
|
||
|
cmp r9,L_large_x_fma3
|
||
|
jge Lcos_remainder_piby2
|
||
|
|
||
|
; __remainder_piby2_fma3 and __remainder_piby2_fma3_bdl
|
||
|
; have the following conventions:
|
||
|
; on input
|
||
|
; x is in xmm0
|
||
|
; on output
|
||
|
; r is in xmm0
|
||
|
; rr is in xmm1
|
||
|
; region is in rax
|
||
|
; The _bdl routine is guaranteed not to touch r10
|
||
|
|
||
|
Lcos_remainder_piby2_small: ;; unused label
|
||
|
; Boldo-Daumas-Li reduction for reasonably small |x|
|
||
|
call __remainder_piby2_fma3_bdl
|
||
|
jmp Lcosf_exit_s
|
||
|
|
||
|
ALIGN 16
|
||
|
Lcos_remainder_piby2:
|
||
|
; argument reduction for general x
|
||
|
call __remainder_piby2_fma3
|
||
|
Lcosf_exit_s:
|
||
|
bt rax,0
|
||
|
jnc Lcosf_piby4_compute
|
||
|
|
||
|
;sinf_piby4_compute:
|
||
|
; vmovapd xmm1,__Lsinfarray+010h
|
||
|
vmovsd xmm1,__Lsinfarray+010h
|
||
|
vmulsd xmm3,xmm0,xmm0
|
||
|
vfmadd231sd xmm1,xmm3,__Lsinfarray+018h
|
||
|
vfmadd213sd xmm1,xmm3,__Lsinfarray+008h
|
||
|
vfmadd213sd xmm1,xmm3,__Lsinfarray
|
||
|
vmulsd xmm3,xmm0,xmm3 ; xmm3 <-- x^3
|
||
|
vfmadd231sd xmm0,xmm1,xmm3
|
||
|
jmp Lcosf_fma3_adjust_sign
|
||
|
|
||
|
ALIGN 16
|
||
|
Lcosf_piby4_compute:
|
||
|
vmovapd xmm2,L_real_one
|
||
|
vmulsd xmm3,xmm0,xmm0
|
||
|
vmulsd xmm1,xmm3,L_one_half ; xmm1 <-- r
|
||
|
vsubsd xmm2,xmm2,xmm1
|
||
|
vmovsd xmm1,__Lcosfarray+018h
|
||
|
vfmadd231sd xmm1 ,xmm3,__Lcosfarray+020h
|
||
|
vfmadd213sd xmm1 ,xmm3,__Lcosfarray+010h
|
||
|
vfmadd213sd xmm1 ,xmm3,__Lcosfarray+008h
|
||
|
vmulsd xmm3,xmm3,xmm3 ; xmm3 <-- x^4
|
||
|
vmovdqa xmm0, xmm2
|
||
|
vfmadd231sd xmm0 ,xmm1,xmm3
|
||
|
|
||
|
Lcosf_fma3_adjust_sign:
|
||
|
; assuming FMA3 ==> AVX ==> SSE4.1
|
||
|
; vpcmpeqq xmm1,xmm4,XMMWORD PTR L_int_one
|
||
|
; vpcmpeqq xmm2,xmm4,XMMWORD PTR L_int_two
|
||
|
; vorpd xmm3,xmm2,xmm1
|
||
|
|
||
|
; vandpd xmm3,xmm3,L_signbit
|
||
|
|
||
|
add rax,1 ; 1,2 --> 2,3
|
||
|
shr rax,1 ; 2,3 --> 1
|
||
|
shl rax,63 ; 1 --> sign bit
|
||
|
vmovq xmm3,rax
|
||
|
|
||
|
vxorpd xmm0,xmm0,xmm3
|
||
|
|
||
|
Lcosf_fma3_return:
|
||
|
vcvtsd2ss xmm0,xmm0,xmm0
|
||
|
StackDeallocate stack_size
|
||
|
ret
|
||
|
|
||
|
Lcosf_fma3_naninf:
|
||
|
call fname_special
|
||
|
StackDeallocate stack_size
|
||
|
ret
|
||
|
|
||
|
fname endp
|
||
|
END
|