reactos/sdk/lib/crt/math/libm_sse2/cos.asm
2022-12-01 15:21:59 +02:00

533 lines
20 KiB
NASM

;
; MIT License
; -----------
;
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
;
; Permission is hereby granted, free of charge, to any person obtaining a copy
; of this Software and associated documentaon files (the "Software"), to deal
; in the Software without restriction, including without limitation the rights
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
; copies of the Software, and to permit persons to whom the Software is
; furnished to do so, subject to the following conditions:
;
; The above copyright notice and this permission notice shall be included in
; all copies or substantial portions of the Software.
;
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
; THE SOFTWARE.
;
;
; An implementation of the cos function.
;
; Prototype:
;
; double cos(double x);
;
; Computes cos(x).
; It will provide proper C99 return values,
; but may not raise floating point status bits properly.
; Based on the NAG C implementation.
;
; If FMA3 hardware is available, an FMA3 implementation of cos will be used.
.const
ALIGN 16
L_real_piby2_1 DQ 03ff921fb54400000h ; piby2_1
DQ 0
L_real_piby2_1tail DQ 03dd0b4611a626331h ; piby2_1tail
DQ 0
L_real_piby2_2 DQ 03dd0b4611a600000h ; piby2_2
DQ 0
L_real_piby2_2tail DQ 03ba3198a2e037073h ; piby2_2tail
DQ 0
ALIGN 16
L_one DQ 03FF0000000000000h, 03FF0000000000000h
L_signbit DQ 08000000000000000h, 00000000000000000h
L_int_one DQ 00000000000000001h, 00000000000000000h
L_int_two DQ 00000000000000002h, 00000000000000000h
L_2_by_pi DQ 03fe45f306dc9c883h ; 2/pi
L_one_half DQ 03FE0000000000000h ; .5
L_neg_one_half DQ 0bfe0000000000000h ; - 0.5
L_two_to_neg_27 DQ 03e40000000000000h ; 2^-27
L_two_to_neg_13 DQ 03f20000000000000h ; 2^-13
L_piby4 DQ 03FE921FB54442D18h ; pi/4
L_small_arg_cw DQ 0411E848000000000h ; 5.e5, appropriate for CW
L_small_arg_bdl DQ 0417312D000000000h ; 2e7, works for BDL
L_sign_mask DQ 07FFFFFFFFFFFFFFFh
L__inf_mask_64 DQ 07FF0000000000000h ; +Inf
EXTRN __Lcosarray:QWORD
EXTRN __Lsinarray:QWORD
EXTRN __use_fma3_lib:DWORD
; local storage offsets
p_temp EQU 020h ; temporary for get/put bits operation
p_temp1 EQU 030h ; temporary for get/put bits operation
dummy_space EQU 040h
stack_size EQU 068h
include fm.inc
fname TEXTEQU <cos>
fname_special TEXTEQU <_cos_special>
;Define name and any external functions being called
EXTERN __remainder_piby2_forAsm : PROC
EXTERN __remainder_piby2_fma3 : PROC
EXTERN __remainder_piby2_fma3_bdl : PROC
EXTERN fname_special : PROC
.code
PUBLIC fname
fname PROC FRAME
StackAllocate stack_size
.ENDPROLOG
cmp DWORD PTR __use_fma3_lib, 0
jne L_cos_fma3
Lcos_sse2:
movd rdx, xmm0
xorpd xmm2, xmm2 ; zeroed out for later use
mov r10, rdx
btr r10, 63 ; r10 <-- |x|
cmp r10, L_piby4
jb Lcos_sse2_absx_lt_piby4
Lcos_absx_nlt_piby4: ; common case
; Here rdx has x, r10 has |x|
movd xmm0, r10 ; xmm0 <-- |x|
cmp r10, QWORD PTR L_small_arg_cw
jae Lcos_reduce_precise ; Note NaN/Inf will branch
; At this point we have |x| < L_small_arg_cw, which is currently 500000.
; Note that if |x| were too large, conversion of npi2 to integer would fail.
; We reduce the argument to be in a range from -pi/4 to +pi/4
; by subtracting multiples of pi/2
movapd xmm2, xmm0
mulsd xmm2, L_2_by_pi
movapd xmm4, xmm0
; xexp = ax >> EXPSHIFTBITS_DP64;
mov r9, r10
shr r9, 52 ; >>EXPSHIFTBITS_DP64
; How many pi/2 is |x| a multiple of?
; npi2 = (int)(x * twobypi + 0.5);
addsd xmm2, L_one_half ; npi2
movsd xmm3, L_real_piby2_1
cvttpd2dq xmm0, xmm2 ; convert npi2 to integer
movsd xmm1, L_real_piby2_1tail
cvtdq2pd xmm2, xmm0 ; and back to double.
; Subtract the multiple from x to get an extra-precision remainder
; rhead = x - npi2 * piby2_1;
mulsd xmm3, xmm2
subsd xmm4, xmm3 ; rhead
; rtail = npi2 * piby2_1tail;
mulsd xmm1, xmm2 ; rtail
movd eax, xmm0 ; eax <-- npi2
; GET_BITS_DP64(rhead-rtail, uy);
; originally only rhead
movapd xmm0, xmm4
subsd xmm0, xmm1
movsd xmm3, L_real_piby2_2
movd rcx, xmm0 ; rcx <-- rhead - rtail
movsd xmm5, L_real_piby2_2tail ; piby2_2tail
; xmm0=r, xmm1=rtail, xmm2=npi2, xmm3=temp for calc,
; xmm4=rhead xmm5= temp for calc
; expdiff = xexp - ((uy & EXPBITS_DP64) >> EXPSHIFTBITS_DP64);
; expdiff measures how close rhead - rtail is to |x|
; (larger expdiff ==> more cancellation in |x| - (rhead-rtail) ==> closer)
shl rcx, 1 ; strip any sign bit
shr rcx, 53 ; >> EXPSHIFTBITS_DP64 +1
sub r9, rcx ; expdiff
;; if (expdiff > 15)
cmp r9, 15
jle Lcos_sse2_cw_reduction_done
; Here the remainder is pretty small compared with x, which
; implies that x is a near multiple of pi/2
; (x matches the multiple to at least 15 bits)
; So we do another stage of argument reduction.
; t = rhead;
movapd xmm1, xmm4
; rtail = npi2 * piby2_2;
mulsd xmm3, xmm2
; rhead = t - rtail;
mulsd xmm5, xmm2 ; npi2 * piby2_2tail
subsd xmm4, xmm3 ; rhead
; rtail = npi2 * piby2_2tail - ((t - rhead) - rtail);
subsd xmm1, xmm4 ; t - rhead
subsd xmm1, xmm3 ; -rtail
subsd xmm5, xmm1 ; rtail
; r = rhead - rtail;
movapd xmm0, xmm4
;HARSHA
;xmm1=rtail
movapd xmm1, xmm5 ; xmm1 <-- copy of rtail
subsd xmm0, xmm5
; xmm0=r, xmm4=rhead, xmm1=rtail
Lcos_sse2_cw_reduction_done:
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; if the input was close to a pi/2 multiple
; The original NAG code missed this trick.
; If the input is very close to n*pi/2 after reduction, so r < 2^-27,
; then the cos is either ~ 1.0 or ~r, to within 53 bits.
; NOTE: Unfortunately, this introduces two jcc instructions close to each
; other and to other branches. As r < 2^-13 should be rather uncommon,
; the problems for branch prediction outweigh the computational savings. - WAT
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; region = npi2 & 3;
subsd xmm4, xmm0 ; rhead-r
subsd xmm4, xmm1 ; rr = (rhead-r) - rtail
Lcos_piby4:
; perform taylor series to calc sinx or cosx
; x2 = r * r;
;xmm4 = a part of rr for the sin path, xmm4 is overwritten in the cos path
;instead use xmm3 because that was freed up in the sin path, xmm3 is overwritten in sin path
movapd xmm3, xmm0
movapd xmm2, xmm0
mulsd xmm2, xmm0 ;x2
bt eax,0
jnc Lcos_sse2_calc_cos
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; region 1 or 3 do a sin calculation
movsd xmm3, __Lsinarray+50h ; s6
mulsd xmm3, xmm2 ; x2s6
movsd xmm5, __Lsinarray+20h ; s3
movsd QWORD PTR p_temp[rsp], xmm4 ; store xx
movapd xmm1, xmm2 ; move for x4
mulsd xmm1, xmm2 ; x4
movsd QWORD PTR p_temp1[rsp], xmm0 ; store x
mulsd xmm5, xmm2 ; x2s3
movapd xmm4, xmm0 ; move for x3
addsd xmm3, __Lsinarray+40h ; s5+x2s6
mulsd xmm1, xmm2 ; x6
mulsd xmm3, xmm2 ; x2(s5+x2s6)
mulsd xmm4, xmm2 ; x3
addsd xmm5, __Lsinarray+10h ; s2+x2s3
mulsd xmm5, xmm2 ; x2(s2+x2s3)
addsd xmm3, __Lsinarray+30h ; s4 + x2(s5+x2s6)
mulsd xmm2, L_one_half ; 0.5 *x2
movsd xmm0, QWORD PTR p_temp[rsp] ; load xx
mulsd xmm3, xmm1 ; x6(s4 + x2(s5+x2s6))
addsd xmm5, __Lsinarray ; s1+x2(s2+x2s3)
mulsd xmm2, xmm0 ; 0.5 * x2 *xx
addsd xmm3, xmm5 ; zs
mulsd xmm4, xmm3 ; *x3
subsd xmm4, xmm2 ; x3*zs - 0.5 * x2 *xx
addsd xmm0, xmm4 ; +xx
addsd xmm0, QWORD PTR p_temp1[rsp] ; +x
jmp Lcos_sse2_adjust_region
ALIGN 16
Lcos_sse2_calc_cos:
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; region 0 or 2 - do a cos calculation
; zc = (c2 + x2 * (c3 + x2 * (c4 + x2 * (c5 + x2 * c6))));
mulsd xmm4, xmm0 ; x*xx
movsd xmm5, L_one_half
movsd xmm1, __Lcosarray+50h ; c6
movsd xmm0, __Lcosarray+20h ; c3
mulsd xmm5, xmm2 ; r = 0.5 *x2
movapd xmm3, xmm2 ; copy of x2
movsd QWORD PTR p_temp[rsp], xmm4 ; store x*xx
mulsd xmm1, xmm2 ; c6*x2
mulsd xmm0, xmm2 ; c3*x2
subsd xmm5, L_one ; -t=r-1.0, trash r
mulsd xmm3, xmm2 ; x4
addsd xmm1, __Lcosarray+40h ; c5+x2c6
addsd xmm0, __Lcosarray+10h ; c2+x2C3
addsd xmm5, L_one ; 1 + (-t), trash t
mulsd xmm3, xmm2 ; x6
mulsd xmm1, xmm2 ; x2(c5+x2c6)
mulsd xmm0, xmm2 ; x2(c2+x2C3)
movapd xmm4, xmm2 ; copy of x2
mulsd xmm4, L_one_half ; r recalculate
addsd xmm1, __Lcosarray+30h ; c4 + x2(c5+x2c6)
addsd xmm0, __Lcosarray ; c1+x2(c2+x2C3)
mulsd xmm2, xmm2 ; x4 recalculate
subsd xmm5, xmm4 ; (1 + (-t)) - r
mulsd xmm1, xmm3 ; x6(c4 + x2(c5+x2c6))
addsd xmm0, xmm1 ; zc
subsd xmm4, L_one ; t relaculate
subsd xmm5, QWORD PTR p_temp[rsp] ; ((1 + (-t)) - r) - x*xx
mulsd xmm0, xmm2 ; x4 * zc
addsd xmm0, xmm5 ; x4 * zc + ((1 + (-t)) - r -x*xx)
subsd xmm0, xmm4 ; result - (-t)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
Lcos_sse2_adjust_region:
; switch (region)
add eax, 1
and eax, 2
jz Lcos_sse2_cleanup
;; if the original region 1 or 2 then we negate the result.
movapd xmm2, xmm0
xorpd xmm0, xmm0
subsd xmm0, xmm2
ALIGN 16
Lcos_sse2_cleanup:
StackDeallocate stack_size
ret
ALIGN 16
Lcos_sse2_absx_lt_piby4:
; cos = cos_piby4(x, 0.0);
; x2 = r * r;
cmp r10, L_two_to_neg_13
jb Lcos_sse2_x_small
movapd xmm2, xmm0
mulsd xmm2, xmm0 ; x2
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; region 0 - do a cos calculation
; zc = (c2 + x2 * (c3 + x2 * (c4 + x2 * (c5 + x2 * c6))));
movsd xmm1, __Lcosarray+10h ; c2
movapd xmm4, xmm2 ; move for x4
mulsd xmm4, xmm2 ; x4
movsd xmm3, __Lcosarray+30h ; c4
mulsd xmm1, xmm2 ; c2x2
movsd xmm5, __Lcosarray+50h ; c6
mulsd xmm3, xmm2 ; c4x2
movapd xmm0, xmm4 ; move for x8
mulsd xmm5, xmm2 ; c6x2
mulsd xmm0, xmm4 ; x8
addsd xmm1, __Lcosarray ; c1 + c2x2
mulsd xmm1, xmm4 ; c1x4 + c2x6
addsd xmm3, __Lcosarray+20h ; c3 + c4x2
mulsd xmm2, L_neg_one_half ; -0.5x2, destroy xmm2
addsd xmm5, __Lcosarray+40h ; c5 + c6x2
mulsd xmm3, xmm0 ; c3x8 + c4x10
mulsd xmm4, xmm0 ; x12
mulsd xmm4, xmm5 ; c5x12 + c6x14
movsd xmm0, L_one
addsd xmm1, xmm3 ; c1x4 + c2x6 + c3x8 + c4x10
movapd xmm3, xmm2 ; preserve -0.5x2
addsd xmm2, xmm0 ; t = 1 - 0.5x2
subsd xmm0, xmm2 ; 1-t
addsd xmm0, xmm3 ; (1-t) - r
addsd xmm1, xmm4 ; c1x4 + c2x6 + c3x8 + c4x10 + c5x12 + c6x14
addsd xmm0, xmm1 ; (1-t) - r + c1x4 + c2x6 + c3x8 + c4x10 + c5x12 + c6x14
addsd xmm0, xmm2 ; 1 - 0.5x2 + above
StackDeallocate stack_size
ret
ALIGN 16
Lcos_sse2_x_small:
movsd xmm2, xmm0
movsd xmm0, L_one
cmp r10, L_two_to_neg_27
jb Lcos_sse2_x_smaller
mulsd xmm2, xmm2
mulsd xmm2, L_one_half
subsd xmm0, xmm2
StackDeallocate stack_size
ret
ALIGN 16
Lcos_sse2_x_smaller:
movsd xmm0, L_one
addsd xmm0, L_int_one ; really adding smallest subnormal; set inexact
StackDeallocate stack_size
ret
ALIGN 16
Lcos_reduce_precise:
; Reduce x into range [-pi/4, pi/4]
cmp r10, L__inf_mask_64
jae Lcos_x_naninf
call __remainder_piby2_forAsm
; At this point xmm0 has r, xmm1 has rr, rax has region
movapd xmm4, xmm1 ; xmm4 <-- rr
jmp Lcos_piby4
; xmm0 = x, xmm4 = xx, eax= region
ALIGN 16
Lcos_x_naninf:
call fname_special
StackDeallocate stack_size
ret
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; From this point we assume that FMA3 and AVX hardware are present.
ALIGN 16
L_cos_fma3:
vmovq r9,xmm0
mov rax,r9
and r9,L_sign_mask ; clear sign
Lcos_early_exit_s_1: ;; unused label
cmp r9,L_piby4
jg Lcos_early_exit_s ; Note that NaN will branch
cmp r9,L_two_to_neg_13
jge Lcompute_cos_pyby_4
cmp r9,L_two_to_neg_27
jge Lcompute_1_xx_5
vmovq xmm0,L_one ; for tiniest args, cos is 1
jmp Lreturn_no_restore
Lcompute_1_xx_5:
vmulsd xmm1,xmm0,L_one_half ; xmm1l <-- .5*x
vfnmadd213sd xmm0,xmm1,L_one ; xmm0l <-- 1.0 - (.5*x)*x
jmp Lreturn_no_restore
Lcompute_cos_pyby_4:
; make sure this is accurate enough
; note that x^2 can't be all that close to 1 here
vmulsd xmm3,xmm0,xmm0 ; xmm3 <-- xx = x*x
vmovapd xmm0,__Lcosarray+050h ; xmm0 <-- c5
vfmadd213sd xmm0,xmm3,__Lcosarray+040h ; xmm0 <-- c5*xx + c4
vfmadd213sd xmm0,xmm3,__Lcosarray+030h ; xmm0 <-- (c5*xx + c4)*xx + c3
vfmadd213sd xmm0,xmm3,__Lcosarray+020h
vfmadd213sd xmm0,xmm3,__Lcosarray+010h
vfmadd213sd xmm0,xmm3,__Lcosarray
vfmsub213sd xmm0,xmm3,L_one_half
vfmadd213sd xmm0,xmm3,L_one
jmp Lreturn_no_restore
Lcos_early_exit_s:
mov r8,L__inf_mask_64
and rax,r8
cmp rax, r8
jz Lcos_x_naninf
Lrange_reduce:
vmovq xmm0,r9 ; r9 <-- |x|
cmp r9,L_small_arg_bdl
jae Lcos_remainder_piby2
; For __remainder_piby2_fma3 and __remainder_piby2_fma3_bdl
; on input
; x is in xmm0
; on output
; r is in xmm0
; rr is in xmm1
; region is in rax
; Boldo-Daumas-Li reduction for reasonably small |x|
call __remainder_piby2_fma3_bdl
;; if region is 0 or 2 do a cos calc.
;; if region is 1 or 3 do a sin calc.
Lcos_exit_s:
bt rax,0
jc Lsin_piby4_compute
Lcos_piby4_compute: ;; unused label
; compute the cosine of r+rr, where this sum is in [-pi/4,pi/4]
vmovapd xmm2,L_one
vmulsd xmm3,xmm0,xmm0 ; xmm3 <-- x * x
vmulsd xmm5,xmm3,L_one_half ; xmm5 <-- x*x*.5 == r
vsubsd xmm4,xmm2,xmm5 ; xmm4 <-- t = 1. - x*x*.5
vsubsd xmm2,xmm2,xmm4 ; 1-t
vsubsd xmm2,xmm2,xmm5 ; xmm2 <-- (1-t) - r
vmovapd xmm5,__Lcosarray+040h
vfnmadd231sd xmm2,xmm0,xmm1 ; (1.0 - t) - r) - x * xx) xmm2
vmulsd xmm1,xmm3,xmm3 ; x2 * x2 xmm1
vfmadd231sd xmm5,xmm3,__Lcosarray+050h
vfmadd213sd xmm5,xmm3,__Lcosarray+030h
vfmadd213sd xmm5,xmm3,__Lcosarray+020h
vfmadd213sd xmm5,xmm3,__Lcosarray+010h
vfmadd213sd xmm5,xmm3,__Lcosarray
vfmadd213sd xmm5,xmm1,xmm2
vaddsd xmm0,xmm5,xmm4
jmp Lcos_exit_s_1
ALIGN 16
Lsin_piby4_compute:
; compute the sine of r+rr, where this sum is in [-pi/4,pi/4]
vmovapd xmm5,__Lsinarray+040h
vmulsd xmm3,xmm0,xmm0 ; xmm3 <-- x2 = x * x
vfmadd231sd xmm5,xmm3,__Lsinarray+050h
vfmadd213sd xmm5,xmm3,__Lsinarray+030h
vfmadd213sd xmm5,xmm3,__Lsinarray+020h
vfmadd213sd xmm5,xmm3,__Lsinarray+010h ; xmm5 <-- r
vmulsd xmm4,xmm0,xmm3 ; xmm4 <-- x3 = x*x*x
vmulsd xmm2,xmm4,xmm5 ; xmm2 <-- x*x*x * r
vmulsd xmm5,xmm1,L_one_half ; xmm5 <-- .5*x*x
vsubsd xmm2,xmm5,xmm2 ; xmm2 <-- .5*x*x - x*x*x*r
vmulsd xmm2,xmm3,xmm2
vsubsd xmm2,xmm2,xmm1
vfnmadd231sd xmm2, xmm4,__Lsinarray
vsubsd xmm0,xmm0,xmm2
Lcos_exit_s_1:
xor r8,r8
add eax, 1
and eax, 2
cmovnz r8, L_signbit
vmovq xmm3,r8
vxorpd xmm0,xmm0,xmm3
Lreturn_restore_regs:
StackDeallocate stack_size
ret
Lreturn_no_restore:
StackDeallocate stack_size
ret
ALIGN 16
Lcos_remainder_piby2:
; argument reduction for general x
call __remainder_piby2_fma3
jmp Lcos_exit_s
fname endp
END