reactos/sdk/lib/crt/math/libm_sse2/sin.asm
2022-12-01 15:21:59 +02:00

512 lines
19 KiB
NASM

;
; MIT License
; -----------
;
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
;
; Permission is hereby granted, free of charge, to any person obtaining a copy
; of this Software and associated documentaon files (the "Software"), to deal
; in the Software without restriction, including without limitation the rights
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
; copies of the Software, and to permit persons to whom the Software is
; furnished to do so, subject to the following conditions:
;
; The above copyright notice and this permission notice shall be included in
; all copies or substantial portions of the Software.
;
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
; THE SOFTWARE.
;
;
; An implementation of the sin function.
;
; Prototype:
;
; double sin(double x);
;
; Computes sin(x).
; It will provide proper C99 return values,
; but may not raise floating point status bits properly.
; Based on the NAG C implementation.
;
; If FMA3 hardware is available, an FMA3 implementation of sin will be used.
.const
ALIGN 16
L_real_piby2_1 DQ 03ff921fb54400000h ; piby2_1
DQ 0
L_real_piby2_1tail DQ 03dd0b4611a626331h ; piby2_1tail
DQ 0
L_real_piby2_2 DQ 03dd0b4611a600000h ; piby2_2
DQ 0
L_real_piby2_2tail DQ 03ba3198a2e037073h ; piby2_2tail
DQ 0
ALIGN 16
L_one DQ 03FF0000000000000h, 03FF0000000000000h
L_signbit DQ 08000000000000000h, 00000000000000000h
L_int_one DQ 00000000000000001h, 00000000000000000h
L_int_two DQ 00000000000000002h, 00000000000000000h
L_int_three DQ 00000000000000003h, 00000000000000000h
L_2_by_pi DQ 03fe45f306dc9c883h ; 2/pi
L_one_half DQ 03FE0000000000000h ; .5
L_one_sixth DQ 03FC5555555555555h ; .1666...
L_two_to_neg_27 DQ 03e40000000000000h ; 2^-27
L_two_to_neg_13 DQ 03f20000000000000h ; 2^-13
L_piby4 DQ 03FE921FB54442D18h ; pi/4
L_small_arg_cw DQ 0411E848000000000h ; 5.e5, appropriate for CW
L_small_arg_bdl DQ 0417312D000000000h ; 2e7, works for BDL
L__inf_mask_64 DQ 07FF0000000000000h ; +Inf
EXTRN __Lcosarray:QWORD
EXTRN __Lsinarray:QWORD
EXTRN __use_fma3_lib:DWORD
; define local variable storage offsets
p_temp EQU 030h
p_temp1 EQU 040h
save_r10 EQU 050h
dummy_space EQU 060h
stack_size EQU 078h
include fm.inc
fname TEXTEQU <sin>
fname_special TEXTEQU <_sin_special>
;Define name and any external functions being called
EXTERN __remainder_piby2_forAsm : PROC
EXTERN __remainder_piby2_fma3 : PROC
EXTERN __remainder_piby2_fma3_bdl : PROC
EXTERN fname_special : PROC
.code
PUBLIC fname
fname PROC FRAME
StackAllocate stack_size
.ENDPROLOG
cmp DWORD PTR __use_fma3_lib, 0
jne Lsin_fma3
Lsin_sse2:
movd rdx, xmm0
xorpd xmm2, xmm2 ; zeroed out for later use
mov r10,rdx
mov r8d, 1 ; for determining region later on
btr r10,63 ; r10 <-- |x|
cmp r10,L_piby4
jb Lsin_sse2_absx_lt_piby4
Lsin_sse2_absx_nlt_piby4: ; common case
mov r11,rdx
shr r11,63
movd xmm0,r10 ; xmm0 <-- |x|
cmp r10, QWORD PTR L_small_arg_cw
jae Lsin_reduce_precise ; Note NaN/Inf will branch
; At this point we have |x| < L_small_arg_cw, which is currently 500000.
; Note that if |x| were too large, conversion of npi2 to integer would fail.
; We reduce the argument to be in a range from -pi/4 to +pi/4
; by subtracting multiples of pi/2
movapd xmm2, xmm0
mulsd xmm2, L_2_by_pi
movapd xmm4, xmm0
; xexp = ax >> EXPSHIFTBITS_DP64;
mov r9, r10
shr r9, 52 ; >>EXPSHIFTBITS_DP64
; How many pi/2 is |x| a multiple of?
; npi2 = (int)(x * twobypi + 0.5);
addsd xmm2, L_one_half ; npi2
movsd xmm3, L_real_piby2_1
cvttpd2dq xmm0, xmm2 ; convert npi2 to integer
movsd xmm1, L_real_piby2_1tail
cvtdq2pd xmm2, xmm0 ; npi2 back to double
; Subtract the multiple from x to get an extra-precision remainder
; rhead = x - npi2 * piby2_1;
mulsd xmm3, xmm2
subsd xmm4, xmm3 ; rhead
; rtail = npi2 * piby2_1tail;
mulsd xmm1, xmm2 ; rtail
movd eax, xmm0 ; eax <-- npi2
; GET_BITS_DP64(rhead-rtail, uy);
; originally only rhead
movapd xmm0, xmm4
subsd xmm0, xmm1
movsd xmm3, L_real_piby2_2
movd rcx, xmm0 ; rcx <-- rhead - rtail
movsd xmm5, L_real_piby2_2tail ; piby2_2tail
; xmm0=r, xmm1=rtail, xmm2=npi2, xmm3=temp for calc,
; xmm4=rhead, xmm5= temp for calc
; expdiff = xexp - ((uy & EXPBITS_DP64) >> EXPSHIFTBITS_DP64);
; expdiff measures how close rhead - rtail is to |x|
; (larger expdiff ==> more cancellation in |x| - (rhead-rtail) ==> closer)
shl rcx, 1 ; strip any sign bit
shr rcx, 53 ; >> EXPSHIFTBITS_DP64 +1
sub r9, rcx ; expdiff
;; if (expdiff > 15)
cmp r9, 15
jle Lsin_sse2_cw_reduction_done
; Here the remainder is pretty small compared with x, which
; implies that x is a near multiple of pi/2
; (x matches the multiple to at least 15 bits)
; So we do another stage of argument reduction.
; t = rhead;
movapd xmm1, xmm4
; rtail = npi2 * piby2_2;
mulsd xmm3, xmm2
; rhead = t - rtail;
mulsd xmm5, xmm2 ; npi2 * piby2_2tail
subsd xmm4, xmm3 ; rhead
; rtail = npi2 * piby2_2tail - ((t - rhead) - rtail);
subsd xmm1, xmm4 ; t - rhead
subsd xmm1, xmm3 ; -rtail
subsd xmm5, xmm1 ; rtail
; r = rhead - rtail;
movapd xmm0, xmm4
;HARSHA
;xmm1=rtail
movapd xmm1, xmm5 ; xmm1 <-- copy of rtail
subsd xmm0, xmm5
; xmm0=r, xmm4=rhead, xmm1=rtail
Lsin_sse2_cw_reduction_done:
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; if the input was close to a pi/2 multiple
; The original NAG code missed this trick.
; If the input is very close to n*pi/2 after reduction, so r < 2^-27,
; then the sin is either ~ 1.0 or ~r, to within 53 bits.
; Note: Unfortunately this introduces two jcc instructions close to each
; other and to other branches. As r < 2^-13 should be rather uncommon, it
; almost certainly costs more than it saves. - WAT
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; region = npi2 & 3;
subsd xmm4, xmm0 ; rhead-r
subsd xmm4, xmm1 ; rr = (rhead-r) - rtail
Lsin_piby4:
; perform taylor series to calc sinx, sinx for |x| <= pi/4
; x2 = r * r;
;xmm4 = a part of rr for the sin path, xmm4 is overwritten in the sin path
;instead use xmm3 because that was freed up in the sin path, xmm3 is overwritten in sin path
movapd xmm3, xmm0
movapd xmm2, xmm0
mulsd xmm2, xmm0 ;x2
bt eax,0
jc Lsin_sse2_calc_cos
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; region 0 or 2 do a sin calculation
movsd xmm3, __Lsinarray+50h ; s6
mulsd xmm3, xmm2 ; x2s6
movsd xmm5, __Lsinarray+20h ; s3
movsd QWORD PTR p_temp[rsp], xmm4 ; store xx
movapd xmm1, xmm2 ; move for x4
mulsd xmm1, xmm2 ; x4
movsd QWORD PTR p_temp1[rsp], xmm0 ; store x
mulsd xmm5, xmm2 ; x2s3
movapd xmm4, xmm0 ; move for x3
addsd xmm3, __Lsinarray+40h ; s5+x2s6
mulsd xmm1, xmm2 ; x6
mulsd xmm3, xmm2 ; x2(s5+x2s6)
mulsd xmm4, xmm2 ; x3
addsd xmm5, __Lsinarray+10h ; s2+x2s3
mulsd xmm5, xmm2 ; x2(s2+x2s3)
addsd xmm3, __Lsinarray+30h ; s4 + x2(s5+x2s6)
mulsd xmm2, L_one_half ; 0.5 *x2
movsd xmm0, QWORD PTR p_temp[rsp] ; load xx
mulsd xmm3, xmm1 ; x6(s4 + x2(s5+x2s6))
addsd xmm5, __Lsinarray ; s1+x2(s2+x2s3)
mulsd xmm2, xmm0 ; 0.5 * x2 *xx
addsd xmm3, xmm5 ; zs
mulsd xmm4, xmm3 ; *x3
subsd xmm4, xmm2 ; x3*zs - 0.5 * x2 *xx
addsd xmm0, xmm4 ; +xx
addsd xmm0, QWORD PTR p_temp1[rsp] ; +x
jmp Lsin_sse2_adjust_region
ALIGN 16
Lsin_sse2_calc_cos:
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; region 1 or 3 - do a cos calculation
; zc = (c2 + x2 * (c3 + x2 * (c4 + x2 * (c5 + x2 * c6))));
mulsd xmm4, xmm0 ; x*xx
movsd xmm5, L_one_half
movsd xmm1, __Lcosarray+50h ; c6
movsd xmm0, __Lcosarray+20h ; c3
mulsd xmm5, xmm2 ; r = 0.5 *x2
movapd xmm3, xmm2 ; copy of x2
movsd QWORD PTR p_temp[rsp], xmm4 ; store x*xx
mulsd xmm1, xmm2 ; c6*x2
mulsd xmm0, xmm2 ; c3*x2
subsd xmm5, L_one ; -t=r-1.0, trash r
mulsd xmm3, xmm2 ; x4
addsd xmm1, __Lcosarray+40h ; c5+x2c6
addsd xmm0, __Lcosarray+10h ; c2+x2C3
addsd xmm5, L_one ; 1 + (-t), trash t
mulsd xmm3, xmm2 ; x6
mulsd xmm1, xmm2 ; x2(c5+x2c6)
mulsd xmm0, xmm2 ; x2(c2+x2C3)
movapd xmm4, xmm2 ; copy of x2
mulsd xmm4, L_one_half ; r recalculate
addsd xmm1, __Lcosarray+30h ; c4 + x2(c5+x2c6)
addsd xmm0, __Lcosarray ; c1+x2(c2+x2C3)
mulsd xmm2, xmm2 ; x4 recalculate
subsd xmm5, xmm4 ; (1 + (-t)) - r
mulsd xmm1, xmm3 ; x6(c4 + x2(c5+x2c6))
addsd xmm0, xmm1 ; zc
subsd xmm4, L_one ; t relaculate
subsd xmm5, QWORD PTR p_temp[rsp] ; ((1 + (-t)) - r) - x*xx
mulsd xmm0, xmm2 ; x4 * zc
addsd xmm0, xmm5 ; x4 * zc + ((1 + (-t)) - r -x*xx)
subsd xmm0, xmm4 ; result - (-t)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
Lsin_sse2_adjust_region:
; positive or negative
; switch (region)
shr eax, 1
mov ecx, eax
and eax, r11d
not ecx
not r11d
and ecx, r11d
or eax, ecx
and eax, 1
jnz Lsin_sse2_cleanup
;; if the original region 0, 1 and arg is negative, then we negate the result.
;; if the original region 2, 3 and arg is positive, then we negate the result.
movapd xmm2, xmm0
xorpd xmm0, xmm0
subsd xmm0, xmm2
ALIGN 16
Lsin_sse2_cleanup:
StackDeallocate stack_size
ret
ALIGN 16
Lsin_sse2_absx_lt_piby4:
; sin = sin_piby4(x, 0.0);
; x2 = r * r;
movapd xmm2, xmm0
mulsd xmm2, xmm0 ; x2
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; region 0 - do a sin calculation
; zs = (s2 + x2 * (s3 + x2 * (s4 + x2 * (s5 + x2 * s6))));
movsd xmm3, __Lsinarray+50h ; s6
mulsd xmm3, xmm2 ; x2s6
movsd xmm5, __Lsinarray+20h ; s3
movapd xmm1, xmm2 ; move for x4
mulsd xmm1, xmm2 ; x4
mulsd xmm5, xmm2 ; x2s3
movapd xmm4, xmm0 ; move for x3
addsd xmm3, __Lsinarray+40h ; s5+x2s6
mulsd xmm1, xmm2 ; x6
mulsd xmm3, xmm2 ; x2(s5+x2s6)
mulsd xmm4, xmm2 ; x3
addsd xmm5, __Lsinarray+10h ; s2+x2s3
mulsd xmm5, xmm2 ; x2(s2+x2s3)
addsd xmm3, __Lsinarray+30h ; s4 + x2(s5+x2s6)
mulsd xmm3, xmm1 ; x6(s4 + x2(s5+x2s6))
addsd xmm5, __Lsinarray ; s1+x2(s2+x2s3)
addsd xmm3, xmm5 ; zs
mulsd xmm4, xmm3 ; *x3
addsd xmm0, xmm4 ; +x
jmp Lsin_sse2_cleanup
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
ALIGN 16
Lsin_reduce_precise:
; Reduce x into range [-pi/4, pih/4]
cmp r10,L__inf_mask_64
jae Lsin_x_naninf
mov QWORD PTR p_temp[rsp], r11
call __remainder_piby2_forAsm
mov r11, QWORD PTR p_temp[rsp]
; At this point xmm0 has r, xmm1 has rr, rax has region
movapd xmm4, xmm1 ; xmm4 <-- rr
jmp Lsin_piby4
; xmm0 = x, xmm4 = xx, eax= region
ALIGN 16
Lsin_x_naninf:
call fname_special
StackDeallocate stack_size
ret
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; From this point we assume that FMA3 and AVX hardware are present.
ALIGN 16
Lsin_fma3:
vmovq r9,xmm0
mov r10,r9 ; save x to get sign later
btr r9,63 ; r9 <-- |x|
cmp r9,L_piby4
jae Lsin_fma3_absx_nlt_piby4 ; Note that NaN will branch
cmp r9,L_two_to_neg_13
jae Lsin_fma3_calc_sin_for_absx_lt_piby4
cmp r9,L_two_to_neg_27
jae Lsin_fma3_compute_x_xxx_0_1666
StackDeallocate stack_size
ret ; sin x ~= x for |x| < 2^-27
ALIGN 16
Lsin_fma3_compute_x_xxx_0_1666: ; |x| in [2^-27,2^-13]
vmulsd xmm1,xmm0,xmm0 ; xmm1l <-- x*x
vmulsd xmm1,xmm1,xmm0 ; xmm1l <-- x*x*x
vfnmadd231sd xmm0,xmm1,L_one_sixth ; xmm0l <-- x - x*x*x*(1/6)
StackDeallocate stack_size
ret
ALIGN 16
Lsin_fma3_calc_sin_for_absx_lt_piby4: ; |x| in [2^-13,pi/4]
vmovsd xmm5,__Lsinarray+050h
vmulsd xmm3,xmm0,xmm0 ; xmm3l <-- x^2
vfmadd213sd xmm5,xmm3,__Lsinarray+040h
vfmadd213sd xmm5,xmm3,__Lsinarray+030h
vfmadd213sd xmm5,xmm3,__Lsinarray+020h
vfmadd213sd xmm5,xmm3,__Lsinarray+010h
vmulsd xmm4,xmm0,xmm3 ; xmm4l <-- x^3
vfmadd213sd xmm5,xmm3,__Lsinarray
vfmadd231sd xmm0,xmm4,xmm5 ; xmm0l <-- x + x^3 p(x^2)
StackDeallocate stack_size
ret
ALIGN 16
Lsin_fma3_absx_nlt_piby4: ; !(|x| < pi/4)
; here r9 has |x|
cmp r9,L__inf_mask_64
jae Lsin_x_naninf
;Lrange_reduce: ;; unused label
vmovq xmm0,r9 ; xmm0 <-- |x|
cmp r9,L_small_arg_bdl
jae Lsin_fma3_do_general_arg_reduction
; Note that __remainder_piby2_fma3 conventions are
; on input
; |x| is in xmm0
; on output
; r is in xmm0
; rr is in xmm1
; region of |x| is in rax
; Boldo-Daumas-Li reduction for reasonably small |x|
call __remainder_piby2_fma3_bdl
Lsin_fma3_exit_s:
bt rax,0
vmulsd xmm3,xmm0,xmm0 ; xmm3 <-- x2 = x * x
jc Lsin_fma3_calc_cos
Lsin_fma3_calc_sin: ;; unused label
; region 0 or 2
; compute the sine of r+rr, where this sum is in [-pi/4,pi/4]
vmovsd xmm5,__Lsinarray+050h
vfmadd213sd xmm5,xmm3,__Lsinarray+040h
vfmadd213sd xmm5,xmm3,__Lsinarray+030h
vfmadd213sd xmm5,xmm3,__Lsinarray+020h
vfmadd213sd xmm5,xmm3,__Lsinarray+010h ; xmm5 <-- r
vmulsd xmm4,xmm0,xmm3 ; xmm4 <-- x3 = x*x*x
vmulsd xmm2,xmm4,xmm5 ; xmm2 <-- x*x*x * r
vmulsd xmm5,xmm1,L_one_half ; xmm5 <-- .5*x*x
vsubsd xmm2,xmm5,xmm2 ; xmm2 <-- .5*x*x - x*x*x*r
vmulsd xmm2,xmm3,xmm2
vsubsd xmm2,xmm2,xmm1
vfnmadd231sd xmm2, xmm4,__Lsinarray
vsubsd xmm0,xmm0,xmm2
jmp Lsin_fma3_exit_s_1
ALIGN 16
Lsin_fma3_calc_cos:
; region 1 or 3
; compute the cosine of r+rr, where this sum is in [-pi/4,pi/4]
vmovapd xmm2,L_one
vmulsd xmm5,xmm3,L_one_half ; xmm5 <-- x*x*.5 == r
vsubsd xmm4,xmm2,xmm5 ; xmm4 <-- t = 1. - x*x*.5
vsubsd xmm2,xmm2,xmm4 ; 1-t
vsubsd xmm2,xmm2,xmm5 ; xmm2 <-- (1-t) - r
vmovsd xmm5,__Lcosarray+050h
vfnmadd231sd xmm2,xmm0,xmm1 ; (1.0 - t) - r) - x * xx) xmm2
vmulsd xmm1,xmm3,xmm3 ; x2 * x2 xmm1
vfmadd213sd xmm5,xmm3,__Lcosarray+040h
vfmadd213sd xmm5,xmm3,__Lcosarray+030h
vfmadd213sd xmm5,xmm3,__Lcosarray+020h
vfmadd213sd xmm5,xmm3,__Lcosarray+010h
vfmadd213sd xmm5,xmm3,__Lcosarray
vfmadd213sd xmm5,xmm1,xmm2
vaddsd xmm0,xmm5,xmm4
Lsin_fma3_exit_s_1:
xor r8,r8 ; prepare r8 for cmov
and r10,L_signbit ; isolate original sign of x
bt eax,1
cmovc r8,L_signbit
xor r8,r10
vmovq xmm3,r8
vxorpd xmm0,xmm0,xmm3
StackDeallocate stack_size
ret
ALIGN 16
Lsin_fma3_do_general_arg_reduction:
; argument reduction for general x
; NOTE: the BDL argument reduction routine does not touch r10,
; but the general-purpose reduction does.
mov QWORD PTR [save_r10+rsp], r10
call __remainder_piby2_fma3
mov r10, QWORD PTR [save_r10+rsp]
jmp Lsin_fma3_exit_s
fname endp
END