reactos/sdk/lib/crt/math/libm_sse2/cosf.asm
2022-12-01 15:21:59 +02:00

526 lines
18 KiB
NASM

;
; MIT License
; -----------
;
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
;
; Permission is hereby granted, free of charge, to any person obtaining a copy
; of this Software and associated documentaon files (the "Software"), to deal
; in the Software without restriction, including without limitation the rights
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
; copies of the Software, and to permit persons to whom the Software is
; furnished to do so, subject to the following conditions:
;
; The above copyright notice and this permission notice shall be included in
; all copies or substantial portions of the Software.
;
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
; THE SOFTWARE.
;
; An implementation of the cosf function.
;
; Prototype:
;
; float cosf(float x);
;
; Computes cosf(x).
; Based on the NAG C implementation.
; It will provide proper C99 return values,
; but may not raise floating point status bits properly.
; Original Author: Harsha Jagasia
.const
ALIGN 16
L_real_one DQ 03ff0000000000000h ; 1.0
DQ 0 ; for alignment
L_one_half DQ 03fe0000000000000h ; 0.5
DQ 0
L_2bypi DQ 03fe45f306dc9c883h ; 2./pi
DQ 0
L_one_sixth DQ 03fc5555555555555h ; 0.166666666666
DQ 0
L_piby2 DQ 03fe921fb54442d18h
DQ 0
L_piby2_1 DQ 03ff921fb54400000h ; piby2_1
DQ 0
L_piby2_1tail DQ 03dd0b4611a626331h ; piby2_1tail
DQ 0
L_piby2_2 DQ 03dd0b4611a600000h ; piby2_2
DQ 0
L_piby2_2tail DQ 03ba3198a2e037073h ; piby2_2tail
DQ 0
L_large_x_sse2 DQ 0411E848000000000h ; 5e5
DQ 0
L_large_x_fma3 DQ 041E921FB60000000h ; 3.37325952e9
DQ 0
L_sign_mask DQ 07FFFFFFFFFFFFFFFh
DQ 07FFFFFFFFFFFFFFFh
L__int_three DQ 00000000000000003h
DQ 00000000000000003h
L__min_norm_double DQ 00010000000000000h
DQ 00010000000000000h
L_two_to_neg_7 DQ 03f80000000000000h
DQ 0
L_two_to_neg_13 DQ 03f20000000000000h
DQ 0
L_inf_mask_32 DD 07F800000h
DQ 0
fname TEXTEQU <cosf>
fname_special TEXTEQU <_cosf_special>
;Define name and any external functions being called
EXTERN __remainder_piby2d2f_forAsm : PROC ; NEAR
EXTERN __remainder_piby2_fma3_bdl : PROC ; NEAR
EXTERN __remainder_piby2_fma3 : PROC ; NEAR
EXTERN fname_special : PROC
EXTERN _set_statfp : PROC
EXTRN __Lcosfarray:QWORD
EXTRN __Lsinfarray:QWORD
EXTRN __use_fma3_lib:DWORD
; define local variable storage offsets
p_temp equ 020h ; temporary for get/put bits operation
p_temp1 equ 030h ; temporary for get/put bits operation
dummy_space EQU 040h
stack_size EQU 068h
include fm.inc
.code
ALIGN 16
PUBLIC fname
fname PROC FRAME
StackAllocate stack_size
.ENDPROLOG
cmp DWORD PTR __use_fma3_lib, 0
jne Lcosf_fma3
Lcosf_sse2:
xorpd xmm2, xmm2 ; zeroed out for later use
;; if NaN or inf
movd edx, xmm0
mov eax, 07f800000h
mov r10d, eax
and r10d, edx
cmp r10d, eax
jz Lcosf_sse2_naninf
cvtss2sd xmm0, xmm0
movd rdx, xmm0
; ax = (ux & ~SIGNBIT_DP64);
mov r10, rdx
btr r10, 63 ; r10 <-- |x|
mov r8d, 1 ; for determining region later on
movapd xmm1, xmm0 ; xmm1 <-- copy of x
;; if (ax <= 3fe921fb54442d18h) /* abs(x) <= pi/4 */
mov rax, 03fe921fb54442d18h
cmp r10, rax
jg Lcosf_sse2_absx_gt_piby4
; *c = cos_piby4(x, 0.0);
movapd xmm2, xmm0
mulsd xmm2, xmm2 ;x^2
xor eax, eax
mov rdx, r10
movsd xmm5, QWORD PTR L_one_half
jmp Lcosf_sse2_calc_sincosf_piby4 ; done
ALIGN 16
Lcosf_sse2_absx_gt_piby4:
; reduce the argument to be in a range from -pi/4 to +pi/4
; by subtracting multiples of pi/2
; xneg = (ax != ux);
movd xmm0, r10 ; xmm0 <-- |x|
cmp r10, QWORD PTR L_large_x_sse2
jae Lcosf_sse2_reduce_precise
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; xmm0=abs(x), xmm1=x
;/* How many pi/2 is x a multiple of? */
movapd xmm2, xmm0
movsd xmm3, QWORD PTR L_2bypi
movapd xmm4, xmm0
movsd xmm5, QWORD PTR L_one_half
mulsd xmm2, xmm3
; movsd xmm5, QWORD PTR L_one_half
; movapd xmm2, xmm0
; mulsd xmm2, QWORD PTR L_2bypi
; movapd xmm4, xmm0
mov r9, r10
shr r9, 52 ; r9 <-- biased exponent of x
; npi2 = (int)(x * twobypi + 0.5);
addsd xmm2, xmm5 ; npi2
movsd xmm3, QWORD PTR L_piby2_1 ; piby2_1
cvttpd2dq xmm0, xmm2 ; xmm0 <-- npi2
movsd xmm1, QWORD PTR L_piby2_1tail ; piby2_1tail
cvtdq2pd xmm2, xmm0 ; xmm2 <-- (double)npi2
; Subtract the multiple from x to get an extra-precision remainder
; rhead = x - npi2 * piby2_1;
mulsd xmm3, xmm2 ; use piby2_1
subsd xmm4, xmm3 ; rhead
; rtail = npi2 * piby2_1tail;
mulsd xmm1, xmm2 ; rtail
movd eax, xmm0
; GET_BITS_DP64(rhead-rtail, uy);
; originally only rhead
movapd xmm0, xmm4
subsd xmm0, xmm1
movsd xmm3, QWORD PTR L_piby2_2 ; piby2_2
movd rcx, xmm0 ; rcx <-- rhead-rtail
movsd xmm5, QWORD PTR L_piby2_2tail ; piby2_2tail
; region = npi2 & 3;
; and eax, 3
; expdiff = xexp - ((uy & EXPBITS_DP64) >> EXPSHIFTBITS_DP64);
shl rcx, 1 ; strip any sign bit
shr rcx, 53 ; >> EXPSHIFTBITS_DP64 +1
sub r9, rcx ; expdiff
;; if (expdiff > 15)
cmp r9, 15
jle Lcosf_sse2_expdiff_le_15
; The remainder is pretty small compared with x, which
; implies that x is a near multiple of pi/2
; (x matches the multiple to at least 15 bits)
; t = rhead;
movapd xmm1, xmm4
; rtail = npi2 * piby2_2;
mulsd xmm3, xmm2
; rhead = t - rtail;
mulsd xmm5, xmm2 ; npi2 * piby2_2tail
subsd xmm4, xmm3 ; rhead
; rtail = npi2 * piby2_2tail - ((t - rhead) - rtail);
subsd xmm1, xmm4 ; t - rhead
subsd xmm1, xmm3 ; -rtail
subsd xmm5, xmm1 ; rtail
; r = rhead - rtail;
movapd xmm0, xmm4
;HARSHA
;xmm1=rtail
movapd xmm1, xmm5
subsd xmm0, xmm5
; xmm0=r, xmm4=rhead, xmm1=rtail
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
Lcosf_sse2_expdiff_le_15:
cmp rcx, 03f2h ; is r < 2^-13 ?
jge Lcosf_sse2_calc_sincosf_piby4 ; use taylor series if not
cmp rcx, 03deh ; is r < 2^-33 ?
jle Lcosf_sse2_r_very_small ; then cosf(r) ~ 1 or r
movapd xmm2, xmm0
mulsd xmm2, xmm0 ; xmm2 <-- x^2
;; if region is 1 or 3 do a sinf calc.
and r8d, eax
jz Lcosf_sse2_r_small_calc_sin
Lcosf_sse2_r_small_calc_cos:
; region 1 or 3
; use simply polynomial
; *s = x - x*x*x*0.166666666666666666;
movsd xmm3, QWORD PTR L_one_sixth
mulsd xmm3, xmm0 ; * x
mulsd xmm3, xmm2 ; * x^2
subsd xmm0, xmm3 ; xs
jmp Lcosf_sse2_adjust_region
ALIGN 16
Lcosf_sse2_r_small_calc_sin:
; region 0 or 2
; cos = 1.0 - x*x*0.5;
movsd xmm0, QWORD PTR L_real_one ; 1.0
mulsd xmm2, QWORD PTR L_one_half ; 0.5 *x^2
subsd xmm0, xmm2
jmp Lcosf_sse2_adjust_region
ALIGN 16
Lcosf_sse2_r_very_small:
; then sin(r) = r
; if region is 1 or 3 do a sin calc.
and r8d, eax
jnz Lcosf_sse2_adjust_region
movsd xmm0, QWORD PTR L_real_one ; cosf(r) is a 1
; By this point, calculations should already have set inexact
jmp Lcosf_sse2_adjust_region
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
ALIGN 16
Lcosf_sse2_reduce_precise:
; Reduce abs(x) into range [-pi/4, pi/4]
; remainder_piby2d2f(ax, &r, &region);
mov QWORD PTR p_temp[rsp], rdx ; save ux for use later
mov QWORD PTR p_temp1[rsp], r10 ; save ax for use later
call __remainder_piby2d2f_forAsm
mov rdx, QWORD PTR p_temp[rsp] ; restore ux for use later
mov r10, QWORD PTR p_temp1[rsp] ; restore ax for use later
mov r8d, 1 ; for determining region later
; Reduced argument is in xmm0. No second word; after all, we started in
; single precision. Region is in rax.
movapd xmm1, xmm0
movsd xmm5, QWORD PTR L_one_half
jmp Lcosf_sse2_calc_sincosf_piby4
; done with reducing the argument. Now perform the sin/cos calculations.
ALIGN 16
Lcosf_sse2_calc_sincosf_piby4:
movapd xmm2, xmm0
mulsd xmm2, xmm0 ; x^2
;; if region is 0 or 2, do a cosf calc
and r8d, eax
jz Lcosf_sse2_do_cosf_calc
; region is 1 or 3: do a sinf calc.
Lcosf_sse2_do_sinf_calc:
movsd xmm1, QWORD PTR __Lsinfarray+18h ; s4
mulsd xmm1, xmm2 ; s4x2
movsd xmm4, xmm2 ; move for x4
mulsd xmm4, xmm2 ; x4
movsd xmm5, QWORD PTR __Lsinfarray+8h ; s2
mulsd xmm5, xmm2 ; s2x2
movsd xmm3, xmm0 ; move for x3
mulsd xmm3, xmm2 ; x3
addsd xmm1, QWORD PTR __Lsinfarray+10h ; s3+s4x2
mulsd xmm1, xmm4 ; s3x4+s4x6
addsd xmm5, QWORD PTR __Lsinfarray ; s1+s2x2
addsd xmm1, xmm5 ; s1+s2x2+s3x4+s4x6
mulsd xmm1, xmm3 ; x3(s1+s2x2+s3x4+s4x6)
addsd xmm0, xmm1 ; x + x3(s1+s2x2+s3x4+s4x6)
jmp Lcosf_sse2_adjust_region
ALIGN 16
Lcosf_sse2_do_cosf_calc:
; region 0 or 2 - do a cos calculation
; zc = 1-0.5*x2+ c1*x4 +c2*x6 +c3*x8;
; zc = 1-0.5*x2+ c1*x4 +c2*x6 +c3*x8 + c4*x10 for a higher precision
movsd xmm1, QWORD PTR __Lcosfarray+20h ; c4
movsd xmm4, xmm2 ; move for x4
mulsd xmm1, xmm2 ; c4x2
movsd xmm3, QWORD PTR __Lcosfarray+10h ; c2
mulsd xmm4, xmm2 ; x4
movsd xmm0, QWORD PTR __Lcosfarray ; c0
mulsd xmm3, xmm2 ; c2x2
mulsd xmm0, xmm2 ; c0x2 (=-0.5x2)
addsd xmm1, QWORD PTR __Lcosfarray+18h ; c3+c4x2
mulsd xmm1, xmm4 ; c3x4 + c4x6
addsd xmm3, QWORD PTR __Lcosfarray+8h ; c1+c2x2
addsd xmm1, xmm3 ; c1 + c2x2 + c3x4 + c4x6
mulsd xmm1, xmm4 ; c1x4 + c2x6 + c3x8 + c4x10
addsd xmm0, QWORD PTR L_real_one ; 1 - 0.5x2
addsd xmm0, xmm1 ; 1 - 0.5x2 + c1x4 + c2x6 + c3x8 + c4x10
Lcosf_sse2_adjust_region:
; xmm1 is cos or sin, relies on previous sections to
; switch (region)
add eax, 1
and eax, 2
jz Lcosf_sse2_cleanup
;; if region 1 or 2 then we negate the result.
xorpd xmm2, xmm2
subsd xmm2, xmm0
movapd xmm0, xmm2
ALIGN 16
Lcosf_sse2_cleanup:
cvtsd2ss xmm0, xmm0
StackDeallocate stack_size
ret
Lcosf_sse2_naninf:
call fname_special
StackDeallocate stack_size
ret
ALIGN 16
Lcosf_fma3:
vmovd eax,xmm0
mov r8d,L_inf_mask_32
and eax,r8d
cmp eax, r8d
jz Lcosf_fma3_naninf
vcvtss2sd xmm5,xmm0,xmm0
vmovq r9,xmm5
btr r9,63 ;clear sign
cmp r9,L_piby2
jg Lcosf_fma3_range_reduce
cmp r9,L_two_to_neg_7
jge Lcosf_fma3_compute_cosf_piby_4
cmp r9,L_two_to_neg_13
jge Lcosf_fma3_compute_1_xx_5
vmovq xmm0,QWORD PTR L_real_one
; Here we need to set inexact
vaddsd xmm0,xmm0,L__min_norm_double ; this will set inexact
jmp Lcosf_fma3_return
ALIGN 16
Lcosf_fma3_compute_1_xx_5:
vmulsd xmm0,xmm5,QWORD PTR L_one_half
vfnmadd213sd xmm0,xmm5,L_real_one ; xmm9 1.0 - x*x*(double2)0.5
jmp Lcosf_fma3_return
ALIGN 16
Lcosf_fma3_compute_cosf_piby_4:
movsd xmm0,xmm5
vmovapd xmm2,L_real_one
vmulsd xmm3,xmm0,xmm0
vmulsd xmm1,xmm3,L_one_half ; xmm1 <-- r
vsubsd xmm2,xmm2,xmm1
vmovsd xmm1,__Lcosfarray+018h
vfmadd231sd xmm1,xmm3,__Lcosfarray+020h
vfmadd213sd xmm1,xmm3,__Lcosfarray+010h
vfmadd213sd xmm1,xmm3,__Lcosfarray+008h
vmulsd xmm3,xmm3,xmm3 ; xmm3 <-- x^4
vmovdqa xmm0,xmm2
vfmadd231sd xmm0,xmm1,xmm3
jmp Lcosf_fma3_return
ALIGN 16
Lcosf_fma3_range_reduce:
vmovq xmm0,r9 ; xmm0 <-- |x|
cmp r9,L_large_x_fma3
jge Lcosf_reduce_precise
;cosff_range_e_5_s:
vandpd xmm1,xmm0,L_sign_mask
vmovapd xmm2,L_2bypi
vfmadd213sd xmm2,xmm1,L_one_half
vcvttpd2dq xmm2,xmm2
vpmovsxdq xmm1,xmm2
vandpd xmm4,xmm1,L__int_three ; region xmm4
vshufps xmm1 ,xmm1,xmm1,8
vcvtdq2pd xmm1,xmm1
vmovdqa xmm2,xmm0
vfnmadd231sd xmm2,xmm1,L_piby2_1 ; xmm2 rhead
vmulsd xmm3,xmm1,L_piby2_1tail ; xmm3 rtail
vsubsd xmm0,xmm2,xmm3 ; r_1 xmm0
vsubsd xmm2,xmm2,xmm0
vsubsd xmm1,xmm2,xmm3
vmovq rax,xmm4
jmp Lcosf_exit_s
ALIGN 16
Lcosf_reduce_precise:
vmovq xmm0,r9 ; r9 <-- |x|
cmp r9,L_large_x_fma3
jge Lcos_remainder_piby2
; __remainder_piby2_fma3 and __remainder_piby2_fma3_bdl
; have the following conventions:
; on input
; x is in xmm0
; on output
; r is in xmm0
; rr is in xmm1
; region is in rax
; The _bdl routine is guaranteed not to touch r10
Lcos_remainder_piby2_small: ;; unused label
; Boldo-Daumas-Li reduction for reasonably small |x|
call __remainder_piby2_fma3_bdl
jmp Lcosf_exit_s
ALIGN 16
Lcos_remainder_piby2:
; argument reduction for general x
call __remainder_piby2_fma3
Lcosf_exit_s:
bt rax,0
jnc Lcosf_piby4_compute
;sinf_piby4_compute:
; vmovapd xmm1,__Lsinfarray+010h
vmovsd xmm1,__Lsinfarray+010h
vmulsd xmm3,xmm0,xmm0
vfmadd231sd xmm1,xmm3,__Lsinfarray+018h
vfmadd213sd xmm1,xmm3,__Lsinfarray+008h
vfmadd213sd xmm1,xmm3,__Lsinfarray
vmulsd xmm3,xmm0,xmm3 ; xmm3 <-- x^3
vfmadd231sd xmm0,xmm1,xmm3
jmp Lcosf_fma3_adjust_sign
ALIGN 16
Lcosf_piby4_compute:
vmovapd xmm2,L_real_one
vmulsd xmm3,xmm0,xmm0
vmulsd xmm1,xmm3,L_one_half ; xmm1 <-- r
vsubsd xmm2,xmm2,xmm1
vmovsd xmm1,__Lcosfarray+018h
vfmadd231sd xmm1 ,xmm3,__Lcosfarray+020h
vfmadd213sd xmm1 ,xmm3,__Lcosfarray+010h
vfmadd213sd xmm1 ,xmm3,__Lcosfarray+008h
vmulsd xmm3,xmm3,xmm3 ; xmm3 <-- x^4
vmovdqa xmm0, xmm2
vfmadd231sd xmm0 ,xmm1,xmm3
Lcosf_fma3_adjust_sign:
; assuming FMA3 ==> AVX ==> SSE4.1
; vpcmpeqq xmm1,xmm4,XMMWORD PTR L_int_one
; vpcmpeqq xmm2,xmm4,XMMWORD PTR L_int_two
; vorpd xmm3,xmm2,xmm1
; vandpd xmm3,xmm3,L_signbit
add rax,1 ; 1,2 --> 2,3
shr rax,1 ; 2,3 --> 1
shl rax,63 ; 1 --> sign bit
vmovq xmm3,rax
vxorpd xmm0,xmm0,xmm3
Lcosf_fma3_return:
vcvtsd2ss xmm0,xmm0,xmm0
StackDeallocate stack_size
ret
Lcosf_fma3_naninf:
call fname_special
StackDeallocate stack_size
ret
fname endp
END