reactos/sdk/lib/crt/math/libm_sse2/tanf.asm
2022-12-01 15:21:59 +02:00

552 lines
19 KiB
NASM

;
;
; MIT License
; -----------
;
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
;
; Permission is hereby granted, free of charge, to any person obtaining a copy
; of this Software and associated documentaon files (the "Software"), to deal
; in the Software without restriction, including without limitation the rights
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
; copies of the Software, and to permit persons to whom the Software is
; furnished to do so, subject to the following conditions:
;
; The above copyright notice and this permission notice shall be included in
; all copies or substantial portions of the Software.
;
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
; THE SOFTWARE.
;
; An implementation of the tanf function using the fma3 instruction.
;
; Prototype:
;
; float tanf(float x);
;
; Computes tanf(x).
; It will provide proper C99 return values,
; but may not raise floating point status bits properly.
; Based on the NAG C implementation.
;
.const
ALIGN 16
L_sign_mask DQ 07FFFFFFFFFFFFFFFh
DQ 07FFFFFFFFFFFFFFFh
L_twobypi DQ 03FE45F306DC9C883h
DQ 03FE45F306DC9C883h
L_int_three DQ 00000000000000003h
DQ 00000000000000003h
L_int_one DQ 00000000000000001h
DQ 00000000000000001h
L_signbit DQ 08000000000000000h
DQ 08000000000000000h
L_tanf DQ 03FD8A8B0DA56CB17h ; c0
DQ 0BF919DBA6EFD6AADh ; c1
DQ 03FF27E84A3E73A2Eh ; d0
DQ 0BFE07266D7B3511Bh ; d1
DQ 03F92E29003C692D9h ; d2
L_large_x_sse2 DQ 04160000000000000h ; 8388608.
L_large_x_fma3 DQ 041E921FB40000000h ; 3.373259264e9
L_point_333 DQ 03FD5555555555555h
L_mask_3e4 DQ 03e40000000000000h
L_mask_3f2 DQ 03f20000000000000h
L_point_five DQ 03FE0000000000000h
L_piby2_1 DQ 03FF921FB54400000h
L_piby2_1tail DQ 03DD0B4611A626331h
L_piby2_lead DQ 03ff921fb54442d18h
L_n_one DQ 0BFF0000000000000h
L_piby4 DQ 03fe921fb54442d18h
L_min_norm DQ 00010000000000000h
L_inf_mask_32 DD 07F800000h
DD 07F800000h
EXTRN __use_fma3_lib:DWORD
EXTRN __L_2_by_pi_bits:BYTE
fname TEXTEQU <tanf>
fname_special TEXTEQU <_tanf_special>
; define local variable storage offsets
; actually there aren't any, but we need to leave room for _tanf_special.
dummy_space EQU 20h
stack_size EQU 38h
include fm.inc
;Define name and any external functions being called
EXTERN fname_special : PROC
.code
PUBLIC fname
fname PROC FRAME
StackAllocate stack_size
.ENDPROLOG
cmp DWORD PTR __use_fma3_lib, 0
jne Ltanf_fma3
Ltanf_sse2:
movd eax,xmm0
mov r8d,L_inf_mask_32
and eax,r8d
cmp eax, r8d
jz Ltanf_sse2_naninf
cvtss2sd xmm5,xmm0
movd r9,xmm5
btr r9,63 ; r9 <-- |x|
cmp r9,L_piby4
jg Ltanf_sse2_range_reduce
cmp r9,L_mask_3f2 ; compare to 2^-13 = 0.0001220703125
jge Ltanf_sse2_compute_tanf_piby_4
cmp r9,L_mask_3e4 ; compare to 2^-27 = 7.4505805969238281e-009
jge Ltanf_sse2_compute_x_xxx_0_333
; At this point tan(x) ~= x; if it's not exact, set the inexact flag.
test r9, r9
je Ltanf_sse2_exact_return
movsd xmm1, L_n_one
addsd xmm1, L_min_norm ; set inexact
Ltanf_sse2_exact_return:
StackDeallocate stack_size
ret
ALIGN 16
Ltanf_sse2_compute_x_xxx_0_333:
movapd xmm2,xmm5
mulsd xmm2,xmm2 ; xmm2 <-- x^2
movapd xmm0,xmm2
mulsd xmm0,xmm5 ; xmm0 <-- x^3
mulsd xmm0,L_point_333
addsd xmm0,xmm5 ; x + x*x*x*0.3333333333333333;
jmp Ltanf_sse2_return_s
ALIGN 16
Ltanf_sse2_compute_tanf_piby_4:
movapd xmm0,xmm5 ; xmm0 <-- x (as double)
movapd xmm1,xmm0
mulsd xmm1,xmm0 ; xmm1 <-- x*x
movsd xmm3,L_tanf+008h ; xmm3 <-- c1
mulsd xmm3,xmm1 ; xmm3 <-- c1*x^2
addsd xmm3,L_tanf ; xmm3 <-- c = c1*x^2 + c0
movsd xmm2,L_tanf+020h ; xmm2 <-- d2
mulsd xmm2,xmm1 ; xmm2 <-- d2*x^2
addsd xmm2,L_tanf+018h ; xmm2 <-- d2*x^2 + d1
mulsd xmm2,xmm1 ; xmm2 <-- (d2*x^2 + d1)*x^2
addsd xmm2,L_tanf+010h ; xmm2 <-- d = (d2*x^2 + d1)*x^2 + d0
divsd xmm3,xmm2 ; xmm3 <-- c/d
mulsd xmm1,xmm0 ; xmm1 <-- x^3
mulsd xmm1,xmm3 ; xmm1 <-- x^3 * c/d
addsd xmm0,xmm1 ; xmm0 <-- x + x^3 * c/d
jmp Ltanf_sse2_return_s
Ltanf_sse2_range_reduce:
movd xmm0,r9
cmp r9,L_large_x_sse2
jge Ltanf_sse2_tanf_reduce_large
Ltanf_sse2_tanf_reduce_moderate:
movapd xmm1,xmm0
andpd xmm1,L_sign_mask
movapd xmm2,L_twobypi
mulsd xmm2,xmm1
addsd xmm2,L_point_five
cvttpd2dq xmm4,xmm2
cvtdq2pd xmm1,xmm4
andpd xmm4,L_int_three ; xmm4 <-- region
movapd xmm2,xmm0
movapd xmm3,xmm1
mulsd xmm1,L_piby2_1
subsd xmm2,xmm1
mulsd xmm3,L_piby2_1tail ; xmm3 rtail
movapd xmm0,xmm2
subsd xmm0,xmm3
subsd xmm2,xmm0
movapd xmm1,xmm2
subsd xmm1,xmm3
jmp Ltanf_sse2_exit_s
Ltanf_sse2_tanf_reduce_large:
lea r9,__L_2_by_pi_bits
;xexp = (x >> 52) 1023
movd r11,xmm0
mov rcx,r11
shr r11,52
sub r11,1023 ; r11 <-- xexp = exponent of input x
;calculate the last byte from which to start multiplication
;last = 134 (xexp >> 3)
mov r10,r11
shr r10,3
sub r10,134 ; r10 <-- -last
neg r10 ; r10 <-- last
;load 64 bits of 2_by_pi
mov rax,[r9+r10]
;mantissa of x = ((x << 12) >> 12) | implied bit
shl rcx,12
shr rcx,12 ; rcx <-- mantissa part of input x
bts rcx,52 ; add the implied bit as well
;load next 128 bits of 2_by_pi
add r10,8 ; increment to next 8 bytes of 2_by_pi
movdqu xmm0,[r9+r10]
;do three 64bit multiplications with mant of x
mul rcx
mov r8,rax ; r8 = last 64 bits of mul = res1[2]
mov r10,rdx ; r10 = carry
vmovq rax,xmm0
mul rcx
;resexp = xexp & 7
and r11,7 ; r11 <-- resexp = last 3 bits of xexp
psrldq xmm0,8
add rax,r10 ; add the previous carry
adc rdx,0
mov r9,rax ; r9 <-- next 64 bits of mul = res1[1]
mov r10,rdx ; r10 <-- carry
movd rax,xmm0
mul rcx
add r10,rax ;r10 = most sig 64 bits = res1[0]
;find the region
;last three bits ltb = most sig bits >> (54 resexp))
; decimal point in last 18 bits == 8 lsb's in first 64 bits
; and 8 msb's in next 64 bits
;point_five = ltb & 01h;
;region = ((ltb >> 1) + point_five) & 3;
mov rcx,54
mov rax,r10
sub rcx,r11
xor rdx,rdx ;rdx = sign of x
shr rax,cl
jnc Ltanf_sse2_no_point_five_f
;;if there is carry.. then negate the result of multiplication
not r10
not r9
not r8
mov rdx,08000000000000000h
ALIGN 16
Ltanf_sse2_no_point_five_f:
adc rax,0
and rax,3
movd xmm4,eax ; xmm4 <-- region
;calculate the number of integer bits and zero them out
mov rcx,r11
add rcx,10 ; rcx = no. of integer bits
shl r10,cl
shr r10,cl ; r10 contains only mant bits
sub rcx,64 ; form the exponent
mov r11,rcx
;find the highest set bit
bsr rcx,r10
jnz Ltanf_sse2_form_mantissa_f
mov r10,r9
mov r9,r8
mov r8,0
bsr rcx,r10 ;rcx = hsb
sub r11,64
ALIGN 16
Ltanf_sse2_form_mantissa_f:
add r11,rcx ; for exp of x
sub rcx,52 ; rcx = no. of bits to shift in r10
cmp rcx,0
jl Ltanf_sse2_hsb_below_52_f
je Ltanf_sse2_form_numbers_f
;hsb above 52
mov r8,r10
shr r10,cl ; r10 = mantissa of x with hsb at 52
shr r9,cl ; make space for bits from r10
sub rcx,64
neg rcx ; rcx = no of bits to shift r10
shl r8,cl
or r9,r8 ; r9 = mantissa bits of xx
jmp Ltanf_sse2_form_numbers_f
ALIGN 16
Ltanf_sse2_hsb_below_52_f:
neg rcx
mov rax,r9
shl r10,cl
shl r9,cl
sub rcx,64
neg rcx
shr rax,cl
or r10,rax
shr r8,cl
or r9,r8
ALIGN 16
Ltanf_sse2_form_numbers_f:
add r11,1023
btr r10,52 ; remove the implied bit
mov rcx,r11
or r10,rdx ; put the sign
shl rcx,52
or r10,rcx ; x is in r10
movd xmm0,r10 ; xmm0 <-- x
mulsd xmm0,L_piby2_lead
Ltanf_sse2_exit_s:
movd eax,xmm4
and eax,1 ; eax <-- region & 1
movapd xmm1,xmm0
mulsd xmm1,xmm0 ; xmm1 <-- x*x
movsd xmm3,L_tanf+008h ; xmm3 <-- c1
mulsd xmm3,xmm1 ; xmm3 <-- c1*x^2
addsd xmm3,L_tanf ; xmm3 <-- c = c1*x^2 + c0
movsd xmm2,L_tanf+020h ; xmm2 <-- d2
mulsd xmm2,xmm1 ; xmm2 <-- d2*x^2
addsd xmm2,L_tanf+018h ; xmm2 <-- d2*x^2 + d1
mulsd xmm2,xmm1 ; xmm2 <-- (d2*x^2 + d1)*x^2
addsd xmm2,L_tanf+010h ; xmm2 <-- d = (d2*x^2 + d1)*x^2 + d0
divsd xmm3,xmm2 ; xmm3 <-- c/d
mulsd xmm1,xmm0 ; xmm1 <-- x^3
mulsd xmm1,xmm3 ; xmm1 <-- x^3 * c/d
addsd xmm0,xmm1 ; xmm0 <-- x + x^3 * c/d
cmp eax,01h
jne Ltanf_sse2_exit_tanpiby4
Ltanf_sse2_recip :
movd xmm3,L_n_one
divsd xmm3,xmm0
movsd xmm0,xmm3
Ltanf_sse2_exit_tanpiby4 :
andpd xmm5,L_signbit
xorpd xmm0,xmm5
Ltanf_sse2_return_s:
cvtsd2ss xmm0,xmm0
Ltanf_sse2_return_c:
StackDeallocate stack_size
ret
Ltanf_sse2_naninf:
call fname_special
StackDeallocate stack_size
ret
ALIGN 16
Ltanf_fma3:
vmovd eax,xmm0
mov r8d,L_inf_mask_32
and eax,r8d
cmp eax, r8d
jz Ltanf_fma3_naninf
vcvtss2sd xmm5,xmm0,xmm0
vmovq r9,xmm5
btr r9,63 ; r9 <-- |x|
cmp r9,L_piby4
jg Ltanf_fma3_range_reduce
cmp r9,L_mask_3f2
jge Ltanf_fma3_compute_tanf_piby_4
cmp r9,L_mask_3e4
jge Ltanf_fma3_compute_x_xxx_0_333
jmp Ltanf_fma3_return_c
Ltanf_fma3_compute_x_xxx_0_333:
vmulsd xmm2,xmm5,xmm5
vmulsd xmm0,xmm2,xmm5
vfmadd132sd xmm0,xmm5,L_point_333 ; x + x*x*x*0.3333333333333333;
jmp Ltanf_fma3_return_s
Ltanf_fma3_compute_tanf_piby_4:
vmovsd xmm0,xmm5,xmm5
vmulsd xmm1,xmm0,xmm0
vmovsd xmm3,L_tanf+008h
vfmadd213sd xmm3,xmm1,L_tanf
vmovsd xmm2,L_tanf+020h
vfmadd213sd xmm2,xmm1,L_tanf+018h
vfmadd213sd xmm2,xmm1,L_tanf+010h
vdivsd xmm3,xmm3,xmm2
vmulsd xmm1,xmm1,xmm0
vfmadd231sd xmm0,xmm1,xmm3
jmp Ltanf_fma3_return_s
Ltanf_fma3_range_reduce:
vmovq xmm0,r9
cmp r9,L_large_x_fma3
jge Ltanf_fma3_tanf_reduce_large
Ltanf_fma3_tanf_reduce_moderate:
vandpd xmm1,xmm0,L_sign_mask
vmovapd xmm2,L_twobypi
vfmadd213sd xmm2,xmm1,L_point_five
vcvttpd2dq xmm2,xmm2
vpmovsxdq xmm1,xmm2
vandpd xmm4,xmm1,L_int_three ; xmm4 <-- region
vshufps xmm1 ,xmm1,xmm1,8
vcvtdq2pd xmm1,xmm1
vmovdqa xmm2,xmm0
vfnmadd231sd xmm2,xmm1,L_piby2_1 ; xmm2 rhead
vmulsd xmm3,xmm1,L_piby2_1tail ; xmm3 rtail
vsubsd xmm0,xmm2,xmm3
vsubsd xmm2,xmm2,xmm0
vsubsd xmm1,xmm2,xmm3
jmp Ltanf_fma3_exit_s
Ltanf_fma3_tanf_reduce_large:
lea r9,__L_2_by_pi_bits
;xexp = (x >> 52) 1023
vmovq r11,xmm0
mov rcx,r11
shr r11,52
sub r11,1023 ; r11 <-- xexp = exponent of input x
;calculate the last byte from which to start multiplication
;last = 134 (xexp >> 3)
mov r10,r11
shr r10,3
sub r10,134 ; r10 <-- -last
neg r10 ; r10 <-- last
;load 64 bits of 2_by_pi
mov rax,[r9+r10]
;mantissa of x = ((x << 12) >> 12) | implied bit
shl rcx,12
shr rcx,12 ; rcx <-- mantissa part of input x
bts rcx,52 ; add the implied bit as well
;load next 128 bits of 2_by_pi
add r10,8 ; increment to next 8 bytes of 2_by_pi
vmovdqu xmm0,XMMWORD PTR[r9+r10]
;do three 64bit multiplications with mant of x
mul rcx
mov r8,rax ; r8 = last 64 bits of mul = res1[2]
mov r10,rdx ; r10 = carry
vmovq rax,xmm0
mul rcx
;resexp = xexp & 7
and r11,7 ; r11 <-- resexp = last 3 bits of xexp
vpsrldq xmm0,xmm0,8
add rax,r10 ; add the previous carry
adc rdx,0
mov r9,rax ; r9 <-- next 64 bits of mul = res1[1]
mov r10,rdx ; r10 <-- carry
vmovq rax,xmm0
mul rcx
add r10,rax ;r10 = most sig 64 bits = res1[0]
;find the region
;last three bits ltb = most sig bits >> (54 resexp))
; decimal point in last 18 bits == 8 lsb's in first 64 bits
; and 8 msb's in next 64 bits
;point_five = ltb & 01h;
;region = ((ltb >> 1) + point_five) & 3;
mov rcx,54
mov rax,r10
sub rcx,r11
xor rdx,rdx ;rdx = sign of x
shr rax,cl
jnc Ltanf_fma3_no_point_five_f
;;if there is carry.. then negate the result of multiplication
not r10
not r9
not r8
mov rdx,08000000000000000h
ALIGN 16
Ltanf_fma3_no_point_five_f:
adc rax,0
and rax,3
vmovd xmm4,eax ; xmm4 <-- region
;calculate the number of integer bits and zero them out
mov rcx,r11
add rcx,10 ; rcx = no. of integer bits
shl r10,cl
shr r10,cl ; r10 contains only mant bits
sub rcx,64 ; form the exponent
mov r11,rcx
;find the highest set bit
bsr rcx,r10
jnz Ltanf_fma3_form_mantissa_f
mov r10,r9
mov r9,r8
mov r8,0
bsr rcx,r10 ;rcx = hsb
sub r11,64
ALIGN 16
Ltanf_fma3_form_mantissa_f:
add r11,rcx ; for exp of x
sub rcx,52 ; rcx = no. of bits to shift in r10
cmp rcx,0
jl Ltanf_fma3_hsb_below_52_f
je Ltanf_fma3_form_numbers_f
;hsb above 52
mov r8,r10
shr r10,cl ; r10 = mantissa of x with hsb at 52
shr r9,cl ; make space for bits from r10
sub rcx,64
neg rcx ; rcx = no of bits to shift r10
shl r8,cl
or r9,r8 ; r9 = mantissa bits of xx
jmp Ltanf_fma3_form_numbers_f
ALIGN 16
Ltanf_fma3_hsb_below_52_f:
neg rcx
mov rax,r9
shl r10,cl
shl r9,cl
sub rcx,64
neg rcx
shr rax,cl
or r10,rax
shr r8,cl
or r9,r8
ALIGN 16
Ltanf_fma3_form_numbers_f:
add r11,1023
btr r10,52 ; remove the implied bit
mov rcx,r11
or r10,rdx ; put the sign
shl rcx,52
or r10,rcx ; x is in r10
vmovq xmm0,r10 ; xmm0 <-- x
vmulsd xmm0,xmm0,L_piby2_lead
Ltanf_fma3_exit_s:
vandpd xmm2,xmm4,XMMWORD PTR L_int_one
vmovd eax,xmm2
vmulsd xmm1,xmm0,xmm0
vmovsd xmm3,L_tanf+008h
vfmadd213sd xmm3,xmm1,L_tanf
vmovsd xmm2,L_tanf+020h
vfmadd213sd xmm2,xmm1,L_tanf+018h
vfmadd213sd xmm2,xmm1,L_tanf+010h
vdivsd xmm3,xmm3,xmm2
vmulsd xmm1,xmm1,xmm0
vfmadd231sd xmm0,xmm1,xmm3
cmp eax,01h
je Ltanf_fma3_recip
jmp Ltanf_fma3_exit_tanpiby4
Ltanf_fma3_recip :
vmovq xmm3,L_n_one
vdivsd xmm0,xmm3,xmm0
Ltanf_fma3_exit_tanpiby4 :
vandpd xmm5,xmm5,L_signbit
vxorpd xmm0,xmm0,xmm5
Ltanf_fma3_return_s:
vcvtsd2ss xmm0,xmm0,xmm0
Ltanf_fma3_return_c:
StackDeallocate stack_size
ret
Ltanf_fma3_naninf:
call fname_special
StackDeallocate stack_size
ret
fname endp
END