mirror of
https://github.com/reactos/reactos.git
synced 2024-08-27 05:38:41 +00:00
284 lines
11 KiB
NASM
284 lines
11 KiB
NASM
;
|
|
;
|
|
; MIT License
|
|
; -----------
|
|
;
|
|
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
|
;
|
|
; Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
; of this Software and associated documentaon files (the "Software"), to deal
|
|
; in the Software without restriction, including without limitation the rights
|
|
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
; copies of the Software, and to permit persons to whom the Software is
|
|
; furnished to do so, subject to the following conditions:
|
|
;
|
|
; The above copyright notice and this permission notice shall be included in
|
|
; all copies or substantial portions of the Software.
|
|
;
|
|
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
; THE SOFTWARE.
|
|
;
|
|
; An implementation of the remainder by pi/2 function using fma3
|
|
; This is a service routine for use by trig functions coded in asm that use fma3
|
|
;
|
|
; On input,
|
|
; xmm0 = x;
|
|
; On ouput
|
|
; xmm0 = r
|
|
; xmm1 = rr
|
|
; rax = region
|
|
|
|
.const
|
|
ALIGN 16
|
|
L_piby2_lead DQ 03ff921fb54442d18h, 03ff921fb54442d18h
|
|
L_fff800 DQ 0fffffffff8000000h, 0fffffffff8000000h
|
|
L_piby2_part1 DQ 03ff921fb50000000h, 03ff921fb50000000h
|
|
L_piby2_part2 DQ 03e5110b460000000h, 03e5110b460000000h
|
|
L_piby2_part3 DQ 03c91a62633145c06h, 03c91a62633145c06h
|
|
L_piby2_1 DQ 03FF921FB54400000h, 03FF921FB54400000h
|
|
L_piby2_2 DQ 03DD0B4611A600000h, 03DD0B4611A600000h
|
|
L_piby2_3 DQ 03BA3198A2E000000h, 03BA3198A2E000000h
|
|
L_piby2_1tail DQ 03DD0B4611A626331h, 03DD0B4611A626331h
|
|
L_piby2_2tail DQ 03BA3198A2E037073h, 03BA3198A2E037073h
|
|
L_piby2_3tail DQ 0397B839A252049C1h, 0397B839A252049C1h
|
|
L_sign_mask DQ 07FFFFFFFFFFFFFFFh, 07FFFFFFFFFFFFFFFh
|
|
L_twobypi DQ 03FE45F306DC9C883h, 03FE45F306DC9C883h
|
|
L_point_five DQ 03FE0000000000000h, 03FE0000000000000h
|
|
L_int_three DQ 00000000000000003h, 00000000000000003h
|
|
L_inf_mask_64 DQ 07FF0000000000000h, 07FF0000000000000h
|
|
L_signbit DQ 08000000000000000h, 08000000000000000h
|
|
;; constants for BDL reduction
|
|
L_r DQ 03FE45F306DC9C883h, 03FE45F306DC9C883h ; 2/pi
|
|
L_xc1 DQ 03FF921FB54442D18H, 03FF921FB54442D18h ; pi/2 (L_piby2_lead)
|
|
L_xc2 DQ 03C91A62633145C00H, 03C91A62633145C00h ; pi/2 part 2
|
|
L_xc3 DQ 0397B839A252049C0H, 0397B839A252049C0h ; pi/2 part 3
|
|
; sigma is 3*2^(p-n-2) where n is 0 and p is 53.
|
|
L_sigma DQ 04338000000000000h, 04338000000000000h ; 6755399441055744.
|
|
|
|
EXTRN __L_2_by_pi_bits:BYTE
|
|
|
|
region EQU 020h
|
|
stack_size EQU 038h
|
|
|
|
include fm.inc
|
|
|
|
fname TEXTEQU <__remainder_piby2_fma3>
|
|
fbname TEXTEQU <__remainder_piby2_fma3_bdl>
|
|
|
|
.code
|
|
|
|
PUBLIC fname
|
|
fname PROC FRAME
|
|
StackAllocate stack_size
|
|
.ENDPROLOG
|
|
|
|
; This function is not using rdx, r8, and r9 as pointers;
|
|
; all returns are in registers
|
|
|
|
; get the unbiased exponent and the mantissa part of x
|
|
lea r9,__L_2_by_pi_bits
|
|
|
|
; xexp = (x >> 52) - 1023
|
|
vmovq r11,xmm0
|
|
mov rcx,r11
|
|
shr r11,52
|
|
sub r11,1023 ; r11 <-- xexp = exponent of input x
|
|
|
|
; calculate the last byte from which to start multiplication
|
|
; last = 134 - (xexp >> 3)
|
|
mov r10,r11
|
|
shr r10,3
|
|
sub r10,134 ; r10 <-- -last
|
|
neg r10 ; r10 <-- last
|
|
|
|
; load 64 bits of 2_by_pi
|
|
mov rax,[r9 + r10]
|
|
|
|
; mantissa of x = ((x << 12) >> 12) | implied bit
|
|
shl rcx,12
|
|
shr rcx,12 ; rcx <-- mantissa part of input x
|
|
bts rcx,52 ; add the implied bit as well
|
|
|
|
; load next 128 bits of 2_by_pi
|
|
add r10,8 ; increment to next 8 bytes of 2_by_pi
|
|
vmovdqu xmm0,XMMWORD PTR[r9 + r10]
|
|
|
|
; do three 64-bit multiplications with mant of x
|
|
mul rcx
|
|
mov r8,rax ; r8 <-- last 64 bits of mul = res1[2]
|
|
mov r10,rdx ; r10 <-- carry
|
|
vmovq rax,xmm0
|
|
mul rcx
|
|
; resexp = xexp & 7
|
|
and r11,7 ; r11 <-- resexp = last 3 bits of xexp
|
|
vpsrldq xmm0,xmm0,8
|
|
add rax,r10 ; add the previous carry
|
|
adc rdx,0
|
|
mov r9,rax ; r9 <-- next 64 bits of mul = res1[1]
|
|
mov r10,rdx ; r10 <-- carry
|
|
vmovq rax,xmm0
|
|
mul rcx
|
|
add r10,rax ; r10 <-- most sig. 64 bits = res1[0]
|
|
|
|
; find the region
|
|
; last three bits ltb = most sig bits >> (54 - resexp));
|
|
; decimal point in last 18 bits ==> 8 lsb's in first 64 bits
|
|
; and 8 msb's in next 64 bits
|
|
; point_five = ltb & 01h;
|
|
; region = ((ltb >> 1) + point_five) & 3;
|
|
mov rcx,54
|
|
mov rax,r10
|
|
sub rcx,r11
|
|
xor rdx,rdx ; rdx <-- sign of x
|
|
shr rax,cl
|
|
jnc L__no_point_five
|
|
; if there is carry then negate the result of multiplication
|
|
not r10
|
|
not r9
|
|
not r8
|
|
mov rdx,08000000000000000h
|
|
|
|
ALIGN 16
|
|
L__no_point_five:
|
|
adc rax,0
|
|
and rax,3 ; rax now has region
|
|
mov QWORD PTR [region+rsp], rax
|
|
|
|
; calculate the number of integer bits and zero them out
|
|
mov rcx,r11
|
|
add rcx,10 ; rcx = no. of integer bits
|
|
shl r10,cl
|
|
shr r10,cl ; r10 contains only mant bits
|
|
sub rcx,64 ; form the exponent
|
|
mov r11,rcx
|
|
|
|
; find the highest set bit
|
|
bsr rcx,r10
|
|
jnz L__form_mantissa
|
|
mov r10,r9
|
|
mov r9,r8
|
|
mov r8,0
|
|
bsr rcx,r10 ; rcx = hsb
|
|
sub r11,64
|
|
|
|
ALIGN 16
|
|
L__form_mantissa:
|
|
add r11,rcx ; for exp of x
|
|
sub rcx,52 ; rcx = no. of bits to shift in r10
|
|
cmp rcx,0
|
|
jl L__hsb_below_52
|
|
je L__form_numbers
|
|
; hsb above 52
|
|
mov r8,r10 ; previous r8 not required
|
|
shr r10,cl ; r10 = mantissa of x with hsb at 52
|
|
shr r9,cl ; make space for bits from r10
|
|
sub rcx,64
|
|
neg rcx
|
|
; rcx <-- no of bits to shift r10 to move those bits to r9
|
|
shl r8,cl
|
|
or r9,r8 ; r9 = mantissa bits of xx
|
|
jmp L__form_numbers
|
|
|
|
ALIGN 16
|
|
L__hsb_below_52:
|
|
; rcx has shift count (< 0)
|
|
neg rcx
|
|
mov rax,r9
|
|
shl r10,cl
|
|
shl r9,cl
|
|
sub rcx,64
|
|
neg rcx
|
|
shr rax,cl
|
|
or r10,rax
|
|
shr r8,cl
|
|
or r9,r8
|
|
|
|
ALIGN 16
|
|
; Here r11 has unbiased exponent
|
|
; r10 has mantissa, with implicit bit possibly set
|
|
; rdx has the sign bit
|
|
L__form_numbers:
|
|
add r11,1023 ; r11 <-- biased exponent
|
|
btr r10,52 ; remove the implicit bit
|
|
mov rcx,r11 ; rcx <-- copy of biased exponent
|
|
or r10,rdx ; put the sign
|
|
shl rcx,52 ; shift biased exponent into place
|
|
or r10,rcx ; r10 <-- x
|
|
vmovq xmm2,r10 ; xmm1l <-- x
|
|
|
|
; form xx
|
|
; xor rcx,rcx ; Why is this necessary???
|
|
bsr rcx,r9 ; scan for high bit of xx mantissa
|
|
sub rcx,64 ; to shift the implied bit as well
|
|
neg rcx
|
|
shl r9,cl
|
|
shr r9,12
|
|
add rcx,52
|
|
sub r11,rcx
|
|
shl r11,52
|
|
or r9,rdx
|
|
or r9,r11
|
|
vmovq xmm1,r9 ; xmm1 <-- xx
|
|
vandpd xmm4,xmm2,L_fff800 ; xmm4 <-- hx
|
|
vsubsd xmm0,xmm2,xmm4 ; xmm0 <-- tx
|
|
vmulsd xmm5,xmm2,L_piby2_lead ; xmm5 <-- c
|
|
vmulsd xmm3,xmm4,L_piby2_part1
|
|
vsubsd xmm3,xmm3,xmm5
|
|
vfmadd231sd xmm3,xmm0,L_piby2_part1
|
|
vfmadd231sd xmm3,xmm4,L_piby2_part2
|
|
vfmadd231sd xmm3,xmm0,L_piby2_part2
|
|
vmulsd xmm4,xmm1,L_piby2_lead
|
|
vfmadd231sd xmm4,xmm2,L_piby2_part3
|
|
vaddsd xmm3,xmm3,xmm4 ; xmm3 <-- cc
|
|
vaddsd xmm0,xmm5,xmm3 ; xmm0 <--r
|
|
vsubsd xmm1,xmm5,xmm0
|
|
vaddsd xmm1,xmm1,xmm3 ; xmm1 <-- rr
|
|
mov rax, QWORD PTR [region+rsp]
|
|
|
|
StackDeallocate stack_size
|
|
ret
|
|
fname endp
|
|
|
|
ALIGN 16
|
|
PUBLIC fbname
|
|
fbname PROC FRAME
|
|
.ENDPROLOG
|
|
; Boldo, Daumas, annd Li, "Formally Verified Argument
|
|
; Reduction With a Fused Multiply-Add,"
|
|
; IEEE Trans. Comp., vol. 58, #8, Aug. 2009
|
|
; coefficients are from table 1, mutatis mutandis
|
|
; algorithm is their formula 3.1 (for getting z from sigma) and
|
|
; algorithm 5.1 (and extended version) for actual reduction
|
|
vmovapd xmm1,xmm0
|
|
vmovapd xmm4,L_xc2 ; xmm4 <-- xc2
|
|
vmovapd xmm2,L_sigma
|
|
vfmadd132sd xmm1,xmm2,L_r ; z = arg*r + sigma
|
|
vsubsd xmm1,xmm1,xmm2 ; xmm1 <-- z -= sigma
|
|
vcvttpd2dq xmm5,xmm1
|
|
vmovq rax, xmm5
|
|
vmovapd xmm2,xmm1
|
|
vfnmadd132sd xmm2,xmm0,L_xc1 ; xmm2 <-- u = arg - z*xc1
|
|
vmulsd xmm3,xmm1,xmm4 ; xmm3 <-- p1 = z*xc2
|
|
vmovapd xmm0,xmm1 ; xmm0 <-- copy of z
|
|
vfmsub213sd xmm0,xmm4,xmm3 ; xmm0 <-- p2 = z*xc2 - p1
|
|
vsubsd xmm5,xmm2,xmm3 ; xmm5 <-- t1 = u - p1
|
|
; We really don't want to spill in this code, so we're commandeering xmm4
|
|
vsubsd xmm4,xmm2,xmm5 ; xmm4 <-- temp = u - t1
|
|
vsubsd xmm4,xmm4,xmm3 ; xmm4 <-- t2 = temp - p1
|
|
; used to use xmm4 here for L_xc2
|
|
vfnmadd231sd xmm2,xmm1,L_xc2 ; xmm2 <-- v1 = -xc2*z + u
|
|
vsubsd xmm5,xmm5,xmm2 ; xmm5 <-- v2 = t1 - v1
|
|
vaddsd xmm5,xmm5,xmm4 ; xmm5 <-- v2 += t2
|
|
vsubsd xmm5,xmm5,xmm0 ; xmm5 <-- v2 -= p2
|
|
vmovapd xmm0,xmm2 ; xmm0 <-- arghead = v1
|
|
vfnmadd132sd xmm1,xmm5,L_xc3 ; xmm1 <-- argtail = -xc3*z + v2
|
|
and rax, 3 ; rax <-- region
|
|
ret
|
|
fbname endp
|
|
END
|