reactos/sdk/lib/crt/math/libm_sse2/remainder_piby2_forAsm.asm

;
;
; MIT License
; -----------
; 
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
; 
; Permission is hereby granted, free of charge, to any person obtaining a copy
; of this Software and associated documentaon files (the "Software"), to deal
; in the Software without restriction, including without limitation the rights
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
; copies of the Software, and to permit persons to whom the Software is
; furnished to do so, subject to the following conditions:
; 
; The above copyright notice and this permission notice shall be included in
; all copies or substantial portions of the Software.
; 
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
; THE SOFTWARE.
;
; An implementation of the remainder by pi/2 function
; This is a service routine for use by trig functions coded in asm
;
; On input,
;   xmm0 = x;
; On ouput
;   xmm0 = r
;   xmm1 = rr
;   xmm2 = region

.const
ALIGN 16
L__piby2_part3_piby2_lead DQ 03ff921fb54442d18h, 03c91a62633145c06h
L__piby2_part1            DQ 03ff921fb50000000h, 03ff921fb50000000h 
L__piby2_part2            DQ 03e5110b460000000h, 03e5110b460000000h
;; constants for CW reduction
L_piby2_1      DQ 03FF921FB54400000h, 03FF921FB54400000h
L_piby2_2      DQ 03DD0B4611A600000h, 03DD0B4611A600000h
L_piby2_3      DQ 03BA3198A2E000000h, 03BA3198A2E000000h
L_piby2_1tail  DQ 03DD0B4611A626331h, 03DD0B4611A626331h
L_piby2_2tail  DQ 03BA3198A2E037073h, 03BA3198A2E037073h
L_piby2_3tail  DQ 0397B839A252049C1h, 0397B839A252049C1h
L_twobypi      DQ 03FE45F306DC9C883h, 03FE45F306DC9C883h
L_point_five   DQ 03FE0000000000000h, 03FE0000000000000h
L_int_three    DQ 00000000000000003h, 00000000000000003h
L_inf_mask_64  DQ 07FF0000000000000h, 07FF0000000000000h
L_signbit      DQ 08000000000000000h, 08000000000000000h
L_int_1        DQ 00000000000000001h, 00000000000000001h
L_int_15       DQ 0000000000000000Fh
L_int_48       DQ 00000000000000030h
L_3pio4        DQ 04002D97C7F3321D2h
L_5pio4        DQ 0400F6A7A2955385Eh
L_7pio4        DQ 04015FDBBE9BBA775h
L_9pio4        DQ 0401c463abeccb2bbh
ALIGN 16
L__2_by_pi_bits DB 224, 241,  27, 193,  12,  88,  33, 116
                DB  53, 126, 196, 126, 237, 175, 169,  75
                DB  74,  41, 222, 231,  28, 244, 236, 197
                DB 151, 175,  31, 235, 158, 212, 181, 168
                DB 127, 121, 154, 253,  24,  61, 221,  38
                DB  44, 159,  60, 251, 217, 180, 125, 180
                DB  41, 104,  45,  70, 188, 188,  63,  96
                DB  22, 120, 255,  95, 226, 127, 236, 160
                DB 228, 247,  46, 126,  17, 114, 210, 231
                DB  76,  13, 230,  88,  71, 230,   4, 249
                DB 125, 209, 154, 192, 113, 166,  19,  18
                DB 237, 186, 212, 215,   8, 162, 251, 156
                DB 166, 196, 114, 172, 119, 248, 115,  72
                DB  70,  39, 168, 187,  36,  25, 128,  75
                DB  55,   9, 233, 184, 145, 220, 134,  21
                DB 239, 122, 175, 142,  69, 249,   7,  65
                DB  14, 241, 100,  86, 138, 109,   3, 119
                DB 211, 212,  71,  95, 157, 240, 167,  84
                DB  16,  57, 185,  13, 230, 139,   2,   0
                DB   0,   0,   0,   0,   0,   0


; local storage offsets
region          EQU 000h
stack_size      EQU 018h
sstack_size     EQU 000h   ; no stack for fsname

include fm.inc

fname TEXTEQU <__remainder_piby2_forAsm>
fsname TEXTEQU <__remainder_piby2_cw_forAsm>


.code

; xmm0l has |x|
PUBLIC fname
fname PROC FRAME
    StackAllocate stack_size
    .ENDPROLOG

    ; This function is not using rdx, r8, and r9 as pointers;
    ; all returns are in registers

    ; get the unbiased exponent and the mantissa part of x
    lea       r9,L__2_by_pi_bits
 
 ;xexp = (x >> 52) - 1023
    movd      r11,xmm0
    mov       rcx,r11 
    shr       r11,52
    sub       r11,1023                 ; r11 <-- xexp = exponent of input x 

    ;calculate the last byte from which to start multiplication
    ;last = 134 - (xexp >> 3) 
    mov       r10,r11
    shr       r10,3
    sub       r10,134                  ; r10 <-- -last
    neg       r10                      ; r10 <-- last

    ; load 64 bits of 2_by_pi
    mov       rax,[r9 + r10]
 
    ; mantissa of x = ((x << 12) >> 12) | implied bit
    shl       rcx,12
    shr       rcx,12                   ; rcx <-- mantissa part of input x 
    bts       rcx,52                   ; add the implied bit as well 

    ; load next 128 bits of 2_by_pi 
    add       r10,8 ;increment to next 8 bytes of 2_by_pi
    movdqu    xmm0,[r9 + r10] 

    ; do three 64-bit multiplications with mant of x 
    mul rcx
    mov       r8,rax                   ; r8 <-- last 64 bits of mul = res1[2] 
    mov       r10,rdx                  ; r10 <-- carry
    movd      rax,xmm0
    mul       rcx
    ; resexp = xexp & 7 
    and       r11,7                    ; r11 <-- resexp = xexp & 7 = last 3 bits
    psrldq    xmm0,8 
    add       rax,r10                  ; add the previous carry
    adc       rdx,0
    mov       r9,rax                   ; r9 <-- next 64 bits of mul = res1[1]
    mov       r10,rdx                  ; r10 <-- carry
    movd      rax,xmm0
    mul       rcx
    add       r10,rax                  ; r10 <-- most sig. 64 bits = res1[0]
    ; find the region 
    ; last three bits ltb = most sig bits >> (54 - resexp));
    ;   decimal point in last 18 bits ==> 8 lsb's in first 64 bits
    ;   and 8 msb's in next 64 bits
    ; point_five = ltb & 01h;
    ; region = ((ltb >> 1) + point_five) & 3;  
    mov       rcx,54
    mov       rax,r10
    sub       rcx,r11
    xor       rdx,rdx                  ; rdx <-- sign of x 
    shr       rax,cl 
    jnc       L__no_point_five
    ; if there is carry then negate the result of multiplication
    not       r10
    not       r9
    not       r8
    mov       rdx,08000000000000000h

ALIGN  16 
L__no_point_five:
    adc       rax,0
    and       rax,3                    ; rax now has region
    mov       QWORD PTR [region+rsp],rax

    ; calculate the number of integer bits and zero them out
    mov       rcx,r11 
    add       rcx,10                   ; rcx = no. of integer bits
    shl       r10,cl
    shr       r10,cl                   ; r10 contains only mant bits
    sub       rcx,64                   ; form the exponent
    mov       r11,rcx
 
 ;find the highest set bit
    bsr       rcx,r10
    jnz       L__form_mantissa
    mov       r10,r9
    mov       r9,r8
    mov       r8,0
    bsr       rcx,r10                  ; rcx = hsb
    sub       r11,64
 
 
ALIGN  16 
L__form_mantissa:
    add       r11,rcx                  ; for exp of x
    sub       rcx,52                   ; rcx = no. of bits to shift in r10 
    cmp       rcx,0
    jl        L__hsb_below_52
    je        L__form_numbers
    ; hsb above 52
    mov       r8,r10                   ; previous contents of r8 not required
    shr       r10,cl                   ; r10 = mantissa of x with hsb at 52
    shr       r9,cl                    ; make space for bits from r10
    sub       rcx,64
    neg       rcx
    ; rcx <-- no of bits to shift r10 to move those bits to r9
    shl       r8,cl
    or        r9,r8                    ; r9 = mantissa bits of xx 
    jmp       L__form_numbers
 
ALIGN  16 
L__hsb_below_52:
    neg       rcx
    mov       rax,r9
    shl       r10,cl
    shl       r9,cl
    sub       rcx,64
    neg       rcx
    shr       rax,cl
    or        r10,rax
    shr       r8,cl
    or        r9,r8 
 
ALIGN  16
L__form_numbers:
    add       r11,1023
    btr       r10,52                   ; remove the implicit bit
    mov       rcx,r11
    or        r10,rdx                  ; put the sign 
    shl       rcx,52
    or        r10,rcx                  ; r10 <-- x
 
    movd      xmm0,r10                 ; xmm0 <-- x
    movdqa    xmm1,xmm0                ; xmm1 <-- x
    psrlq     xmm1,27
    psllq     xmm1,27                  ; xmm1 <-- hx
    movdqa    xmm2,xmm0                ; xmm2 <-- x 
    subsd     xmm2,xmm1                ; xmm2 <-- tx
    movlhps   xmm0,xmm0                ; xmm0 <-- x,x
    movlhps   xmm2,xmm1                ; xmm2 <-- hx,tx

    movdqa    xmm1,XMMWORD PTR L__piby2_part3_piby2_lead 
    movdqa    xmm3,XMMWORD PTR L__piby2_part1
    movdqa    xmm4,XMMWORD PTR L__piby2_part2

    ; form xx
    xor       rcx,rcx
    bsr       rcx,r9
    sub       rcx,64                   ; to shift the implicit bit as well
    neg       rcx
    shl       r9,cl
    shr       r9,12
    add       rcx,52
    sub       r11,rcx
    shl       r11,52
    or        r9,rdx
    or        r9,r11
    movd      xmm5,r9                  ; xmm5 <-- xx 
 
    mulpd     xmm0,xmm1 ; xmm0 <-- piby2_part3 * x,piby2_lead * x = c
    mulpd     xmm5,xmm1 ; xmm5 <-- piby2_lead * xx
    mulpd     xmm3,xmm2 ; xmm3 <-- piby2_part1 * hx,piby2_part1 * tx
    mulpd     xmm4,xmm2 ; xmm4 <-- piby2_part2 * hx,piby2_part2 * tx 
 
    ; cc = (piby2_part1 * hx - c) + (piby2_part1 * tx) +
    ;   (piby2_part2 * hx) + (piby2_part2 * tx) + 
    ;   (piby2_lead * xx + piby2_part3 * x)
    movhlps   xmm1,xmm3 ; xmm1 = piby2_part1 * hx
    movhlps   xmm2,xmm4 ; xmm2 = piby2_part2 * hx 
    subsd     xmm1,xmm0 ; xmm1 = (piby2_part1 * hx - c)
    addsd     xmm1,xmm3 ; xmm1 = (piby2_part1 * hx - c) + (piby2_part1 * tx)
    movhlps   xmm3,xmm0 ; xmm3 = piby2_part3 * x
    addsd     xmm1,xmm2
    ; xmm1 = (piby2_part1 * hx - c) + (piby2_part1 * tx) + (piby2_part2 * hx)
    addsd     xmm3,xmm5 ; xmm3 = (piby2_lead * xx + piby2_part3 * x)
    addsd     xmm1,xmm4
    ; xmm1 = (piby2_part1 * hx - c) + (piby2_part1 * tx) + 
    ;    (piby2_part2 * hx) + (piby2_part2 * tx)
    addsd     xmm1,xmm3                ; xmm1 = cc
 
    ; xmm0 <-- c, xmm1 <-- cc
    ; r = c + cc
    ; rr = (c - r) + cc

    movdqa    xmm2,xmm0                ; xmm2 <-- copy of c
    addsd     xmm0,xmm1                ; xmm0 <-- r = c + cc
    subsd     xmm2,xmm0                ; xmm2 <-- c - r
    addsd     xmm1,xmm2                ; xmm1 <-- rr = cc + (c - r)
    mov       rax, QWORD PTR[region+rsp] ; rax <-- region

    StackDeallocate stack_size
    ret 
 
fname        endp

; NOTE: If this is not going to be used, should probably remove it. - WAT
ALIGN 16
PUBLIC fsname
fsname PROC FRAME
    StackAllocate sstack_size
    .ENDPROLOG

; xmm0l has |x|
; r9 also has |x|
; ASSUMPTION: if we call this function, |x| > pi/4

    xor       r8d,r8d
    cmp       r9, QWORD PTR L_5pio4
    ja        Lax_gt_5pio4
    cmp       r9, QWORD PTR L_3pio4
    seta      r8b
    inc       r8d
    jmp       Lstage_npi2
Lax_gt_5pio4:
    cmp       r9, QWORD PTR L_9pio4
    ja        Lnpi2_full_computation
    cmp       r9, QWORD PTR L_7pio4
    seta      r8b
    add       r8d,3
Lstage_npi2:
    movd      xmm2, r8d
    cvtdq2pd  xmm4, xmm2
    jmp       Lnpi2_known

Lnpi2_full_computation:
;   movapd    xmm1, L_twobypi
;   movapd    xmm3, L_point_five
    movapd    xmm5,xmm0
;   mulsd     xmm5,xmm1
;   addsd     xmm5,xmm3                   ; xmm5 <-- |x|*2/pi + .5
    mulsd     xmm5, L_twobypi
    addsd     xmm5, L_point_five

    cvttpd2dq xmm5,xmm5                   ; xmm5 < npi2 = int part
    movapd    xmm2,xmm5
    andpd     xmm2,L_int_three
    cvtdq2pd  xmm4,xmm5

Lnpi2_known:
    movapd    xmm5,xmm4
    mulsd     xmm5,QWORD PTR L_piby2_1    ; xmm5 <-- npi2*piby2_1
    xorpd     xmm5,L_signbit              ; xmm5 <-- -npi2*piby2_1
    addpd     xmm5,xmm0                   ; xmm5 <-- rhead = x - npi2*piby2_1
    movapd    xmm3,xmm4
    mulsd     xmm3,QWORD PTR L_piby2_1tail ; xmm3 <-- rtail = npi2*piby2_1tail

    ; If x is nearly a multiple of pi/2, rhead will be small compared to |x|
    ; we check this by checking exponent difference.

    ; Note that both the unbiased exponents are positive, and that of rhead
    ; must be <= that of |x|
    movapd    xmm1,xmm5                   ; xmm1l <-- rhead
    subpd     xmm1,xmm3                   ; xmm1l <-- r = rhead - rtail
    andpd     xmm1,L_inf_mask_64
    psubq     xmm0,xmm1                   ; xmm0 <-- |x| - r
    psrlq     xmm0,52
    comisd    xmm0,L_int_15

;   movd      rax, xmm5                   ; really a movq
;   shr       rax, 52
;   shr       rdx, 52                     ; get exponent of |x| (no and needed)
;   sub       rdx, rax
;   cmp       rdx, 15
    jbe       Lcw_get_r_rr

    ; here expdiff > 15, so x is nearly a multiple of pi/2 and things are hard
    ; we use another piece of pi/2 in the reduction

    movapd    xmm1,xmm5
    movapd    xmm3,xmm4
    mulsd     xmm3,QWORD PTR L_piby2_2 ; xmm3 <--- rtail = npi2*piby2_2
    subsd     xmm5,xmm3 ; xmm5 <-- rhead = t - rtail

    ; now rtail = npi2*piby2_2tail - ((t-rhead) - rtail)
    subsd     xmm1,xmm5
    subsd     xmm1,xmm3
    movapd    xmm3,xmm4
    mulsd     xmm3,QWORD PTR L_piby2_2tail
    subsd     xmm3,xmm1 ; xmm3 <-- rtail

    comisd    xmm0,L_int_48
;   cmp       rdx, 48
    jbe       Lcw_get_r_rr

    ; here expdiff > 48, so x is REALLY close to a multiple of pi/2
    ; and we use yet another piece of pi/2 in the reduction

    movapd    xmm0,xmm5 ; xmm0 <-- t = rhead
    movapd    xmm3,xmm4
    mulsd     xmm3,QWORD PTR L_piby2_3 ; xmm3 <-- rtail = npi2 * piby2_3
    movapd    xmm5,xmm0
    subsd     xmm5,xmm3 ; xmm5 <-- rhead = t - rtail

    ; now rtail = npi2 * piby2_3tail - ((t - rhead) - rtail)
    movapd    xmm1,xmm0
    subsd     xmm1,xmm5
    subsd     xmm1,xmm3
    movapd    xmm3,xmm4
    mulsd     xmm3,QWORD PTR L_piby2_3tail
    subsd     xmm3,xmm1 ; xmm3 <-- rtail

Lcw_get_r_rr:
    ; We have a satisfactory rhead in xmm5 and rtail in xmm3
    ; We now produce r in xmm0 and rr in xmm1, where the actual reduced argument
    ; is the sum of r and rr, and rr is insignificant
    ; with respect to r under addition (i.e., r + rr == r).
    movapd    xmm0,xmm5 ; xmm0 <-- rhead
    subsd     xmm0,xmm3 ; xmm0 <-- r = rhead - rtail
    movapd    xmm1,xmm5 ; xmm1 <-- rhead
    subsd     xmm1,xmm0 ; xmm1 <-- (rhead - r)
    subsd     xmm1,xmm3 ; xmm1 <-- rr = (rhead - r) - rtail
    movd      rax,xmm2  ; rax <-- region
    StackDeallocate sstack_size
    ret
fsname        endp

END
[LIBM] Import win-libm from AMD Source: https://github.com/amd/win-libm 2022-06-12 10:02:01 +00:00			`;`
			`;`
			`; MIT License`
			`; -----------`
			`;`
			`; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.`
			`;`
			`; Permission is hereby granted, free of charge, to any person obtaining a copy`
			`; of this Software and associated documentaon files (the "Software"), to deal`
			`; in the Software without restriction, including without limitation the rights`
			`; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell`
			`; copies of the Software, and to permit persons to whom the Software is`
			`; furnished to do so, subject to the following conditions:`
			`;`
			`; The above copyright notice and this permission notice shall be included in`
			`; all copies or substantial portions of the Software.`
			`;`
			`; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR`
			`; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,`
			`; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE`
			`; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER`
			`; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,`
			`; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN`
			`; THE SOFTWARE.`
			`;`
			`; An implementation of the remainder by pi/2 function`
			`; This is a service routine for use by trig functions coded in asm`
			`;`
			`; On input,`
			`; xmm0 = x;`
			`; On ouput`
			`; xmm0 = r`
			`; xmm1 = rr`
			`; xmm2 = region`

			`.const`
			`ALIGN 16`
			`L__piby2_part3_piby2_lead DQ 03ff921fb54442d18h, 03c91a62633145c06h`
			`L__piby2_part1 DQ 03ff921fb50000000h, 03ff921fb50000000h`
			`L__piby2_part2 DQ 03e5110b460000000h, 03e5110b460000000h`
			`;; constants for CW reduction`
			`L_piby2_1 DQ 03FF921FB54400000h, 03FF921FB54400000h`
			`L_piby2_2 DQ 03DD0B4611A600000h, 03DD0B4611A600000h`
			`L_piby2_3 DQ 03BA3198A2E000000h, 03BA3198A2E000000h`
			`L_piby2_1tail DQ 03DD0B4611A626331h, 03DD0B4611A626331h`
			`L_piby2_2tail DQ 03BA3198A2E037073h, 03BA3198A2E037073h`
			`L_piby2_3tail DQ 0397B839A252049C1h, 0397B839A252049C1h`
			`L_twobypi DQ 03FE45F306DC9C883h, 03FE45F306DC9C883h`
			`L_point_five DQ 03FE0000000000000h, 03FE0000000000000h`
			`L_int_three DQ 00000000000000003h, 00000000000000003h`
			`L_inf_mask_64 DQ 07FF0000000000000h, 07FF0000000000000h`
			`L_signbit DQ 08000000000000000h, 08000000000000000h`
			`L_int_1 DQ 00000000000000001h, 00000000000000001h`
			`L_int_15 DQ 0000000000000000Fh`
			`L_int_48 DQ 00000000000000030h`
			`L_3pio4 DQ 04002D97C7F3321D2h`
			`L_5pio4 DQ 0400F6A7A2955385Eh`
			`L_7pio4 DQ 04015FDBBE9BBA775h`
			`L_9pio4 DQ 0401c463abeccb2bbh`
			`ALIGN 16`
			`L__2_by_pi_bits DB 224, 241, 27, 193, 12, 88, 33, 116`
			`DB 53, 126, 196, 126, 237, 175, 169, 75`
			`DB 74, 41, 222, 231, 28, 244, 236, 197`
			`DB 151, 175, 31, 235, 158, 212, 181, 168`
			`DB 127, 121, 154, 253, 24, 61, 221, 38`
			`DB 44, 159, 60, 251, 217, 180, 125, 180`
			`DB 41, 104, 45, 70, 188, 188, 63, 96`
			`DB 22, 120, 255, 95, 226, 127, 236, 160`
			`DB 228, 247, 46, 126, 17, 114, 210, 231`
			`DB 76, 13, 230, 88, 71, 230, 4, 249`
			`DB 125, 209, 154, 192, 113, 166, 19, 18`
			`DB 237, 186, 212, 215, 8, 162, 251, 156`
			`DB 166, 196, 114, 172, 119, 248, 115, 72`
			`DB 70, 39, 168, 187, 36, 25, 128, 75`
			`DB 55, 9, 233, 184, 145, 220, 134, 21`
			`DB 239, 122, 175, 142, 69, 249, 7, 65`
			`DB 14, 241, 100, 86, 138, 109, 3, 119`
			`DB 211, 212, 71, 95, 157, 240, 167, 84`
			`DB 16, 57, 185, 13, 230, 139, 2, 0`
			`DB 0, 0, 0, 0, 0, 0`


			`; local storage offsets`
			`region EQU 000h`
			`stack_size EQU 018h`
			`sstack_size EQU 000h ; no stack for fsname`

			`include fm.inc`

			`fname TEXTEQU <__remainder_piby2_forAsm>`
			`fsname TEXTEQU <__remainder_piby2_cw_forAsm>`


			`.code`

			`; xmm0l has \|x\|`
			`PUBLIC fname`
			`fname PROC FRAME`
			`StackAllocate stack_size`
			`.ENDPROLOG`

			`; This function is not using rdx, r8, and r9 as pointers;`
			`; all returns are in registers`

			`; get the unbiased exponent and the mantissa part of x`
			`lea r9,L__2_by_pi_bits`

			`;xexp = (x >> 52) - 1023`
			`movd r11,xmm0`
			`mov rcx,r11`
			`shr r11,52`
			`sub r11,1023 ; r11 <-- xexp = exponent of input x`

			`;calculate the last byte from which to start multiplication`
			`;last = 134 - (xexp >> 3)`
			`mov r10,r11`
			`shr r10,3`
			`sub r10,134 ; r10 <-- -last`
			`neg r10 ; r10 <-- last`

			`; load 64 bits of 2_by_pi`
			`mov rax,[r9 + r10]`

			`; mantissa of x = ((x << 12) >> 12) \| implied bit`
			`shl rcx,12`
			`shr rcx,12 ; rcx <-- mantissa part of input x`
			`bts rcx,52 ; add the implied bit as well`

			`; load next 128 bits of 2_by_pi`
			`add r10,8 ;increment to next 8 bytes of 2_by_pi`
			`movdqu xmm0,[r9 + r10]`

			`; do three 64-bit multiplications with mant of x`
			`mul rcx`
			`mov r8,rax ; r8 <-- last 64 bits of mul = res1[2]`
			`mov r10,rdx ; r10 <-- carry`
			`movd rax,xmm0`
			`mul rcx`
			`; resexp = xexp & 7`
			`and r11,7 ; r11 <-- resexp = xexp & 7 = last 3 bits`
			`psrldq xmm0,8`
			`add rax,r10 ; add the previous carry`
			`adc rdx,0`
			`mov r9,rax ; r9 <-- next 64 bits of mul = res1[1]`
			`mov r10,rdx ; r10 <-- carry`
			`movd rax,xmm0`
			`mul rcx`
			`add r10,rax ; r10 <-- most sig. 64 bits = res1[0]`
			`; find the region`
			`; last three bits ltb = most sig bits >> (54 - resexp));`
			`; decimal point in last 18 bits ==> 8 lsb's in first 64 bits`
			`; and 8 msb's in next 64 bits`
			`; point_five = ltb & 01h;`
			`; region = ((ltb >> 1) + point_five) & 3;`
			`mov rcx,54`
			`mov rax,r10`
			`sub rcx,r11`
			`xor rdx,rdx ; rdx <-- sign of x`
			`shr rax,cl`
			`jnc L__no_point_five`
			`; if there is carry then negate the result of multiplication`
			`not r10`
			`not r9`
			`not r8`
			`mov rdx,08000000000000000h`

			`ALIGN 16`
			`L__no_point_five:`
			`adc rax,0`
			`and rax,3 ; rax now has region`
			`mov QWORD PTR [region+rsp],rax`

			`; calculate the number of integer bits and zero them out`
			`mov rcx,r11`
			`add rcx,10 ; rcx = no. of integer bits`
			`shl r10,cl`
			`shr r10,cl ; r10 contains only mant bits`
			`sub rcx,64 ; form the exponent`
			`mov r11,rcx`

			`;find the highest set bit`
			`bsr rcx,r10`
			`jnz L__form_mantissa`
			`mov r10,r9`
			`mov r9,r8`
			`mov r8,0`
			`bsr rcx,r10 ; rcx = hsb`
			`sub r11,64`


			`ALIGN 16`
			`L__form_mantissa:`
			`add r11,rcx ; for exp of x`
			`sub rcx,52 ; rcx = no. of bits to shift in r10`
			`cmp rcx,0`
			`jl L__hsb_below_52`
			`je L__form_numbers`
			`; hsb above 52`
			`mov r8,r10 ; previous contents of r8 not required`
			`shr r10,cl ; r10 = mantissa of x with hsb at 52`
			`shr r9,cl ; make space for bits from r10`
			`sub rcx,64`
			`neg rcx`
			`; rcx <-- no of bits to shift r10 to move those bits to r9`
			`shl r8,cl`
			`or r9,r8 ; r9 = mantissa bits of xx`
			`jmp L__form_numbers`

			`ALIGN 16`
			`L__hsb_below_52:`
			`neg rcx`
			`mov rax,r9`
			`shl r10,cl`
			`shl r9,cl`
			`sub rcx,64`
			`neg rcx`
			`shr rax,cl`
			`or r10,rax`
			`shr r8,cl`
			`or r9,r8`

			`ALIGN 16`
			`L__form_numbers:`
			`add r11,1023`
			`btr r10,52 ; remove the implicit bit`
			`mov rcx,r11`
			`or r10,rdx ; put the sign`
			`shl rcx,52`
			`or r10,rcx ; r10 <-- x`

			`movd xmm0,r10 ; xmm0 <-- x`
			`movdqa xmm1,xmm0 ; xmm1 <-- x`
			`psrlq xmm1,27`
			`psllq xmm1,27 ; xmm1 <-- hx`
			`movdqa xmm2,xmm0 ; xmm2 <-- x`
			`subsd xmm2,xmm1 ; xmm2 <-- tx`
			`movlhps xmm0,xmm0 ; xmm0 <-- x,x`
			`movlhps xmm2,xmm1 ; xmm2 <-- hx,tx`

			`movdqa xmm1,XMMWORD PTR L__piby2_part3_piby2_lead`
			`movdqa xmm3,XMMWORD PTR L__piby2_part1`
			`movdqa xmm4,XMMWORD PTR L__piby2_part2`

			`; form xx`
			`xor rcx,rcx`
			`bsr rcx,r9`
			`sub rcx,64 ; to shift the implicit bit as well`
			`neg rcx`
			`shl r9,cl`
			`shr r9,12`
			`add rcx,52`
			`sub r11,rcx`
			`shl r11,52`
			`or r9,rdx`
			`or r9,r11`
			`movd xmm5,r9 ; xmm5 <-- xx`

			`mulpd xmm0,xmm1 ; xmm0 <-- piby2_part3 * x,piby2_lead * x = c`
			`mulpd xmm5,xmm1 ; xmm5 <-- piby2_lead * xx`
			`mulpd xmm3,xmm2 ; xmm3 <-- piby2_part1 * hx,piby2_part1 * tx`
			`mulpd xmm4,xmm2 ; xmm4 <-- piby2_part2 * hx,piby2_part2 * tx`

			`; cc = (piby2_part1 * hx - c) + (piby2_part1 * tx) +`
			`; (piby2_part2 * hx) + (piby2_part2 * tx) +`
			`; (piby2_lead * xx + piby2_part3 * x)`
			`movhlps xmm1,xmm3 ; xmm1 = piby2_part1 * hx`
			`movhlps xmm2,xmm4 ; xmm2 = piby2_part2 * hx`
			`subsd xmm1,xmm0 ; xmm1 = (piby2_part1 * hx - c)`
			`addsd xmm1,xmm3 ; xmm1 = (piby2_part1 * hx - c) + (piby2_part1 * tx)`
			`movhlps xmm3,xmm0 ; xmm3 = piby2_part3 * x`
			`addsd xmm1,xmm2`
			`; xmm1 = (piby2_part1 * hx - c) + (piby2_part1 * tx) + (piby2_part2 * hx)`
			`addsd xmm3,xmm5 ; xmm3 = (piby2_lead * xx + piby2_part3 * x)`
			`addsd xmm1,xmm4`
			`; xmm1 = (piby2_part1 * hx - c) + (piby2_part1 * tx) +`
			`; (piby2_part2 * hx) + (piby2_part2 * tx)`
			`addsd xmm1,xmm3 ; xmm1 = cc`

			`; xmm0 <-- c, xmm1 <-- cc`
			`; r = c + cc`
			`; rr = (c - r) + cc`

			`movdqa xmm2,xmm0 ; xmm2 <-- copy of c`
			`addsd xmm0,xmm1 ; xmm0 <-- r = c + cc`
			`subsd xmm2,xmm0 ; xmm2 <-- c - r`
			`addsd xmm1,xmm2 ; xmm1 <-- rr = cc + (c - r)`
			`mov rax, QWORD PTR[region+rsp] ; rax <-- region`

			`StackDeallocate stack_size`
			`ret`

			`fname endp`

			`; NOTE: If this is not going to be used, should probably remove it. - WAT`
			`ALIGN 16`
			`PUBLIC fsname`
			`fsname PROC FRAME`
			`StackAllocate sstack_size`
			`.ENDPROLOG`

			`; xmm0l has \|x\|`
			`; r9 also has \|x\|`
			`; ASSUMPTION: if we call this function, \|x\| > pi/4`

			`xor r8d,r8d`
			`cmp r9, QWORD PTR L_5pio4`
			`ja Lax_gt_5pio4`
			`cmp r9, QWORD PTR L_3pio4`
			`seta r8b`
			`inc r8d`
			`jmp Lstage_npi2`
			`Lax_gt_5pio4:`
			`cmp r9, QWORD PTR L_9pio4`
			`ja Lnpi2_full_computation`
			`cmp r9, QWORD PTR L_7pio4`
			`seta r8b`
			`add r8d,3`
			`Lstage_npi2:`
			`movd xmm2, r8d`
			`cvtdq2pd xmm4, xmm2`
			`jmp Lnpi2_known`

			`Lnpi2_full_computation:`
			`; movapd xmm1, L_twobypi`
			`; movapd xmm3, L_point_five`
			`movapd xmm5,xmm0`
			`; mulsd xmm5,xmm1`
			`; addsd xmm5,xmm3 ; xmm5 <-- \|x\|*2/pi + .5`
			`mulsd xmm5, L_twobypi`
			`addsd xmm5, L_point_five`

			`cvttpd2dq xmm5,xmm5 ; xmm5 < npi2 = int part`
			`movapd xmm2,xmm5`
			`andpd xmm2,L_int_three`
			`cvtdq2pd xmm4,xmm5`

			`Lnpi2_known:`
			`movapd xmm5,xmm4`
			`mulsd xmm5,QWORD PTR L_piby2_1 ; xmm5 <-- npi2*piby2_1`
			`xorpd xmm5,L_signbit ; xmm5 <-- -npi2*piby2_1`
			`addpd xmm5,xmm0 ; xmm5 <-- rhead = x - npi2*piby2_1`
			`movapd xmm3,xmm4`
			`mulsd xmm3,QWORD PTR L_piby2_1tail ; xmm3 <-- rtail = npi2*piby2_1tail`

			`; If x is nearly a multiple of pi/2, rhead will be small compared to \|x\|`
			`; we check this by checking exponent difference.`

			`; Note that both the unbiased exponents are positive, and that of rhead`
			`; must be <= that of \|x\|`
			`movapd xmm1,xmm5 ; xmm1l <-- rhead`
			`subpd xmm1,xmm3 ; xmm1l <-- r = rhead - rtail`
			`andpd xmm1,L_inf_mask_64`
			`psubq xmm0,xmm1 ; xmm0 <-- \|x\| - r`
			`psrlq xmm0,52`
			`comisd xmm0,L_int_15`

			`; movd rax, xmm5 ; really a movq`
			`; shr rax, 52`
			`; shr rdx, 52 ; get exponent of \|x\| (no and needed)`
			`; sub rdx, rax`
			`; cmp rdx, 15`
			`jbe Lcw_get_r_rr`

			`; here expdiff > 15, so x is nearly a multiple of pi/2 and things are hard`
			`; we use another piece of pi/2 in the reduction`

			`movapd xmm1,xmm5`
			`movapd xmm3,xmm4`
			`mulsd xmm3,QWORD PTR L_piby2_2 ; xmm3 <--- rtail = npi2*piby2_2`
			`subsd xmm5,xmm3 ; xmm5 <-- rhead = t - rtail`

			`; now rtail = npi2*piby2_2tail - ((t-rhead) - rtail)`
			`subsd xmm1,xmm5`
			`subsd xmm1,xmm3`
			`movapd xmm3,xmm4`
			`mulsd xmm3,QWORD PTR L_piby2_2tail`
			`subsd xmm3,xmm1 ; xmm3 <-- rtail`

			`comisd xmm0,L_int_48`
			`; cmp rdx, 48`
			`jbe Lcw_get_r_rr`

			`; here expdiff > 48, so x is REALLY close to a multiple of pi/2`
			`; and we use yet another piece of pi/2 in the reduction`

			`movapd xmm0,xmm5 ; xmm0 <-- t = rhead`
			`movapd xmm3,xmm4`
			`mulsd xmm3,QWORD PTR L_piby2_3 ; xmm3 <-- rtail = npi2 * piby2_3`
			`movapd xmm5,xmm0`
			`subsd xmm5,xmm3 ; xmm5 <-- rhead = t - rtail`

			`; now rtail = npi2 * piby2_3tail - ((t - rhead) - rtail)`
			`movapd xmm1,xmm0`
			`subsd xmm1,xmm5`
			`subsd xmm1,xmm3`
			`movapd xmm3,xmm4`
			`mulsd xmm3,QWORD PTR L_piby2_3tail`
			`subsd xmm3,xmm1 ; xmm3 <-- rtail`

			`Lcw_get_r_rr:`
			`; We have a satisfactory rhead in xmm5 and rtail in xmm3`
			`; We now produce r in xmm0 and rr in xmm1, where the actual reduced argument`
			`; is the sum of r and rr, and rr is insignificant`
			`; with respect to r under addition (i.e., r + rr == r).`
			`movapd xmm0,xmm5 ; xmm0 <-- rhead`
			`subsd xmm0,xmm3 ; xmm0 <-- r = rhead - rtail`
			`movapd xmm1,xmm5 ; xmm1 <-- rhead`
			`subsd xmm1,xmm0 ; xmm1 <-- (rhead - r)`
			`subsd xmm1,xmm3 ; xmm1 <-- rr = (rhead - r) - rtail`
			`movd rax,xmm2 ; rax <-- region`
			`StackDeallocate sstack_size`
			`ret`
			`fsname endp`

			`END`