reactos/sdk/lib/crt/math/libm_sse2/cosf.asm

;
; MIT License
; -----------
; 
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
; 
; Permission is hereby granted, free of charge, to any person obtaining a copy
; of this Software and associated documentaon files (the "Software"), to deal
; in the Software without restriction, including without limitation the rights
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
; copies of the Software, and to permit persons to whom the Software is
; furnished to do so, subject to the following conditions:
; 
; The above copyright notice and this permission notice shall be included in
; all copies or substantial portions of the Software.
; 
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
; THE SOFTWARE.
;
; An implementation of the cosf function.
;
; Prototype:
;
;     float cosf(float x);
;
;   Computes cosf(x).
;   Based on the NAG C implementation.
;   It will provide proper C99 return values,
;   but may not raise floating point status bits properly.
;   Original Author: Harsha Jagasia

.const
ALIGN 16
L_real_one                 DQ 03ff0000000000000h      ; 1.0
                           DQ 0                          ; for alignment
L_one_half                 DQ 03fe0000000000000h      ; 0.5
                           DQ 0
L_2bypi                    DQ 03fe45f306dc9c883h      ; 2./pi
                           DQ 0
L_one_sixth                DQ 03fc5555555555555h      ; 0.166666666666
                           DQ 0
L_piby2                    DQ 03fe921fb54442d18h
                           DQ 0
L_piby2_1                  DQ 03ff921fb54400000h     ; piby2_1
                           DQ 0
L_piby2_1tail              DQ 03dd0b4611a626331h     ; piby2_1tail
                           DQ 0
L_piby2_2                  DQ 03dd0b4611a600000h     ; piby2_2
                           DQ 0
L_piby2_2tail              DQ 03ba3198a2e037073h     ; piby2_2tail
                           DQ 0
L_large_x_sse2             DQ 0411E848000000000h     ; 5e5
                           DQ 0
L_large_x_fma3             DQ 041E921FB60000000h     ; 3.37325952e9
                           DQ 0
L_sign_mask                DQ 07FFFFFFFFFFFFFFFh
                           DQ 07FFFFFFFFFFFFFFFh
L__int_three               DQ 00000000000000003h
                           DQ 00000000000000003h
L__min_norm_double         DQ 00010000000000000h
                           DQ 00010000000000000h
L_two_to_neg_7             DQ 03f80000000000000h
                           DQ 0
L_two_to_neg_13            DQ 03f20000000000000h
                           DQ 0
L_inf_mask_32              DD 07F800000h
                           DQ 0

fname           TEXTEQU <cosf>
fname_special   TEXTEQU <_cosf_special>

;Define name and any external functions being called
EXTERN           __remainder_piby2d2f_forAsm : PROC    ; NEAR
EXTERN           __remainder_piby2_fma3_bdl  : PROC   ; NEAR
EXTERN           __remainder_piby2_fma3      : PROC   ; NEAR
EXTERN           fname_special      : PROC
EXTERN           _set_statfp        : PROC


EXTRN __Lcosfarray:QWORD
EXTRN __Lsinfarray:QWORD
EXTRN __use_fma3_lib:DWORD

; define local variable storage offsets
p_temp           equ        020h          ; temporary for get/put bits operation
p_temp1          equ        030h          ; temporary for get/put bits operation
dummy_space      EQU        040h
stack_size       EQU        068h

include fm.inc

.code

ALIGN 16
PUBLIC fname
fname PROC FRAME
    StackAllocate stack_size
    .ENDPROLOG
    cmp          DWORD PTR __use_fma3_lib, 0
    jne          Lcosf_fma3

Lcosf_sse2:

    xorpd        xmm2, xmm2               ; zeroed out for later use

;;  if NaN or inf
    movd         edx, xmm0
    mov          eax, 07f800000h
    mov          r10d, eax
    and          r10d, edx
    cmp          r10d, eax
    jz           Lcosf_sse2_naninf

    cvtss2sd     xmm0, xmm0
    movd         rdx, xmm0

;  ax = (ux & ~SIGNBIT_DP64);
    mov          r10, rdx
    btr          r10, 63                  ; r10 <-- |x|
    mov          r8d, 1                   ; for determining region later on

    movapd       xmm1, xmm0               ; xmm1 <-- copy of x


;;  if (ax <= 3fe921fb54442d18h) /* abs(x) <= pi/4 */
    mov          rax, 03fe921fb54442d18h
    cmp          r10, rax
    jg           Lcosf_sse2_absx_gt_piby4

;          *c = cos_piby4(x, 0.0);
    movapd       xmm2, xmm0
    mulsd        xmm2, xmm2        ;x^2
    xor          eax, eax
    mov          rdx, r10
    movsd        xmm5, QWORD PTR L_one_half
    jmp          Lcosf_sse2_calc_sincosf_piby4        ; done


ALIGN 16
Lcosf_sse2_absx_gt_piby4:
; reduce  the argument to be in a range from -pi/4 to +pi/4
; by subtracting multiples of pi/2
;  xneg = (ax != ux);
    movd         xmm0, r10                ; xmm0 <-- |x|
    cmp          r10, QWORD PTR L_large_x_sse2
    jae          Lcosf_sse2_reduce_precise

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; xmm0=abs(x), xmm1=x
;/* How many pi/2 is x a multiple of? */

    movapd       xmm2, xmm0
    movsd        xmm3, QWORD PTR L_2bypi
    movapd       xmm4, xmm0
    movsd        xmm5, QWORD PTR L_one_half
    mulsd        xmm2, xmm3

;   movsd        xmm5, QWORD PTR L_one_half
;   movapd       xmm2, xmm0
;   mulsd        xmm2, QWORD PTR L_2bypi
;   movapd       xmm4, xmm0

    mov     r9, r10
    shr     r9, 52                        ; r9 <-- biased exponent of x

;        npi2  = (int)(x * twobypi + 0.5);
    addsd   xmm2, xmm5                          ; npi2

    movsd        xmm3, QWORD PTR L_piby2_1      ; piby2_1
    cvttpd2dq    xmm0, xmm2                     ; xmm0 <-- npi2
    movsd        xmm1, QWORD PTR L_piby2_1tail  ; piby2_1tail
    cvtdq2pd     xmm2, xmm0                     ; xmm2 <-- (double)npi2

;    Subtract the multiple from x to get an extra-precision remainder
;      rhead  = x - npi2 * piby2_1;

    mulsd        xmm3, xmm2                     ; use piby2_1
    subsd        xmm4, xmm3                     ; rhead

;      rtail  = npi2 * piby2_1tail;
    mulsd        xmm1, xmm2                     ; rtail
    movd         eax, xmm0

; GET_BITS_DP64(rhead-rtail, uy);
; originally only rhead
    movapd       xmm0, xmm4
    subsd        xmm0, xmm1

    movsd        xmm3, QWORD PTR L_piby2_2      ; piby2_2
    movd         rcx, xmm0                      ; rcx <-- rhead-rtail
    movsd        xmm5, QWORD PTR L_piby2_2tail  ; piby2_2tail

;      region = npi2 & 3;
;    and        eax, 3
;      expdiff = xexp - ((uy & EXPBITS_DP64) >> EXPSHIFTBITS_DP64);
    shl          rcx, 1                         ; strip any sign bit
    shr          rcx, 53                        ; >> EXPSHIFTBITS_DP64 +1
    sub          r9, rcx                        ; expdiff

;;      if (expdiff > 15)
    cmp          r9, 15
    jle          Lcosf_sse2_expdiff_le_15

; The remainder is pretty small compared with x, which
; implies that x is a near multiple of pi/2
; (x matches the multiple to at least 15 bits)
;          t  = rhead;
    movapd       xmm1, xmm4

;          rtail  = npi2 * piby2_2;
    mulsd        xmm3, xmm2

;          rhead  = t - rtail;
    mulsd        xmm5, xmm2                     ; npi2 * piby2_2tail
    subsd        xmm4, xmm3                     ; rhead

;          rtail  = npi2 * piby2_2tail - ((t - rhead) - rtail);
    subsd        xmm1, xmm4                     ; t - rhead
    subsd        xmm1, xmm3                     ; -rtail
    subsd        xmm5, xmm1                     ; rtail

;      r = rhead - rtail;
    movapd       xmm0, xmm4

;HARSHA
;xmm1=rtail
    movapd       xmm1, xmm5
    subsd        xmm0, xmm5

;    xmm0=r, xmm4=rhead, xmm1=rtail

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
Lcosf_sse2_expdiff_le_15:
    cmp          rcx, 03f2h                     ; is r < 2^-13 ?
    jge          Lcosf_sse2_calc_sincosf_piby4  ; use taylor series if not
    cmp          rcx, 03deh                     ; is r < 2^-33 ?
    jle          Lcosf_sse2_r_very_small        ; then cosf(r) ~ 1 or r

    movapd       xmm2, xmm0
    mulsd        xmm2, xmm0                     ; xmm2 <-- x^2

;;      if region is 1 or 3 do a sinf calc.
    and          r8d, eax
    jz           Lcosf_sse2_r_small_calc_sin

Lcosf_sse2_r_small_calc_cos:
; region 1 or 3
; use simply polynomial
;              *s = x - x*x*x*0.166666666666666666;
    movsd        xmm3, QWORD PTR L_one_sixth
    mulsd        xmm3, xmm0                     ; * x
    mulsd        xmm3, xmm2                     ; * x^2
    subsd        xmm0, xmm3                     ; xs
    jmp          Lcosf_sse2_adjust_region

ALIGN 16
Lcosf_sse2_r_small_calc_sin:
; region 0 or 2
;              cos = 1.0 - x*x*0.5;
    movsd        xmm0, QWORD PTR L_real_one     ; 1.0
    mulsd        xmm2, QWORD PTR L_one_half     ; 0.5 *x^2
    subsd        xmm0, xmm2
    jmp          Lcosf_sse2_adjust_region

ALIGN 16
Lcosf_sse2_r_very_small:
; then sin(r) = r
; if region is 1 or 3    do a sin calc.
    and          r8d, eax
    jnz          Lcosf_sse2_adjust_region

    movsd        xmm0, QWORD PTR L_real_one  ; cosf(r) is a 1
    ; By this point, calculations should already have set inexact
    jmp          Lcosf_sse2_adjust_region

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
ALIGN 16
Lcosf_sse2_reduce_precise:
;      Reduce abs(x) into range [-pi/4, pi/4]
;      remainder_piby2d2f(ax, &r, &region);
    mov          QWORD PTR p_temp[rsp], rdx     ; save ux for use later
    mov          QWORD PTR p_temp1[rsp], r10    ; save ax for use later

    call         __remainder_piby2d2f_forAsm
    mov          rdx, QWORD PTR p_temp[rsp]     ; restore ux for use later
    mov          r10, QWORD PTR p_temp1[rsp]    ; restore ax for use later
    mov          r8d, 1                         ; for determining region later

    ; Reduced argument is in xmm0.  No second word; after all, we started in
    ; single precision.  Region is in rax.
    movapd       xmm1, xmm0
    movsd        xmm5, QWORD PTR L_one_half

    jmp          Lcosf_sse2_calc_sincosf_piby4


; done with reducing the argument.  Now perform the sin/cos calculations.
ALIGN 16
Lcosf_sse2_calc_sincosf_piby4:
    movapd       xmm2, xmm0
    mulsd        xmm2, xmm0                     ; x^2

;;       if region is 0 or 2, do a cosf calc
    and          r8d, eax
    jz           Lcosf_sse2_do_cosf_calc
;   region is 1 or 3: do a sinf calc.
Lcosf_sse2_do_sinf_calc:
    movsd   xmm1, QWORD PTR __Lsinfarray+18h          ; s4
    mulsd   xmm1, xmm2                                ; s4x2
    movsd   xmm4, xmm2                                ; move for x4    
    mulsd   xmm4, xmm2                                ; x4
    movsd   xmm5, QWORD PTR __Lsinfarray+8h           ; s2
    mulsd   xmm5, xmm2                                ; s2x2
    movsd   xmm3, xmm0                                ; move for x3
    mulsd   xmm3, xmm2                                ; x3        
    addsd   xmm1, QWORD PTR __Lsinfarray+10h          ; s3+s4x2
    mulsd   xmm1, xmm4                                ; s3x4+s4x6     
    addsd   xmm5, QWORD PTR __Lsinfarray              ; s1+s2x2
    addsd   xmm1, xmm5                                ; s1+s2x2+s3x4+s4x6
    mulsd   xmm1, xmm3                                ; x3(s1+s2x2+s3x4+s4x6)
    addsd   xmm0, xmm1                                ; x + x3(s1+s2x2+s3x4+s4x6)
    jmp     Lcosf_sse2_adjust_region

ALIGN 16
Lcosf_sse2_do_cosf_calc:
; region 0 or 2     - do a cos calculation
;  zc = 1-0.5*x2+ c1*x4 +c2*x6 +c3*x8;
;     zc = 1-0.5*x2+ c1*x4 +c2*x6 +c3*x8 + c4*x10 for a higher precision
    movsd   xmm1, QWORD PTR __Lcosfarray+20h          ; c4
    movsd   xmm4, xmm2                                ; move for x4
    mulsd   xmm1, xmm2                                ; c4x2
    movsd   xmm3, QWORD PTR __Lcosfarray+10h          ; c2
    mulsd   xmm4, xmm2                                ; x4
    movsd   xmm0, QWORD PTR __Lcosfarray              ; c0
    mulsd   xmm3, xmm2                                ; c2x2
    mulsd   xmm0, xmm2                                ; c0x2 (=-0.5x2)
    addsd   xmm1, QWORD PTR __Lcosfarray+18h          ; c3+c4x2
    mulsd   xmm1, xmm4                                ; c3x4 + c4x6
    addsd   xmm3, QWORD PTR __Lcosfarray+8h           ; c1+c2x2
    addsd   xmm1, xmm3                                ; c1 + c2x2 + c3x4 + c4x6
    mulsd   xmm1, xmm4                                ; c1x4 + c2x6 + c3x8 + c4x10
    addsd   xmm0, QWORD PTR L_real_one                ; 1 - 0.5x2
    addsd   xmm0, xmm1                                ; 1 - 0.5x2 + c1x4 + c2x6 + c3x8 + c4x10

Lcosf_sse2_adjust_region:
; xmm1 is cos or sin, relies on previous sections to
;      switch (region)
    add          eax, 1
    and          eax, 2
    jz           Lcosf_sse2_cleanup
;; if region 1 or 2 then we negate the result.
    xorpd        xmm2, xmm2
    subsd        xmm2, xmm0
    movapd       xmm0, xmm2

ALIGN 16
Lcosf_sse2_cleanup:
    cvtsd2ss     xmm0, xmm0
    StackDeallocate stack_size
    ret


Lcosf_sse2_naninf:
    call         fname_special
    StackDeallocate stack_size
    ret


ALIGN 16
Lcosf_fma3:
    vmovd        eax,xmm0
    mov          r8d,L_inf_mask_32
    and          eax,r8d
    cmp          eax, r8d
    jz           Lcosf_fma3_naninf

    vcvtss2sd    xmm5,xmm0,xmm0
    vmovq        r9,xmm5
    btr          r9,63                    ;clear sign

    cmp          r9,L_piby2
    jg           Lcosf_fma3_range_reduce
    cmp          r9,L_two_to_neg_7
    jge          Lcosf_fma3_compute_cosf_piby_4
    cmp          r9,L_two_to_neg_13
    jge          Lcosf_fma3_compute_1_xx_5

    vmovq        xmm0,QWORD PTR L_real_one
    ; Here we need to set inexact
    vaddsd       xmm0,xmm0,L__min_norm_double  ; this will set inexact
    jmp          Lcosf_fma3_return

ALIGN 16
Lcosf_fma3_compute_1_xx_5:
    vmulsd       xmm0,xmm5,QWORD PTR L_one_half
    vfnmadd213sd xmm0,xmm5,L_real_one           ; xmm9 1.0 - x*x*(double2)0.5
    jmp          Lcosf_fma3_return

ALIGN 16
Lcosf_fma3_compute_cosf_piby_4:
    movsd        xmm0,xmm5
    vmovapd      xmm2,L_real_one
    vmulsd       xmm3,xmm0,xmm0
    vmulsd       xmm1,xmm3,L_one_half           ; xmm1 <-- r
    vsubsd       xmm2,xmm2,xmm1
    vmovsd       xmm1,__Lcosfarray+018h
    vfmadd231sd  xmm1,xmm3,__Lcosfarray+020h
    vfmadd213sd  xmm1,xmm3,__Lcosfarray+010h
    vfmadd213sd  xmm1,xmm3,__Lcosfarray+008h
    vmulsd       xmm3,xmm3,xmm3                 ; xmm3 <-- x^4
    vmovdqa      xmm0,xmm2
    vfmadd231sd  xmm0,xmm1,xmm3
    jmp          Lcosf_fma3_return

ALIGN 16
Lcosf_fma3_range_reduce:
    vmovq        xmm0,r9                        ; xmm0 <-- |x|
    cmp          r9,L_large_x_fma3
    jge          Lcosf_reduce_precise

;cosff_range_e_5_s:
    vandpd       xmm1,xmm0,L_sign_mask
    vmovapd      xmm2,L_2bypi
    vfmadd213sd  xmm2,xmm1,L_one_half
    vcvttpd2dq   xmm2,xmm2
    vpmovsxdq    xmm1,xmm2
    vandpd       xmm4,xmm1,L__int_three         ; region xmm4
    vshufps      xmm1 ,xmm1,xmm1,8
    vcvtdq2pd    xmm1,xmm1
    vmovdqa      xmm2,xmm0
    vfnmadd231sd xmm2,xmm1,L_piby2_1            ; xmm2 rhead
    vmulsd       xmm3,xmm1,L_piby2_1tail        ; xmm3 rtail
    vsubsd       xmm0,xmm2,xmm3                 ; r_1  xmm0
    vsubsd       xmm2,xmm2,xmm0
    vsubsd       xmm1,xmm2,xmm3
    vmovq        rax,xmm4
    jmp          Lcosf_exit_s

ALIGN 16
Lcosf_reduce_precise:

    vmovq        xmm0,r9               ; r9 <-- |x|
    cmp          r9,L_large_x_fma3
    jge          Lcos_remainder_piby2

    ; __remainder_piby2_fma3 and __remainder_piby2_fma3_bdl
    ; have the following conventions:
    ; on input
    ;   x is in xmm0
    ; on output
    ;   r is in xmm0
    ;   rr is in xmm1
    ;   region is in rax
    ; The _bdl routine is guaranteed not to touch r10

Lcos_remainder_piby2_small: ;; unused label
    ; Boldo-Daumas-Li reduction for reasonably small |x|
    call         __remainder_piby2_fma3_bdl
    jmp          Lcosf_exit_s

ALIGN 16
Lcos_remainder_piby2:
    ; argument reduction for general x
    call         __remainder_piby2_fma3
Lcosf_exit_s:
    bt           rax,0
    jnc          Lcosf_piby4_compute

;sinf_piby4_compute:
;   vmovapd      xmm1,__Lsinfarray+010h
    vmovsd       xmm1,__Lsinfarray+010h
    vmulsd       xmm3,xmm0,xmm0
    vfmadd231sd  xmm1,xmm3,__Lsinfarray+018h
    vfmadd213sd  xmm1,xmm3,__Lsinfarray+008h
    vfmadd213sd  xmm1,xmm3,__Lsinfarray
    vmulsd       xmm3,xmm0,xmm3                 ; xmm3 <-- x^3
    vfmadd231sd  xmm0,xmm1,xmm3
    jmp          Lcosf_fma3_adjust_sign

ALIGN 16
Lcosf_piby4_compute:
    vmovapd      xmm2,L_real_one
    vmulsd       xmm3,xmm0,xmm0
    vmulsd       xmm1,xmm3,L_one_half           ; xmm1 <-- r
    vsubsd       xmm2,xmm2,xmm1
    vmovsd       xmm1,__Lcosfarray+018h
    vfmadd231sd  xmm1 ,xmm3,__Lcosfarray+020h
    vfmadd213sd  xmm1 ,xmm3,__Lcosfarray+010h
    vfmadd213sd  xmm1 ,xmm3,__Lcosfarray+008h
    vmulsd       xmm3,xmm3,xmm3                 ; xmm3 <-- x^4
    vmovdqa      xmm0, xmm2
    vfmadd231sd  xmm0 ,xmm1,xmm3

Lcosf_fma3_adjust_sign:
    ; assuming FMA3 ==> AVX ==> SSE4.1
;    vpcmpeqq     xmm1,xmm4,XMMWORD PTR L_int_one
;    vpcmpeqq     xmm2,xmm4,XMMWORD PTR L_int_two
;    vorpd        xmm3,xmm2,xmm1

;    vandpd       xmm3,xmm3,L_signbit

    add          rax,1                    ; 1,2 --> 2,3
    shr          rax,1                    ; 2,3 --> 1
    shl          rax,63                   ; 1 --> sign bit
    vmovq        xmm3,rax

    vxorpd       xmm0,xmm0,xmm3

Lcosf_fma3_return:
    vcvtsd2ss    xmm0,xmm0,xmm0
    StackDeallocate stack_size
    ret

Lcosf_fma3_naninf:
    call         fname_special
    StackDeallocate stack_size
    ret

fname  endp
END
[LIBM] Import win-libm from AMD Source: https://github.com/amd/win-libm 2022-06-12 10:02:01 +00:00			`;`
			`; MIT License`
			`; -----------`
			`;`
			`; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.`
			`;`
			`; Permission is hereby granted, free of charge, to any person obtaining a copy`
			`; of this Software and associated documentaon files (the "Software"), to deal`
			`; in the Software without restriction, including without limitation the rights`
			`; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell`
			`; copies of the Software, and to permit persons to whom the Software is`
			`; furnished to do so, subject to the following conditions:`
			`;`
			`; The above copyright notice and this permission notice shall be included in`
			`; all copies or substantial portions of the Software.`
			`;`
			`; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR`
			`; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,`
			`; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE`
			`; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER`
			`; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,`
			`; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN`
			`; THE SOFTWARE.`
			`;`
			`; An implementation of the cosf function.`
			`;`
			`; Prototype:`
			`;`
			`; float cosf(float x);`
			`;`
			`; Computes cosf(x).`
			`; Based on the NAG C implementation.`
			`; It will provide proper C99 return values,`
			`; but may not raise floating point status bits properly.`
			`; Original Author: Harsha Jagasia`

			`.const`
			`ALIGN 16`
			`L_real_one DQ 03ff0000000000000h ; 1.0`
			`DQ 0 ; for alignment`
			`L_one_half DQ 03fe0000000000000h ; 0.5`
			`DQ 0`
			`L_2bypi DQ 03fe45f306dc9c883h ; 2./pi`
			`DQ 0`
			`L_one_sixth DQ 03fc5555555555555h ; 0.166666666666`
			`DQ 0`
			`L_piby2 DQ 03fe921fb54442d18h`
			`DQ 0`
			`L_piby2_1 DQ 03ff921fb54400000h ; piby2_1`
			`DQ 0`
			`L_piby2_1tail DQ 03dd0b4611a626331h ; piby2_1tail`
			`DQ 0`
			`L_piby2_2 DQ 03dd0b4611a600000h ; piby2_2`
			`DQ 0`
			`L_piby2_2tail DQ 03ba3198a2e037073h ; piby2_2tail`
			`DQ 0`
			`L_large_x_sse2 DQ 0411E848000000000h ; 5e5`
			`DQ 0`
			`L_large_x_fma3 DQ 041E921FB60000000h ; 3.37325952e9`
			`DQ 0`
			`L_sign_mask DQ 07FFFFFFFFFFFFFFFh`
			`DQ 07FFFFFFFFFFFFFFFh`
			`L__int_three DQ 00000000000000003h`
			`DQ 00000000000000003h`
			`L__min_norm_double DQ 00010000000000000h`
			`DQ 00010000000000000h`
			`L_two_to_neg_7 DQ 03f80000000000000h`
			`DQ 0`
			`L_two_to_neg_13 DQ 03f20000000000000h`
			`DQ 0`
			`L_inf_mask_32 DD 07F800000h`
			`DQ 0`

			`fname TEXTEQU <cosf>`
			`fname_special TEXTEQU <_cosf_special>`

			`;Define name and any external functions being called`
			`EXTERN __remainder_piby2d2f_forAsm : PROC ; NEAR`
			`EXTERN __remainder_piby2_fma3_bdl : PROC ; NEAR`
			`EXTERN __remainder_piby2_fma3 : PROC ; NEAR`
			`EXTERN fname_special : PROC`
			`EXTERN _set_statfp : PROC`


			`EXTRN __Lcosfarray:QWORD`
			`EXTRN __Lsinfarray:QWORD`
			`EXTRN __use_fma3_lib:DWORD`

			`; define local variable storage offsets`
			`p_temp equ 020h ; temporary for get/put bits operation`
			`p_temp1 equ 030h ; temporary for get/put bits operation`
			`dummy_space EQU 040h`
			`stack_size EQU 068h`

			`include fm.inc`

			`.code`

			`ALIGN 16`
			`PUBLIC fname`
			`fname PROC FRAME`
			`StackAllocate stack_size`
			`.ENDPROLOG`
			`cmp DWORD PTR __use_fma3_lib, 0`
			`jne Lcosf_fma3`

			`Lcosf_sse2:`

			`xorpd xmm2, xmm2 ; zeroed out for later use`

			`;; if NaN or inf`
			`movd edx, xmm0`
			`mov eax, 07f800000h`
			`mov r10d, eax`
			`and r10d, edx`
			`cmp r10d, eax`
			`jz Lcosf_sse2_naninf`

			`cvtss2sd xmm0, xmm0`
			`movd rdx, xmm0`

			`; ax = (ux & ~SIGNBIT_DP64);`
			`mov r10, rdx`
			`btr r10, 63 ; r10 <-- \|x\|`
			`mov r8d, 1 ; for determining region later on`

			`movapd xmm1, xmm0 ; xmm1 <-- copy of x`


			`;; if (ax <= 3fe921fb54442d18h) /* abs(x) <= pi/4 */`
			`mov rax, 03fe921fb54442d18h`
			`cmp r10, rax`
			`jg Lcosf_sse2_absx_gt_piby4`

			`; *c = cos_piby4(x, 0.0);`
			`movapd xmm2, xmm0`
			`mulsd xmm2, xmm2 ;x^2`
			`xor eax, eax`
			`mov rdx, r10`
			`movsd xmm5, QWORD PTR L_one_half`
			`jmp Lcosf_sse2_calc_sincosf_piby4 ; done`


			`ALIGN 16`
			`Lcosf_sse2_absx_gt_piby4:`
			`; reduce the argument to be in a range from -pi/4 to +pi/4`
			`; by subtracting multiples of pi/2`
			`; xneg = (ax != ux);`
			`movd xmm0, r10 ; xmm0 <-- \|x\|`
			`cmp r10, QWORD PTR L_large_x_sse2`
			`jae Lcosf_sse2_reduce_precise`

			`;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;`
			`; xmm0=abs(x), xmm1=x`
			`;/* How many pi/2 is x a multiple of? */`

			`movapd xmm2, xmm0`
			`movsd xmm3, QWORD PTR L_2bypi`
			`movapd xmm4, xmm0`
			`movsd xmm5, QWORD PTR L_one_half`
			`mulsd xmm2, xmm3`

			`; movsd xmm5, QWORD PTR L_one_half`
			`; movapd xmm2, xmm0`
			`; mulsd xmm2, QWORD PTR L_2bypi`
			`; movapd xmm4, xmm0`

			`mov r9, r10`
			`shr r9, 52 ; r9 <-- biased exponent of x`

			`; npi2 = (int)(x * twobypi + 0.5);`
			`addsd xmm2, xmm5 ; npi2`

			`movsd xmm3, QWORD PTR L_piby2_1 ; piby2_1`
			`cvttpd2dq xmm0, xmm2 ; xmm0 <-- npi2`
			`movsd xmm1, QWORD PTR L_piby2_1tail ; piby2_1tail`
			`cvtdq2pd xmm2, xmm0 ; xmm2 <-- (double)npi2`

			`; Subtract the multiple from x to get an extra-precision remainder`
			`; rhead = x - npi2 * piby2_1;`

			`mulsd xmm3, xmm2 ; use piby2_1`
			`subsd xmm4, xmm3 ; rhead`

			`; rtail = npi2 * piby2_1tail;`
			`mulsd xmm1, xmm2 ; rtail`
			`movd eax, xmm0`

			`; GET_BITS_DP64(rhead-rtail, uy);`
			`; originally only rhead`
			`movapd xmm0, xmm4`
			`subsd xmm0, xmm1`

			`movsd xmm3, QWORD PTR L_piby2_2 ; piby2_2`
			`movd rcx, xmm0 ; rcx <-- rhead-rtail`
			`movsd xmm5, QWORD PTR L_piby2_2tail ; piby2_2tail`

			`; region = npi2 & 3;`
			`; and eax, 3`
			`; expdiff = xexp - ((uy & EXPBITS_DP64) >> EXPSHIFTBITS_DP64);`
			`shl rcx, 1 ; strip any sign bit`
			`shr rcx, 53 ; >> EXPSHIFTBITS_DP64 +1`
			`sub r9, rcx ; expdiff`

			`;; if (expdiff > 15)`
			`cmp r9, 15`
			`jle Lcosf_sse2_expdiff_le_15`

			`; The remainder is pretty small compared with x, which`
			`; implies that x is a near multiple of pi/2`
			`; (x matches the multiple to at least 15 bits)`
			`; t = rhead;`
			`movapd xmm1, xmm4`

			`; rtail = npi2 * piby2_2;`
			`mulsd xmm3, xmm2`

			`; rhead = t - rtail;`
			`mulsd xmm5, xmm2 ; npi2 * piby2_2tail`
			`subsd xmm4, xmm3 ; rhead`

			`; rtail = npi2 * piby2_2tail - ((t - rhead) - rtail);`
			`subsd xmm1, xmm4 ; t - rhead`
			`subsd xmm1, xmm3 ; -rtail`
			`subsd xmm5, xmm1 ; rtail`

			`; r = rhead - rtail;`
			`movapd xmm0, xmm4`

			`;HARSHA`
			`;xmm1=rtail`
			`movapd xmm1, xmm5`
			`subsd xmm0, xmm5`

			`; xmm0=r, xmm4=rhead, xmm1=rtail`

			`;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;`
			`Lcosf_sse2_expdiff_le_15:`
			`cmp rcx, 03f2h ; is r < 2^-13 ?`
			`jge Lcosf_sse2_calc_sincosf_piby4 ; use taylor series if not`
			`cmp rcx, 03deh ; is r < 2^-33 ?`
			`jle Lcosf_sse2_r_very_small ; then cosf(r) ~ 1 or r`

			`movapd xmm2, xmm0`
			`mulsd xmm2, xmm0 ; xmm2 <-- x^2`

			`;; if region is 1 or 3 do a sinf calc.`
			`and r8d, eax`
			`jz Lcosf_sse2_r_small_calc_sin`

			`Lcosf_sse2_r_small_calc_cos:`
			`; region 1 or 3`
			`; use simply polynomial`
			`; s = x - xxx0.166666666666666666;`
			`movsd xmm3, QWORD PTR L_one_sixth`
			`mulsd xmm3, xmm0 ; * x`
			`mulsd xmm3, xmm2 ; * x^2`
			`subsd xmm0, xmm3 ; xs`
			`jmp Lcosf_sse2_adjust_region`

			`ALIGN 16`
			`Lcosf_sse2_r_small_calc_sin:`
			`; region 0 or 2`
			`; cos = 1.0 - xx0.5;`
			`movsd xmm0, QWORD PTR L_real_one ; 1.0`
			`mulsd xmm2, QWORD PTR L_one_half ; 0.5 *x^2`
			`subsd xmm0, xmm2`
			`jmp Lcosf_sse2_adjust_region`

			`ALIGN 16`
			`Lcosf_sse2_r_very_small:`
			`; then sin(r) = r`
			`; if region is 1 or 3 do a sin calc.`
			`and r8d, eax`
			`jnz Lcosf_sse2_adjust_region`

			`movsd xmm0, QWORD PTR L_real_one ; cosf(r) is a 1`
			`; By this point, calculations should already have set inexact`
			`jmp Lcosf_sse2_adjust_region`

			`;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;`
			`ALIGN 16`
			`Lcosf_sse2_reduce_precise:`
			`; Reduce abs(x) into range [-pi/4, pi/4]`
			`; remainder_piby2d2f(ax, &r, &region);`
			`mov QWORD PTR p_temp[rsp], rdx ; save ux for use later`
			`mov QWORD PTR p_temp1[rsp], r10 ; save ax for use later`

			`call __remainder_piby2d2f_forAsm`
			`mov rdx, QWORD PTR p_temp[rsp] ; restore ux for use later`
			`mov r10, QWORD PTR p_temp1[rsp] ; restore ax for use later`
			`mov r8d, 1 ; for determining region later`

			`; Reduced argument is in xmm0. No second word; after all, we started in`
			`; single precision. Region is in rax.`
			`movapd xmm1, xmm0`
			`movsd xmm5, QWORD PTR L_one_half`

			`jmp Lcosf_sse2_calc_sincosf_piby4`


			`; done with reducing the argument. Now perform the sin/cos calculations.`
			`ALIGN 16`
			`Lcosf_sse2_calc_sincosf_piby4:`
			`movapd xmm2, xmm0`
			`mulsd xmm2, xmm0 ; x^2`

			`;; if region is 0 or 2, do a cosf calc`
			`and r8d, eax`
			`jz Lcosf_sse2_do_cosf_calc`
			`; region is 1 or 3: do a sinf calc.`
			`Lcosf_sse2_do_sinf_calc:`
			`movsd xmm1, QWORD PTR __Lsinfarray+18h ; s4`
			`mulsd xmm1, xmm2 ; s4x2`
			`movsd xmm4, xmm2 ; move for x4`
			`mulsd xmm4, xmm2 ; x4`
			`movsd xmm5, QWORD PTR __Lsinfarray+8h ; s2`
			`mulsd xmm5, xmm2 ; s2x2`
			`movsd xmm3, xmm0 ; move for x3`
			`mulsd xmm3, xmm2 ; x3`
			`addsd xmm1, QWORD PTR __Lsinfarray+10h ; s3+s4x2`
			`mulsd xmm1, xmm4 ; s3x4+s4x6`
			`addsd xmm5, QWORD PTR __Lsinfarray ; s1+s2x2`
			`addsd xmm1, xmm5 ; s1+s2x2+s3x4+s4x6`
			`mulsd xmm1, xmm3 ; x3(s1+s2x2+s3x4+s4x6)`
			`addsd xmm0, xmm1 ; x + x3(s1+s2x2+s3x4+s4x6)`
			`jmp Lcosf_sse2_adjust_region`

			`ALIGN 16`
			`Lcosf_sse2_do_cosf_calc:`
			`; region 0 or 2 - do a cos calculation`
			`; zc = 1-0.5x2+ c1x4 +c2x6 +c3x8;`
			`; zc = 1-0.5x2+ c1x4 +c2x6 +c3x8 + c4*x10 for a higher precision`
			`movsd xmm1, QWORD PTR __Lcosfarray+20h ; c4`
			`movsd xmm4, xmm2 ; move for x4`
			`mulsd xmm1, xmm2 ; c4x2`
			`movsd xmm3, QWORD PTR __Lcosfarray+10h ; c2`
			`mulsd xmm4, xmm2 ; x4`
			`movsd xmm0, QWORD PTR __Lcosfarray ; c0`
			`mulsd xmm3, xmm2 ; c2x2`
			`mulsd xmm0, xmm2 ; c0x2 (=-0.5x2)`
			`addsd xmm1, QWORD PTR __Lcosfarray+18h ; c3+c4x2`
			`mulsd xmm1, xmm4 ; c3x4 + c4x6`
			`addsd xmm3, QWORD PTR __Lcosfarray+8h ; c1+c2x2`
			`addsd xmm1, xmm3 ; c1 + c2x2 + c3x4 + c4x6`
			`mulsd xmm1, xmm4 ; c1x4 + c2x6 + c3x8 + c4x10`
			`addsd xmm0, QWORD PTR L_real_one ; 1 - 0.5x2`
			`addsd xmm0, xmm1 ; 1 - 0.5x2 + c1x4 + c2x6 + c3x8 + c4x10`

			`Lcosf_sse2_adjust_region:`
			`; xmm1 is cos or sin, relies on previous sections to`
			`; switch (region)`
			`add eax, 1`
			`and eax, 2`
			`jz Lcosf_sse2_cleanup`
			`;; if region 1 or 2 then we negate the result.`
			`xorpd xmm2, xmm2`
			`subsd xmm2, xmm0`
			`movapd xmm0, xmm2`

			`ALIGN 16`
			`Lcosf_sse2_cleanup:`
			`cvtsd2ss xmm0, xmm0`
			`StackDeallocate stack_size`
			`ret`


			`Lcosf_sse2_naninf:`
			`call fname_special`
			`StackDeallocate stack_size`
			`ret`


			`ALIGN 16`
			`Lcosf_fma3:`
			`vmovd eax,xmm0`
			`mov r8d,L_inf_mask_32`
			`and eax,r8d`
			`cmp eax, r8d`
			`jz Lcosf_fma3_naninf`

			`vcvtss2sd xmm5,xmm0,xmm0`
			`vmovq r9,xmm5`
			`btr r9,63 ;clear sign`

			`cmp r9,L_piby2`
			`jg Lcosf_fma3_range_reduce`
			`cmp r9,L_two_to_neg_7`
			`jge Lcosf_fma3_compute_cosf_piby_4`
			`cmp r9,L_two_to_neg_13`
			`jge Lcosf_fma3_compute_1_xx_5`

			`vmovq xmm0,QWORD PTR L_real_one`
			`; Here we need to set inexact`
			`vaddsd xmm0,xmm0,L__min_norm_double ; this will set inexact`
			`jmp Lcosf_fma3_return`

			`ALIGN 16`
			`Lcosf_fma3_compute_1_xx_5:`
			`vmulsd xmm0,xmm5,QWORD PTR L_one_half`
			`vfnmadd213sd xmm0,xmm5,L_real_one ; xmm9 1.0 - xx(double2)0.5`
			`jmp Lcosf_fma3_return`

			`ALIGN 16`
			`Lcosf_fma3_compute_cosf_piby_4:`
			`movsd xmm0,xmm5`
			`vmovapd xmm2,L_real_one`
			`vmulsd xmm3,xmm0,xmm0`
			`vmulsd xmm1,xmm3,L_one_half ; xmm1 <-- r`
			`vsubsd xmm2,xmm2,xmm1`
			`vmovsd xmm1,__Lcosfarray+018h`
			`vfmadd231sd xmm1,xmm3,__Lcosfarray+020h`
			`vfmadd213sd xmm1,xmm3,__Lcosfarray+010h`
			`vfmadd213sd xmm1,xmm3,__Lcosfarray+008h`
			`vmulsd xmm3,xmm3,xmm3 ; xmm3 <-- x^4`
			`vmovdqa xmm0,xmm2`
			`vfmadd231sd xmm0,xmm1,xmm3`
			`jmp Lcosf_fma3_return`

			`ALIGN 16`
			`Lcosf_fma3_range_reduce:`
			`vmovq xmm0,r9 ; xmm0 <-- \|x\|`
			`cmp r9,L_large_x_fma3`
			`jge Lcosf_reduce_precise`

			`;cosff_range_e_5_s:`
			`vandpd xmm1,xmm0,L_sign_mask`
			`vmovapd xmm2,L_2bypi`
			`vfmadd213sd xmm2,xmm1,L_one_half`
			`vcvttpd2dq xmm2,xmm2`
			`vpmovsxdq xmm1,xmm2`
			`vandpd xmm4,xmm1,L__int_three ; region xmm4`
			`vshufps xmm1 ,xmm1,xmm1,8`
			`vcvtdq2pd xmm1,xmm1`
			`vmovdqa xmm2,xmm0`
			`vfnmadd231sd xmm2,xmm1,L_piby2_1 ; xmm2 rhead`
			`vmulsd xmm3,xmm1,L_piby2_1tail ; xmm3 rtail`
			`vsubsd xmm0,xmm2,xmm3 ; r_1 xmm0`
			`vsubsd xmm2,xmm2,xmm0`
			`vsubsd xmm1,xmm2,xmm3`
			`vmovq rax,xmm4`
			`jmp Lcosf_exit_s`

			`ALIGN 16`
			`Lcosf_reduce_precise:`

			`vmovq xmm0,r9 ; r9 <-- \|x\|`
			`cmp r9,L_large_x_fma3`
			`jge Lcos_remainder_piby2`

			`; __remainder_piby2_fma3 and __remainder_piby2_fma3_bdl`
			`; have the following conventions:`
			`; on input`
			`; x is in xmm0`
			`; on output`
			`; r is in xmm0`
			`; rr is in xmm1`
			`; region is in rax`
			`; The _bdl routine is guaranteed not to touch r10`

			`Lcos_remainder_piby2_small: ;; unused label`
			`; Boldo-Daumas-Li reduction for reasonably small \|x\|`
			`call __remainder_piby2_fma3_bdl`
			`jmp Lcosf_exit_s`

			`ALIGN 16`
			`Lcos_remainder_piby2:`
			`; argument reduction for general x`
			`call __remainder_piby2_fma3`
			`Lcosf_exit_s:`
			`bt rax,0`
			`jnc Lcosf_piby4_compute`

			`;sinf_piby4_compute:`
			`; vmovapd xmm1,__Lsinfarray+010h`
			`vmovsd xmm1,__Lsinfarray+010h`
			`vmulsd xmm3,xmm0,xmm0`
			`vfmadd231sd xmm1,xmm3,__Lsinfarray+018h`
			`vfmadd213sd xmm1,xmm3,__Lsinfarray+008h`
			`vfmadd213sd xmm1,xmm3,__Lsinfarray`
			`vmulsd xmm3,xmm0,xmm3 ; xmm3 <-- x^3`
			`vfmadd231sd xmm0,xmm1,xmm3`
			`jmp Lcosf_fma3_adjust_sign`

			`ALIGN 16`
			`Lcosf_piby4_compute:`
			`vmovapd xmm2,L_real_one`
			`vmulsd xmm3,xmm0,xmm0`
			`vmulsd xmm1,xmm3,L_one_half ; xmm1 <-- r`
			`vsubsd xmm2,xmm2,xmm1`
			`vmovsd xmm1,__Lcosfarray+018h`
			`vfmadd231sd xmm1 ,xmm3,__Lcosfarray+020h`
			`vfmadd213sd xmm1 ,xmm3,__Lcosfarray+010h`
			`vfmadd213sd xmm1 ,xmm3,__Lcosfarray+008h`
			`vmulsd xmm3,xmm3,xmm3 ; xmm3 <-- x^4`
			`vmovdqa xmm0, xmm2`
			`vfmadd231sd xmm0 ,xmm1,xmm3`

			`Lcosf_fma3_adjust_sign:`
			`; assuming FMA3 ==> AVX ==> SSE4.1`
			`; vpcmpeqq xmm1,xmm4,XMMWORD PTR L_int_one`
			`; vpcmpeqq xmm2,xmm4,XMMWORD PTR L_int_two`
			`; vorpd xmm3,xmm2,xmm1`

			`; vandpd xmm3,xmm3,L_signbit`

			`add rax,1 ; 1,2 --> 2,3`
			`shr rax,1 ; 2,3 --> 1`
			`shl rax,63 ; 1 --> sign bit`
			`vmovq xmm3,rax`

			`vxorpd xmm0,xmm0,xmm3`

			`Lcosf_fma3_return:`
			`vcvtsd2ss xmm0,xmm0,xmm0`
			`StackDeallocate stack_size`
			`ret`

			`Lcosf_fma3_naninf:`
			`call fname_special`
			`StackDeallocate stack_size`
			`ret`

			`fname endp`
			`END`