[LIBM] Import win-libm from AMD

Source: https://github.com/amd/win-libm
2025-07-31 18:42:14 +00:00 · 2022-06-12 12:02:01 +02:00 · 2022-06-12 12:02:01 +02:00 · 4afb647c78
commit 4afb647c78
parent 081c637c06
82 changed files with 22392 additions and 0 deletions
--- a/sdk/lib/crt/math/libm_sse2/cosf.asm
+++ b/sdk/lib/crt/math/libm_sse2/cosf.asm
@ -0,0 +1,525 @@
+;
+; MIT License
+; -----------
+; 
+; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
+; 
+; Permission is hereby granted, free of charge, to any person obtaining a copy
+; of this Software and associated documentaon files (the "Software"), to deal
+; in the Software without restriction, including without limitation the rights
+; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+; copies of the Software, and to permit persons to whom the Software is
+; furnished to do so, subject to the following conditions:
+; 
+; The above copyright notice and this permission notice shall be included in
+; all copies or substantial portions of the Software.
+; 
+; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+; THE SOFTWARE.
+;
+; An implementation of the cosf function.
+;
+; Prototype:
+;
+;     float cosf(float x);
+;
+;   Computes cosf(x).
+;   Based on the NAG C implementation.
+;   It will provide proper C99 return values,
+;   but may not raise floating point status bits properly.
+;   Original Author: Harsha Jagasia
+
+.const
+ALIGN 16
+L_real_one                 DQ 03ff0000000000000h      ; 1.0
+                           DQ 0                          ; for alignment
+L_one_half                 DQ 03fe0000000000000h      ; 0.5
+                           DQ 0
+L_2bypi                    DQ 03fe45f306dc9c883h      ; 2./pi
+                           DQ 0
+L_one_sixth                DQ 03fc5555555555555h      ; 0.166666666666
+                           DQ 0
+L_piby2                    DQ 03fe921fb54442d18h
+                           DQ 0
+L_piby2_1                  DQ 03ff921fb54400000h     ; piby2_1
+                           DQ 0
+L_piby2_1tail              DQ 03dd0b4611a626331h     ; piby2_1tail
+                           DQ 0
+L_piby2_2                  DQ 03dd0b4611a600000h     ; piby2_2
+                           DQ 0
+L_piby2_2tail              DQ 03ba3198a2e037073h     ; piby2_2tail
+                           DQ 0
+L_large_x_sse2             DQ 0411E848000000000h     ; 5e5
+                           DQ 0
+L_large_x_fma3             DQ 041E921FB60000000h     ; 3.37325952e9
+                           DQ 0
+L_sign_mask                DQ 07FFFFFFFFFFFFFFFh
+                           DQ 07FFFFFFFFFFFFFFFh
+L__int_three               DQ 00000000000000003h
+                           DQ 00000000000000003h
+L__min_norm_double         DQ 00010000000000000h
+                           DQ 00010000000000000h
+L_two_to_neg_7             DQ 03f80000000000000h
+                           DQ 0
+L_two_to_neg_13            DQ 03f20000000000000h
+                           DQ 0
+L_inf_mask_32              DD 07F800000h
+                           DQ 0
+
+fname           TEXTEQU <cosf>
+fname_special   TEXTEQU <_cosf_special>
+
+;Define name and any external functions being called
+EXTERN           __remainder_piby2d2f_forAsm : PROC    ; NEAR
+EXTERN           __remainder_piby2_fma3_bdl  : PROC   ; NEAR
+EXTERN           __remainder_piby2_fma3      : PROC   ; NEAR
+EXTERN           fname_special      : PROC
+EXTERN           _set_statfp        : PROC
+
+
+EXTRN __Lcosfarray:QWORD
+EXTRN __Lsinfarray:QWORD
+EXTRN __use_fma3_lib:DWORD
+
+; define local variable storage offsets
+p_temp           equ        020h          ; temporary for get/put bits operation
+p_temp1          equ        030h          ; temporary for get/put bits operation
+dummy_space      EQU        040h
+stack_size       EQU        068h
+
+include fm.inc
+
+.code
+
+ALIGN 16
+PUBLIC fname
+fname PROC FRAME
+    StackAllocate stack_size
+    .ENDPROLOG
+    cmp          DWORD PTR __use_fma3_lib, 0
+    jne          Lcosf_fma3
+
+Lcosf_sse2:
+
+    xorpd        xmm2, xmm2               ; zeroed out for later use
+
+;;  if NaN or inf
+    movd         edx, xmm0
+    mov          eax, 07f800000h
+    mov          r10d, eax
+    and          r10d, edx
+    cmp          r10d, eax
+    jz           Lcosf_sse2_naninf
+
+    cvtss2sd     xmm0, xmm0
+    movd         rdx, xmm0
+
+;  ax = (ux & ~SIGNBIT_DP64);
+    mov          r10, rdx
+    btr          r10, 63                  ; r10 <-- |x|
+    mov          r8d, 1                   ; for determining region later on
+
+    movapd       xmm1, xmm0               ; xmm1 <-- copy of x
+
+
+;;  if (ax <= 3fe921fb54442d18h) /* abs(x) <= pi/4 */
+    mov          rax, 03fe921fb54442d18h
+    cmp          r10, rax
+    jg           Lcosf_sse2_absx_gt_piby4
+
+;          *c = cos_piby4(x, 0.0);
+    movapd       xmm2, xmm0
+    mulsd        xmm2, xmm2        ;x^2
+    xor          eax, eax
+    mov          rdx, r10
+    movsd        xmm5, QWORD PTR L_one_half
+    jmp          Lcosf_sse2_calc_sincosf_piby4        ; done
+
+
+ALIGN 16
+Lcosf_sse2_absx_gt_piby4:
+; reduce  the argument to be in a range from -pi/4 to +pi/4
+; by subtracting multiples of pi/2
+;  xneg = (ax != ux);
+    movd         xmm0, r10                ; xmm0 <-- |x|
+    cmp          r10, QWORD PTR L_large_x_sse2
+    jae          Lcosf_sse2_reduce_precise
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; xmm0=abs(x), xmm1=x
+;/* How many pi/2 is x a multiple of? */
+
+    movapd       xmm2, xmm0
+    movsd        xmm3, QWORD PTR L_2bypi
+    movapd       xmm4, xmm0
+    movsd        xmm5, QWORD PTR L_one_half
+    mulsd        xmm2, xmm3
+
+;   movsd        xmm5, QWORD PTR L_one_half
+;   movapd       xmm2, xmm0
+;   mulsd        xmm2, QWORD PTR L_2bypi
+;   movapd       xmm4, xmm0
+
+    mov     r9, r10
+    shr     r9, 52                        ; r9 <-- biased exponent of x
+
+;        npi2  = (int)(x * twobypi + 0.5);
+    addsd   xmm2, xmm5                          ; npi2
+
+    movsd        xmm3, QWORD PTR L_piby2_1      ; piby2_1
+    cvttpd2dq    xmm0, xmm2                     ; xmm0 <-- npi2
+    movsd        xmm1, QWORD PTR L_piby2_1tail  ; piby2_1tail
+    cvtdq2pd     xmm2, xmm0                     ; xmm2 <-- (double)npi2
+
+;    Subtract the multiple from x to get an extra-precision remainder
+;      rhead  = x - npi2 * piby2_1;
+
+    mulsd        xmm3, xmm2                     ; use piby2_1
+    subsd        xmm4, xmm3                     ; rhead
+
+;      rtail  = npi2 * piby2_1tail;
+    mulsd        xmm1, xmm2                     ; rtail
+    movd         eax, xmm0
+
+; GET_BITS_DP64(rhead-rtail, uy);
+; originally only rhead
+    movapd       xmm0, xmm4
+    subsd        xmm0, xmm1
+
+    movsd        xmm3, QWORD PTR L_piby2_2      ; piby2_2
+    movd         rcx, xmm0                      ; rcx <-- rhead-rtail
+    movsd        xmm5, QWORD PTR L_piby2_2tail  ; piby2_2tail
+
+;      region = npi2 & 3;
+;    and        eax, 3
+;      expdiff = xexp - ((uy & EXPBITS_DP64) >> EXPSHIFTBITS_DP64);
+    shl          rcx, 1                         ; strip any sign bit
+    shr          rcx, 53                        ; >> EXPSHIFTBITS_DP64 +1
+    sub          r9, rcx                        ; expdiff
+
+;;      if (expdiff > 15)
+    cmp          r9, 15
+    jle          Lcosf_sse2_expdiff_le_15
+
+; The remainder is pretty small compared with x, which
+; implies that x is a near multiple of pi/2
+; (x matches the multiple to at least 15 bits)
+;          t  = rhead;
+    movapd       xmm1, xmm4
+
+;          rtail  = npi2 * piby2_2;
+    mulsd        xmm3, xmm2
+
+;          rhead  = t - rtail;
+    mulsd        xmm5, xmm2                     ; npi2 * piby2_2tail
+    subsd        xmm4, xmm3                     ; rhead
+
+;          rtail  = npi2 * piby2_2tail - ((t - rhead) - rtail);
+    subsd        xmm1, xmm4                     ; t - rhead
+    subsd        xmm1, xmm3                     ; -rtail
+    subsd        xmm5, xmm1                     ; rtail
+
+;      r = rhead - rtail;
+    movapd       xmm0, xmm4
+
+;HARSHA
+;xmm1=rtail
+    movapd       xmm1, xmm5
+    subsd        xmm0, xmm5
+
+;    xmm0=r, xmm4=rhead, xmm1=rtail
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+Lcosf_sse2_expdiff_le_15:
+    cmp          rcx, 03f2h                     ; is r < 2^-13 ?
+    jge          Lcosf_sse2_calc_sincosf_piby4  ; use taylor series if not
+    cmp          rcx, 03deh                     ; is r < 2^-33 ?
+    jle          Lcosf_sse2_r_very_small        ; then cosf(r) ~ 1 or r
+
+    movapd       xmm2, xmm0
+    mulsd        xmm2, xmm0                     ; xmm2 <-- x^2
+
+;;      if region is 1 or 3 do a sinf calc.
+    and          r8d, eax
+    jz           Lcosf_sse2_r_small_calc_sin
+
+Lcosf_sse2_r_small_calc_cos:
+; region 1 or 3
+; use simply polynomial
+;              *s = x - x*x*x*0.166666666666666666;
+    movsd        xmm3, QWORD PTR L_one_sixth
+    mulsd        xmm3, xmm0                     ; * x
+    mulsd        xmm3, xmm2                     ; * x^2
+    subsd        xmm0, xmm3                     ; xs
+    jmp          Lcosf_sse2_adjust_region
+
+ALIGN 16
+Lcosf_sse2_r_small_calc_sin:
+; region 0 or 2
+;              cos = 1.0 - x*x*0.5;
+    movsd        xmm0, QWORD PTR L_real_one     ; 1.0
+    mulsd        xmm2, QWORD PTR L_one_half     ; 0.5 *x^2
+    subsd        xmm0, xmm2
+    jmp          Lcosf_sse2_adjust_region
+
+ALIGN 16
+Lcosf_sse2_r_very_small:
+; then sin(r) = r
+; if region is 1 or 3    do a sin calc.
+    and          r8d, eax
+    jnz          Lcosf_sse2_adjust_region
+
+    movsd        xmm0, QWORD PTR L_real_one  ; cosf(r) is a 1
+    ; By this point, calculations should already have set inexact
+    jmp          Lcosf_sse2_adjust_region
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ALIGN 16
+Lcosf_sse2_reduce_precise:
+;      Reduce abs(x) into range [-pi/4, pi/4]
+;      remainder_piby2d2f(ax, &r, &region);
+    mov          QWORD PTR p_temp[rsp], rdx     ; save ux for use later
+    mov          QWORD PTR p_temp1[rsp], r10    ; save ax for use later
+
+    call         __remainder_piby2d2f_forAsm
+    mov          rdx, QWORD PTR p_temp[rsp]     ; restore ux for use later
+    mov          r10, QWORD PTR p_temp1[rsp]    ; restore ax for use later
+    mov          r8d, 1                         ; for determining region later
+
+    ; Reduced argument is in xmm0.  No second word; after all, we started in
+    ; single precision.  Region is in rax.
+    movapd       xmm1, xmm0
+    movsd        xmm5, QWORD PTR L_one_half
+
+    jmp          Lcosf_sse2_calc_sincosf_piby4
+
+
+; done with reducing the argument.  Now perform the sin/cos calculations.
+ALIGN 16
+Lcosf_sse2_calc_sincosf_piby4:
+    movapd       xmm2, xmm0
+    mulsd        xmm2, xmm0                     ; x^2
+
+;;       if region is 0 or 2, do a cosf calc
+    and          r8d, eax
+    jz           Lcosf_sse2_do_cosf_calc
+;   region is 1 or 3: do a sinf calc.
+Lcosf_sse2_do_sinf_calc:
+    movsd   xmm1, QWORD PTR __Lsinfarray+18h          ; s4
+    mulsd   xmm1, xmm2                                ; s4x2
+    movsd   xmm4, xmm2                                ; move for x4    
+    mulsd   xmm4, xmm2                                ; x4
+    movsd   xmm5, QWORD PTR __Lsinfarray+8h           ; s2
+    mulsd   xmm5, xmm2                                ; s2x2
+    movsd   xmm3, xmm0                                ; move for x3
+    mulsd   xmm3, xmm2                                ; x3        
+    addsd   xmm1, QWORD PTR __Lsinfarray+10h          ; s3+s4x2
+    mulsd   xmm1, xmm4                                ; s3x4+s4x6     
+    addsd   xmm5, QWORD PTR __Lsinfarray              ; s1+s2x2
+    addsd   xmm1, xmm5                                ; s1+s2x2+s3x4+s4x6
+    mulsd   xmm1, xmm3                                ; x3(s1+s2x2+s3x4+s4x6)
+    addsd   xmm0, xmm1                                ; x + x3(s1+s2x2+s3x4+s4x6)
+    jmp     Lcosf_sse2_adjust_region
+
+ALIGN 16
+Lcosf_sse2_do_cosf_calc:
+; region 0 or 2     - do a cos calculation
+;  zc = 1-0.5*x2+ c1*x4 +c2*x6 +c3*x8;
+;     zc = 1-0.5*x2+ c1*x4 +c2*x6 +c3*x8 + c4*x10 for a higher precision
+    movsd   xmm1, QWORD PTR __Lcosfarray+20h          ; c4
+    movsd   xmm4, xmm2                                ; move for x4
+    mulsd   xmm1, xmm2                                ; c4x2
+    movsd   xmm3, QWORD PTR __Lcosfarray+10h          ; c2
+    mulsd   xmm4, xmm2                                ; x4
+    movsd   xmm0, QWORD PTR __Lcosfarray              ; c0
+    mulsd   xmm3, xmm2                                ; c2x2
+    mulsd   xmm0, xmm2                                ; c0x2 (=-0.5x2)
+    addsd   xmm1, QWORD PTR __Lcosfarray+18h          ; c3+c4x2
+    mulsd   xmm1, xmm4                                ; c3x4 + c4x6
+    addsd   xmm3, QWORD PTR __Lcosfarray+8h           ; c1+c2x2
+    addsd   xmm1, xmm3                                ; c1 + c2x2 + c3x4 + c4x6
+    mulsd   xmm1, xmm4                                ; c1x4 + c2x6 + c3x8 + c4x10
+    addsd   xmm0, QWORD PTR L_real_one                ; 1 - 0.5x2
+    addsd   xmm0, xmm1                                ; 1 - 0.5x2 + c1x4 + c2x6 + c3x8 + c4x10
+
+Lcosf_sse2_adjust_region:
+; xmm1 is cos or sin, relies on previous sections to
+;      switch (region)
+    add          eax, 1
+    and          eax, 2
+    jz           Lcosf_sse2_cleanup
+;; if region 1 or 2 then we negate the result.
+    xorpd        xmm2, xmm2
+    subsd        xmm2, xmm0
+    movapd       xmm0, xmm2
+
+ALIGN 16
+Lcosf_sse2_cleanup:
+    cvtsd2ss     xmm0, xmm0
+    StackDeallocate stack_size
+    ret
+
+
+Lcosf_sse2_naninf:
+    call         fname_special
+    StackDeallocate stack_size
+    ret
+
+
+ALIGN 16
+Lcosf_fma3:
+    vmovd        eax,xmm0
+    mov          r8d,L_inf_mask_32
+    and          eax,r8d
+    cmp          eax, r8d
+    jz           Lcosf_fma3_naninf
+
+    vcvtss2sd    xmm5,xmm0,xmm0
+    vmovq        r9,xmm5
+    btr          r9,63                    ;clear sign
+
+    cmp          r9,L_piby2
+    jg           Lcosf_fma3_range_reduce
+    cmp          r9,L_two_to_neg_7
+    jge          Lcosf_fma3_compute_cosf_piby_4
+    cmp          r9,L_two_to_neg_13
+    jge          Lcosf_fma3_compute_1_xx_5
+
+    vmovq        xmm0,QWORD PTR L_real_one
+    ; Here we need to set inexact
+    vaddsd       xmm0,xmm0,L__min_norm_double  ; this will set inexact
+    jmp          Lcosf_fma3_return
+
+ALIGN 16
+Lcosf_fma3_compute_1_xx_5:
+    vmulsd       xmm0,xmm5,QWORD PTR L_one_half
+    vfnmadd213sd xmm0,xmm5,L_real_one           ; xmm9 1.0 - x*x*(double2)0.5
+    jmp          Lcosf_fma3_return
+
+ALIGN 16
+Lcosf_fma3_compute_cosf_piby_4:
+    movsd        xmm0,xmm5
+    vmovapd      xmm2,L_real_one
+    vmulsd       xmm3,xmm0,xmm0
+    vmulsd       xmm1,xmm3,L_one_half           ; xmm1 <-- r
+    vsubsd       xmm2,xmm2,xmm1
+    vmovsd       xmm1,__Lcosfarray+018h
+    vfmadd231sd  xmm1,xmm3,__Lcosfarray+020h
+    vfmadd213sd  xmm1,xmm3,__Lcosfarray+010h
+    vfmadd213sd  xmm1,xmm3,__Lcosfarray+008h
+    vmulsd       xmm3,xmm3,xmm3                 ; xmm3 <-- x^4
+    vmovdqa      xmm0,xmm2
+    vfmadd231sd  xmm0,xmm1,xmm3
+    jmp          Lcosf_fma3_return
+
+ALIGN 16
+Lcosf_fma3_range_reduce:
+    vmovq        xmm0,r9                        ; xmm0 <-- |x|
+    cmp          r9,L_large_x_fma3
+    jge          Lcosf_reduce_precise
+
+;cosff_range_e_5_s:
+    vandpd       xmm1,xmm0,L_sign_mask
+    vmovapd      xmm2,L_2bypi
+    vfmadd213sd  xmm2,xmm1,L_one_half
+    vcvttpd2dq   xmm2,xmm2
+    vpmovsxdq    xmm1,xmm2
+    vandpd       xmm4,xmm1,L__int_three         ; region xmm4
+    vshufps      xmm1 ,xmm1,xmm1,8
+    vcvtdq2pd    xmm1,xmm1
+    vmovdqa      xmm2,xmm0
+    vfnmadd231sd xmm2,xmm1,L_piby2_1            ; xmm2 rhead
+    vmulsd       xmm3,xmm1,L_piby2_1tail        ; xmm3 rtail
+    vsubsd       xmm0,xmm2,xmm3                 ; r_1  xmm0
+    vsubsd       xmm2,xmm2,xmm0
+    vsubsd       xmm1,xmm2,xmm3
+    vmovq        rax,xmm4
+    jmp          Lcosf_exit_s
+
+ALIGN 16
+Lcosf_reduce_precise:
+
+    vmovq        xmm0,r9               ; r9 <-- |x|
+    cmp          r9,L_large_x_fma3
+    jge          Lcos_remainder_piby2
+
+    ; __remainder_piby2_fma3 and __remainder_piby2_fma3_bdl
+    ; have the following conventions:
+    ; on input
+    ;   x is in xmm0
+    ; on output
+    ;   r is in xmm0
+    ;   rr is in xmm1
+    ;   region is in rax
+    ; The _bdl routine is guaranteed not to touch r10
+
+Lcos_remainder_piby2_small: ;; unused label
+    ; Boldo-Daumas-Li reduction for reasonably small |x|
+    call         __remainder_piby2_fma3_bdl
+    jmp          Lcosf_exit_s
+
+ALIGN 16
+Lcos_remainder_piby2:
+    ; argument reduction for general x
+    call         __remainder_piby2_fma3
+Lcosf_exit_s:
+    bt           rax,0
+    jnc          Lcosf_piby4_compute
+
+;sinf_piby4_compute:
+;   vmovapd      xmm1,__Lsinfarray+010h
+    vmovsd       xmm1,__Lsinfarray+010h
+    vmulsd       xmm3,xmm0,xmm0
+    vfmadd231sd  xmm1,xmm3,__Lsinfarray+018h
+    vfmadd213sd  xmm1,xmm3,__Lsinfarray+008h
+    vfmadd213sd  xmm1,xmm3,__Lsinfarray
+    vmulsd       xmm3,xmm0,xmm3                 ; xmm3 <-- x^3
+    vfmadd231sd  xmm0,xmm1,xmm3
+    jmp          Lcosf_fma3_adjust_sign
+
+ALIGN 16
+Lcosf_piby4_compute:
+    vmovapd      xmm2,L_real_one
+    vmulsd       xmm3,xmm0,xmm0
+    vmulsd       xmm1,xmm3,L_one_half           ; xmm1 <-- r
+    vsubsd       xmm2,xmm2,xmm1
+    vmovsd       xmm1,__Lcosfarray+018h
+    vfmadd231sd  xmm1 ,xmm3,__Lcosfarray+020h
+    vfmadd213sd  xmm1 ,xmm3,__Lcosfarray+010h
+    vfmadd213sd  xmm1 ,xmm3,__Lcosfarray+008h
+    vmulsd       xmm3,xmm3,xmm3                 ; xmm3 <-- x^4
+    vmovdqa      xmm0, xmm2
+    vfmadd231sd  xmm0 ,xmm1,xmm3
+
+Lcosf_fma3_adjust_sign:
+    ; assuming FMA3 ==> AVX ==> SSE4.1
+;    vpcmpeqq     xmm1,xmm4,XMMWORD PTR L_int_one
+;    vpcmpeqq     xmm2,xmm4,XMMWORD PTR L_int_two
+;    vorpd        xmm3,xmm2,xmm1
+
+;    vandpd       xmm3,xmm3,L_signbit
+
+    add          rax,1                    ; 1,2 --> 2,3
+    shr          rax,1                    ; 2,3 --> 1
+    shl          rax,63                   ; 1 --> sign bit
+    vmovq        xmm3,rax
+
+    vxorpd       xmm0,xmm0,xmm3
+
+Lcosf_fma3_return:
+    vcvtsd2ss    xmm0,xmm0,xmm0
+    StackDeallocate stack_size
+    ret
+
+Lcosf_fma3_naninf:
+    call         fname_special
+    StackDeallocate stack_size
+    ret
+
+fname  endp
+END