[CRT]

- Remove x64 asm stub for acos from cmake file, since we already have a generic C implementation - Implement sqrt for amd64 in SSE, both in C and asm. While the C version would be sufficient, it's currently less portable due to the lack of mm intrinsics for GCC - Silence a warning svn path=/trunk/; revision=58251
2024-07-10 22:55:05 +00:00 · 2013-01-28 23:44:29 +00:00 · 2013-01-28 23:44:29 +00:00 · 1edcc31339
parent 57f9b8f9ab
commit 1edcc31339
5 changed files with 199 additions and 5 deletions
--- a/reactos/lib/sdk/crt/crt.cmake
+++ b/reactos/lib/sdk/crt/crt.cmake
@ -433,7 +433,7 @@ elseif(ARCH STREQUAL "amd64")
        float/amd64/getsetfpcw.S
        float/amd64/fpreset.S
        float/amd64/logb.S
-        math/amd64/acos.S
+        # math/amd64/acos.S
        math/amd64/acosf.S
        math/amd64/atan.S
        math/amd64/atan2.S
--- a/reactos/lib/sdk/crt/locale/locale.c
+++ b/reactos/lib/sdk/crt/locale/locale.c
@ -1402,6 +1402,7 @@ char* CDECL setlocale(int category, const char* locale)
    if(category == LC_ALL)
        return construct_lc_all(locinfo);

+    _Analysis_assume_(category <= 5);
    return locinfo->lc_category[category].locale;
 }

@ -1481,13 +1482,13 @@ MSVCRT__locale_t global_locale = NULL;
 void __init_global_locale()
 {
    unsigned i;
-    
+
    LOCK_LOCALE;
    /* Someone created it before us */
    if(global_locale)
        return;
    global_locale = MSVCRT__create_locale(0, "C");
-    
+
    __lc_codepage = MSVCRT_locale->locinfo->lc_codepage;
    MSVCRT___lc_collate_cp = MSVCRT_locale->locinfo->lc_collate_cp;
    __mb_cur_max = MSVCRT_locale->locinfo->mb_cur_max;
--- a/reactos/lib/sdk/crt/math/amd64/asin.c
+++ b/reactos/lib/sdk/crt/math/amd64/asin.c
@ -0,0 +1,73 @@
+/*
+ * COPYRIGHT:        See COPYING in the top level directory
+ * PROJECT:          ReactOS CRT
+ * FILE:             lib/crt/math/acos.c
+ * PURPOSE:          Generic C implementation of arc sine
+ * PROGRAMMER:       Timo Kreuzer (timo.kreuzer@reactos.org)
+ */
+
+#define PRECISION 9
+
+/*
+ * The arc sine can be approximated with the following row:
+ *
+ *   asin(x) = a0*x + a1*x^3 + a2*x^5 + a3*x^7 + a4*x^9 + ...
+ *
+ * To reduce the number of multiplications the formula is transformed to
+ *
+ *   asin(x) = x * (1 + x^2*(a1 + x^2*(a2 + x^2*(a3 + ...) ) ) )
+ *
+ * The coefficients are:
+ *   a0 = 1
+ *   a1 = (1/2*3)
+ *   a2 = (3*1/4*2*5)
+ *   a3 = (5*3*1/6*4*2*7)
+ *   a4 = (7*5*3*1/8*6*4*2*9)
+ *   a5 = (9*7*5*3*1/10*8*6*4*2*11)
+ *   ...
+ */
+
+double
+asin(double x)
+{
+    double x2, result;
+
+    /* Check range */
+    if ((x > 1.) || (x < -1.)) return NaN;
+
+    /* Calculate the square of x */
+    x2 = (x * x);
+
+    /* Start with 0, compiler will optimize this away */
+    result = 0;
+
+    result += (15*13*11*9*7*5*3*1./(16*14*12*10*8*6*4*2*17));
+    result *= x2;
+
+    result += (13*11*9*7*5*3*1./(14*12*10*8*6*4*2*15));
+    result *= x2;
+
+    result += (11*9*7*5*3*1./(12*10*8*6*4*2*13));
+    result *= x2;
+
+    result += (9*7*5*3*1./(10*8*6*4*2*11));
+    result *= x2;
+
+    result += (7*5*3*1./(8*6*4*2*9));
+    result *= x2;
+
+    result += (5*3*1./(6*4*2*7));
+    result *= x2;
+
+    result += (3*1./(4*2*5));
+    result *= x2;
+
+    result += (1./(2*3));
+    result *= x2;
+
+    result += 1.;
+    result *= x;
+
+    return result;
+}
+
--- a/reactos/lib/sdk/crt/math/amd64/sqrt.S
+++ b/reactos/lib/sdk/crt/math/amd64/sqrt.S
@ -9,14 +9,57 @@
 /* INCLUDES ******************************************************************/

 #include <asm.inc>
-#include <ksamd64.inc>

 /* CODE **********************************************************************/
 .code64

 PUBLIC sqrt
 sqrt:
-    UNIMPLEMENTED sqrt
+
+    /* Load the sign bit into rdx */
+    mov rdx, HEX(8000000000000000)
+
+    /* Move the lower 64 bits of xmm0 into rax */
+    movd rax, xmm0
+
+    /* Test the sign bit */
+    test rax, rdx
+
+    /* If it is set, go to the failure path */
+    jnz x_is_negative
+
+    /* x is positive, now check if it is NaN by checking if the unsigned
+       integer value is larger than the highest valid positive value. */
+    mov rcx, 7FF0000000000000h
+    cmp rax, rcx
+    ja short x_is_nan
+
+    /* All is well, calculate the sqrt */
+    sqrtpd xmm0, xmm0
    ret

+x_is_negative:
+    /* Load failure return value (-1.#IND00) into rcx */
+    mov rcx, HEX(0FFF8000000000000)
+
+    /* Check if the parameter was -0.0 */
+    cmp rax, rdx
+
+    /* If it was not, load the failure value, otherwise keep -0.0  */
+    cmovne  rax, rcx
+
+    /* Move the value back into the return register */
+    movd xmm0, rax
+    ret
+
+x_is_nan:
+    /* Create a 1.#QNAN0 by setting this bit */
+    mov rcx, HEX(8000000000000)
+    or rax, rcx
+
+    /* Move the value back into the return register */
+    movd xmm0, rax
+    ret
+
+
 END
--- a/reactos/lib/sdk/crt/math/amd64/sqrt.c
+++ b/reactos/lib/sdk/crt/math/amd64/sqrt.c
@ -0,0 +1,77 @@
+
+#include <intrin.h>
+
+double
+sqrt (
+    double x)
+{
+    register union
+    {
+        __m128d x128d;
+        __m128i x128i;
+    } u ;
+    register union
+    {
+        unsigned long long ullx;
+        double dbl;
+    } u2;
+
+    /* Set the lower double-precision value of u to x.
+       All that we want, is that the compiler understands that we have the
+       function parameter in a register that we can address as an __m128.
+       Sadly there is no obvious way to do that. If we use the union, VS will
+       generate code to store xmm0 in memory and the read it into a GPR.
+       We avoid memory access by using a direct move. But even here we won't
+       get a simple MOVSD. We can either do:
+       a) _mm_set_sd: move x into the lower part of an xmm register and zero
+          out the upper part (XORPD+MOVSD)
+       b) _mm_set1_pd: move x into the lower and higher part of an xmm register
+         (MOVSD+UNPCKLPD)
+       c) _mm_set_pd, which either generates a memory access, when we try to
+          tell it to keep the upper 64 bits, or generate 2 MOVAPS + UNPCKLPD
+       We choose a, which is probably the fastest.
+    */
+    u.x128d = _mm_set_sd(x);
+
+    /* Move the contents of the lower 64 bit into a 64 bit GPR using MOVD */
+    u2.ullx = _mm_cvtsi128_si64(u.x128i);
+
+    /* Check for negative values */
+    if (u2.ullx & 0x8000000000000000ULL)
+    {
+        /* Check if this is *really* negative and not just -0.0 */
+        if (u2.ullx != 0x8000000000000000ULL)
+        {
+            /* Return -1.#IND00 */
+            u2.ullx = 0xfff8000000000000ULL;
+        }
+
+        /* Return what we have */
+        return u2.dbl;
+    }
+
+    /* Check if this is a NaN (bits 52-62 are 1, bit 0-61 are not all 0) or
+       negative (bit 63 is 1) */
+    if (u2.ullx > 0x7FF0000000000000ULL)
+    {
+        /* Set this bit. That's what MS function does. */
+        u2.ullx |= 0x8000000000000ULL;
+        return u2.dbl;
+    }
+
+    /* Calculate the square root. */
+#ifdef _MSC_VER
+    /* Another YAY for the MS compiler. There are 2 instructions we could use:
+       SQRTPD (computes sqrt for 2 double values) or SQRTSD (computes sqrt for
+       only the lower 64 bit double value). Obviously we only need 1. And on
+       Some architectures SQRTPD is twice as slow as SQRTSD. On the other hand
+       the MS compiler is stupid and always generates an additional MOVAPS
+       instruction when SQRTSD is used. We choose to use SQRTPD here since on
+       modern hardware it's as fast as SQRTSD. */
+    u.x128d = _mm_sqrt_pd(u.x128d); // SQRTPD
+#else
+    u.x128d = _mm_sqrt_sd(u.x128d, u.x128d); // SQRTSD
+#endif
+
+    return u.x128d.m128d_f64[0];
+}