From 1edcc31339c95ebe8d75f9de193c58842f8fad35 Mon Sep 17 00:00:00 2001
From: Timo Kreuzer <timo.kreuzer@reactos.org>
Date: Mon, 28 Jan 2013 23:44:29 +0000
Subject: [PATCH] [CRT] - Remove x64 asm stub for acos from cmake file, since
 we already have a generic C implementation - Implement sqrt for amd64 in SSE,
 both in C and asm. While the C version would be sufficient, it's currently
 less portable due to the lack of mm intrinsics for GCC - Silence a warning

svn path=/trunk/; revision=58251
---
 reactos/lib/sdk/crt/crt.cmake         |  2 +-
 reactos/lib/sdk/crt/locale/locale.c   |  5 +-
 reactos/lib/sdk/crt/math/amd64/asin.c | 73 +++++++++++++++++++++++++
 reactos/lib/sdk/crt/math/amd64/sqrt.S | 47 +++++++++++++++-
 reactos/lib/sdk/crt/math/amd64/sqrt.c | 77 +++++++++++++++++++++++++++
 5 files changed, 199 insertions(+), 5 deletions(-)
 create mode 100644 reactos/lib/sdk/crt/math/amd64/asin.c
 create mode 100644 reactos/lib/sdk/crt/math/amd64/sqrt.c

diff --git a/reactos/lib/sdk/crt/crt.cmake b/reactos/lib/sdk/crt/crt.cmake
index 90eac846264..eaf22f2c1c8 100644
--- a/reactos/lib/sdk/crt/crt.cmake
+++ b/reactos/lib/sdk/crt/crt.cmake
@@ -433,7 +433,7 @@ elseif(ARCH STREQUAL "amd64")
         float/amd64/getsetfpcw.S
         float/amd64/fpreset.S
         float/amd64/logb.S
-        math/amd64/acos.S
+        # math/amd64/acos.S
         math/amd64/acosf.S
         math/amd64/atan.S
         math/amd64/atan2.S
diff --git a/reactos/lib/sdk/crt/locale/locale.c b/reactos/lib/sdk/crt/locale/locale.c
index 557b99c24e0..d40608263f2 100644
--- a/reactos/lib/sdk/crt/locale/locale.c
+++ b/reactos/lib/sdk/crt/locale/locale.c
@@ -1402,6 +1402,7 @@ char* CDECL setlocale(int category, const char* locale)
     if(category == LC_ALL)
         return construct_lc_all(locinfo);
 
+    _Analysis_assume_(category <= 5);
     return locinfo->lc_category[category].locale;
 }
 
@@ -1481,13 +1482,13 @@ MSVCRT__locale_t global_locale = NULL;
 void __init_global_locale()
 {
     unsigned i;
-    
+
     LOCK_LOCALE;
     /* Someone created it before us */
     if(global_locale)
         return;
     global_locale = MSVCRT__create_locale(0, "C");
-    
+
     __lc_codepage = MSVCRT_locale->locinfo->lc_codepage;
     MSVCRT___lc_collate_cp = MSVCRT_locale->locinfo->lc_collate_cp;
     __mb_cur_max = MSVCRT_locale->locinfo->mb_cur_max;
diff --git a/reactos/lib/sdk/crt/math/amd64/asin.c b/reactos/lib/sdk/crt/math/amd64/asin.c
new file mode 100644
index 00000000000..c38bbe4e0c2
--- /dev/null
+++ b/reactos/lib/sdk/crt/math/amd64/asin.c
@@ -0,0 +1,73 @@
+/*
+ * COPYRIGHT:        See COPYING in the top level directory
+ * PROJECT:          ReactOS CRT
+ * FILE:             lib/crt/math/acos.c
+ * PURPOSE:          Generic C implementation of arc sine
+ * PROGRAMMER:       Timo Kreuzer (timo.kreuzer@reactos.org)
+ */
+
+#define PRECISION 9
+
+/*
+ * The arc sine can be approximated with the following row:
+ *
+ *   asin(x) = a0*x + a1*x^3 + a2*x^5 + a3*x^7 + a4*x^9 + ...
+ *
+ * To reduce the number of multiplications the formula is transformed to
+ *
+ *   asin(x) = x * (1 + x^2*(a1 + x^2*(a2 + x^2*(a3 + ...) ) ) )
+ *
+ * The coefficients are:
+ *   a0 = 1
+ *   a1 = (1/2*3)
+ *   a2 = (3*1/4*2*5)
+ *   a3 = (5*3*1/6*4*2*7)
+ *   a4 = (7*5*3*1/8*6*4*2*9)
+ *   a5 = (9*7*5*3*1/10*8*6*4*2*11)
+ *   ...
+ */
+
+double
+asin(double x)
+{
+    double x2, result;
+
+    /* Check range */
+    if ((x > 1.) || (x < -1.)) return NaN;
+
+    /* Calculate the square of x */
+    x2 = (x * x);
+
+    /* Start with 0, compiler will optimize this away */
+    result = 0;
+
+    result += (15*13*11*9*7*5*3*1./(16*14*12*10*8*6*4*2*17));
+    result *= x2;
+
+    result += (13*11*9*7*5*3*1./(14*12*10*8*6*4*2*15));
+    result *= x2;
+
+    result += (11*9*7*5*3*1./(12*10*8*6*4*2*13));
+    result *= x2;
+
+    result += (9*7*5*3*1./(10*8*6*4*2*11));
+    result *= x2;
+
+    result += (7*5*3*1./(8*6*4*2*9));
+    result *= x2;
+
+    result += (5*3*1./(6*4*2*7));
+    result *= x2;
+
+    result += (3*1./(4*2*5));
+    result *= x2;
+
+    result += (1./(2*3));
+    result *= x2;
+
+    result += 1.;
+    result *= x;
+
+    return result;
+}
+
diff --git a/reactos/lib/sdk/crt/math/amd64/sqrt.S b/reactos/lib/sdk/crt/math/amd64/sqrt.S
index 4c234eb0739..5238feebc98 100644
--- a/reactos/lib/sdk/crt/math/amd64/sqrt.S
+++ b/reactos/lib/sdk/crt/math/amd64/sqrt.S
@@ -9,14 +9,57 @@
 /* INCLUDES ******************************************************************/
 
 #include <asm.inc>
-#include <ksamd64.inc>
 
 /* CODE **********************************************************************/
 .code64
 
 PUBLIC sqrt
 sqrt:
-    UNIMPLEMENTED sqrt
+
+    /* Load the sign bit into rdx */
+    mov rdx, HEX(8000000000000000)
+
+    /* Move the lower 64 bits of xmm0 into rax */
+    movd rax, xmm0
+
+    /* Test the sign bit */
+    test rax, rdx
+
+    /* If it is set, go to the failure path */
+    jnz x_is_negative
+
+    /* x is positive, now check if it is NaN by checking if the unsigned
+       integer value is larger than the highest valid positive value. */
+    mov rcx, 7FF0000000000000h
+    cmp rax, rcx
+    ja short x_is_nan
+
+    /* All is well, calculate the sqrt */
+    sqrtpd xmm0, xmm0
     ret
 
+x_is_negative:
+    /* Load failure return value (-1.#IND00) into rcx */
+    mov rcx, HEX(0FFF8000000000000)
+
+    /* Check if the parameter was -0.0 */
+    cmp rax, rdx
+
+    /* If it was not, load the failure value, otherwise keep -0.0  */
+    cmovne  rax, rcx
+
+    /* Move the value back into the return register */
+    movd xmm0, rax
+    ret
+
+x_is_nan:
+    /* Create a 1.#QNAN0 by setting this bit */
+    mov rcx, HEX(8000000000000)
+    or rax, rcx
+
+    /* Move the value back into the return register */
+    movd xmm0, rax
+    ret
+
+
 END
diff --git a/reactos/lib/sdk/crt/math/amd64/sqrt.c b/reactos/lib/sdk/crt/math/amd64/sqrt.c
new file mode 100644
index 00000000000..636b9c7832c
--- /dev/null
+++ b/reactos/lib/sdk/crt/math/amd64/sqrt.c
@@ -0,0 +1,77 @@
+
+#include <intrin.h>
+
+double
+sqrt (
+    double x)
+{
+    register union
+    {
+        __m128d x128d;
+        __m128i x128i;
+    } u ;
+    register union
+    {
+        unsigned long long ullx;
+        double dbl;
+    } u2;
+
+    /* Set the lower double-precision value of u to x.
+       All that we want, is that the compiler understands that we have the
+       function parameter in a register that we can address as an __m128.
+       Sadly there is no obvious way to do that. If we use the union, VS will
+       generate code to store xmm0 in memory and the read it into a GPR.
+       We avoid memory access by using a direct move. But even here we won't
+       get a simple MOVSD. We can either do:
+       a) _mm_set_sd: move x into the lower part of an xmm register and zero
+          out the upper part (XORPD+MOVSD)
+       b) _mm_set1_pd: move x into the lower and higher part of an xmm register
+         (MOVSD+UNPCKLPD)
+       c) _mm_set_pd, which either generates a memory access, when we try to
+          tell it to keep the upper 64 bits, or generate 2 MOVAPS + UNPCKLPD
+       We choose a, which is probably the fastest.
+    */
+    u.x128d = _mm_set_sd(x);
+
+    /* Move the contents of the lower 64 bit into a 64 bit GPR using MOVD */
+    u2.ullx = _mm_cvtsi128_si64(u.x128i);
+
+    /* Check for negative values */
+    if (u2.ullx & 0x8000000000000000ULL)
+    {
+        /* Check if this is *really* negative and not just -0.0 */
+        if (u2.ullx != 0x8000000000000000ULL)
+        {
+            /* Return -1.#IND00 */
+            u2.ullx = 0xfff8000000000000ULL;
+        }
+
+        /* Return what we have */
+        return u2.dbl;
+    }
+
+    /* Check if this is a NaN (bits 52-62 are 1, bit 0-61 are not all 0) or
+       negative (bit 63 is 1) */
+    if (u2.ullx > 0x7FF0000000000000ULL)
+    {
+        /* Set this bit. That's what MS function does. */
+        u2.ullx |= 0x8000000000000ULL;
+        return u2.dbl;
+    }
+
+    /* Calculate the square root. */
+#ifdef _MSC_VER
+    /* Another YAY for the MS compiler. There are 2 instructions we could use:
+       SQRTPD (computes sqrt for 2 double values) or SQRTSD (computes sqrt for
+       only the lower 64 bit double value). Obviously we only need 1. And on
+       Some architectures SQRTPD is twice as slow as SQRTSD. On the other hand
+       the MS compiler is stupid and always generates an additional MOVAPS
+       instruction when SQRTSD is used. We choose to use SQRTPD here since on
+       modern hardware it's as fast as SQRTSD. */
+    u.x128d = _mm_sqrt_pd(u.x128d); // SQRTPD
+#else
+    u.x128d = _mm_sqrt_sd(u.x128d, u.x128d); // SQRTSD
+#endif
+
+    return u.x128d.m128d_f64[0];
+}