[CRT]

- Remove x64 asm stub for acos from cmake file, since we already have a generic C implementation - Implement sqrt for amd64 in SSE, both in C and asm. While the C version would be sufficient, it's currently less portable due to the lack of mm intrinsics for GCC - Silence a warning svn path=/trunk/; revision=58251
2024-07-30 08:08:56 +00:00 · 2013-01-28 23:44:29 +00:00 · 2013-01-28 23:44:29 +00:00 · 1edcc31339
parent 57f9b8f9ab
commit 1edcc31339
5 changed files with 199 additions and 5 deletions
--- a/reactos/lib/sdk/crt/crt.cmake
+++ b/reactos/lib/sdk/crt/crt.cmake
@ -433,7 +433,7 @@ elseif(ARCH STREQUAL "amd64")
        float/amd64/getsetfpcw.S
        float/amd64/fpreset.S
        float/amd64/logb.S
-        math/amd64/acos.S
+        # math/amd64/acos.S
        math/amd64/acosf.S
        math/amd64/atan.S
        math/amd64/atan2.S
--- a/reactos/lib/sdk/crt/locale/locale.c
+++ b/reactos/lib/sdk/crt/locale/locale.c
@ -1402,6 +1402,7 @@ char* CDECL setlocale(int category, const char* locale)
    if(category == LC_ALL)
        return construct_lc_all(locinfo);
    _Analysis_assume_(category <= 5);
    return locinfo->lc_category[category].locale;
 }
@ -1481,13 +1482,13 @@ MSVCRT__locale_t global_locale = NULL;
 void __init_global_locale()
 {
    unsigned i;
-    
+
    LOCK_LOCALE;
    /* Someone created it before us */
    if(global_locale)
        return;
    global_locale = MSVCRT__create_locale(0, "C");
-    
+
    __lc_codepage = MSVCRT_locale->locinfo->lc_codepage;
    MSVCRT___lc_collate_cp = MSVCRT_locale->locinfo->lc_collate_cp;
    __mb_cur_max = MSVCRT_locale->locinfo->mb_cur_max;
--- a/reactos/lib/sdk/crt/math/amd64/asin.c
+++ b/reactos/lib/sdk/crt/math/amd64/asin.c
@ -0,0 +1,73 @@
 /*
 * COPYRIGHT:        See COPYING in the top level directory
 * PROJECT:          ReactOS CRT
 * FILE:             lib/crt/math/acos.c
 * PURPOSE:          Generic C implementation of arc sine
 * PROGRAMMER:       Timo Kreuzer (timo.kreuzer@reactos.org)
 */
 #define PRECISION 9
 /*
 * The arc sine can be approximated with the following row:
 *
 *   asin(x) = a0*x + a1*x^3 + a2*x^5 + a3*x^7 + a4*x^9 + ...
 *
 * To reduce the number of multiplications the formula is transformed to
 *
 *   asin(x) = x * (1 + x^2*(a1 + x^2*(a2 + x^2*(a3 + ...) ) ) )
 *
 * The coefficients are:
 *   a0 = 1
 *   a1 = (1/2*3)
 *   a2 = (3*1/4*2*5)
 *   a3 = (5*3*1/6*4*2*7)
 *   a4 = (7*5*3*1/8*6*4*2*9)
 *   a5 = (9*7*5*3*1/10*8*6*4*2*11)
 *   ...
 */
 double
 asin(double x)
 {
    double x2, result;
    /* Check range */
    if ((x > 1.) || (x < -1.)) return NaN;
    /* Calculate the square of x */
    x2 = (x * x);
    /* Start with 0, compiler will optimize this away */
    result = 0;
    result += (15*13*11*9*7*5*3*1./(16*14*12*10*8*6*4*2*17));
    result *= x2;
    result += (13*11*9*7*5*3*1./(14*12*10*8*6*4*2*15));
    result *= x2;
    result += (11*9*7*5*3*1./(12*10*8*6*4*2*13));
    result *= x2;
    result += (9*7*5*3*1./(10*8*6*4*2*11));
    result *= x2;
    result += (7*5*3*1./(8*6*4*2*9));
    result *= x2;
    result += (5*3*1./(6*4*2*7));
    result *= x2;
    result += (3*1./(4*2*5));
    result *= x2;
    result += (1./(2*3));
    result *= x2;
    result += 1.;
    result *= x;
    return result;
 }
--- a/reactos/lib/sdk/crt/math/amd64/sqrt.S
+++ b/reactos/lib/sdk/crt/math/amd64/sqrt.S
@ -9,14 +9,57 @@
 /* INCLUDES ******************************************************************/
 #include <asm.inc>
 #include <ksamd64.inc>
 /* CODE **********************************************************************/
 .code64
 PUBLIC sqrt
 sqrt:
-    UNIMPLEMENTED sqrt
+
    /* Load the sign bit into rdx */
    mov rdx, HEX(8000000000000000)
    /* Move the lower 64 bits of xmm0 into rax */
    movd rax, xmm0
    /* Test the sign bit */
    test rax, rdx
    /* If it is set, go to the failure path */
    jnz x_is_negative
    /* x is positive, now check if it is NaN by checking if the unsigned
       integer value is larger than the highest valid positive value. */
    mov rcx, 7FF0000000000000h
    cmp rax, rcx
    ja short x_is_nan
    /* All is well, calculate the sqrt */
    sqrtpd xmm0, xmm0
    ret
 x_is_negative:
    /* Load failure return value (-1.#IND00) into rcx */
    mov rcx, HEX(0FFF8000000000000)
    /* Check if the parameter was -0.0 */
    cmp rax, rdx
    /* If it was not, load the failure value, otherwise keep -0.0  */
    cmovne  rax, rcx
    /* Move the value back into the return register */
    movd xmm0, rax
    ret
 x_is_nan:
    /* Create a 1.#QNAN0 by setting this bit */
    mov rcx, HEX(8000000000000)
    or rax, rcx
    /* Move the value back into the return register */
    movd xmm0, rax
    ret
 END
--- a/reactos/lib/sdk/crt/math/amd64/sqrt.c
+++ b/reactos/lib/sdk/crt/math/amd64/sqrt.c
@ -0,0 +1,77 @@
 #include <intrin.h>
 double
 sqrt (
    double x)
 {
    register union
    {
        __m128d x128d;
        __m128i x128i;
    } u ;
    register union
    {
        unsigned long long ullx;
        double dbl;
    } u2;
    /* Set the lower double-precision value of u to x.
       All that we want, is that the compiler understands that we have the
       function parameter in a register that we can address as an __m128.
       Sadly there is no obvious way to do that. If we use the union, VS will
       generate code to store xmm0 in memory and the read it into a GPR.
       We avoid memory access by using a direct move. But even here we won't
       get a simple MOVSD. We can either do:
       a) _mm_set_sd: move x into the lower part of an xmm register and zero
          out the upper part (XORPD+MOVSD)
       b) _mm_set1_pd: move x into the lower and higher part of an xmm register
         (MOVSD+UNPCKLPD)
       c) _mm_set_pd, which either generates a memory access, when we try to
          tell it to keep the upper 64 bits, or generate 2 MOVAPS + UNPCKLPD
       We choose a, which is probably the fastest.
    */
    u.x128d = _mm_set_sd(x);
    /* Move the contents of the lower 64 bit into a 64 bit GPR using MOVD */
    u2.ullx = _mm_cvtsi128_si64(u.x128i);
    /* Check for negative values */
    if (u2.ullx & 0x8000000000000000ULL)
    {
        /* Check if this is *really* negative and not just -0.0 */
        if (u2.ullx != 0x8000000000000000ULL)
        {
            /* Return -1.#IND00 */
            u2.ullx = 0xfff8000000000000ULL;
        }
        /* Return what we have */
        return u2.dbl;
    }
    /* Check if this is a NaN (bits 52-62 are 1, bit 0-61 are not all 0) or
       negative (bit 63 is 1) */
    if (u2.ullx > 0x7FF0000000000000ULL)
    {
        /* Set this bit. That's what MS function does. */
        u2.ullx |= 0x8000000000000ULL;
        return u2.dbl;
    }
    /* Calculate the square root. */
 #ifdef _MSC_VER
    /* Another YAY for the MS compiler. There are 2 instructions we could use:
       SQRTPD (computes sqrt for 2 double values) or SQRTSD (computes sqrt for
       only the lower 64 bit double value). Obviously we only need 1. And on
       Some architectures SQRTPD is twice as slow as SQRTSD. On the other hand
       the MS compiler is stupid and always generates an additional MOVAPS
       instruction when SQRTSD is used. We choose to use SQRTPD here since on
       modern hardware it's as fast as SQRTSD. */
    u.x128d = _mm_sqrt_pd(u.x128d); // SQRTPD
 #else
    u.x128d = _mm_sqrt_sd(u.x128d, u.x128d); // SQRTSD
 #endif
    return u.x128d.m128d_f64[0];
 }