From 1edcc31339c95ebe8d75f9de193c58842f8fad35 Mon Sep 17 00:00:00 2001 From: Timo Kreuzer Date: Mon, 28 Jan 2013 23:44:29 +0000 Subject: [PATCH] [CRT] - Remove x64 asm stub for acos from cmake file, since we already have a generic C implementation - Implement sqrt for amd64 in SSE, both in C and asm. While the C version would be sufficient, it's currently less portable due to the lack of mm intrinsics for GCC - Silence a warning svn path=/trunk/; revision=58251 --- reactos/lib/sdk/crt/crt.cmake | 2 +- reactos/lib/sdk/crt/locale/locale.c | 5 +- reactos/lib/sdk/crt/math/amd64/asin.c | 73 +++++++++++++++++++++++++ reactos/lib/sdk/crt/math/amd64/sqrt.S | 47 +++++++++++++++- reactos/lib/sdk/crt/math/amd64/sqrt.c | 77 +++++++++++++++++++++++++++ 5 files changed, 199 insertions(+), 5 deletions(-) create mode 100644 reactos/lib/sdk/crt/math/amd64/asin.c create mode 100644 reactos/lib/sdk/crt/math/amd64/sqrt.c diff --git a/reactos/lib/sdk/crt/crt.cmake b/reactos/lib/sdk/crt/crt.cmake index 90eac846264..eaf22f2c1c8 100644 --- a/reactos/lib/sdk/crt/crt.cmake +++ b/reactos/lib/sdk/crt/crt.cmake @@ -433,7 +433,7 @@ elseif(ARCH STREQUAL "amd64") float/amd64/getsetfpcw.S float/amd64/fpreset.S float/amd64/logb.S - math/amd64/acos.S + # math/amd64/acos.S math/amd64/acosf.S math/amd64/atan.S math/amd64/atan2.S diff --git a/reactos/lib/sdk/crt/locale/locale.c b/reactos/lib/sdk/crt/locale/locale.c index 557b99c24e0..d40608263f2 100644 --- a/reactos/lib/sdk/crt/locale/locale.c +++ b/reactos/lib/sdk/crt/locale/locale.c @@ -1402,6 +1402,7 @@ char* CDECL setlocale(int category, const char* locale) if(category == LC_ALL) return construct_lc_all(locinfo); + _Analysis_assume_(category <= 5); return locinfo->lc_category[category].locale; } @@ -1481,13 +1482,13 @@ MSVCRT__locale_t global_locale = NULL; void __init_global_locale() { unsigned i; - + LOCK_LOCALE; /* Someone created it before us */ if(global_locale) return; global_locale = MSVCRT__create_locale(0, "C"); - + __lc_codepage = MSVCRT_locale->locinfo->lc_codepage; MSVCRT___lc_collate_cp = MSVCRT_locale->locinfo->lc_collate_cp; __mb_cur_max = MSVCRT_locale->locinfo->mb_cur_max; diff --git a/reactos/lib/sdk/crt/math/amd64/asin.c b/reactos/lib/sdk/crt/math/amd64/asin.c new file mode 100644 index 00000000000..c38bbe4e0c2 --- /dev/null +++ b/reactos/lib/sdk/crt/math/amd64/asin.c @@ -0,0 +1,73 @@ +/* + * COPYRIGHT: See COPYING in the top level directory + * PROJECT: ReactOS CRT + * FILE: lib/crt/math/acos.c + * PURPOSE: Generic C implementation of arc sine + * PROGRAMMER: Timo Kreuzer (timo.kreuzer@reactos.org) + */ + +#define PRECISION 9 + +/* + * The arc sine can be approximated with the following row: + * + * asin(x) = a0*x + a1*x^3 + a2*x^5 + a3*x^7 + a4*x^9 + ... + * + * To reduce the number of multiplications the formula is transformed to + * + * asin(x) = x * (1 + x^2*(a1 + x^2*(a2 + x^2*(a3 + ...) ) ) ) + * + * The coefficients are: + * a0 = 1 + * a1 = (1/2*3) + * a2 = (3*1/4*2*5) + * a3 = (5*3*1/6*4*2*7) + * a4 = (7*5*3*1/8*6*4*2*9) + * a5 = (9*7*5*3*1/10*8*6*4*2*11) + * ... + */ + +double +asin(double x) +{ + double x2, result; + + /* Check range */ + if ((x > 1.) || (x < -1.)) return NaN; + + /* Calculate the square of x */ + x2 = (x * x); + + /* Start with 0, compiler will optimize this away */ + result = 0; + + result += (15*13*11*9*7*5*3*1./(16*14*12*10*8*6*4*2*17)); + result *= x2; + + result += (13*11*9*7*5*3*1./(14*12*10*8*6*4*2*15)); + result *= x2; + + result += (11*9*7*5*3*1./(12*10*8*6*4*2*13)); + result *= x2; + + result += (9*7*5*3*1./(10*8*6*4*2*11)); + result *= x2; + + result += (7*5*3*1./(8*6*4*2*9)); + result *= x2; + + result += (5*3*1./(6*4*2*7)); + result *= x2; + + result += (3*1./(4*2*5)); + result *= x2; + + result += (1./(2*3)); + result *= x2; + + result += 1.; + result *= x; + + return result; +} + diff --git a/reactos/lib/sdk/crt/math/amd64/sqrt.S b/reactos/lib/sdk/crt/math/amd64/sqrt.S index 4c234eb0739..5238feebc98 100644 --- a/reactos/lib/sdk/crt/math/amd64/sqrt.S +++ b/reactos/lib/sdk/crt/math/amd64/sqrt.S @@ -9,14 +9,57 @@ /* INCLUDES ******************************************************************/ #include -#include /* CODE **********************************************************************/ .code64 PUBLIC sqrt sqrt: - UNIMPLEMENTED sqrt + + /* Load the sign bit into rdx */ + mov rdx, HEX(8000000000000000) + + /* Move the lower 64 bits of xmm0 into rax */ + movd rax, xmm0 + + /* Test the sign bit */ + test rax, rdx + + /* If it is set, go to the failure path */ + jnz x_is_negative + + /* x is positive, now check if it is NaN by checking if the unsigned + integer value is larger than the highest valid positive value. */ + mov rcx, 7FF0000000000000h + cmp rax, rcx + ja short x_is_nan + + /* All is well, calculate the sqrt */ + sqrtpd xmm0, xmm0 ret +x_is_negative: + /* Load failure return value (-1.#IND00) into rcx */ + mov rcx, HEX(0FFF8000000000000) + + /* Check if the parameter was -0.0 */ + cmp rax, rdx + + /* If it was not, load the failure value, otherwise keep -0.0 */ + cmovne rax, rcx + + /* Move the value back into the return register */ + movd xmm0, rax + ret + +x_is_nan: + /* Create a 1.#QNAN0 by setting this bit */ + mov rcx, HEX(8000000000000) + or rax, rcx + + /* Move the value back into the return register */ + movd xmm0, rax + ret + + END diff --git a/reactos/lib/sdk/crt/math/amd64/sqrt.c b/reactos/lib/sdk/crt/math/amd64/sqrt.c new file mode 100644 index 00000000000..636b9c7832c --- /dev/null +++ b/reactos/lib/sdk/crt/math/amd64/sqrt.c @@ -0,0 +1,77 @@ + +#include + +double +sqrt ( + double x) +{ + register union + { + __m128d x128d; + __m128i x128i; + } u ; + register union + { + unsigned long long ullx; + double dbl; + } u2; + + /* Set the lower double-precision value of u to x. + All that we want, is that the compiler understands that we have the + function parameter in a register that we can address as an __m128. + Sadly there is no obvious way to do that. If we use the union, VS will + generate code to store xmm0 in memory and the read it into a GPR. + We avoid memory access by using a direct move. But even here we won't + get a simple MOVSD. We can either do: + a) _mm_set_sd: move x into the lower part of an xmm register and zero + out the upper part (XORPD+MOVSD) + b) _mm_set1_pd: move x into the lower and higher part of an xmm register + (MOVSD+UNPCKLPD) + c) _mm_set_pd, which either generates a memory access, when we try to + tell it to keep the upper 64 bits, or generate 2 MOVAPS + UNPCKLPD + We choose a, which is probably the fastest. + */ + u.x128d = _mm_set_sd(x); + + /* Move the contents of the lower 64 bit into a 64 bit GPR using MOVD */ + u2.ullx = _mm_cvtsi128_si64(u.x128i); + + /* Check for negative values */ + if (u2.ullx & 0x8000000000000000ULL) + { + /* Check if this is *really* negative and not just -0.0 */ + if (u2.ullx != 0x8000000000000000ULL) + { + /* Return -1.#IND00 */ + u2.ullx = 0xfff8000000000000ULL; + } + + /* Return what we have */ + return u2.dbl; + } + + /* Check if this is a NaN (bits 52-62 are 1, bit 0-61 are not all 0) or + negative (bit 63 is 1) */ + if (u2.ullx > 0x7FF0000000000000ULL) + { + /* Set this bit. That's what MS function does. */ + u2.ullx |= 0x8000000000000ULL; + return u2.dbl; + } + + /* Calculate the square root. */ +#ifdef _MSC_VER + /* Another YAY for the MS compiler. There are 2 instructions we could use: + SQRTPD (computes sqrt for 2 double values) or SQRTSD (computes sqrt for + only the lower 64 bit double value). Obviously we only need 1. And on + Some architectures SQRTPD is twice as slow as SQRTSD. On the other hand + the MS compiler is stupid and always generates an additional MOVAPS + instruction when SQRTSD is used. We choose to use SQRTPD here since on + modern hardware it's as fast as SQRTSD. */ + u.x128d = _mm_sqrt_pd(u.x128d); // SQRTPD +#else + u.x128d = _mm_sqrt_sd(u.x128d, u.x128d); // SQRTSD +#endif + + return u.x128d.m128d_f64[0]; +}