mirror of
https://github.com/reactos/reactos.git
synced 2024-07-10 22:55:05 +00:00
[CRT]
- Remove x64 asm stub for acos from cmake file, since we already have a generic C implementation - Implement sqrt for amd64 in SSE, both in C and asm. While the C version would be sufficient, it's currently less portable due to the lack of mm intrinsics for GCC - Silence a warning svn path=/trunk/; revision=58251
This commit is contained in:
parent
57f9b8f9ab
commit
1edcc31339
|
@ -433,7 +433,7 @@ elseif(ARCH STREQUAL "amd64")
|
|||
float/amd64/getsetfpcw.S
|
||||
float/amd64/fpreset.S
|
||||
float/amd64/logb.S
|
||||
math/amd64/acos.S
|
||||
# math/amd64/acos.S
|
||||
math/amd64/acosf.S
|
||||
math/amd64/atan.S
|
||||
math/amd64/atan2.S
|
||||
|
|
|
@ -1402,6 +1402,7 @@ char* CDECL setlocale(int category, const char* locale)
|
|||
if(category == LC_ALL)
|
||||
return construct_lc_all(locinfo);
|
||||
|
||||
_Analysis_assume_(category <= 5);
|
||||
return locinfo->lc_category[category].locale;
|
||||
}
|
||||
|
||||
|
@ -1481,13 +1482,13 @@ MSVCRT__locale_t global_locale = NULL;
|
|||
void __init_global_locale()
|
||||
{
|
||||
unsigned i;
|
||||
|
||||
|
||||
LOCK_LOCALE;
|
||||
/* Someone created it before us */
|
||||
if(global_locale)
|
||||
return;
|
||||
global_locale = MSVCRT__create_locale(0, "C");
|
||||
|
||||
|
||||
__lc_codepage = MSVCRT_locale->locinfo->lc_codepage;
|
||||
MSVCRT___lc_collate_cp = MSVCRT_locale->locinfo->lc_collate_cp;
|
||||
__mb_cur_max = MSVCRT_locale->locinfo->mb_cur_max;
|
||||
|
|
73
reactos/lib/sdk/crt/math/amd64/asin.c
Normal file
73
reactos/lib/sdk/crt/math/amd64/asin.c
Normal file
|
@ -0,0 +1,73 @@
|
|||
/*
|
||||
* COPYRIGHT: See COPYING in the top level directory
|
||||
* PROJECT: ReactOS CRT
|
||||
* FILE: lib/crt/math/acos.c
|
||||
* PURPOSE: Generic C implementation of arc sine
|
||||
* PROGRAMMER: Timo Kreuzer (timo.kreuzer@reactos.org)
|
||||
*/
|
||||
|
||||
#define PRECISION 9
|
||||
|
||||
/*
|
||||
* The arc sine can be approximated with the following row:
|
||||
*
|
||||
* asin(x) = a0*x + a1*x^3 + a2*x^5 + a3*x^7 + a4*x^9 + ...
|
||||
*
|
||||
* To reduce the number of multiplications the formula is transformed to
|
||||
*
|
||||
* asin(x) = x * (1 + x^2*(a1 + x^2*(a2 + x^2*(a3 + ...) ) ) )
|
||||
*
|
||||
* The coefficients are:
|
||||
* a0 = 1
|
||||
* a1 = (1/2*3)
|
||||
* a2 = (3*1/4*2*5)
|
||||
* a3 = (5*3*1/6*4*2*7)
|
||||
* a4 = (7*5*3*1/8*6*4*2*9)
|
||||
* a5 = (9*7*5*3*1/10*8*6*4*2*11)
|
||||
* ...
|
||||
*/
|
||||
|
||||
double
|
||||
asin(double x)
|
||||
{
|
||||
double x2, result;
|
||||
|
||||
/* Check range */
|
||||
if ((x > 1.) || (x < -1.)) return NaN;
|
||||
|
||||
/* Calculate the square of x */
|
||||
x2 = (x * x);
|
||||
|
||||
/* Start with 0, compiler will optimize this away */
|
||||
result = 0;
|
||||
|
||||
result += (15*13*11*9*7*5*3*1./(16*14*12*10*8*6*4*2*17));
|
||||
result *= x2;
|
||||
|
||||
result += (13*11*9*7*5*3*1./(14*12*10*8*6*4*2*15));
|
||||
result *= x2;
|
||||
|
||||
result += (11*9*7*5*3*1./(12*10*8*6*4*2*13));
|
||||
result *= x2;
|
||||
|
||||
result += (9*7*5*3*1./(10*8*6*4*2*11));
|
||||
result *= x2;
|
||||
|
||||
result += (7*5*3*1./(8*6*4*2*9));
|
||||
result *= x2;
|
||||
|
||||
result += (5*3*1./(6*4*2*7));
|
||||
result *= x2;
|
||||
|
||||
result += (3*1./(4*2*5));
|
||||
result *= x2;
|
||||
|
||||
result += (1./(2*3));
|
||||
result *= x2;
|
||||
|
||||
result += 1.;
|
||||
result *= x;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
|
@ -9,14 +9,57 @@
|
|||
/* INCLUDES ******************************************************************/
|
||||
|
||||
#include <asm.inc>
|
||||
#include <ksamd64.inc>
|
||||
|
||||
/* CODE **********************************************************************/
|
||||
.code64
|
||||
|
||||
PUBLIC sqrt
|
||||
sqrt:
|
||||
UNIMPLEMENTED sqrt
|
||||
|
||||
/* Load the sign bit into rdx */
|
||||
mov rdx, HEX(8000000000000000)
|
||||
|
||||
/* Move the lower 64 bits of xmm0 into rax */
|
||||
movd rax, xmm0
|
||||
|
||||
/* Test the sign bit */
|
||||
test rax, rdx
|
||||
|
||||
/* If it is set, go to the failure path */
|
||||
jnz x_is_negative
|
||||
|
||||
/* x is positive, now check if it is NaN by checking if the unsigned
|
||||
integer value is larger than the highest valid positive value. */
|
||||
mov rcx, 7FF0000000000000h
|
||||
cmp rax, rcx
|
||||
ja short x_is_nan
|
||||
|
||||
/* All is well, calculate the sqrt */
|
||||
sqrtpd xmm0, xmm0
|
||||
ret
|
||||
|
||||
x_is_negative:
|
||||
/* Load failure return value (-1.#IND00) into rcx */
|
||||
mov rcx, HEX(0FFF8000000000000)
|
||||
|
||||
/* Check if the parameter was -0.0 */
|
||||
cmp rax, rdx
|
||||
|
||||
/* If it was not, load the failure value, otherwise keep -0.0 */
|
||||
cmovne rax, rcx
|
||||
|
||||
/* Move the value back into the return register */
|
||||
movd xmm0, rax
|
||||
ret
|
||||
|
||||
x_is_nan:
|
||||
/* Create a 1.#QNAN0 by setting this bit */
|
||||
mov rcx, HEX(8000000000000)
|
||||
or rax, rcx
|
||||
|
||||
/* Move the value back into the return register */
|
||||
movd xmm0, rax
|
||||
ret
|
||||
|
||||
|
||||
END
|
||||
|
|
77
reactos/lib/sdk/crt/math/amd64/sqrt.c
Normal file
77
reactos/lib/sdk/crt/math/amd64/sqrt.c
Normal file
|
@ -0,0 +1,77 @@
|
|||
|
||||
#include <intrin.h>
|
||||
|
||||
double
|
||||
sqrt (
|
||||
double x)
|
||||
{
|
||||
register union
|
||||
{
|
||||
__m128d x128d;
|
||||
__m128i x128i;
|
||||
} u ;
|
||||
register union
|
||||
{
|
||||
unsigned long long ullx;
|
||||
double dbl;
|
||||
} u2;
|
||||
|
||||
/* Set the lower double-precision value of u to x.
|
||||
All that we want, is that the compiler understands that we have the
|
||||
function parameter in a register that we can address as an __m128.
|
||||
Sadly there is no obvious way to do that. If we use the union, VS will
|
||||
generate code to store xmm0 in memory and the read it into a GPR.
|
||||
We avoid memory access by using a direct move. But even here we won't
|
||||
get a simple MOVSD. We can either do:
|
||||
a) _mm_set_sd: move x into the lower part of an xmm register and zero
|
||||
out the upper part (XORPD+MOVSD)
|
||||
b) _mm_set1_pd: move x into the lower and higher part of an xmm register
|
||||
(MOVSD+UNPCKLPD)
|
||||
c) _mm_set_pd, which either generates a memory access, when we try to
|
||||
tell it to keep the upper 64 bits, or generate 2 MOVAPS + UNPCKLPD
|
||||
We choose a, which is probably the fastest.
|
||||
*/
|
||||
u.x128d = _mm_set_sd(x);
|
||||
|
||||
/* Move the contents of the lower 64 bit into a 64 bit GPR using MOVD */
|
||||
u2.ullx = _mm_cvtsi128_si64(u.x128i);
|
||||
|
||||
/* Check for negative values */
|
||||
if (u2.ullx & 0x8000000000000000ULL)
|
||||
{
|
||||
/* Check if this is *really* negative and not just -0.0 */
|
||||
if (u2.ullx != 0x8000000000000000ULL)
|
||||
{
|
||||
/* Return -1.#IND00 */
|
||||
u2.ullx = 0xfff8000000000000ULL;
|
||||
}
|
||||
|
||||
/* Return what we have */
|
||||
return u2.dbl;
|
||||
}
|
||||
|
||||
/* Check if this is a NaN (bits 52-62 are 1, bit 0-61 are not all 0) or
|
||||
negative (bit 63 is 1) */
|
||||
if (u2.ullx > 0x7FF0000000000000ULL)
|
||||
{
|
||||
/* Set this bit. That's what MS function does. */
|
||||
u2.ullx |= 0x8000000000000ULL;
|
||||
return u2.dbl;
|
||||
}
|
||||
|
||||
/* Calculate the square root. */
|
||||
#ifdef _MSC_VER
|
||||
/* Another YAY for the MS compiler. There are 2 instructions we could use:
|
||||
SQRTPD (computes sqrt for 2 double values) or SQRTSD (computes sqrt for
|
||||
only the lower 64 bit double value). Obviously we only need 1. And on
|
||||
Some architectures SQRTPD is twice as slow as SQRTSD. On the other hand
|
||||
the MS compiler is stupid and always generates an additional MOVAPS
|
||||
instruction when SQRTSD is used. We choose to use SQRTPD here since on
|
||||
modern hardware it's as fast as SQRTSD. */
|
||||
u.x128d = _mm_sqrt_pd(u.x128d); // SQRTPD
|
||||
#else
|
||||
u.x128d = _mm_sqrt_sd(u.x128d, u.x128d); // SQRTSD
|
||||
#endif
|
||||
|
||||
return u.x128d.m128d_f64[0];
|
||||
}
|
Loading…
Reference in a new issue