mirror of
https://github.com/reactos/reactos.git
synced 2024-05-16 02:01:59 +00:00
[LIBM] Import win-libm from AMD
Source: https://github.com/amd/win-libm
This commit is contained in:
parent
081c637c06
commit
4afb647c78
54
sdk/lib/crt/math/libm_sse2/L2_by_pi_bits.asm
Normal file
54
sdk/lib/crt/math/libm_sse2/L2_by_pi_bits.asm
Normal file
|
@ -0,0 +1,54 @@
|
|||
;
|
||||
; MIT License
|
||||
; -----------
|
||||
;
|
||||
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
||||
;
|
||||
; Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
; of this Software and associated documentaon files (the "Software"), to deal
|
||||
; in the Software without restriction, including without limitation the rights
|
||||
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
; copies of the Software, and to permit persons to whom the Software is
|
||||
; furnished to do so, subject to the following conditions:
|
||||
;
|
||||
; The above copyright notice and this permission notice shall be included in
|
||||
; all copies or substantial portions of the Software.
|
||||
;
|
||||
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
; THE SOFTWARE.
|
||||
;
|
||||
;;
|
||||
;; Defines __L_2_by_pi_bits array
|
||||
;; Used in trigonometric argument reduction
|
||||
;;
|
||||
|
||||
.const
|
||||
|
||||
ALIGN 16
|
||||
PUBLIC __L_2_by_pi_bits
|
||||
__L_2_by_pi_bits DB 224, 241, 27, 193, 12, 88, 33, 116
|
||||
DB 53, 126, 196, 126, 237, 175, 169, 75
|
||||
DB 74, 41, 222, 231, 28, 244, 236, 197
|
||||
DB 151, 175, 31, 235, 158, 212, 181, 168
|
||||
DB 127, 121, 154, 253, 24, 61, 221, 38
|
||||
DB 44, 159, 60, 251, 217, 180, 125, 180
|
||||
DB 41, 104, 45, 70, 188, 188, 63, 96
|
||||
DB 22, 120, 255, 95, 226, 127, 236, 160
|
||||
DB 228, 247, 46, 126, 17, 114, 210, 231
|
||||
DB 76, 13, 230, 88, 71, 230, 4, 249
|
||||
DB 125, 209, 154, 192, 113, 166, 19, 18
|
||||
DB 237, 186, 212, 215, 8, 162, 251, 156
|
||||
DB 166, 196, 114, 172, 119, 248, 115, 72
|
||||
DB 70, 39, 168, 187, 36, 25, 128, 75
|
||||
DB 55, 9, 233, 184, 145, 220, 134, 21
|
||||
DB 239, 122, 175, 142, 69, 249, 7, 65
|
||||
DB 14, 241, 100, 86, 138, 109, 3, 119
|
||||
DB 211, 212, 71, 95, 157, 240, 167, 84
|
||||
DB 16, 57, 185, 13, 230, 139, 2, 0
|
||||
DB 0, 0, 0, 0, 0, 0
|
||||
END
|
62
sdk/lib/crt/math/libm_sse2/Lsincos_array.asm
Normal file
62
sdk/lib/crt/math/libm_sse2/Lsincos_array.asm
Normal file
|
@ -0,0 +1,62 @@
|
|||
;;
|
||||
;
|
||||
; MIT License
|
||||
; -----------
|
||||
;
|
||||
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
||||
;
|
||||
; Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
; of this Software and associated documentaon files (the "Software"), to deal
|
||||
; in the Software without restriction, including without limitation the rights
|
||||
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
; copies of the Software, and to permit persons to whom the Software is
|
||||
; furnished to do so, subject to the following conditions:
|
||||
;
|
||||
; The above copyright notice and this permission notice shall be included in
|
||||
; all copies or substantial portions of the Software.
|
||||
;
|
||||
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
; THE SOFTWARE.
|
||||
;
|
||||
;; Defines __Lcosarray and __Lsinarray arrays.
|
||||
;; Used in sin.asm and cos.asm
|
||||
;;
|
||||
|
||||
.const
|
||||
|
||||
ALIGN 16
|
||||
PUBLIC __Lcosarray
|
||||
__Lcosarray DQ 03fa5555555555555h ; 0.0416667 c1
|
||||
DQ 0
|
||||
DQ 0bf56c16c16c16967h ; -0.00138889 c2
|
||||
DQ 0
|
||||
DQ 03EFA01A019F4EC91h ; 2.48016e-005 c3
|
||||
DQ 0
|
||||
DQ 0bE927E4FA17F667Bh ; -2.75573e-007 c4
|
||||
DQ 0
|
||||
DQ 03E21EEB690382EECh ; 2.08761e-009 c5
|
||||
DQ 0
|
||||
DQ 0bDA907DB47258AA7h ; -1.13826e-011 c6
|
||||
DQ 0
|
||||
|
||||
ALIGN 16
|
||||
PUBLIC __Lsinarray
|
||||
__Lsinarray DQ 0bfc5555555555555h ; -0.166667 s1
|
||||
DQ 0
|
||||
DQ 03f81111111110bb3h ; 0.00833333 s2
|
||||
DQ 0
|
||||
DQ 0bf2a01a019e83e5ch ; -0.000198413 s3
|
||||
DQ 0
|
||||
DQ 03ec71de3796cde01h ; 2.75573e-006 s4
|
||||
DQ 0
|
||||
DQ 0be5ae600b42fdfa7h ; -2.50511e-008 s5
|
||||
DQ 0
|
||||
DQ 03de5e0b2f9a43bb8h ; 1.59181e-010 s6
|
||||
DQ 0
|
||||
|
||||
END
|
48
sdk/lib/crt/math/libm_sse2/Lsincosf_array.asm
Normal file
48
sdk/lib/crt/math/libm_sse2/Lsincosf_array.asm
Normal file
|
@ -0,0 +1,48 @@
|
|||
;;
|
||||
;
|
||||
; MIT License
|
||||
; -----------
|
||||
;
|
||||
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
||||
;
|
||||
; Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
; of this Software and associated documentaon files (the "Software"), to deal
|
||||
; in the Software without restriction, including without limitation the rights
|
||||
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
; copies of the Software, and to permit persons to whom the Software is
|
||||
; furnished to do so, subject to the following conditions:
|
||||
;
|
||||
; The above copyright notice and this permission notice shall be included in
|
||||
; all copies or substantial portions of the Software.
|
||||
;
|
||||
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
; THE SOFTWARE.
|
||||
;
|
||||
;; Defines __Lcosarray and __Lsinarray arrays.
|
||||
;; Used in sin.asm and cos.asm
|
||||
;; These coefficients are actually from Taylor series.
|
||||
;;
|
||||
|
||||
.const
|
||||
|
||||
ALIGN 16
|
||||
PUBLIC __Lcosfarray
|
||||
__Lcosfarray DQ 0bfe0000000000000h ; -0.5 c0
|
||||
DQ 03fa5555555555555h ; 0.0416667 c1
|
||||
DQ 0bf56c16c16c16c16h ; -0.00138889 c2
|
||||
DQ 03EFA01A01A01A019h ; 2.48016e-005 c3
|
||||
DQ 0be927e4fb7789f5ch ; -2.75573e-007 c4
|
||||
|
||||
ALIGN 16
|
||||
PUBLIC __Lsinfarray
|
||||
__Lsinfarray DQ 0bfc5555555555555h ; -0.166667 s1
|
||||
DQ 03f81111111111111h ; 0.00833333 s2
|
||||
DQ 0bf2a01a01a01a01ah ; -0.000198413 s3
|
||||
DQ 03ec71de3a556c734h ; 2.75573e-006 s4
|
||||
|
||||
END
|
41
sdk/lib/crt/math/libm_sse2/_chgsign.c
Normal file
41
sdk/lib/crt/math/libm_sse2/_chgsign.c
Normal file
|
@ -0,0 +1,41 @@
|
|||
|
||||
/*******************************************************************************
|
||||
MIT License
|
||||
-----------
|
||||
|
||||
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this Software and associated documentaon files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "libm.h"
|
||||
#include "libm_util.h"
|
||||
|
||||
double FN_PROTOTYPE(_chgsign)(double x)
|
||||
{
|
||||
/* Returns x with its sign reversed.
|
||||
NaNs are not considered special; their sign bits are handled
|
||||
the same as for any other number */
|
||||
unsigned long u;
|
||||
GET_BITS_DP64(x, u);
|
||||
u ^= SIGNBIT_DP64;
|
||||
PUT_BITS_DP64(u, x);
|
||||
return x;
|
||||
}
|
||||
|
40
sdk/lib/crt/math/libm_sse2/_chgsignf.c
Normal file
40
sdk/lib/crt/math/libm_sse2/_chgsignf.c
Normal file
|
@ -0,0 +1,40 @@
|
|||
|
||||
/*******************************************************************************
|
||||
MIT License
|
||||
-----------
|
||||
|
||||
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this Software and associated documentaon files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "libm.h"
|
||||
#include "libm_util.h"
|
||||
|
||||
float FN_PROTOTYPE(_chgsignf)(float x)
|
||||
{
|
||||
/* Returns x with its sign reversed.
|
||||
NaNs are not considered special; their sign bits are handled
|
||||
the same as for any other number */
|
||||
unsigned int u;
|
||||
GET_BITS_SP32(x, u);
|
||||
u ^= SIGNBIT_SP32;
|
||||
PUT_BITS_SP32(u, x);
|
||||
return x;
|
||||
}
|
44
sdk/lib/crt/math/libm_sse2/_copysign.c
Normal file
44
sdk/lib/crt/math/libm_sse2/_copysign.c
Normal file
|
@ -0,0 +1,44 @@
|
|||
|
||||
/*******************************************************************************
|
||||
MIT License
|
||||
-----------
|
||||
|
||||
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this Software and associated documentaon files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "libm.h"
|
||||
#include "libm_util.h"
|
||||
|
||||
/* Returns the absolute value of x with the sign of y.
|
||||
NaNs are not considered special; their sign bits are handled
|
||||
the same as for any other number. */
|
||||
|
||||
double FN_PROTOTYPE(_copysign)(double x, double y)
|
||||
{
|
||||
|
||||
unsigned long ux, uy;
|
||||
GET_BITS_DP64(x, ux);
|
||||
GET_BITS_DP64(y, uy);
|
||||
if ((ux ^ uy) & SIGNBIT_DP64)
|
||||
PUT_BITS_DP64(ux ^ SIGNBIT_DP64, x);
|
||||
return x;
|
||||
|
||||
}
|
42
sdk/lib/crt/math/libm_sse2/_copysignf.c
Normal file
42
sdk/lib/crt/math/libm_sse2/_copysignf.c
Normal file
|
@ -0,0 +1,42 @@
|
|||
|
||||
/*******************************************************************************
|
||||
MIT License
|
||||
-----------
|
||||
|
||||
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this Software and associated documentaon files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "libm.h"
|
||||
#include "libm_util.h"
|
||||
|
||||
/* Returns the absolute value of x with the sign of y.
|
||||
NaNs are not considered special; their sign bits are handled
|
||||
the same as for any other number. */
|
||||
|
||||
float FN_PROTOTYPE(_copysignf)(float x, float y)
|
||||
{
|
||||
unsigned int ux, uy;
|
||||
GET_BITS_SP32(x, ux);
|
||||
GET_BITS_SP32(y, uy);
|
||||
if ((ux ^ uy) & SIGNBIT_SP32)
|
||||
PUT_BITS_SP32(ux ^ SIGNBIT_SP32, x);
|
||||
return x;
|
||||
}
|
39
sdk/lib/crt/math/libm_sse2/_finite.c
Normal file
39
sdk/lib/crt/math/libm_sse2/_finite.c
Normal file
|
@ -0,0 +1,39 @@
|
|||
|
||||
/*******************************************************************************
|
||||
MIT License
|
||||
-----------
|
||||
|
||||
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this Software and associated documentaon files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "libm.h"
|
||||
#include "libm_util.h"
|
||||
|
||||
/* Returns 0 if x is infinite or NaN, otherwise returns 1 */
|
||||
|
||||
int FN_PROTOTYPE(_finite)(double x)
|
||||
{
|
||||
|
||||
|
||||
unsigned long ux;
|
||||
GET_BITS_DP64(x, ux);
|
||||
return (int)(((ux & ~SIGNBIT_DP64) - PINFBITPATT_DP64) >> 63);
|
||||
}
|
40
sdk/lib/crt/math/libm_sse2/_finitef.c
Normal file
40
sdk/lib/crt/math/libm_sse2/_finitef.c
Normal file
|
@ -0,0 +1,40 @@
|
|||
|
||||
/*******************************************************************************
|
||||
MIT License
|
||||
-----------
|
||||
|
||||
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this Software and associated documentaon files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "libm.h"
|
||||
#include "libm_util.h"
|
||||
|
||||
/* Returns 0 if x is infinite or NaN, otherwise returns 1 */
|
||||
|
||||
int FN_PROTOTYPE(_finitef)(float x)
|
||||
{
|
||||
|
||||
|
||||
unsigned int ux;
|
||||
GET_BITS_SP32(x, ux);
|
||||
return (int)(((ux & ~SIGNBIT_SP32) - PINFBITPATT_SP32) >> 31);
|
||||
|
||||
}
|
145
sdk/lib/crt/math/libm_sse2/acos.c
Normal file
145
sdk/lib/crt/math/libm_sse2/acos.c
Normal file
|
@ -0,0 +1,145 @@
|
|||
|
||||
/*******************************************************************************
|
||||
MIT License
|
||||
-----------
|
||||
|
||||
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this Software and associated documentaon files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "libm.h"
|
||||
#include "libm_util.h"
|
||||
|
||||
#define USE_VAL_WITH_FLAGS
|
||||
#define USE_NAN_WITH_FLAGS
|
||||
#define USE_HANDLE_ERROR
|
||||
#include "libm_inlines.h"
|
||||
#undef USE_NAN_WITH_FLAGS
|
||||
#undef USE_VAL_WITH_FLAGS
|
||||
#undef USE_HANDLE_ERROR
|
||||
|
||||
#include "libm_errno.h"
|
||||
|
||||
|
||||
#pragma function(acos)
|
||||
|
||||
double FN_PROTOTYPE(acos)(double x)
|
||||
{
|
||||
/* Computes arccos(x).
|
||||
The argument is first reduced by noting that arccos(x)
|
||||
is invalid for abs(x) > 1. For denormal and small
|
||||
arguments arccos(x) = pi/2 to machine accuracy.
|
||||
Remaining argument ranges are handled as follows.
|
||||
For abs(x) <= 0.5 use
|
||||
arccos(x) = pi/2 - arcsin(x)
|
||||
= pi/2 - (x + x^3*R(x^2))
|
||||
where R(x^2) is a rational minimax approximation to
|
||||
(arcsin(x) - x)/x^3.
|
||||
For abs(x) > 0.5 exploit the identity:
|
||||
arccos(x) = pi - 2*arcsin(sqrt(1-x)/2)
|
||||
together with the above rational approximation, and
|
||||
reconstruct the terms carefully.
|
||||
*/
|
||||
|
||||
/* Some constants and split constants. */
|
||||
|
||||
static const double
|
||||
pi = 3.1415926535897933e+00, /* 0x400921fb54442d18 */
|
||||
piby2 = 1.5707963267948965580e+00, /* 0x3ff921fb54442d18 */
|
||||
piby2_head = 1.5707963267948965580e+00, /* 0x3ff921fb54442d18 */
|
||||
piby2_tail = 6.12323399573676603587e-17; /* 0x3c91a62633145c07 */
|
||||
|
||||
double u, y, s=0.0, r;
|
||||
int xexp, xnan, transform=0;
|
||||
|
||||
unsigned long ux, aux, xneg;
|
||||
GET_BITS_DP64(x, ux);
|
||||
aux = ux & ~SIGNBIT_DP64;
|
||||
xneg = (ux & SIGNBIT_DP64);
|
||||
xnan = (aux > PINFBITPATT_DP64);
|
||||
xexp = (int)((ux & EXPBITS_DP64) >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64;
|
||||
|
||||
/* Special cases */
|
||||
|
||||
if (xnan)
|
||||
{
|
||||
return _handle_error("acos", OP_ACOS, ux|0x0008000000000000, _DOMAIN,
|
||||
0, EDOM, x, 0.0, 1);
|
||||
}
|
||||
else if (xexp < -56)
|
||||
{ /* y small enough that arccos(x) = pi/2 */
|
||||
return val_with_flags(piby2, AMD_F_INEXACT);
|
||||
}
|
||||
else if (xexp >= 0)
|
||||
{ /* abs(x) >= 1.0 */
|
||||
if (x == 1.0)
|
||||
return 0.0;
|
||||
else if (x == -1.0)
|
||||
return val_with_flags(pi, AMD_F_INEXACT);
|
||||
else
|
||||
return _handle_error("acos", OP_ACOS, INDEFBITPATT_DP64, _DOMAIN,
|
||||
AMD_F_INVALID, EDOM, x, 0.0, 1);
|
||||
}
|
||||
|
||||
if (xneg) y = -x;
|
||||
else y = x;
|
||||
|
||||
transform = (xexp >= -1); /* abs(x) >= 0.5 */
|
||||
|
||||
if (transform)
|
||||
{ /* Transform y into the range [0,0.5) */
|
||||
r = 0.5*(1.0 - y);
|
||||
/* VC++ intrinsic call */
|
||||
_mm_store_sd(&s, _mm_sqrt_sd(_mm_setzero_pd(), _mm_load_sd(&r)));
|
||||
y = s;
|
||||
}
|
||||
else
|
||||
r = y*y;
|
||||
|
||||
/* Use a rational approximation for [0.0, 0.5] */
|
||||
|
||||
u = r*(0.227485835556935010735943483075 +
|
||||
(-0.445017216867635649900123110649 +
|
||||
(0.275558175256937652532686256258 +
|
||||
(-0.0549989809235685841612020091328 +
|
||||
(0.00109242697235074662306043804220 +
|
||||
0.0000482901920344786991880522822991*r)*r)*r)*r)*r)/
|
||||
(1.36491501334161032038194214209 +
|
||||
(-3.28431505720958658909889444194 +
|
||||
(2.76568859157270989520376345954 +
|
||||
(-0.943639137032492685763471240072 +
|
||||
0.105869422087204370341222318533*r)*r)*r)*r);
|
||||
|
||||
if (transform)
|
||||
{ /* Reconstruct acos carefully in transformed region */
|
||||
if (xneg) return pi - 2.0*(s+(y*u - piby2_tail));
|
||||
else
|
||||
{
|
||||
double c, s1;
|
||||
unsigned long us;
|
||||
GET_BITS_DP64(s, us);
|
||||
PUT_BITS_DP64(0xffffffff00000000 & us, s1);
|
||||
c = (r-s1*s1)/(s+s1);
|
||||
return 2.0*s1 + (2.0*c+2.0*y*u);
|
||||
}
|
||||
}
|
||||
else
|
||||
return piby2_head - (x - (piby2_tail - x*u));
|
||||
}
|
146
sdk/lib/crt/math/libm_sse2/acosf.c
Normal file
146
sdk/lib/crt/math/libm_sse2/acosf.c
Normal file
|
@ -0,0 +1,146 @@
|
|||
|
||||
/*******************************************************************************
|
||||
MIT License
|
||||
-----------
|
||||
|
||||
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this Software and associated documentaon files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "libm.h"
|
||||
#include "libm_util.h"
|
||||
|
||||
#define USE_VALF_WITH_FLAGS
|
||||
#define USE_NANF_WITH_FLAGS
|
||||
#define USE_HANDLE_ERRORF
|
||||
#include "libm_inlines.h"
|
||||
#undef USE_NANF_WITH_FLAGS
|
||||
#undef USE_VALF_WITH_FLAGS
|
||||
#undef USE_HANDLE_ERRORF
|
||||
|
||||
#include "libm_errno.h"
|
||||
|
||||
// Disable "C4163: not available as intrinsic function" warning that older
|
||||
// compilers may issue here.
|
||||
#pragma warning(disable:4163)
|
||||
#pragma function(acosf)
|
||||
|
||||
|
||||
float FN_PROTOTYPE(acosf)(float x)
|
||||
{
|
||||
/* Computes arccos(x).
|
||||
The argument is first reduced by noting that arccos(x)
|
||||
is invalid for abs(x) > 1. For denormal and small
|
||||
arguments arccos(x) = pi/2 to machine accuracy.
|
||||
Remaining argument ranges are handled as follows.
|
||||
For abs(x) <= 0.5 use
|
||||
arccos(x) = pi/2 - arcsin(x)
|
||||
= pi/2 - (x + x^3*R(x^2))
|
||||
where R(x^2) is a rational minimax approximation to
|
||||
(arcsin(x) - x)/x^3.
|
||||
For abs(x) > 0.5 exploit the identity:
|
||||
arccos(x) = pi - 2*arcsin(sqrt(1-x)/2)
|
||||
together with the above rational approximation, and
|
||||
reconstruct the terms carefully.
|
||||
*/
|
||||
|
||||
/* Some constants and split constants. */
|
||||
|
||||
static const float
|
||||
piby2 = 1.5707963705e+00F; /* 0x3fc90fdb */
|
||||
static const double
|
||||
pi = 3.1415926535897933e+00, /* 0x400921fb54442d18 */
|
||||
piby2_head = 1.5707963267948965580e+00, /* 0x3ff921fb54442d18 */
|
||||
piby2_tail = 6.12323399573676603587e-17; /* 0x3c91a62633145c07 */
|
||||
|
||||
float u, y, s = 0.0F, r;
|
||||
int xexp, xnan, transform = 0;
|
||||
|
||||
unsigned int ux, aux, xneg;
|
||||
|
||||
GET_BITS_SP32(x, ux);
|
||||
aux = ux & ~SIGNBIT_SP32;
|
||||
xneg = (ux & SIGNBIT_SP32);
|
||||
xnan = (aux > PINFBITPATT_SP32);
|
||||
xexp = (int)((ux & EXPBITS_SP32) >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
|
||||
|
||||
/* Special cases */
|
||||
|
||||
if (xnan)
|
||||
{
|
||||
return _handle_errorf("acosf", OP_ACOS, ux|0x00400000, _DOMAIN, 0,
|
||||
EDOM, x, 0.0F, 1);
|
||||
}
|
||||
else if (xexp < -26)
|
||||
/* y small enough that arccos(x) = pi/2 */
|
||||
return valf_with_flags(piby2, AMD_F_INEXACT);
|
||||
else if (xexp >= 0)
|
||||
{ /* abs(x) >= 1.0 */
|
||||
if (x == 1.0F)
|
||||
return 0.0F;
|
||||
else if (x == -1.0F)
|
||||
return valf_with_flags((float)pi, AMD_F_INEXACT);
|
||||
else
|
||||
return _handle_errorf("acosf", OP_ACOS, INDEFBITPATT_SP32, _DOMAIN,
|
||||
AMD_F_INVALID, EDOM, x, 0.0F, 1);
|
||||
}
|
||||
|
||||
if (xneg) y = -x;
|
||||
else y = x;
|
||||
|
||||
transform = (xexp >= -1); /* abs(x) >= 0.5 */
|
||||
|
||||
if (transform)
|
||||
{ /* Transform y into the range [0,0.5) */
|
||||
r = 0.5F*(1.0F - y);
|
||||
/* VC++ intrinsic call */
|
||||
_mm_store_ss(&s, _mm_sqrt_ss(_mm_load_ss(&r)));
|
||||
y = s;
|
||||
}
|
||||
else
|
||||
r = y*y;
|
||||
|
||||
/* Use a rational approximation for [0.0, 0.5] */
|
||||
|
||||
u=r*(0.184161606965100694821398249421F +
|
||||
(-0.0565298683201845211985026327361F +
|
||||
(-0.0133819288943925804214011424456F -
|
||||
0.00396137437848476485201154797087F*r)*r)*r)/
|
||||
(1.10496961524520294485512696706F -
|
||||
0.836411276854206731913362287293F*r);
|
||||
|
||||
if (transform)
|
||||
{
|
||||
/* Reconstruct acos carefully in transformed region */
|
||||
if (xneg)
|
||||
return (float)(pi - 2.0*(s+(y*u - piby2_tail)));
|
||||
else
|
||||
{
|
||||
float c, s1;
|
||||
unsigned int us;
|
||||
GET_BITS_SP32(s, us);
|
||||
PUT_BITS_SP32(0xffff0000 & us, s1);
|
||||
c = (r-s1*s1)/(s+s1);
|
||||
return 2.0F*s1 + (2.0F*c+2.0F*y*u);
|
||||
}
|
||||
}
|
||||
else
|
||||
return (float)(piby2_head - (x - (piby2_tail - x*u)));
|
||||
}
|
153
sdk/lib/crt/math/libm_sse2/asin.c
Normal file
153
sdk/lib/crt/math/libm_sse2/asin.c
Normal file
|
@ -0,0 +1,153 @@
|
|||
|
||||
/*******************************************************************************
|
||||
MIT License
|
||||
-----------
|
||||
|
||||
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this Software and associated documentaon files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "libm.h"
|
||||
#include "libm_util.h"
|
||||
|
||||
#define USE_VAL_WITH_FLAGS
|
||||
#define USE_NAN_WITH_FLAGS
|
||||
#define USE_HANDLE_ERROR
|
||||
#include "libm_inlines.h"
|
||||
#undef USE_NAN_WITH_FLAGS
|
||||
#undef USE_VAL_WITH_FLAGS
|
||||
#undef USE_HANDLE_ERROR
|
||||
|
||||
#include "libm_errno.h"
|
||||
|
||||
#pragma function(asin)
|
||||
|
||||
double FN_PROTOTYPE(asin)(double x)
|
||||
{
|
||||
/* Computes arcsin(x).
|
||||
The argument is first reduced by noting that arcsin(x)
|
||||
is invalid for abs(x) > 1 and arcsin(-x) = -arcsin(x).
|
||||
For denormal and small arguments arcsin(x) = x to machine
|
||||
accuracy. Remaining argument ranges are handled as follows.
|
||||
For abs(x) <= 0.5 use
|
||||
arcsin(x) = x + x^3*R(x^2)
|
||||
where R(x^2) is a rational minimax approximation to
|
||||
(arcsin(x) - x)/x^3.
|
||||
For abs(x) > 0.5 exploit the identity:
|
||||
arcsin(x) = pi/2 - 2*arcsin(sqrt(1-x)/2)
|
||||
together with the above rational approximation, and
|
||||
reconstruct the terms carefully.
|
||||
*/
|
||||
|
||||
/* Some constants and split constants. */
|
||||
|
||||
static const double
|
||||
piby2_tail = 6.1232339957367660e-17, /* 0x3c91a62633145c07 */
|
||||
hpiby2_head = 7.8539816339744831e-01, /* 0x3fe921fb54442d18 */
|
||||
piby2 = 1.5707963267948965e+00; /* 0x3ff921fb54442d18 */
|
||||
double u, v, y, s=0.0, r;
|
||||
int xexp, xnan, transform=0;
|
||||
|
||||
unsigned long ux, aux, xneg;
|
||||
GET_BITS_DP64(x, ux);
|
||||
aux = ux & ~SIGNBIT_DP64;
|
||||
xneg = (ux & SIGNBIT_DP64);
|
||||
xnan = (aux > PINFBITPATT_DP64);
|
||||
xexp = (int)((ux & EXPBITS_DP64) >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64;
|
||||
|
||||
/* Special cases */
|
||||
|
||||
if (xnan)
|
||||
{
|
||||
return _handle_error("asin", OP_ASIN, ux|0x0008000000000000, _DOMAIN,
|
||||
0, EDOM, x, 0.0, 1);
|
||||
}
|
||||
else if (xexp < -28)
|
||||
{ /* y small enough that arcsin(x) = x */
|
||||
return val_with_flags(x, AMD_F_INEXACT);
|
||||
}
|
||||
else if (xexp >= 0)
|
||||
{ /* abs(x) >= 1.0 */
|
||||
if (x == 1.0)
|
||||
return val_with_flags(piby2, AMD_F_INEXACT);
|
||||
else if (x == -1.0)
|
||||
return val_with_flags(-piby2, AMD_F_INEXACT);
|
||||
else
|
||||
return _handle_error("asin", OP_ASIN, INDEFBITPATT_DP64, _DOMAIN,
|
||||
AMD_F_INVALID, EDOM, x, 0.0, 1);
|
||||
}
|
||||
|
||||
if (xneg) y = -x;
|
||||
else y = x;
|
||||
|
||||
transform = (xexp >= -1); /* abs(x) >= 0.5 */
|
||||
|
||||
if (transform)
|
||||
{ /* Transform y into the range [0,0.5) */
|
||||
r = 0.5*(1.0 - y);
|
||||
/* VC++ intrinsic call */
|
||||
_mm_store_sd(&s, _mm_sqrt_sd(_mm_setzero_pd(), _mm_load_sd(&r)));
|
||||
y = s;
|
||||
}
|
||||
else
|
||||
r = y*y;
|
||||
|
||||
/* Use a rational approximation for [0.0, 0.5] */
|
||||
|
||||
u = r*(0.227485835556935010735943483075 +
|
||||
(-0.445017216867635649900123110649 +
|
||||
(0.275558175256937652532686256258 +
|
||||
(-0.0549989809235685841612020091328 +
|
||||
(0.00109242697235074662306043804220 +
|
||||
0.0000482901920344786991880522822991*r)*r)*r)*r)*r)/
|
||||
(1.36491501334161032038194214209 +
|
||||
(-3.28431505720958658909889444194 +
|
||||
(2.76568859157270989520376345954 +
|
||||
(-0.943639137032492685763471240072 +
|
||||
0.105869422087204370341222318533*r)*r)*r)*r);
|
||||
|
||||
if (transform)
|
||||
{ /* Reconstruct asin carefully in transformed region */
|
||||
{
|
||||
double c, s1, p, q;
|
||||
unsigned long us;
|
||||
GET_BITS_DP64(s, us);
|
||||
PUT_BITS_DP64(0xffffffff00000000 & us, s1);
|
||||
c = (r-s1*s1)/(s+s1);
|
||||
p = 2.0*s*u - (piby2_tail-2.0*c);
|
||||
q = hpiby2_head - 2.0*s1;
|
||||
v = hpiby2_head - (p-q);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Use a temporary variable to prevent VC++ rearranging
|
||||
y + y*u
|
||||
into
|
||||
y * (1 + u)
|
||||
and getting an incorrectly rounded result */
|
||||
double tmp;
|
||||
tmp = y * u;
|
||||
v = y + tmp;
|
||||
}
|
||||
|
||||
if (xneg) return -v;
|
||||
else return v;
|
||||
}
|
151
sdk/lib/crt/math/libm_sse2/asinf.c
Normal file
151
sdk/lib/crt/math/libm_sse2/asinf.c
Normal file
|
@ -0,0 +1,151 @@
|
|||
|
||||
/*******************************************************************************
|
||||
MIT License
|
||||
-----------
|
||||
|
||||
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this Software and associated documentaon files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "libm.h"
|
||||
#include "libm_util.h"
|
||||
|
||||
#define USE_VALF_WITH_FLAGS
|
||||
#define USE_NANF_WITH_FLAGS
|
||||
#define USE_HANDLE_ERRORF
|
||||
#include "libm_inlines.h"
|
||||
#undef USE_NANF_WITH_FLAGS
|
||||
#undef USE_VALF_WITH_FLAGS
|
||||
#undef USE_HANDLE_ERRORF
|
||||
|
||||
#include "libm_errno.h"
|
||||
|
||||
// Disable "C4163: not available as intrinsic function" warning that older
|
||||
// compilers may issue here.
|
||||
#pragma warning(disable:4163)
|
||||
#pragma function(asinf)
|
||||
|
||||
|
||||
float FN_PROTOTYPE(asinf)(float x)
|
||||
{
|
||||
/* Computes arcsin(x).
|
||||
The argument is first reduced by noting that arcsin(x)
|
||||
is invalid for abs(x) > 1 and arcsin(-x) = -arcsin(x).
|
||||
For denormal and small arguments arcsin(x) = x to machine
|
||||
accuracy. Remaining argument ranges are handled as follows.
|
||||
For abs(x) <= 0.5 use
|
||||
arcsin(x) = x + x^3*R(x^2)
|
||||
where R(x^2) is a rational minimax approximation to
|
||||
(arcsin(x) - x)/x^3.
|
||||
For abs(x) > 0.5 exploit the identity:
|
||||
arcsin(x) = pi/2 - 2*arcsin(sqrt(1-x)/2)
|
||||
together with the above rational approximation, and
|
||||
reconstruct the terms carefully.
|
||||
*/
|
||||
|
||||
/* Some constants and split constants. */
|
||||
|
||||
static const float
|
||||
piby2_tail = 7.5497894159e-08F, /* 0x33a22168 */
|
||||
hpiby2_head = 7.8539812565e-01F, /* 0x3f490fda */
|
||||
piby2 = 1.5707963705e+00F; /* 0x3fc90fdb */
|
||||
float u, v, y, s = 0.0F, r;
|
||||
int xexp, xnan, transform = 0;
|
||||
|
||||
unsigned int ux, aux, xneg;
|
||||
GET_BITS_SP32(x, ux);
|
||||
aux = ux & ~SIGNBIT_SP32;
|
||||
xneg = (ux & SIGNBIT_SP32);
|
||||
xnan = (aux > PINFBITPATT_SP32);
|
||||
xexp = (int)((ux & EXPBITS_SP32) >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
|
||||
|
||||
/* Special cases */
|
||||
|
||||
if (xnan)
|
||||
{
|
||||
return _handle_errorf("asinf", OP_ASIN, ux|0x00400000, _DOMAIN, 0,
|
||||
EDOM, x, 0.0F, 1);
|
||||
}
|
||||
else if (xexp < -14)
|
||||
/* y small enough that arcsin(x) = x */
|
||||
return valf_with_flags(x, AMD_F_INEXACT);
|
||||
else if (xexp >= 0)
|
||||
{
|
||||
/* abs(x) >= 1.0 */
|
||||
if (x == 1.0F)
|
||||
return valf_with_flags(piby2, AMD_F_INEXACT);
|
||||
else if (x == -1.0F)
|
||||
return valf_with_flags(-piby2, AMD_F_INEXACT);
|
||||
else
|
||||
return _handle_errorf("asinf", OP_ASIN, INDEFBITPATT_SP32, _DOMAIN,
|
||||
AMD_F_INVALID, EDOM, x, 0.0F, 1);
|
||||
}
|
||||
|
||||
if (xneg) y = -x;
|
||||
else y = x;
|
||||
|
||||
transform = (xexp >= -1); /* abs(x) >= 0.5 */
|
||||
|
||||
if (transform)
|
||||
{ /* Transform y into the range [0,0.5) */
|
||||
r = 0.5F*(1.0F - y);
|
||||
/* VC++ intrinsic call */
|
||||
_mm_store_ss(&s, _mm_sqrt_ss(_mm_load_ss(&r)));
|
||||
y = s;
|
||||
}
|
||||
else
|
||||
r = y*y;
|
||||
|
||||
/* Use a rational approximation for [0.0, 0.5] */
|
||||
|
||||
u=r*(0.184161606965100694821398249421F +
|
||||
(-0.0565298683201845211985026327361F +
|
||||
(-0.0133819288943925804214011424456F -
|
||||
0.00396137437848476485201154797087F*r)*r)*r)/
|
||||
(1.10496961524520294485512696706F -
|
||||
0.836411276854206731913362287293F*r);
|
||||
|
||||
if (transform)
|
||||
{
|
||||
/* Reconstruct asin carefully in transformed region */
|
||||
float c, s1, p, q;
|
||||
unsigned int us;
|
||||
GET_BITS_SP32(s, us);
|
||||
PUT_BITS_SP32(0xffff0000 & us, s1);
|
||||
c = (r-s1*s1)/(s+s1);
|
||||
p = 2.0F*s*u - (piby2_tail-2.0F*c);
|
||||
q = hpiby2_head - 2.0F*s1;
|
||||
v = hpiby2_head - (p-q);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Use a temporary variable to prevent VC++ rearranging
|
||||
y + y*u
|
||||
into
|
||||
y * (1 + u)
|
||||
and getting an incorrectly rounded result */
|
||||
float tmp;
|
||||
tmp = y * u;
|
||||
v = y + tmp;
|
||||
}
|
||||
|
||||
if (xneg) return -v;
|
||||
else return v;
|
||||
}
|
132
sdk/lib/crt/math/libm_sse2/atan.c
Normal file
132
sdk/lib/crt/math/libm_sse2/atan.c
Normal file
|
@ -0,0 +1,132 @@
|
|||
|
||||
/*******************************************************************************
|
||||
MIT License
|
||||
-----------
|
||||
|
||||
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this Software and associated documentaon files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "libm.h"
|
||||
#include "libm_util.h"
|
||||
|
||||
#define USE_VAL_WITH_FLAGS
|
||||
#define USE_NAN_WITH_FLAGS
|
||||
#define USE_HANDLE_ERROR
|
||||
#include "libm_inlines.h"
|
||||
#undef USE_VAL_WITH_FLAGS
|
||||
#undef USE_NAN_WITH_FLAGS
|
||||
#undef USE_HANDLE_ERROR
|
||||
|
||||
#include "libm_errno.h"
|
||||
|
||||
#pragma function(atan)
|
||||
|
||||
double FN_PROTOTYPE(atan)(double x)
|
||||
{
|
||||
|
||||
/* Some constants and split constants. */
|
||||
|
||||
static double piby2 = 1.5707963267948966e+00; /* 0x3ff921fb54442d18 */
|
||||
double chi, clo, v, s, q, z;
|
||||
|
||||
/* Find properties of argument x. */
|
||||
|
||||
unsigned long ux, aux, xneg;
|
||||
GET_BITS_DP64(x, ux);
|
||||
aux = ux & ~SIGNBIT_DP64;
|
||||
xneg = (ux != aux);
|
||||
|
||||
if (xneg) v = -x;
|
||||
else v = x;
|
||||
|
||||
/* Argument reduction to range [-7/16,7/16] */
|
||||
|
||||
if (aux > 0x4003800000000000) /* v > 39./16. */
|
||||
{
|
||||
|
||||
if (aux > PINFBITPATT_DP64)
|
||||
{
|
||||
/* x is NaN */
|
||||
return _handle_error("atan", OP_ATAN, ux|0x0008000000000000, _DOMAIN, 0,
|
||||
EDOM, x, 0.0, 1);
|
||||
}
|
||||
else if (v > 0x4370000000000000)
|
||||
{ /* abs(x) > 2^56 => arctan(1/x) is
|
||||
insignificant compared to piby2 */
|
||||
if (xneg)
|
||||
return val_with_flags(-piby2, AMD_F_INEXACT);
|
||||
else
|
||||
return val_with_flags(piby2, AMD_F_INEXACT);
|
||||
}
|
||||
|
||||
x = -1.0/v;
|
||||
/* (chi + clo) = arctan(infinity) */
|
||||
chi = 1.57079632679489655800e+00; /* 0x3ff921fb54442d18 */
|
||||
clo = 6.12323399573676480327e-17; /* 0x3c91a62633145c06 */
|
||||
}
|
||||
else if (aux > 0x3ff3000000000000) /* 39./16. > v > 19./16. */
|
||||
{
|
||||
x = (v-1.5)/(1.0+1.5*v);
|
||||
/* (chi + clo) = arctan(1.5) */
|
||||
chi = 9.82793723247329054082e-01; /* 0x3fef730bd281f69b */
|
||||
clo = 1.39033110312309953701e-17; /* 0x3c7007887af0cbbc */
|
||||
}
|
||||
else if (aux > 0x3fe6000000000000) /* 19./16. > v > 11./16. */
|
||||
{
|
||||
x = (v-1.0)/(1.0+v);
|
||||
/* (chi + clo) = arctan(1.) */
|
||||
chi = 7.85398163397448278999e-01; /* 0x3fe921fb54442d18 */
|
||||
clo = 3.06161699786838240164e-17; /* 0x3c81a62633145c06 */
|
||||
}
|
||||
else if (aux > 0x3fdc000000000000) /* 11./16. > v > 7./16. */
|
||||
{
|
||||
x = (2.0*v-1.0)/(2.0+v);
|
||||
/* (chi + clo) = arctan(0.5) */
|
||||
chi = 4.63647609000806093515e-01; /* 0x3fddac670561bb4f */
|
||||
clo = 2.26987774529616809294e-17; /* 0x3c7a2b7f222f65e0 */
|
||||
}
|
||||
else /* v < 7./16. */
|
||||
{
|
||||
x = v;
|
||||
chi = 0.0;
|
||||
clo = 0.0;
|
||||
}
|
||||
|
||||
/* Core approximation: Remez(4,4) on [-7/16,7/16] */
|
||||
|
||||
s = x*x;
|
||||
q = x*s*
|
||||
(0.268297920532545909e0 +
|
||||
(0.447677206805497472e0 +
|
||||
(0.220638780716667420e0 +
|
||||
(0.304455919504853031e-1 +
|
||||
0.142316903342317766e-3*s)*s)*s)*s)/
|
||||
(0.804893761597637733e0 +
|
||||
(0.182596787737507063e1 +
|
||||
(0.141254259931958921e1 +
|
||||
(0.424602594203847109e0 +
|
||||
0.389525873944742195e-1*s)*s)*s)*s);
|
||||
|
||||
z = chi - ((q - clo) - x);
|
||||
|
||||
if (xneg) z = -z;
|
||||
return z;
|
||||
}
|
750
sdk/lib/crt/math/libm_sse2/atan2.c
Normal file
750
sdk/lib/crt/math/libm_sse2/atan2.c
Normal file
|
@ -0,0 +1,750 @@
|
|||
|
||||
/*******************************************************************************
|
||||
MIT License
|
||||
-----------
|
||||
|
||||
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this Software and associated documentaon files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "libm.h"
|
||||
#include "libm_util.h"
|
||||
|
||||
#define USE_VAL_WITH_FLAGS
|
||||
#define USE_NAN_WITH_FLAGS
|
||||
#define USE_SCALEDOUBLE_1
|
||||
#define USE_SCALEDOUBLE_2
|
||||
#define USE_SCALEUPDOUBLE1024
|
||||
#define USE_SCALEDOWNDOUBLE
|
||||
#define USE_HANDLE_ERROR
|
||||
#include "libm_inlines.h"
|
||||
#undef USE_VAL_WITH_FLAGS
|
||||
#undef USE_NAN_WITH_FLAGS
|
||||
#undef USE_SCALEDOUBLE_1
|
||||
#undef USE_SCALEDOUBLE_2
|
||||
#undef USE_SCALEUPDOUBLE1024
|
||||
#undef USE_SCALEDOWNDOUBLE
|
||||
#undef USE_HANDLE_ERROR
|
||||
|
||||
#include "libm_errno.h"
|
||||
|
||||
#pragma function(atan2)
|
||||
|
||||
double FN_PROTOTYPE(atan2)(double y, double x)
|
||||
{
|
||||
/* Arrays atan_jby256_lead and atan_jby256_tail contain
|
||||
leading and trailing parts respectively of precomputed
|
||||
values of atan(j/256), for j = 16, 17, ..., 256.
|
||||
atan_jby256_lead contains the first 21 bits of precision,
|
||||
and atan_jby256_tail contains a further 53 bits precision. */
|
||||
|
||||
static const double atan_jby256_lead[ 241] = {
|
||||
6.24187886714935302734e-02, /* 0x3faff55b00000000 */
|
||||
6.63088560104370117188e-02, /* 0x3fb0f99e00000000 */
|
||||
7.01969265937805175781e-02, /* 0x3fb1f86d00000000 */
|
||||
7.40829110145568847656e-02, /* 0x3fb2f71900000000 */
|
||||
7.79666304588317871094e-02, /* 0x3fb3f59f00000000 */
|
||||
8.18479657173156738281e-02, /* 0x3fb4f3fd00000000 */
|
||||
8.57268571853637695312e-02, /* 0x3fb5f23200000000 */
|
||||
8.96031260490417480469e-02, /* 0x3fb6f03b00000000 */
|
||||
9.34767723083496093750e-02, /* 0x3fb7ee1800000000 */
|
||||
9.73475575447082519531e-02, /* 0x3fb8ebc500000000 */
|
||||
1.01215422153472900391e-01, /* 0x3fb9e94100000000 */
|
||||
1.05080246925354003906e-01, /* 0x3fbae68a00000000 */
|
||||
1.08941912651062011719e-01, /* 0x3fbbe39e00000000 */
|
||||
1.12800359725952148438e-01, /* 0x3fbce07c00000000 */
|
||||
1.16655409336090087891e-01, /* 0x3fbddd2100000000 */
|
||||
1.20507001876831054688e-01, /* 0x3fbed98c00000000 */
|
||||
1.24354958534240722656e-01, /* 0x3fbfd5ba00000000 */
|
||||
1.28199219703674316406e-01, /* 0x3fc068d500000000 */
|
||||
1.32039666175842285156e-01, /* 0x3fc0e6ad00000000 */
|
||||
1.35876297950744628906e-01, /* 0x3fc1646500000000 */
|
||||
1.39708757400512695312e-01, /* 0x3fc1e1fa00000000 */
|
||||
1.43537282943725585938e-01, /* 0x3fc25f6e00000000 */
|
||||
1.47361397743225097656e-01, /* 0x3fc2dcbd00000000 */
|
||||
1.51181221008300781250e-01, /* 0x3fc359e800000000 */
|
||||
1.54996633529663085938e-01, /* 0x3fc3d6ee00000000 */
|
||||
1.58807516098022460938e-01, /* 0x3fc453ce00000000 */
|
||||
1.62613749504089355469e-01, /* 0x3fc4d08700000000 */
|
||||
1.66415214538574218750e-01, /* 0x3fc54d1800000000 */
|
||||
1.70211911201477050781e-01, /* 0x3fc5c98100000000 */
|
||||
1.74003481864929199219e-01, /* 0x3fc645bf00000000 */
|
||||
1.77790164947509765625e-01, /* 0x3fc6c1d400000000 */
|
||||
1.81571602821350097656e-01, /* 0x3fc73dbd00000000 */
|
||||
1.85347914695739746094e-01, /* 0x3fc7b97b00000000 */
|
||||
1.89118742942810058594e-01, /* 0x3fc8350b00000000 */
|
||||
1.92884206771850585938e-01, /* 0x3fc8b06e00000000 */
|
||||
1.96644186973571777344e-01, /* 0x3fc92ba300000000 */
|
||||
2.00398445129394531250e-01, /* 0x3fc9a6a800000000 */
|
||||
2.04147100448608398438e-01, /* 0x3fca217e00000000 */
|
||||
2.07889914512634277344e-01, /* 0x3fca9c2300000000 */
|
||||
2.11626768112182617188e-01, /* 0x3fcb169600000000 */
|
||||
2.15357661247253417969e-01, /* 0x3fcb90d700000000 */
|
||||
2.19082474708557128906e-01, /* 0x3fcc0ae500000000 */
|
||||
2.22801089286804199219e-01, /* 0x3fcc84bf00000000 */
|
||||
2.26513504981994628906e-01, /* 0x3fccfe6500000000 */
|
||||
2.30219483375549316406e-01, /* 0x3fcd77d500000000 */
|
||||
2.33919143676757812500e-01, /* 0x3fcdf11000000000 */
|
||||
2.37612247467041015625e-01, /* 0x3fce6a1400000000 */
|
||||
2.41298794746398925781e-01, /* 0x3fcee2e100000000 */
|
||||
2.44978547096252441406e-01, /* 0x3fcf5b7500000000 */
|
||||
2.48651623725891113281e-01, /* 0x3fcfd3d100000000 */
|
||||
2.52317905426025390625e-01, /* 0x3fd025fa00000000 */
|
||||
2.55977153778076171875e-01, /* 0x3fd061ee00000000 */
|
||||
2.59629487991333007812e-01, /* 0x3fd09dc500000000 */
|
||||
2.63274669647216796875e-01, /* 0x3fd0d97e00000000 */
|
||||
2.66912937164306640625e-01, /* 0x3fd1151a00000000 */
|
||||
2.70543813705444335938e-01, /* 0x3fd1509700000000 */
|
||||
2.74167299270629882812e-01, /* 0x3fd18bf500000000 */
|
||||
2.77783632278442382812e-01, /* 0x3fd1c73500000000 */
|
||||
2.81392335891723632812e-01, /* 0x3fd2025500000000 */
|
||||
2.84993648529052734375e-01, /* 0x3fd23d5600000000 */
|
||||
2.88587331771850585938e-01, /* 0x3fd2783700000000 */
|
||||
2.92173147201538085938e-01, /* 0x3fd2b2f700000000 */
|
||||
2.95751571655273437500e-01, /* 0x3fd2ed9800000000 */
|
||||
2.99322128295898437500e-01, /* 0x3fd3281800000000 */
|
||||
3.02884817123413085938e-01, /* 0x3fd3627700000000 */
|
||||
3.06439399719238281250e-01, /* 0x3fd39cb400000000 */
|
||||
3.09986352920532226562e-01, /* 0x3fd3d6d100000000 */
|
||||
3.13524961471557617188e-01, /* 0x3fd410cb00000000 */
|
||||
3.17055702209472656250e-01, /* 0x3fd44aa400000000 */
|
||||
3.20578098297119140625e-01, /* 0x3fd4845a00000000 */
|
||||
3.24092388153076171875e-01, /* 0x3fd4bdee00000000 */
|
||||
3.27598333358764648438e-01, /* 0x3fd4f75f00000000 */
|
||||
3.31095933914184570312e-01, /* 0x3fd530ad00000000 */
|
||||
3.34585189819335937500e-01, /* 0x3fd569d800000000 */
|
||||
3.38066101074218750000e-01, /* 0x3fd5a2e000000000 */
|
||||
3.41538190841674804688e-01, /* 0x3fd5dbc300000000 */
|
||||
3.45002174377441406250e-01, /* 0x3fd6148400000000 */
|
||||
3.48457098007202148438e-01, /* 0x3fd64d1f00000000 */
|
||||
3.51903676986694335938e-01, /* 0x3fd6859700000000 */
|
||||
3.55341434478759765625e-01, /* 0x3fd6bdea00000000 */
|
||||
3.58770608901977539062e-01, /* 0x3fd6f61900000000 */
|
||||
3.62190723419189453125e-01, /* 0x3fd72e2200000000 */
|
||||
3.65602254867553710938e-01, /* 0x3fd7660700000000 */
|
||||
3.69004726409912109375e-01, /* 0x3fd79dc600000000 */
|
||||
3.72398376464843750000e-01, /* 0x3fd7d56000000000 */
|
||||
3.75782966613769531250e-01, /* 0x3fd80cd400000000 */
|
||||
3.79158496856689453125e-01, /* 0x3fd8442200000000 */
|
||||
3.82525205612182617188e-01, /* 0x3fd87b4b00000000 */
|
||||
3.85882616043090820312e-01, /* 0x3fd8b24d00000000 */
|
||||
3.89230966567993164062e-01, /* 0x3fd8e92900000000 */
|
||||
3.92570018768310546875e-01, /* 0x3fd91fde00000000 */
|
||||
3.95900011062622070312e-01, /* 0x3fd9566d00000000 */
|
||||
3.99220705032348632812e-01, /* 0x3fd98cd500000000 */
|
||||
4.02532100677490234375e-01, /* 0x3fd9c31600000000 */
|
||||
4.05834197998046875000e-01, /* 0x3fd9f93000000000 */
|
||||
4.09126996994018554688e-01, /* 0x3fda2f2300000000 */
|
||||
4.12410259246826171875e-01, /* 0x3fda64ee00000000 */
|
||||
4.15684223175048828125e-01, /* 0x3fda9a9200000000 */
|
||||
4.18948888778686523438e-01, /* 0x3fdad00f00000000 */
|
||||
4.22204017639160156250e-01, /* 0x3fdb056400000000 */
|
||||
4.25449609756469726562e-01, /* 0x3fdb3a9100000000 */
|
||||
4.28685665130615234375e-01, /* 0x3fdb6f9600000000 */
|
||||
4.31912183761596679688e-01, /* 0x3fdba47300000000 */
|
||||
4.35129165649414062500e-01, /* 0x3fdbd92800000000 */
|
||||
4.38336372375488281250e-01, /* 0x3fdc0db400000000 */
|
||||
4.41534280776977539062e-01, /* 0x3fdc421900000000 */
|
||||
4.44722414016723632812e-01, /* 0x3fdc765500000000 */
|
||||
4.47900772094726562500e-01, /* 0x3fdcaa6800000000 */
|
||||
4.51069593429565429688e-01, /* 0x3fdcde5300000000 */
|
||||
4.54228639602661132812e-01, /* 0x3fdd121500000000 */
|
||||
4.57377910614013671875e-01, /* 0x3fdd45ae00000000 */
|
||||
4.60517644882202148438e-01, /* 0x3fdd791f00000000 */
|
||||
4.63647603988647460938e-01, /* 0x3fddac6700000000 */
|
||||
4.66767549514770507812e-01, /* 0x3fdddf8500000000 */
|
||||
4.69877958297729492188e-01, /* 0x3fde127b00000000 */
|
||||
4.72978591918945312500e-01, /* 0x3fde454800000000 */
|
||||
4.76069211959838867188e-01, /* 0x3fde77eb00000000 */
|
||||
4.79150056838989257812e-01, /* 0x3fdeaa6500000000 */
|
||||
4.82221126556396484375e-01, /* 0x3fdedcb600000000 */
|
||||
4.85282421112060546875e-01, /* 0x3fdf0ede00000000 */
|
||||
4.88333940505981445312e-01, /* 0x3fdf40dd00000000 */
|
||||
4.91375446319580078125e-01, /* 0x3fdf72b200000000 */
|
||||
4.94406938552856445312e-01, /* 0x3fdfa45d00000000 */
|
||||
4.97428894042968750000e-01, /* 0x3fdfd5e000000000 */
|
||||
5.00440597534179687500e-01, /* 0x3fe0039c00000000 */
|
||||
5.03442764282226562500e-01, /* 0x3fe01c3400000000 */
|
||||
5.06434917449951171875e-01, /* 0x3fe034b700000000 */
|
||||
5.09417057037353515625e-01, /* 0x3fe04d2500000000 */
|
||||
5.12389183044433593750e-01, /* 0x3fe0657e00000000 */
|
||||
5.15351772308349609375e-01, /* 0x3fe07dc300000000 */
|
||||
5.18304347991943359375e-01, /* 0x3fe095f300000000 */
|
||||
5.21246910095214843750e-01, /* 0x3fe0ae0e00000000 */
|
||||
5.24179458618164062500e-01, /* 0x3fe0c61400000000 */
|
||||
5.27101993560791015625e-01, /* 0x3fe0de0500000000 */
|
||||
5.30014991760253906250e-01, /* 0x3fe0f5e200000000 */
|
||||
5.32917976379394531250e-01, /* 0x3fe10daa00000000 */
|
||||
5.35810947418212890625e-01, /* 0x3fe1255d00000000 */
|
||||
5.38693904876708984375e-01, /* 0x3fe13cfb00000000 */
|
||||
5.41567325592041015625e-01, /* 0x3fe1548500000000 */
|
||||
5.44430732727050781250e-01, /* 0x3fe16bfa00000000 */
|
||||
5.47284126281738281250e-01, /* 0x3fe1835a00000000 */
|
||||
5.50127506256103515625e-01, /* 0x3fe19aa500000000 */
|
||||
5.52961349487304687500e-01, /* 0x3fe1b1dc00000000 */
|
||||
5.55785179138183593750e-01, /* 0x3fe1c8fe00000000 */
|
||||
5.58598995208740234375e-01, /* 0x3fe1e00b00000000 */
|
||||
5.61403274536132812500e-01, /* 0x3fe1f70400000000 */
|
||||
5.64197540283203125000e-01, /* 0x3fe20de800000000 */
|
||||
5.66981792449951171875e-01, /* 0x3fe224b700000000 */
|
||||
5.69756031036376953125e-01, /* 0x3fe23b7100000000 */
|
||||
5.72520732879638671875e-01, /* 0x3fe2521700000000 */
|
||||
5.75275897979736328125e-01, /* 0x3fe268a900000000 */
|
||||
5.78021049499511718750e-01, /* 0x3fe27f2600000000 */
|
||||
5.80756187438964843750e-01, /* 0x3fe2958e00000000 */
|
||||
5.83481788635253906250e-01, /* 0x3fe2abe200000000 */
|
||||
5.86197376251220703125e-01, /* 0x3fe2c22100000000 */
|
||||
5.88903427124023437500e-01, /* 0x3fe2d84c00000000 */
|
||||
5.91599464416503906250e-01, /* 0x3fe2ee6200000000 */
|
||||
5.94285964965820312500e-01, /* 0x3fe3046400000000 */
|
||||
5.96962928771972656250e-01, /* 0x3fe31a5200000000 */
|
||||
5.99629878997802734375e-01, /* 0x3fe3302b00000000 */
|
||||
6.02287292480468750000e-01, /* 0x3fe345f000000000 */
|
||||
6.04934692382812500000e-01, /* 0x3fe35ba000000000 */
|
||||
6.07573032379150390625e-01, /* 0x3fe3713d00000000 */
|
||||
6.10201358795166015625e-01, /* 0x3fe386c500000000 */
|
||||
6.12820148468017578125e-01, /* 0x3fe39c3900000000 */
|
||||
6.15428924560546875000e-01, /* 0x3fe3b19800000000 */
|
||||
6.18028640747070312500e-01, /* 0x3fe3c6e400000000 */
|
||||
6.20618820190429687500e-01, /* 0x3fe3dc1c00000000 */
|
||||
6.23198986053466796875e-01, /* 0x3fe3f13f00000000 */
|
||||
6.25770092010498046875e-01, /* 0x3fe4064f00000000 */
|
||||
6.28331184387207031250e-01, /* 0x3fe41b4a00000000 */
|
||||
6.30883216857910156250e-01, /* 0x3fe4303200000000 */
|
||||
6.33425712585449218750e-01, /* 0x3fe4450600000000 */
|
||||
6.35958671569824218750e-01, /* 0x3fe459c600000000 */
|
||||
6.38482093811035156250e-01, /* 0x3fe46e7200000000 */
|
||||
6.40995979309082031250e-01, /* 0x3fe4830a00000000 */
|
||||
6.43500804901123046875e-01, /* 0x3fe4978f00000000 */
|
||||
6.45996093750000000000e-01, /* 0x3fe4ac0000000000 */
|
||||
6.48482322692871093750e-01, /* 0x3fe4c05e00000000 */
|
||||
6.50959014892578125000e-01, /* 0x3fe4d4a800000000 */
|
||||
6.53426170349121093750e-01, /* 0x3fe4e8de00000000 */
|
||||
6.55884265899658203125e-01, /* 0x3fe4fd0100000000 */
|
||||
6.58332824707031250000e-01, /* 0x3fe5111000000000 */
|
||||
6.60772323608398437500e-01, /* 0x3fe5250c00000000 */
|
||||
6.63202762603759765625e-01, /* 0x3fe538f500000000 */
|
||||
6.65623664855957031250e-01, /* 0x3fe54cca00000000 */
|
||||
6.68035984039306640625e-01, /* 0x3fe5608d00000000 */
|
||||
6.70438766479492187500e-01, /* 0x3fe5743c00000000 */
|
||||
6.72832489013671875000e-01, /* 0x3fe587d800000000 */
|
||||
6.75216674804687500000e-01, /* 0x3fe59b6000000000 */
|
||||
6.77592277526855468750e-01, /* 0x3fe5aed600000000 */
|
||||
6.79958820343017578125e-01, /* 0x3fe5c23900000000 */
|
||||
6.82316303253173828125e-01, /* 0x3fe5d58900000000 */
|
||||
6.84664726257324218750e-01, /* 0x3fe5e8c600000000 */
|
||||
6.87004089355468750000e-01, /* 0x3fe5fbf000000000 */
|
||||
6.89334869384765625000e-01, /* 0x3fe60f0800000000 */
|
||||
6.91656589508056640625e-01, /* 0x3fe6220d00000000 */
|
||||
6.93969249725341796875e-01, /* 0x3fe634ff00000000 */
|
||||
6.96272850036621093750e-01, /* 0x3fe647de00000000 */
|
||||
6.98567867279052734375e-01, /* 0x3fe65aab00000000 */
|
||||
7.00854301452636718750e-01, /* 0x3fe66d6600000000 */
|
||||
7.03131675720214843750e-01, /* 0x3fe6800e00000000 */
|
||||
7.05400466918945312500e-01, /* 0x3fe692a400000000 */
|
||||
7.07660198211669921875e-01, /* 0x3fe6a52700000000 */
|
||||
7.09911346435546875000e-01, /* 0x3fe6b79800000000 */
|
||||
7.12153911590576171875e-01, /* 0x3fe6c9f700000000 */
|
||||
7.14387893676757812500e-01, /* 0x3fe6dc4400000000 */
|
||||
7.16613292694091796875e-01, /* 0x3fe6ee7f00000000 */
|
||||
7.18829631805419921875e-01, /* 0x3fe700a700000000 */
|
||||
7.21037864685058593750e-01, /* 0x3fe712be00000000 */
|
||||
7.23237514495849609375e-01, /* 0x3fe724c300000000 */
|
||||
7.25428581237792968750e-01, /* 0x3fe736b600000000 */
|
||||
7.27611064910888671875e-01, /* 0x3fe7489700000000 */
|
||||
7.29785442352294921875e-01, /* 0x3fe75a6700000000 */
|
||||
7.31950759887695312500e-01, /* 0x3fe76c2400000000 */
|
||||
7.34108448028564453125e-01, /* 0x3fe77dd100000000 */
|
||||
7.36257076263427734375e-01, /* 0x3fe78f6b00000000 */
|
||||
7.38397598266601562500e-01, /* 0x3fe7a0f400000000 */
|
||||
7.40530014038085937500e-01, /* 0x3fe7b26c00000000 */
|
||||
7.42654323577880859375e-01, /* 0x3fe7c3d300000000 */
|
||||
7.44770050048828125000e-01, /* 0x3fe7d52800000000 */
|
||||
7.46877670288085937500e-01, /* 0x3fe7e66c00000000 */
|
||||
7.48976707458496093750e-01, /* 0x3fe7f79e00000000 */
|
||||
7.51068115234375000000e-01, /* 0x3fe808c000000000 */
|
||||
7.53150939941406250000e-01, /* 0x3fe819d000000000 */
|
||||
7.55226135253906250000e-01, /* 0x3fe82ad000000000 */
|
||||
7.57292747497558593750e-01, /* 0x3fe83bbe00000000 */
|
||||
7.59351730346679687500e-01, /* 0x3fe84c9c00000000 */
|
||||
7.61402606964111328125e-01, /* 0x3fe85d6900000000 */
|
||||
7.63445377349853515625e-01, /* 0x3fe86e2500000000 */
|
||||
7.65480041503906250000e-01, /* 0x3fe87ed000000000 */
|
||||
7.67507076263427734375e-01, /* 0x3fe88f6b00000000 */
|
||||
7.69526004791259765625e-01, /* 0x3fe89ff500000000 */
|
||||
7.71537303924560546875e-01, /* 0x3fe8b06f00000000 */
|
||||
7.73540973663330078125e-01, /* 0x3fe8c0d900000000 */
|
||||
7.75536537170410156250e-01, /* 0x3fe8d13200000000 */
|
||||
7.77523994445800781250e-01, /* 0x3fe8e17a00000000 */
|
||||
7.79504299163818359375e-01, /* 0x3fe8f1b300000000 */
|
||||
7.81476497650146484375e-01, /* 0x3fe901db00000000 */
|
||||
7.83441066741943359375e-01, /* 0x3fe911f300000000 */
|
||||
7.85398006439208984375e-01}; /* 0x3fe921fb00000000 */
|
||||
|
||||
static const double atan_jby256_tail[ 241] = {
|
||||
2.13244638182005395671e-08, /* 0x3e56e59fbd38db2c */
|
||||
3.89093864761712760656e-08, /* 0x3e64e3aa54dedf96 */
|
||||
4.44780900009437454576e-08, /* 0x3e67e105ab1bda88 */
|
||||
1.15344768460112754160e-08, /* 0x3e48c5254d013fd0 */
|
||||
3.37271051945395312705e-09, /* 0x3e2cf8ab3ad62670 */
|
||||
2.40857608736109859459e-08, /* 0x3e59dca4bec80468 */
|
||||
1.85853810450623807768e-08, /* 0x3e53f4b5ec98a8da */
|
||||
5.14358299969225078306e-08, /* 0x3e6b9d49619d81fe */
|
||||
8.85023985412952486748e-09, /* 0x3e43017887460934 */
|
||||
1.59425154214358432060e-08, /* 0x3e511e3eca0b9944 */
|
||||
1.95139937737755753164e-08, /* 0x3e54f3f73c5a332e */
|
||||
2.64909755273544319715e-08, /* 0x3e5c71c8ae0e00a6 */
|
||||
4.43388037881231070144e-08, /* 0x3e67cde0f86fbdc7 */
|
||||
2.14757072421821274557e-08, /* 0x3e570f328c889c72 */
|
||||
2.61049792670754218852e-08, /* 0x3e5c07ae9b994efe */
|
||||
7.81439350674466302231e-09, /* 0x3e40c8021d7b1698 */
|
||||
3.60125207123751024094e-08, /* 0x3e635585edb8cb22 */
|
||||
6.15276238179343767917e-08, /* 0x3e70842567b30e96 */
|
||||
9.54387964641184285058e-08, /* 0x3e799e811031472e */
|
||||
3.02789566851502754129e-08, /* 0x3e6041821416bcee */
|
||||
1.16888650949870856331e-07, /* 0x3e7f6086e4dc96f4 */
|
||||
1.07580956468653338863e-08, /* 0x3e471a535c5f1b58 */
|
||||
8.33454265379535427653e-08, /* 0x3e765f743fe63ca1 */
|
||||
1.10790279272629526068e-07, /* 0x3e7dbd733472d014 */
|
||||
1.08394277896366207424e-07, /* 0x3e7d18cc4d8b0d1d */
|
||||
9.22176086126841098800e-08, /* 0x3e78c12553c8fb29 */
|
||||
7.90938592199048786990e-08, /* 0x3e753b49e2e8f991 */
|
||||
8.66445407164293125637e-08, /* 0x3e77422ae148c141 */
|
||||
1.40839973537092438671e-08, /* 0x3e4e3ec269df56a8 */
|
||||
1.19070438507307600689e-07, /* 0x3e7ff6754e7e0ac9 */
|
||||
6.40451663051716197071e-08, /* 0x3e7131267b1b5aad */
|
||||
1.08338682076343674522e-07, /* 0x3e7d14fa403a94bc */
|
||||
3.52999550187922736222e-08, /* 0x3e62f396c089a3d8 */
|
||||
1.05983273930043077202e-07, /* 0x3e7c731d78fa95bb */
|
||||
1.05486124078259553339e-07, /* 0x3e7c50f385177399 */
|
||||
5.82167732281776477773e-08, /* 0x3e6f41409c6f2c20 */
|
||||
1.08696483983403942633e-07, /* 0x3e7d2d90c4c39ec0 */
|
||||
4.47335086122377542835e-08, /* 0x3e680420696f2106 */
|
||||
1.26896287162615723528e-08, /* 0x3e4b40327943a2e8 */
|
||||
4.06534471589151404531e-08, /* 0x3e65d35e02f3d2a2 */
|
||||
3.84504846300557026690e-08, /* 0x3e64a498288117b0 */
|
||||
3.60715006404807269080e-08, /* 0x3e635da119afb324 */
|
||||
6.44725903165522722801e-08, /* 0x3e714e85cdb9a908 */
|
||||
3.63749249976409461305e-08, /* 0x3e638754e5547b9a */
|
||||
1.03901294413833913794e-07, /* 0x3e7be40ae6ce3246 */
|
||||
6.25379756302167880580e-08, /* 0x3e70c993b3bea7e7 */
|
||||
6.63984302368488828029e-08, /* 0x3e71d2dd89ac3359 */
|
||||
3.21844598971548278059e-08, /* 0x3e61476603332c46 */
|
||||
1.16030611712765830905e-07, /* 0x3e7f25901bac55b7 */
|
||||
1.17464622142347730134e-07, /* 0x3e7f881b7c826e28 */
|
||||
7.54604017965808996596e-08, /* 0x3e7441996d698d20 */
|
||||
1.49234929356206556899e-07, /* 0x3e8407ac521ea089 */
|
||||
1.41416924523217430259e-07, /* 0x3e82fb0c6c4b1723 */
|
||||
2.13308065617483489011e-07, /* 0x3e8ca135966a3e18 */
|
||||
5.04230937933302320146e-08, /* 0x3e6b1218e4d646e4 */
|
||||
5.45874922281655519035e-08, /* 0x3e6d4e72a350d288 */
|
||||
1.51849028914786868886e-07, /* 0x3e84617e2f04c329 */
|
||||
3.09004308703769273010e-08, /* 0x3e6096ec41e82650 */
|
||||
9.67574548184738317664e-08, /* 0x3e79f91f25773e6e */
|
||||
4.02508285529322212824e-08, /* 0x3e659c0820f1d674 */
|
||||
3.01222268096861091157e-08, /* 0x3e602bf7a2df1064 */
|
||||
2.36189860670079288680e-07, /* 0x3e8fb36bfc40508f */
|
||||
1.14095158111080887695e-07, /* 0x3e7ea08f3f8dc892 */
|
||||
7.42349089746573467487e-08, /* 0x3e73ed6254656a0e */
|
||||
5.12515583196230380184e-08, /* 0x3e6b83f5e5e69c58 */
|
||||
2.19290391828763918102e-07, /* 0x3e8d6ec2af768592 */
|
||||
3.83263512187553886471e-08, /* 0x3e6493889a226f94 */
|
||||
1.61513486284090523855e-07, /* 0x3e85ad8fa65279ba */
|
||||
5.09996743535589922261e-08, /* 0x3e6b615784d45434 */
|
||||
1.23694037861246766534e-07, /* 0x3e809a184368f145 */
|
||||
8.23367955351123783984e-08, /* 0x3e761a2439b0d91c */
|
||||
1.07591766213053694014e-07, /* 0x3e7ce1a65e39a978 */
|
||||
1.42789947524631815640e-07, /* 0x3e832a39a93b6a66 */
|
||||
1.32347123024711878538e-07, /* 0x3e81c3699af804e7 */
|
||||
2.17626067316598149229e-08, /* 0x3e575e0f4e44ede8 */
|
||||
2.34454866923044288656e-07, /* 0x3e8f77ced1a7a83b */
|
||||
2.82966370261766916053e-09, /* 0x3e284e7f0cb1b500 */
|
||||
2.29300919890907632975e-07, /* 0x3e8ec6b838b02dfe */
|
||||
1.48428270450261284915e-07, /* 0x3e83ebf4dfbeda87 */
|
||||
1.87937408574313982512e-07, /* 0x3e89397aed9cb475 */
|
||||
6.13685946813334055347e-08, /* 0x3e707937bc239c54 */
|
||||
1.98585022733583817493e-07, /* 0x3e8aa754553131b6 */
|
||||
7.68394131623752961662e-08, /* 0x3e74a05d407c45dc */
|
||||
1.28119052312436745644e-07, /* 0x3e8132231a206dd0 */
|
||||
7.02119104719236502733e-08, /* 0x3e72d8ecfdd69c88 */
|
||||
9.87954793820636301943e-08, /* 0x3e7a852c74218606 */
|
||||
1.72176752381034986217e-07, /* 0x3e871bf2baeebb50 */
|
||||
1.12877225146169704119e-08, /* 0x3e483d7db7491820 */
|
||||
5.33549829555851737993e-08, /* 0x3e6ca50d92b6da14 */
|
||||
2.13833275710816521345e-08, /* 0x3e56f5cde8530298 */
|
||||
1.16243518048290556393e-07, /* 0x3e7f343198910740 */
|
||||
6.29926408369055877943e-08, /* 0x3e70e8d241ccd80a */
|
||||
6.45429039328021963791e-08, /* 0x3e71535ac619e6c8 */
|
||||
8.64001922814281933403e-08, /* 0x3e77316041c36cd2 */
|
||||
9.50767572202325800240e-08, /* 0x3e7985a000637d8e */
|
||||
5.80851497508121135975e-08, /* 0x3e6f2f29858c0a68 */
|
||||
1.82350561135024766232e-07, /* 0x3e8879847f96d909 */
|
||||
1.98948680587390608655e-07, /* 0x3e8ab3d319e12e42 */
|
||||
7.83548663450197659846e-08, /* 0x3e75088162dfc4c2 */
|
||||
3.04374234486798594427e-08, /* 0x3e605749a1cd9d8c */
|
||||
2.76135725629797411787e-08, /* 0x3e5da65c6c6b8618 */
|
||||
4.32610105454203065470e-08, /* 0x3e6739bf7df1ad64 */
|
||||
5.17107515324127256994e-08, /* 0x3e6bc31252aa3340 */
|
||||
2.82398327875841444660e-08, /* 0x3e5e528191ad3aa8 */
|
||||
1.87482469524195595399e-07, /* 0x3e8929d93df19f18 */
|
||||
2.97481891662714096139e-08, /* 0x3e5ff11eb693a080 */
|
||||
9.94421570843584316402e-09, /* 0x3e455ae3f145a3a0 */
|
||||
1.07056210730391848428e-07, /* 0x3e7cbcd8c6c0ca82 */
|
||||
6.25589580466881163081e-08, /* 0x3e70cb04d425d304 */
|
||||
9.56641013869464593803e-08, /* 0x3e79adfcab5be678 */
|
||||
1.88056307148355440276e-07, /* 0x3e893d90c5662508 */
|
||||
8.38850689379557880950e-08, /* 0x3e768489bd35ff40 */
|
||||
5.01215865527674122924e-09, /* 0x3e3586ed3da2b7e0 */
|
||||
1.74166095998522089762e-07, /* 0x3e87604d2e850eee */
|
||||
9.96779574395363585849e-08, /* 0x3e7ac1d12bfb53d8 */
|
||||
5.98432026368321460686e-09, /* 0x3e39b3d468274740 */
|
||||
1.18362922366887577169e-07, /* 0x3e7fc5d68d10e53c */
|
||||
1.86086833284154215946e-07, /* 0x3e88f9e51884becb */
|
||||
1.97671457251348941011e-07, /* 0x3e8a87f0869c06d1 */
|
||||
1.42447160717199237159e-07, /* 0x3e831e7279f685fa */
|
||||
1.05504240785546574184e-08, /* 0x3e46a8282f9719b0 */
|
||||
3.13335218371639189324e-08, /* 0x3e60d2724a8a44e0 */
|
||||
1.96518418901914535399e-07, /* 0x3e8a60524b11ad4e */
|
||||
2.17692035039173536059e-08, /* 0x3e575fdf832750f0 */
|
||||
2.15613114426529981675e-07, /* 0x3e8cf06902e4cd36 */
|
||||
5.68271098300441214948e-08, /* 0x3e6e82422d4f6d10 */
|
||||
1.70331455823369124256e-08, /* 0x3e524a091063e6c0 */
|
||||
9.17590028095709583247e-08, /* 0x3e78a1a172dc6f38 */
|
||||
2.77266304112916566247e-07, /* 0x3e929b6619f8a92d */
|
||||
9.37041937614656939690e-08, /* 0x3e79274d9c1b70c8 */
|
||||
1.56116346368316796511e-08, /* 0x3e50c34b1fbb7930 */
|
||||
4.13967433808382727413e-08, /* 0x3e6639866c20eb50 */
|
||||
1.70164749185821616276e-07, /* 0x3e86d6d0f6832e9e */
|
||||
4.01708788545600086008e-07, /* 0x3e9af54def99f25e */
|
||||
2.59663539226050551563e-07, /* 0x3e916cfc52a00262 */
|
||||
2.22007487655027469542e-07, /* 0x3e8dcc1e83569c32 */
|
||||
2.90542250809644081369e-07, /* 0x3e937f7a551ed425 */
|
||||
4.67720537666628903341e-07, /* 0x3e9f6360adc98887 */
|
||||
2.79799803956772554802e-07, /* 0x3e92c6ec8d35a2c1 */
|
||||
2.07344552327432547723e-07, /* 0x3e8bd44df84cb036 */
|
||||
2.54705698692735196368e-07, /* 0x3e9117cf826e310e */
|
||||
4.26848589539548450728e-07, /* 0x3e9ca533f332cfc9 */
|
||||
2.52506723633552216197e-07, /* 0x3e90f208509dbc2e */
|
||||
2.14684129933849704964e-07, /* 0x3e8cd07d93c945de */
|
||||
3.20134822201596505431e-07, /* 0x3e957bdfd67e6d72 */
|
||||
9.93537565749855712134e-08, /* 0x3e7aab89c516c658 */
|
||||
3.70792944827917252327e-08, /* 0x3e63e823b1a1b8a0 */
|
||||
1.41772749369083698972e-07, /* 0x3e8307464a9d6d3c */
|
||||
4.22446601490198804306e-07, /* 0x3e9c5993cd438843 */
|
||||
4.11818433724801511540e-07, /* 0x3e9ba2fca02ab554 */
|
||||
1.19976381502605310519e-07, /* 0x3e801a5b6983a268 */
|
||||
3.43703078571520905265e-08, /* 0x3e6273d1b350efc8 */
|
||||
1.66128705555453270379e-07, /* 0x3e864c238c37b0c6 */
|
||||
5.00499610023283006540e-08, /* 0x3e6aded07370a300 */
|
||||
1.75105139941208062123e-07, /* 0x3e878091197eb47e */
|
||||
7.70807146729030327334e-08, /* 0x3e74b0f245e0dabc */
|
||||
2.45918607526895836121e-07, /* 0x3e9080d9794e2eaf */
|
||||
2.18359020958626199345e-07, /* 0x3e8d4ec242b60c76 */
|
||||
8.44342887976445333569e-09, /* 0x3e4221d2f940caa0 */
|
||||
1.07506148687888629299e-07, /* 0x3e7cdbc42b2bba5c */
|
||||
5.36544954316820904572e-08, /* 0x3e6cce37bb440840 */
|
||||
3.39109101518396596341e-07, /* 0x3e96c1d999cf1dd0 */
|
||||
2.60098720293920613340e-08, /* 0x3e5bed8a07eb0870 */
|
||||
8.42678991664621455827e-08, /* 0x3e769ed88f490e3c */
|
||||
5.36972237470183633197e-08, /* 0x3e6cd41719b73ef0 */
|
||||
4.28192558171921681288e-07, /* 0x3e9cbc4ac95b41b7 */
|
||||
2.71535491483955143294e-07, /* 0x3e9238f1b890f5d7 */
|
||||
7.84094998145075780203e-08, /* 0x3e750c4282259cc4 */
|
||||
3.43880599134117431863e-07, /* 0x3e9713d2de87b3e2 */
|
||||
1.32878065060366481043e-07, /* 0x3e81d5a7d2255276 */
|
||||
4.18046802627967629428e-07, /* 0x3e9c0dfd48227ac1 */
|
||||
2.65042411765766019424e-07, /* 0x3e91c964dab76753 */
|
||||
1.70383695347518643694e-07, /* 0x3e86de56d5704496 */
|
||||
1.54096497259613515678e-07, /* 0x3e84aeb71fd19968 */
|
||||
2.36543402412459813461e-07, /* 0x3e8fbf91c57b1918 */
|
||||
4.38416350106876736790e-07, /* 0x3e9d6bef7fbe5d9a */
|
||||
3.03892161339927775731e-07, /* 0x3e9464d3dc249066 */
|
||||
3.31136771605664899240e-07, /* 0x3e9638e2ec4d9073 */
|
||||
6.49494294526590682218e-08, /* 0x3e716f4a7247ea7c */
|
||||
4.10423429887181345747e-09, /* 0x3e31a0a740f1d440 */
|
||||
1.70831640869113847224e-07, /* 0x3e86edbb0114a33c */
|
||||
1.10811512657909180966e-07, /* 0x3e7dbee8bf1d513c */
|
||||
3.23677724749783611964e-07, /* 0x3e95b8bdb0248f73 */
|
||||
3.55662734259192678528e-07, /* 0x3e97de3d3f5eac64 */
|
||||
2.30102333489738219140e-07, /* 0x3e8ee24187ae448a */
|
||||
4.47429004000738629714e-07, /* 0x3e9e06c591ec5192 */
|
||||
7.78167135617329598659e-08, /* 0x3e74e3861a332738 */
|
||||
9.90345291908535415737e-08, /* 0x3e7a9599dcc2bfe4 */
|
||||
5.85800913143113728314e-08, /* 0x3e6f732fbad43468 */
|
||||
4.57859062410871843857e-07, /* 0x3e9eb9f573b727d9 */
|
||||
3.67993069723390929794e-07, /* 0x3e98b212a2eb9897 */
|
||||
2.90836464322977276043e-07, /* 0x3e9384884c167215 */
|
||||
2.51621574250131388318e-07, /* 0x3e90e2d363020051 */
|
||||
2.75789824740652815545e-07, /* 0x3e92820879fbd022 */
|
||||
3.88985776250314403593e-07, /* 0x3e9a1ab9893e4b30 */
|
||||
1.40214080183768019611e-07, /* 0x3e82d1b817a24478 */
|
||||
3.23451432223550478373e-08, /* 0x3e615d7b8ded4878 */
|
||||
9.15979180730608444470e-08, /* 0x3e78968f9db3a5e4 */
|
||||
3.44371402498640470421e-07, /* 0x3e971c4171fe135f */
|
||||
3.40401897215059498077e-07, /* 0x3e96d80f605d0d8c */
|
||||
1.06431813453707950243e-07, /* 0x3e7c91f043691590 */
|
||||
1.46204238932338846248e-07, /* 0x3e839f8a15fce2b2 */
|
||||
9.94610376972039046878e-09, /* 0x3e455beda9d94b80 */
|
||||
2.01711528092681771039e-07, /* 0x3e8b12c15d60949a */
|
||||
2.72027977986191568296e-07, /* 0x3e924167b312bfe3 */
|
||||
2.48402602511693757964e-07, /* 0x3e90ab8633070277 */
|
||||
1.58480011219249621715e-07, /* 0x3e854554ebbc80ee */
|
||||
3.00372828113368713281e-08, /* 0x3e60204aef5a4bb8 */
|
||||
3.67816204583541976394e-07, /* 0x3e98af08c679cf2c */
|
||||
2.46169793032343824291e-07, /* 0x3e90852a330ae6c8 */
|
||||
1.70080468270204253247e-07, /* 0x3e86d3eb9ec32916 */
|
||||
1.67806717763872914315e-07, /* 0x3e8685cb7fcbbafe */
|
||||
2.67715622006907942620e-07, /* 0x3e91f751c1e0bd95 */
|
||||
2.14411342550299170574e-08, /* 0x3e5705b1b0f72560 */
|
||||
4.11228221283669073277e-07, /* 0x3e9b98d8d808ca92 */
|
||||
3.52311752396749662260e-08, /* 0x3e62ea22c75cc980 */
|
||||
3.52718000397367821054e-07, /* 0x3e97aba62bca0350 */
|
||||
4.38857387992911129814e-07, /* 0x3e9d73833442278c */
|
||||
3.22574606753482540743e-07, /* 0x3e95a5ca1fb18bf9 */
|
||||
3.28730371182804296828e-08, /* 0x3e61a6092b6ecf28 */
|
||||
7.56672470607639279700e-08, /* 0x3e744fd049aac104 */
|
||||
3.26750155316369681821e-09, /* 0x3e2c114fd8df5180 */
|
||||
3.21724445362095284743e-07, /* 0x3e95972f130feae5 */
|
||||
1.06639427371776571151e-07, /* 0x3e7ca034a55fe198 */
|
||||
3.41020788139524715063e-07, /* 0x3e96e2b149990227 */
|
||||
1.00582838631232552824e-07, /* 0x3e7b00000294592c */
|
||||
3.68439433859276640065e-07, /* 0x3e98b9bdc442620e */
|
||||
2.20403078342388012027e-07, /* 0x3e8d94fdfabf3e4e */
|
||||
1.62841467098298142534e-07, /* 0x3e85db30b145ad9a */
|
||||
2.25325348296680733838e-07, /* 0x3e8e3e1eb95022b0 */
|
||||
4.37462238226421614339e-07, /* 0x3e9d5b8b45442bd6 */
|
||||
3.52055880555040706500e-07, /* 0x3e97a046231ecd2e */
|
||||
4.75614398494781776825e-07, /* 0x3e9feafe3ef55232 */
|
||||
3.60998399033215317516e-07, /* 0x3e9839e7bfd78267 */
|
||||
3.79292434611513945954e-08, /* 0x3e645cf49d6fa900 */
|
||||
1.29859015528549300061e-08, /* 0x3e4be3132b27f380 */
|
||||
3.15927546985474913188e-07, /* 0x3e9533980bb84f9f */
|
||||
2.28533679887379668031e-08, /* 0x3e5889e2ce3ba390 */
|
||||
1.17222541823553133877e-07, /* 0x3e7f7778c3ad0cc8 */
|
||||
1.51991208405464415857e-07, /* 0x3e846660cec4eba2 */
|
||||
1.56958239325240655564e-07}; /* 0x3e85110b4611a626 */
|
||||
|
||||
/* Some constants and split constants. */
|
||||
|
||||
static double pi = 3.1415926535897932e+00, /* 0x400921fb54442d18 */
|
||||
piby2 = 1.5707963267948966e+00, /* 0x3ff921fb54442d18 */
|
||||
piby4 = 7.8539816339744831e-01, /* 0x3fe921fb54442d18 */
|
||||
three_piby4 = 2.3561944901923449e+00, /* 0x4002d97c7f3321d2 */
|
||||
pi_head = 3.1415926218032836e+00, /* 0x400921fb50000000 */
|
||||
pi_tail = 3.1786509547056392e-08, /* 0x3e6110b4611a6263 */
|
||||
piby2_head = 1.5707963267948965e+00, /* 0x3ff921fb54442d18 */
|
||||
piby2_tail = 6.1232339957367660e-17; /* 0x3c91a62633145c07 */
|
||||
|
||||
double u, v, vbyu, q1, q2, s, u1, vu1, u2, vu2, uu, c, r;
|
||||
unsigned int swap_vu, index, xzero, yzero, xnan, ynan, xinf, yinf;
|
||||
int m, xexp, yexp, diffexp;
|
||||
|
||||
/* Find properties of arguments x and y. */
|
||||
|
||||
unsigned long ux, ui, aux, xneg, uy, auy, yneg;
|
||||
|
||||
GET_BITS_DP64(x, ux);
|
||||
GET_BITS_DP64(y, uy);
|
||||
aux = ux & ~SIGNBIT_DP64;
|
||||
auy = uy & ~SIGNBIT_DP64;
|
||||
xexp = (int)((ux & EXPBITS_DP64) >> EXPSHIFTBITS_DP64);
|
||||
yexp = (int)((uy & EXPBITS_DP64) >> EXPSHIFTBITS_DP64);
|
||||
xneg = ux & SIGNBIT_DP64;
|
||||
yneg = uy & SIGNBIT_DP64;
|
||||
xzero = (aux == 0);
|
||||
yzero = (auy == 0);
|
||||
xnan = (aux > PINFBITPATT_DP64);
|
||||
ynan = (auy > PINFBITPATT_DP64);
|
||||
xinf = (aux == PINFBITPATT_DP64);
|
||||
yinf = (auy == PINFBITPATT_DP64);
|
||||
|
||||
diffexp = yexp - xexp;
|
||||
|
||||
/* Special cases */
|
||||
|
||||
if (xnan)
|
||||
return _handle_error("atan2", OP_ATAN2, ux|0x0008000000000000, _DOMAIN, 0,
|
||||
EDOM, x, y, 2);
|
||||
else if (ynan)
|
||||
return _handle_error("atan2", OP_ATAN2, uy|0x0008000000000000, _DOMAIN, 0,
|
||||
EDOM, x, y, 2);
|
||||
else if (yzero)
|
||||
{ /* Zero y gives +-0 for positive x
|
||||
and +-pi for negative x */
|
||||
if (xneg)
|
||||
{
|
||||
if (yneg) return val_with_flags(-pi,AMD_F_INEXACT);
|
||||
else return val_with_flags(pi,AMD_F_INEXACT);
|
||||
}
|
||||
else return y;
|
||||
}
|
||||
else if (xzero)
|
||||
{ /* Zero x gives +- pi/2
|
||||
depending on sign of y */
|
||||
if (yneg) return val_with_flags(-piby2,AMD_F_INEXACT);
|
||||
else val_with_flags(piby2,AMD_F_INEXACT);
|
||||
}
|
||||
|
||||
/* Scale up both x and y if they are both below 1/4.
|
||||
This avoids any possible later denormalised arithmetic. */
|
||||
|
||||
if ((xexp < 1021 && yexp < 1021))
|
||||
{
|
||||
scaleUpDouble1024(ux, &ux);
|
||||
scaleUpDouble1024(uy, &uy);
|
||||
PUT_BITS_DP64(ux, x);
|
||||
PUT_BITS_DP64(uy, y);
|
||||
xexp = (int)((ux & EXPBITS_DP64) >> EXPSHIFTBITS_DP64);
|
||||
yexp = (int)((uy & EXPBITS_DP64) >> EXPSHIFTBITS_DP64);
|
||||
diffexp = yexp - xexp;
|
||||
}
|
||||
|
||||
if (diffexp > 56)
|
||||
{ /* abs(y)/abs(x) > 2^56 => arctan(x/y)
|
||||
is insignificant compared to piby2 */
|
||||
if (yneg) return val_with_flags(-piby2,AMD_F_INEXACT);
|
||||
else return val_with_flags(piby2,AMD_F_INEXACT);
|
||||
}
|
||||
else if (diffexp < -28 && (!xneg))
|
||||
{ /* x positive and dominant over y by a factor of 2^28.
|
||||
In this case atan(y/x) is y/x to machine accuracy. */
|
||||
|
||||
if (diffexp < -1074) /* Result underflows */
|
||||
{
|
||||
if (yneg)
|
||||
return val_with_flags(-0.0,AMD_F_INEXACT | AMD_F_UNDERFLOW);
|
||||
else
|
||||
return val_with_flags(0.0,AMD_F_INEXACT | AMD_F_UNDERFLOW);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (diffexp < -1022)
|
||||
{
|
||||
/* Result will likely be denormalized */
|
||||
y = scaleDouble_1(y, 100);
|
||||
y /= x;
|
||||
/* Now y is 2^100 times the true result. Scale it back down. */
|
||||
GET_BITS_DP64(y, uy);
|
||||
scaleDownDouble(uy, 100, &uy);
|
||||
PUT_BITS_DP64(uy, y);
|
||||
if ((uy & EXPBITS_DP64) == 0)
|
||||
return val_with_flags(y, AMD_F_INEXACT | AMD_F_UNDERFLOW);
|
||||
else
|
||||
return y;
|
||||
}
|
||||
else
|
||||
return y / x;
|
||||
}
|
||||
}
|
||||
else if (diffexp < -56 && xneg)
|
||||
{ /* abs(x)/abs(y) > 2^56 and x < 0 => arctan(y/x)
|
||||
is insignificant compared to pi */
|
||||
if (yneg) return val_with_flags(-pi,AMD_F_INEXACT);
|
||||
else return val_with_flags(pi,AMD_F_INEXACT);
|
||||
}
|
||||
else if (yinf && xinf)
|
||||
{ /* If abs(x) and abs(y) are both infinity
|
||||
return +-pi/4 or +- 3pi/4 according to
|
||||
signs. */
|
||||
if (xneg)
|
||||
{
|
||||
if (yneg) return val_with_flags(-three_piby4,AMD_F_INEXACT);
|
||||
else return val_with_flags(three_piby4,AMD_F_INEXACT);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (yneg) return val_with_flags(-piby4,AMD_F_INEXACT);
|
||||
else return val_with_flags(piby4,AMD_F_INEXACT);
|
||||
}
|
||||
}
|
||||
|
||||
/* General case: take absolute values of arguments */
|
||||
|
||||
u = x; v = y;
|
||||
if (xneg) u = -x;
|
||||
if (yneg) v = -y;
|
||||
|
||||
/* Swap u and v if necessary to obtain 0 < v < u. Compute v/u. */
|
||||
|
||||
swap_vu = (u < v);
|
||||
if (swap_vu) { uu = u; u = v; v = uu; }
|
||||
vbyu = v/u;
|
||||
|
||||
if (vbyu > 0.0625)
|
||||
{ /* General values of v/u. Use a look-up
|
||||
table and series expansion. */
|
||||
|
||||
index = (int)(256*vbyu + 0.5);
|
||||
q1 = atan_jby256_lead[index-16];
|
||||
q2 = atan_jby256_tail[index-16];
|
||||
c = index*1./256;
|
||||
GET_BITS_DP64(u, ui);
|
||||
m = (int)((ui & EXPBITS_DP64) >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64;
|
||||
u = scaleDouble_2(u,-m);
|
||||
v = scaleDouble_2(v,-m);
|
||||
GET_BITS_DP64(u, ui);
|
||||
PUT_BITS_DP64(0xfffffffff8000000 & ui, u1); /* 26 leading bits of u */
|
||||
u2 = u - u1;
|
||||
|
||||
r = ((v-c*u1)-c*u2)/(u+c*v);
|
||||
|
||||
/* Polynomial approximation to atan(r) */
|
||||
|
||||
s = r*r;
|
||||
q2 = q2 + r - r*(s * (0.33333333333224095522 - s*(0.19999918038989143496)));
|
||||
}
|
||||
else if (vbyu < 1.e-8)
|
||||
{ /* v/u is small enough that atan(v/u) = v/u */
|
||||
q1 = 0.0;
|
||||
q2 = vbyu;
|
||||
}
|
||||
else /* vbyu <= 0.0625 */
|
||||
{
|
||||
/* Small values of v/u. Use a series expansion
|
||||
computed carefully to minimise cancellation */
|
||||
|
||||
GET_BITS_DP64(u, ui);
|
||||
PUT_BITS_DP64(0xffffffff00000000 & ui, u1);
|
||||
GET_BITS_DP64(vbyu, ui);
|
||||
PUT_BITS_DP64(0xffffffff00000000 & ui, vu1);
|
||||
u2 = u - u1;
|
||||
vu2 = vbyu - vu1;
|
||||
|
||||
q1 = 0.0;
|
||||
s = vbyu*vbyu;
|
||||
q2 = vbyu +
|
||||
((((v - u1*vu1) - u2*vu1) - u*vu2)/u -
|
||||
(vbyu*s*(0.33333333333333170500 -
|
||||
s*(0.19999999999393223405 -
|
||||
s*(0.14285713561807169030 -
|
||||
s*(0.11110736283514525407 -
|
||||
s*(0.90029810285449784439E-01)))))));
|
||||
}
|
||||
|
||||
/* Tidy-up according to which quadrant the arguments lie in */
|
||||
|
||||
if (swap_vu) {q1 = piby2_head - q1; q2 = piby2_tail - q2;}
|
||||
if (xneg) {q1 = pi_head - q1; q2 = pi_tail - q2;}
|
||||
q1 = q1 + q2;
|
||||
|
||||
if (yneg) q1 = - q1;
|
||||
|
||||
return q1;
|
||||
}
|
469
sdk/lib/crt/math/libm_sse2/atan2f.c
Normal file
469
sdk/lib/crt/math/libm_sse2/atan2f.c
Normal file
|
@ -0,0 +1,469 @@
|
|||
|
||||
/*******************************************************************************
|
||||
MIT License
|
||||
-----------
|
||||
|
||||
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this Software and associated documentaon files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "libm.h"
|
||||
#include "libm_util.h"
|
||||
|
||||
#define USE_VALF_WITH_FLAGS
|
||||
#define USE_NAN_WITH_FLAGS
|
||||
#define USE_SCALEDOUBLE_1
|
||||
#define USE_SCALEDOWNDOUBLE
|
||||
#define USE_HANDLE_ERRORF
|
||||
#include "libm_inlines.h"
|
||||
#undef USE_VALF_WITH_FLAGS
|
||||
#undef USE_NAN_WITH_FLAGS
|
||||
#undef USE_SCALEDOUBLE_1
|
||||
#undef USE_SCALEDOWNDOUBLE
|
||||
#undef USE_HANDLE_ERRORF
|
||||
|
||||
#include "libm_errno.h"
|
||||
|
||||
// Disable "C4163: not available as intrinsic function" warning that older
|
||||
// compilers may issue here.
|
||||
#pragma warning(disable:4163)
|
||||
#pragma function(atan2f)
|
||||
|
||||
float FN_PROTOTYPE(atan2f)(float fy, float fx)
|
||||
{
|
||||
/* Array atan_jby256 contains precomputed values of atan(j/256),
|
||||
for j = 16, 17, ..., 256. */
|
||||
|
||||
static const double atan_jby256[ 241] = {
|
||||
6.24188099959573430842e-02, /* 0x3faff55bb72cfde9 */
|
||||
6.63088949198234745008e-02, /* 0x3fb0f99ea71d52a6 */
|
||||
7.01969710718705064423e-02, /* 0x3fb1f86dbf082d58 */
|
||||
7.40829225490337306415e-02, /* 0x3fb2f719318a4a9a */
|
||||
7.79666338315423007588e-02, /* 0x3fb3f59f0e7c559d */
|
||||
8.18479898030765457007e-02, /* 0x3fb4f3fd677292fb */
|
||||
8.57268757707448092464e-02, /* 0x3fb5f2324fd2d7b2 */
|
||||
8.96031774848717321724e-02, /* 0x3fb6f03bdcea4b0c */
|
||||
9.34767811585894559112e-02, /* 0x3fb7ee182602f10e */
|
||||
9.73475734872236708739e-02, /* 0x3fb8ebc54478fb28 */
|
||||
1.01215441667466668485e-01, /* 0x3fb9e94153cfdcf1 */
|
||||
1.05080273416329528224e-01, /* 0x3fbae68a71c722b8 */
|
||||
1.08941956989865793015e-01, /* 0x3fbbe39ebe6f07c3 */
|
||||
1.12800381201659388752e-01, /* 0x3fbce07c5c3cca32 */
|
||||
1.16655435441069349478e-01, /* 0x3fbddd21701eba6e */
|
||||
1.20507009691224548087e-01, /* 0x3fbed98c2190043a */
|
||||
1.24354994546761424279e-01, /* 0x3fbfd5ba9aac2f6d */
|
||||
1.28199281231298117811e-01, /* 0x3fc068d584212b3d */
|
||||
1.32039761614638734288e-01, /* 0x3fc0e6adccf40881 */
|
||||
1.35876328229701304195e-01, /* 0x3fc1646541060850 */
|
||||
1.39708874289163620386e-01, /* 0x3fc1e1fafb043726 */
|
||||
1.43537293701821222491e-01, /* 0x3fc25f6e171a535c */
|
||||
1.47361481088651630200e-01, /* 0x3fc2dcbdb2fba1ff */
|
||||
1.51181331798580037562e-01, /* 0x3fc359e8edeb99a3 */
|
||||
1.54996741923940972718e-01, /* 0x3fc3d6eee8c6626c */
|
||||
1.58807608315631065832e-01, /* 0x3fc453cec6092a9e */
|
||||
1.62613828597948567589e-01, /* 0x3fc4d087a9da4f17 */
|
||||
1.66415301183114927586e-01, /* 0x3fc54d18ba11570a */
|
||||
1.70211925285474380276e-01, /* 0x3fc5c9811e3ec269 */
|
||||
1.74003600935367680469e-01, /* 0x3fc645bfffb3aa73 */
|
||||
1.77790228992676047071e-01, /* 0x3fc6c1d4898933d8 */
|
||||
1.81571711160032150945e-01, /* 0x3fc73dbde8a7d201 */
|
||||
1.85347949995694760705e-01, /* 0x3fc7b97b4bce5b02 */
|
||||
1.89118848926083965578e-01, /* 0x3fc8350be398ebc7 */
|
||||
1.92884312257974643856e-01, /* 0x3fc8b06ee2879c28 */
|
||||
1.96644245190344985064e-01, /* 0x3fc92ba37d050271 */
|
||||
2.00398553825878511514e-01, /* 0x3fc9a6a8e96c8626 */
|
||||
2.04147145182116990236e-01, /* 0x3fca217e601081a5 */
|
||||
2.07889927202262986272e-01, /* 0x3fca9c231b403279 */
|
||||
2.11626808765629753628e-01, /* 0x3fcb1696574d780b */
|
||||
2.15357699697738047551e-01, /* 0x3fcb90d7529260a2 */
|
||||
2.19082510780057748701e-01, /* 0x3fcc0ae54d768466 */
|
||||
2.22801153759394493514e-01, /* 0x3fcc84bf8a742e6d */
|
||||
2.26513541356919617664e-01, /* 0x3fccfe654e1d5395 */
|
||||
2.30219587276843717927e-01, /* 0x3fcd77d5df205736 */
|
||||
2.33919206214733416127e-01, /* 0x3fcdf110864c9d9d */
|
||||
2.37612313865471241892e-01, /* 0x3fce6a148e96ec4d */
|
||||
2.41298826930858800743e-01, /* 0x3fcee2e1451d980c */
|
||||
2.44978663126864143473e-01, /* 0x3fcf5b75f92c80dd */
|
||||
2.48651741190513253521e-01, /* 0x3fcfd3d1fc40dbe4 */
|
||||
2.52317980886427151166e-01, /* 0x3fd025fa510665b5 */
|
||||
2.55977303013005474952e-01, /* 0x3fd061eea03d6290 */
|
||||
2.59629629408257511791e-01, /* 0x3fd09dc597d86362 */
|
||||
2.63274882955282396590e-01, /* 0x3fd0d97ee509acb3 */
|
||||
2.66912987587400396539e-01, /* 0x3fd1151a362431c9 */
|
||||
2.70543868292936529052e-01, /* 0x3fd150973a9ce546 */
|
||||
2.74167451119658789338e-01, /* 0x3fd18bf5a30bf178 */
|
||||
2.77783663178873208022e-01, /* 0x3fd1c735212dd883 */
|
||||
2.81392432649178403370e-01, /* 0x3fd2025567e47c95 */
|
||||
2.84993688779881237938e-01, /* 0x3fd23d562b381041 */
|
||||
2.88587361894077354396e-01, /* 0x3fd278372057ef45 */
|
||||
2.92173383391398755471e-01, /* 0x3fd2b2f7fd9b5fe2 */
|
||||
2.95751685750431536626e-01, /* 0x3fd2ed987a823cfe */
|
||||
2.99322202530807379706e-01, /* 0x3fd328184fb58951 */
|
||||
3.02884868374971361060e-01, /* 0x3fd362773707ebcb */
|
||||
3.06439619009630070945e-01, /* 0x3fd39cb4eb76157b */
|
||||
3.09986391246883430384e-01, /* 0x3fd3d6d129271134 */
|
||||
3.13525122985043869228e-01, /* 0x3fd410cbad6c7d32 */
|
||||
3.17055753209146973237e-01, /* 0x3fd44aa436c2af09 */
|
||||
3.20578221991156986359e-01, /* 0x3fd4845a84d0c21b */
|
||||
3.24092470489871664618e-01, /* 0x3fd4bdee586890e6 */
|
||||
3.27598440950530811477e-01, /* 0x3fd4f75f73869978 */
|
||||
3.31096076704132047386e-01, /* 0x3fd530ad9951cd49 */
|
||||
3.34585322166458920545e-01, /* 0x3fd569d88e1b4cd7 */
|
||||
3.38066122836825466713e-01, /* 0x3fd5a2e0175e0f4e */
|
||||
3.41538425296541714449e-01, /* 0x3fd5dbc3fbbe768d */
|
||||
3.45002177207105076295e-01, /* 0x3fd614840309cfe1 */
|
||||
3.48457327308122011278e-01, /* 0x3fd64d1ff635c1c5 */
|
||||
3.51903825414964732676e-01, /* 0x3fd685979f5fa6fd */
|
||||
3.55341622416168290144e-01, /* 0x3fd6bdeac9cbd76c */
|
||||
3.58770670270572189509e-01, /* 0x3fd6f61941e4def0 */
|
||||
3.62190922004212156882e-01, /* 0x3fd72e22d53aa2a9 */
|
||||
3.65602331706966821034e-01, /* 0x3fd7660752817501 */
|
||||
3.69004854528964421068e-01, /* 0x3fd79dc6899118d1 */
|
||||
3.72398446676754202311e-01, /* 0x3fd7d5604b63b3f7 */
|
||||
3.75783065409248884237e-01, /* 0x3fd80cd46a14b1d0 */
|
||||
3.79158669033441808605e-01, /* 0x3fd84422b8df95d7 */
|
||||
3.82525216899905096124e-01, /* 0x3fd87b4b0c1ebedb */
|
||||
3.85882669398073752109e-01, /* 0x3fd8b24d394a1b25 */
|
||||
3.89230987951320717144e-01, /* 0x3fd8e92916f5cde8 */
|
||||
3.92570135011828580396e-01, /* 0x3fd91fde7cd0c662 */
|
||||
3.95900074055262896078e-01, /* 0x3fd9566d43a34907 */
|
||||
3.99220769575252543149e-01, /* 0x3fd98cd5454d6b18 */
|
||||
4.02532187077682512832e-01, /* 0x3fd9c3165cc58107 */
|
||||
4.05834293074804064450e-01, /* 0x3fd9f93066168001 */
|
||||
4.09127055079168300278e-01, /* 0x3fda2f233e5e530b */
|
||||
4.12410441597387267265e-01, /* 0x3fda64eec3cc23fc */
|
||||
4.15684422123729413467e-01, /* 0x3fda9a92d59e98cf */
|
||||
4.18948967133552840902e-01, /* 0x3fdad00f5422058b */
|
||||
4.22204048076583571270e-01, /* 0x3fdb056420ae9343 */
|
||||
4.25449637370042266227e-01, /* 0x3fdb3a911da65c6c */
|
||||
4.28685708391625730496e-01, /* 0x3fdb6f962e737efb */
|
||||
4.31912235472348193799e-01, /* 0x3fdba473378624a5 */
|
||||
4.35129193889246812521e-01, /* 0x3fdbd9281e528191 */
|
||||
4.38336559857957774877e-01, /* 0x3fdc0db4c94ec9ef */
|
||||
4.41534310525166673322e-01, /* 0x3fdc42191ff11eb6 */
|
||||
4.44722423960939305942e-01, /* 0x3fdc76550aad71f8 */
|
||||
4.47900879150937292206e-01, /* 0x3fdcaa6872f3631b */
|
||||
4.51069655988523443568e-01, /* 0x3fdcde53432c1350 */
|
||||
4.54228735266762495559e-01, /* 0x3fdd121566b7f2ad */
|
||||
4.57378098670320809571e-01, /* 0x3fdd45aec9ec862b */
|
||||
4.60517728767271039558e-01, /* 0x3fdd791f5a1226f4 */
|
||||
4.63647609000806093515e-01, /* 0x3fddac670561bb4f */
|
||||
4.66767723680866497560e-01, /* 0x3fdddf85bb026974 */
|
||||
4.69878057975686880265e-01, /* 0x3fde127b6b0744af */
|
||||
4.72978597903265574054e-01, /* 0x3fde4548066cf51a */
|
||||
4.76069330322761219421e-01, /* 0x3fde77eb7f175a34 */
|
||||
4.79150242925822533735e-01, /* 0x3fdeaa65c7cf28c4 */
|
||||
4.82221324227853687105e-01, /* 0x3fdedcb6d43f8434 */
|
||||
4.85282563559221225002e-01, /* 0x3fdf0ede98f393cf */
|
||||
4.88333951056405479729e-01, /* 0x3fdf40dd0b541417 */
|
||||
4.91375477653101910835e-01, /* 0x3fdf72b221a4e495 */
|
||||
4.94407135071275316562e-01, /* 0x3fdfa45dd3029258 */
|
||||
4.97428915812172245392e-01, /* 0x3fdfd5e0175fdf83 */
|
||||
5.00440813147294050189e-01, /* 0x3fe0039c73c1a40b */
|
||||
5.03442821109336358099e-01, /* 0x3fe01c341e82422d */
|
||||
5.06434934483096732549e-01, /* 0x3fe034b709250488 */
|
||||
5.09417148796356245022e-01, /* 0x3fe04d25314342e5 */
|
||||
5.12389460310737621107e-01, /* 0x3fe0657e94db30cf */
|
||||
5.15351866012543347040e-01, /* 0x3fe07dc3324e9b38 */
|
||||
5.18304363603577900044e-01, /* 0x3fe095f30861a58f */
|
||||
5.21246951491958210312e-01, /* 0x3fe0ae0e1639866c */
|
||||
5.24179628782913242802e-01, /* 0x3fe0c6145b5b43da */
|
||||
5.27102395269579471204e-01, /* 0x3fe0de05d7aa6f7c */
|
||||
5.30015251423793132268e-01, /* 0x3fe0f5e28b67e295 */
|
||||
5.32918198386882147055e-01, /* 0x3fe10daa77307a0d */
|
||||
5.35811237960463593311e-01, /* 0x3fe1255d9bfbd2a8 */
|
||||
5.38694372597246617929e-01, /* 0x3fe13cfbfb1b056e */
|
||||
5.41567605391844897333e-01, /* 0x3fe1548596376469 */
|
||||
5.44430940071603086672e-01, /* 0x3fe16bfa6f5137e1 */
|
||||
5.47284380987436924748e-01, /* 0x3fe1835a88be7c13 */
|
||||
5.50127933104692989907e-01, /* 0x3fe19aa5e5299f99 */
|
||||
5.52961601994028217888e-01, /* 0x3fe1b1dc87904284 */
|
||||
5.55785393822313511514e-01, /* 0x3fe1c8fe7341f64f */
|
||||
5.58599315343562330405e-01, /* 0x3fe1e00babdefeb3 */
|
||||
5.61403373889889367732e-01, /* 0x3fe1f7043557138a */
|
||||
5.64197577362497537656e-01, /* 0x3fe20de813e823b1 */
|
||||
5.66981934222700489912e-01, /* 0x3fe224b74c1d192a */
|
||||
5.69756453482978431069e-01, /* 0x3fe23b71e2cc9e6a */
|
||||
5.72521144698072359525e-01, /* 0x3fe25217dd17e501 */
|
||||
5.75276017956117824426e-01, /* 0x3fe268a940696da6 */
|
||||
5.78021083869819540801e-01, /* 0x3fe27f261273d1b3 */
|
||||
5.80756353567670302596e-01, /* 0x3fe2958e59308e30 */
|
||||
5.83481838685214859730e-01, /* 0x3fe2abe21aded073 */
|
||||
5.86197551356360535557e-01, /* 0x3fe2c2215e024465 */
|
||||
5.88903504204738026395e-01, /* 0x3fe2d84c2961e48b */
|
||||
5.91599710335111383941e-01, /* 0x3fe2ee628406cbca */
|
||||
5.94286183324841177367e-01, /* 0x3fe30464753b090a */
|
||||
5.96962937215401501234e-01, /* 0x3fe31a52048874be */
|
||||
5.99629986503951384336e-01, /* 0x3fe3302b39b78856 */
|
||||
6.02287346134964152178e-01, /* 0x3fe345f01cce37bb */
|
||||
6.04935031491913965951e-01, /* 0x3fe35ba0b60eccce */
|
||||
6.07573058389022313541e-01, /* 0x3fe3713d0df6c503 */
|
||||
6.10201443063065118722e-01, /* 0x3fe386c52d3db11e */
|
||||
6.12820202165241245673e-01, /* 0x3fe39c391cd41719 */
|
||||
6.15429352753104952356e-01, /* 0x3fe3b198e5e2564a */
|
||||
6.18028912282561737612e-01, /* 0x3fe3c6e491c78dc4 */
|
||||
6.20618898599929469384e-01, /* 0x3fe3dc1c2a188504 */
|
||||
6.23199329934065904268e-01, /* 0x3fe3f13fb89e96f4 */
|
||||
6.25770224888563042498e-01, /* 0x3fe4064f47569f48 */
|
||||
6.28331602434009650615e-01, /* 0x3fe41b4ae06fea41 */
|
||||
6.30883481900321840818e-01, /* 0x3fe430328e4b26d5 */
|
||||
6.33425882969144482537e-01, /* 0x3fe445065b795b55 */
|
||||
6.35958825666321447834e-01, /* 0x3fe459c652badc7f */
|
||||
6.38482330354437466191e-01, /* 0x3fe46e727efe4715 */
|
||||
6.40996417725432032775e-01, /* 0x3fe4830aeb5f7bfd */
|
||||
6.43501108793284370968e-01, /* 0x3fe4978fa3269ee1 */
|
||||
6.45996424886771558604e-01, /* 0x3fe4ac00b1c71762 */
|
||||
6.48482387642300484032e-01, /* 0x3fe4c05e22de94e4 */
|
||||
6.50959018996812410762e-01, /* 0x3fe4d4a8023414e8 */
|
||||
6.53426341180761927063e-01, /* 0x3fe4e8de5bb6ec04 */
|
||||
6.55884376711170835605e-01, /* 0x3fe4fd013b7dd17e */
|
||||
6.58333148384755983962e-01, /* 0x3fe51110adc5ed81 */
|
||||
6.60772679271132590273e-01, /* 0x3fe5250cbef1e9fa */
|
||||
6.63202992706093175102e-01, /* 0x3fe538f57b89061e */
|
||||
6.65624112284960989250e-01, /* 0x3fe54ccaf0362c8f */
|
||||
6.68036061856020157990e-01, /* 0x3fe5608d29c70c34 */
|
||||
6.70438865514021320458e-01, /* 0x3fe5743c352b33b9 */
|
||||
6.72832547593763097282e-01, /* 0x3fe587d81f732fba */
|
||||
6.75217132663749830535e-01, /* 0x3fe59b60f5cfab9d */
|
||||
6.77592645519925151909e-01, /* 0x3fe5aed6c5909517 */
|
||||
6.79959111179481823228e-01, /* 0x3fe5c2399c244260 */
|
||||
6.82316554874748071313e-01, /* 0x3fe5d58987169b18 */
|
||||
6.84665002047148862907e-01, /* 0x3fe5e8c6941043cf */
|
||||
6.87004478341244895212e-01, /* 0x3fe5fbf0d0d5cc49 */
|
||||
6.89335009598845749323e-01, /* 0x3fe60f084b46e05e */
|
||||
6.91656621853199760075e-01, /* 0x3fe6220d115d7b8d */
|
||||
6.93969341323259825138e-01, /* 0x3fe634ff312d1f3b */
|
||||
6.96273194408023488045e-01, /* 0x3fe647deb8e20b8f */
|
||||
6.98568207680949848637e-01, /* 0x3fe65aabb6c07b02 */
|
||||
7.00854407884450081312e-01, /* 0x3fe66d663923e086 */
|
||||
7.03131821924453670469e-01, /* 0x3fe6800e4e7e2857 */
|
||||
7.05400476865049030906e-01, /* 0x3fe692a40556fb6a */
|
||||
7.07660399923197958039e-01, /* 0x3fe6a5276c4b0575 */
|
||||
7.09911618463524796141e-01, /* 0x3fe6b798920b3d98 */
|
||||
7.12154159993178659249e-01, /* 0x3fe6c9f7855c3198 */
|
||||
7.14388052156768926793e-01, /* 0x3fe6dc44551553ae */
|
||||
7.16613322731374569052e-01, /* 0x3fe6ee7f10204aef */
|
||||
7.18829999621624415873e-01, /* 0x3fe700a7c5784633 */
|
||||
7.21038110854851588272e-01, /* 0x3fe712be84295198 */
|
||||
7.23237684576317874097e-01, /* 0x3fe724c35b4fae7b */
|
||||
7.25428749044510712274e-01, /* 0x3fe736b65a172dff */
|
||||
7.27611332626510676214e-01, /* 0x3fe748978fba8e0f */
|
||||
7.29785463793429123314e-01, /* 0x3fe75a670b82d8d8 */
|
||||
7.31951171115916565668e-01, /* 0x3fe76c24dcc6c6c0 */
|
||||
7.34108483259739652560e-01, /* 0x3fe77dd112ea22c7 */
|
||||
7.36257428981428097003e-01, /* 0x3fe78f6bbd5d315e */
|
||||
7.38398037123989547936e-01, /* 0x3fe7a0f4eb9c19a2 */
|
||||
7.40530336612692630105e-01, /* 0x3fe7b26cad2e50fd */
|
||||
7.42654356450917929600e-01, /* 0x3fe7c3d311a6092b */
|
||||
7.44770125716075148681e-01, /* 0x3fe7d528289fa093 */
|
||||
7.46877673555587429099e-01, /* 0x3fe7e66c01c114fd */
|
||||
7.48977029182941400620e-01, /* 0x3fe7f79eacb97898 */
|
||||
7.51068221873802288613e-01, /* 0x3fe808c03940694a */
|
||||
7.53151280962194302759e-01, /* 0x3fe819d0b7158a4c */
|
||||
7.55226235836744863583e-01, /* 0x3fe82ad036000005 */
|
||||
7.57293115936992444759e-01, /* 0x3fe83bbec5cdee22 */
|
||||
7.59351950749757920178e-01, /* 0x3fe84c9c7653f7ea */
|
||||
7.61402769805578416573e-01, /* 0x3fe85d69576cc2c5 */
|
||||
7.63445602675201784315e-01, /* 0x3fe86e2578f87ae5 */
|
||||
7.65480478966144461950e-01, /* 0x3fe87ed0eadc5a2a */
|
||||
7.67507428319308182552e-01, /* 0x3fe88f6bbd023118 */
|
||||
7.69526480405658186434e-01, /* 0x3fe89ff5ff57f1f7 */
|
||||
7.71537664922959498526e-01, /* 0x3fe8b06fc1cf3dfe */
|
||||
7.73541011592573490852e-01, /* 0x3fe8c0d9145cf49d */
|
||||
7.75536550156311621507e-01, /* 0x3fe8d13206f8c4ca */
|
||||
7.77524310373347682379e-01, /* 0x3fe8e17aa99cc05d */
|
||||
7.79504322017186335181e-01, /* 0x3fe8f1b30c44f167 */
|
||||
7.81476614872688268854e-01, /* 0x3fe901db3eeef187 */
|
||||
7.83441218733151756304e-01, /* 0x3fe911f35199833b */
|
||||
7.85398163397448278999e-01}; /* 0x3fe921fb54442d18 */
|
||||
|
||||
/* Some constants. */
|
||||
|
||||
static double pi = 3.1415926535897932e+00, /* 0x400921fb54442d18 */
|
||||
piby2 = 1.5707963267948966e+00, /* 0x3ff921fb54442d18 */
|
||||
piby4 = 7.8539816339744831e-01, /* 0x3fe921fb54442d18 */
|
||||
three_piby4 = 2.3561944901923449e+00; /* 0x4002d97c7f3321d2 */
|
||||
|
||||
double u, v, vbyu, q, s, uu, r;
|
||||
unsigned int swap_vu, index, xzero, yzero, xnan, ynan, xinf, yinf;
|
||||
int xexp, yexp, diffexp;
|
||||
|
||||
double x = fx;
|
||||
double y = fy;
|
||||
|
||||
/* Find properties of arguments x and y. */
|
||||
|
||||
unsigned long ux, aux, xneg, uy, auy, yneg;
|
||||
|
||||
GET_BITS_DP64(x, ux);
|
||||
GET_BITS_DP64(y, uy);
|
||||
aux = ux & ~SIGNBIT_DP64;
|
||||
auy = uy & ~SIGNBIT_DP64;
|
||||
xexp = (int)((ux & EXPBITS_DP64) >> EXPSHIFTBITS_DP64);
|
||||
yexp = (int)((uy & EXPBITS_DP64) >> EXPSHIFTBITS_DP64);
|
||||
xneg = ux & SIGNBIT_DP64;
|
||||
yneg = uy & SIGNBIT_DP64;
|
||||
xzero = (aux == 0);
|
||||
yzero = (auy == 0);
|
||||
xnan = (aux > PINFBITPATT_DP64);
|
||||
ynan = (auy > PINFBITPATT_DP64);
|
||||
xinf = (aux == PINFBITPATT_DP64);
|
||||
yinf = (auy == PINFBITPATT_DP64);
|
||||
|
||||
diffexp = yexp - xexp;
|
||||
|
||||
/* Special cases */
|
||||
|
||||
if (xnan)
|
||||
{
|
||||
unsigned int ufx;
|
||||
GET_BITS_SP32(fx, ufx);
|
||||
return _handle_errorf("atan2f", OP_ATAN2, ufx|0x00400000, _DOMAIN, 0,
|
||||
EDOM, fx, fy, 2);
|
||||
}
|
||||
else if (ynan)
|
||||
{
|
||||
unsigned int ufy;
|
||||
GET_BITS_SP32(fy, ufy);
|
||||
return _handle_errorf("atan2f", OP_ATAN2, ufy|0x00400000, _DOMAIN, 0,
|
||||
EDOM, fx, fy, 2);
|
||||
}
|
||||
else if (yzero)
|
||||
{ /* Zero y gives +-0 for positive x
|
||||
and +-pi for negative x */
|
||||
if (xneg)
|
||||
{
|
||||
if (yneg) return valf_with_flags((float)-pi, AMD_F_INEXACT);
|
||||
else return valf_with_flags((float)pi, AMD_F_INEXACT);
|
||||
}
|
||||
else return (float)y;
|
||||
}
|
||||
else if (xzero)
|
||||
{ /* Zero x gives +- pi/2
|
||||
depending on sign of y */
|
||||
if (yneg) return valf_with_flags((float)-piby2, AMD_F_INEXACT);
|
||||
else valf_with_flags((float)piby2, AMD_F_INEXACT);
|
||||
}
|
||||
|
||||
if (diffexp > 26)
|
||||
{ /* abs(y)/abs(x) > 2^26 => arctan(x/y)
|
||||
is insignificant compared to piby2 */
|
||||
if (yneg) return valf_with_flags((float)-piby2, AMD_F_INEXACT);
|
||||
else return valf_with_flags((float)piby2, AMD_F_INEXACT);
|
||||
}
|
||||
else if (diffexp < -13 && (!xneg))
|
||||
{ /* x positive and dominant over y by a factor of 2^13.
|
||||
In this case atan(y/x) is y/x to machine accuracy. */
|
||||
|
||||
if (diffexp < -150) /* Result underflows */
|
||||
{
|
||||
if (yneg)
|
||||
return valf_with_flags(-0.0F, AMD_F_INEXACT | AMD_F_UNDERFLOW);
|
||||
else
|
||||
return valf_with_flags(0.0F, AMD_F_INEXACT | AMD_F_UNDERFLOW);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (diffexp < -126)
|
||||
{
|
||||
/* Result will likely be denormalized */
|
||||
y = scaleDouble_1(y, 100);
|
||||
y /= x;
|
||||
/* Now y is 2^100 times the true result. Scale it back down. */
|
||||
GET_BITS_DP64(y, uy);
|
||||
scaleDownDouble(uy, 100, &uy);
|
||||
PUT_BITS_DP64(uy, y);
|
||||
if ((uy & EXPBITS_DP64) == 0)
|
||||
return valf_with_flags((float)y, AMD_F_INEXACT | AMD_F_UNDERFLOW);
|
||||
else
|
||||
return (float)y;
|
||||
}
|
||||
else
|
||||
return (float)(y / x);
|
||||
}
|
||||
}
|
||||
else if (diffexp < -26 && xneg)
|
||||
{ /* abs(x)/abs(y) > 2^56 and x < 0 => arctan(y/x)
|
||||
is insignificant compared to pi */
|
||||
if (yneg) return valf_with_flags((float)-pi, AMD_F_INEXACT);
|
||||
else return valf_with_flags((float)pi, AMD_F_INEXACT);
|
||||
}
|
||||
else if (yinf && xinf)
|
||||
{ /* If abs(x) and abs(y) are both infinity
|
||||
return +-pi/4 or +- 3pi/4 according to
|
||||
signs. */
|
||||
if (xneg)
|
||||
{
|
||||
if (yneg) return valf_with_flags((float)-three_piby4, AMD_F_INEXACT);
|
||||
else return valf_with_flags((float)three_piby4, AMD_F_INEXACT);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (yneg) return valf_with_flags((float)-piby4, AMD_F_INEXACT);
|
||||
else return valf_with_flags((float)piby4, AMD_F_INEXACT);
|
||||
}
|
||||
}
|
||||
|
||||
/* General case: take absolute values of arguments */
|
||||
|
||||
u = x; v = y;
|
||||
if (xneg) u = -x;
|
||||
if (yneg) v = -y;
|
||||
|
||||
/* Swap u and v if necessary to obtain 0 < v < u. Compute v/u. */
|
||||
|
||||
swap_vu = (u < v);
|
||||
if (swap_vu) { uu = u; u = v; v = uu; }
|
||||
vbyu = v/u;
|
||||
|
||||
if (vbyu > 0.0625)
|
||||
{ /* General values of v/u. Use a look-up
|
||||
table and series expansion. */
|
||||
|
||||
index = (int)(256*vbyu + 0.5);
|
||||
r = (256*v-index*u)/(256*u+index*v);
|
||||
|
||||
/* Polynomial approximation to atan(vbyu) */
|
||||
|
||||
s = r*r;
|
||||
q = atan_jby256[index-16] + r - r*s*0.33333333333224095522;
|
||||
}
|
||||
else if (vbyu < 1.e-4)
|
||||
{ /* v/u is small enough that atan(v/u) = v/u */
|
||||
q = vbyu;
|
||||
}
|
||||
else /* vbyu <= 0.0625 */
|
||||
{
|
||||
/* Small values of v/u. Use a series expansion */
|
||||
|
||||
s = vbyu*vbyu;
|
||||
q = vbyu -
|
||||
vbyu*s*(0.33333333333333170500 -
|
||||
s*(0.19999999999393223405 -
|
||||
s*0.14285713561807169030));
|
||||
}
|
||||
|
||||
/* Tidy-up according to which quadrant the arguments lie in */
|
||||
|
||||
if (swap_vu) {q = piby2 - q;}
|
||||
if (xneg) {q = pi - q;}
|
||||
if (yneg) q = - q;
|
||||
return (float)q;
|
||||
}
|
135
sdk/lib/crt/math/libm_sse2/atanf.c
Normal file
135
sdk/lib/crt/math/libm_sse2/atanf.c
Normal file
|
@ -0,0 +1,135 @@
|
|||
|
||||
/*******************************************************************************
|
||||
MIT License
|
||||
-----------
|
||||
|
||||
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this Software and associated documentaon files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "libm.h"
|
||||
#include "libm_util.h"
|
||||
|
||||
#define USE_VALF_WITH_FLAGS
|
||||
#define USE_NAN_WITH_FLAGS
|
||||
#define USE_HANDLE_ERRORF
|
||||
#include "libm_inlines.h"
|
||||
#undef USE_VALF_WITH_FLAGS
|
||||
#undef USE_NAN_WITH_FLAGS
|
||||
#undef USE_HANDLE_ERRORF
|
||||
|
||||
#include "libm_errno.h"
|
||||
|
||||
// Disable "C4163: not available as intrinsic function" warning that older
|
||||
// compilers may issue here.
|
||||
#pragma warning(disable:4163)
|
||||
#pragma function(atanf)
|
||||
|
||||
float FN_PROTOTYPE(atanf)(float fx)
|
||||
{
|
||||
|
||||
/* Some constants and split constants. */
|
||||
|
||||
static double piby2 = 1.5707963267948966e+00; /* 0x3ff921fb54442d18 */
|
||||
|
||||
double c, v, s, q, z;
|
||||
unsigned int xnan;
|
||||
|
||||
double x = fx;
|
||||
|
||||
/* Find properties of argument fx. */
|
||||
|
||||
unsigned long ux, aux, xneg;
|
||||
|
||||
GET_BITS_DP64(x, ux);
|
||||
aux = ux & ~SIGNBIT_DP64;
|
||||
xneg = ux & SIGNBIT_DP64;
|
||||
|
||||
v = x;
|
||||
if (xneg) v = -x;
|
||||
|
||||
/* Argument reduction to range [-7/16,7/16] */
|
||||
|
||||
if (aux < 0x3fdc000000000000) /* v < 7./16. */
|
||||
{
|
||||
x = v;
|
||||
c = 0.0;
|
||||
}
|
||||
else if (aux < 0x3fe6000000000000) /* v < 11./16. */
|
||||
{
|
||||
x = (2.0*v-1.0)/(2.0+v);
|
||||
/* c = arctan(0.5) */
|
||||
c = 4.63647609000806093515e-01; /* 0x3fddac670561bb4f */
|
||||
}
|
||||
else if (aux < 0x3ff3000000000000) /* v < 19./16. */
|
||||
{
|
||||
x = (v-1.0)/(1.0+v);
|
||||
/* c = arctan(1.) */
|
||||
c = 7.85398163397448278999e-01; /* 0x3fe921fb54442d18 */
|
||||
}
|
||||
else if (aux < 0x4003800000000000) /* v < 39./16. */
|
||||
{
|
||||
x = (v-1.5)/(1.0+1.5*v);
|
||||
/* c = arctan(1.5) */
|
||||
c = 9.82793723247329054082e-01; /* 0x3fef730bd281f69b */
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
xnan = (aux > PINFBITPATT_DP64);
|
||||
|
||||
if (xnan)
|
||||
{
|
||||
/* x is NaN */
|
||||
unsigned int uhx;
|
||||
GET_BITS_SP32(fx, uhx);
|
||||
return _handle_errorf("atanf", OP_ATAN, uhx|0x00400000, _DOMAIN,
|
||||
0, EDOM, fx, 0.0F, 1);
|
||||
}
|
||||
else if (v > 0x4c80000000000000)
|
||||
{ /* abs(x) > 2^26 => arctan(1/x) is
|
||||
insignificant compared to piby2 */
|
||||
if (xneg)
|
||||
return valf_with_flags((float)-piby2, AMD_F_INEXACT);
|
||||
else
|
||||
return valf_with_flags((float)piby2, AMD_F_INEXACT);
|
||||
}
|
||||
|
||||
x = -1.0/v;
|
||||
/* c = arctan(infinity) */
|
||||
c = 1.57079632679489655800e+00; /* 0x3ff921fb54442d18 */
|
||||
}
|
||||
|
||||
/* Core approximation: Remez(2,2) on [-7/16,7/16] */
|
||||
|
||||
s = x*x;
|
||||
q = x*s*
|
||||
(0.296528598819239217902158651186e0 +
|
||||
(0.192324546402108583211697690500e0 +
|
||||
0.470677934286149214138357545549e-2*s)*s)/
|
||||
(0.889585796862432286486651434570e0 +
|
||||
(0.111072499995399550138837673349e1 +
|
||||
0.299309699959659728404442796915e0*s)*s);
|
||||
|
||||
z = c - (q - x);
|
||||
|
||||
if (xneg) z = -z;
|
||||
return (float)z;
|
||||
}
|
34
sdk/lib/crt/math/libm_sse2/cabs.c
Normal file
34
sdk/lib/crt/math/libm_sse2/cabs.c
Normal file
|
@ -0,0 +1,34 @@
|
|||
|
||||
/*******************************************************************************
|
||||
MIT License
|
||||
-----------
|
||||
|
||||
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this Software and associated documentaon files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "libm.h"
|
||||
|
||||
double __cdecl _cabs(COMPLEX z)
|
||||
{
|
||||
/* Returns the absolute value of a complex number z
|
||||
with real part a and complex part b. */
|
||||
return _hypot(z.x, z.y);
|
||||
}
|
35
sdk/lib/crt/math/libm_sse2/cabsf.c
Normal file
35
sdk/lib/crt/math/libm_sse2/cabsf.c
Normal file
|
@ -0,0 +1,35 @@
|
|||
|
||||
/*******************************************************************************
|
||||
MIT License
|
||||
-----------
|
||||
|
||||
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this Software and associated documentaon files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "libm.h"
|
||||
|
||||
float _hypotf(float,float);
|
||||
float _cabsf(COMPLEX z)
|
||||
{
|
||||
/* Returns the absolute value of a complex number z
|
||||
with real part a and complex part b. */
|
||||
return _hypotf((float)z.x, (float)z.y);
|
||||
}
|
88
sdk/lib/crt/math/libm_sse2/ceil.c
Normal file
88
sdk/lib/crt/math/libm_sse2/ceil.c
Normal file
|
@ -0,0 +1,88 @@
|
|||
|
||||
/*******************************************************************************
|
||||
MIT License
|
||||
-----------
|
||||
|
||||
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this Software and associated documentaon files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "libm.h"
|
||||
#include "libm_util.h"
|
||||
|
||||
#include "libm_errno.h"
|
||||
#define USE_HANDLE_ERROR
|
||||
#include "libm_inlines.h"
|
||||
#undef USE_HANDLE_ERROR
|
||||
|
||||
// Disable "C4163: not available as intrinsic function" warning that older
|
||||
// compilers may issue here.
|
||||
#pragma warning(disable:4163)
|
||||
#pragma function(ceil)
|
||||
|
||||
double FN_PROTOTYPE(ceil)(double x)
|
||||
{
|
||||
double r;
|
||||
long rexp, xneg;
|
||||
unsigned long ux, ax, ur, mask;
|
||||
|
||||
GET_BITS_DP64(x, ux);
|
||||
ax = ux & (~SIGNBIT_DP64);
|
||||
xneg = (ux != ax);
|
||||
|
||||
if (ax >= 0x4340000000000000)
|
||||
{
|
||||
/* abs(x) is either NaN, infinity, or >= 2^53 */
|
||||
if (ax > 0x7ff0000000000000)
|
||||
/* x is NaN */
|
||||
return _handle_error("ceil", OP_CEIL, ux|0x0008000000000000, _DOMAIN, 0,
|
||||
EDOM, x, 0.0, 1);
|
||||
else
|
||||
return x;
|
||||
}
|
||||
else if (ax < 0x3ff0000000000000) /* abs(x) < 1.0 */
|
||||
{
|
||||
if (ax == 0x0000000000000000)
|
||||
/* x is +zero or -zero; return the same zero */
|
||||
return x;
|
||||
else if (xneg) /* x < 0.0 */
|
||||
{
|
||||
PUT_BITS_DP64(SIGNBIT_DP64, r); /* return -0.0 */
|
||||
return r;
|
||||
}
|
||||
else
|
||||
return 1.0;
|
||||
}
|
||||
else
|
||||
{
|
||||
rexp = ((ux & EXPBITS_DP64) >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64;
|
||||
/* Mask out the bits of r that we don't want */
|
||||
mask = 1;
|
||||
mask = (mask << (EXPSHIFTBITS_DP64 - rexp)) - 1;
|
||||
ur = (ux & ~mask);
|
||||
PUT_BITS_DP64(ur, r);
|
||||
if (xneg || (ur == ux))
|
||||
return r;
|
||||
else
|
||||
/* We threw some bits away and x was positive */
|
||||
return r + 1.0;
|
||||
}
|
||||
|
||||
}
|
86
sdk/lib/crt/math/libm_sse2/ceilf.c
Normal file
86
sdk/lib/crt/math/libm_sse2/ceilf.c
Normal file
|
@ -0,0 +1,86 @@
|
|||
|
||||
/*******************************************************************************
|
||||
MIT License
|
||||
-----------
|
||||
|
||||
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this Software and associated documentaon files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "libm.h"
|
||||
#include "libm_util.h"
|
||||
|
||||
#include "libm_errno.h"
|
||||
#define USE_HANDLE_ERRORF
|
||||
#include "libm_inlines.h"
|
||||
#undef USE_HANDLE_ERRORF
|
||||
|
||||
// Disable "C4163: not available as intrinsic function" warning that older
|
||||
// compilers may issue here.
|
||||
#pragma warning(disable:4163)
|
||||
#pragma function(ceilf)
|
||||
|
||||
float FN_PROTOTYPE(ceilf)(float x)
|
||||
{
|
||||
float r;
|
||||
int rexp, xneg;
|
||||
unsigned int ux, ax, ur, mask;
|
||||
|
||||
GET_BITS_SP32(x, ux);
|
||||
ax = ux & (~SIGNBIT_SP32);
|
||||
xneg = (ux != ax);
|
||||
|
||||
if (ax >= 0x4b800000)
|
||||
{
|
||||
/* abs(x) is either NaN, infinity, or >= 2^24 */
|
||||
if (ax > 0x7f800000)
|
||||
/* x is NaN */
|
||||
return _handle_errorf("ceilf", OP_CEIL, ux, _DOMAIN, 0, EDOM, x,
|
||||
0.0F, 1);
|
||||
else
|
||||
return x;
|
||||
}
|
||||
else if (ax < 0x3f800000) /* abs(x) < 1.0 */
|
||||
{
|
||||
if (ax == 0x00000000)
|
||||
/* x is +zero or -zero; return the same zero */
|
||||
return x;
|
||||
else if (xneg) /* x < 0.0 */
|
||||
{
|
||||
PUT_BITS_SP32(SIGNBIT_SP32, r); /* return -0.0 */
|
||||
return r;
|
||||
}
|
||||
else
|
||||
return 1.0F;
|
||||
}
|
||||
else
|
||||
{
|
||||
rexp = ((ux & EXPBITS_SP32) >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
|
||||
/* Mask out the bits of r that we don't want */
|
||||
mask = (1 << (EXPSHIFTBITS_SP32 - rexp)) - 1;
|
||||
ur = (ux & ~mask);
|
||||
PUT_BITS_SP32(ur, r);
|
||||
|
||||
if (xneg || (ux == ur)) return r;
|
||||
else
|
||||
/* We threw some bits away and x was positive */
|
||||
return r + 1.0F;
|
||||
}
|
||||
}
|
533
sdk/lib/crt/math/libm_sse2/cos.asm
Normal file
533
sdk/lib/crt/math/libm_sse2/cos.asm
Normal file
|
@ -0,0 +1,533 @@
|
|||
;
|
||||
; MIT License
|
||||
; -----------
|
||||
;
|
||||
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
||||
;
|
||||
; Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
; of this Software and associated documentaon files (the "Software"), to deal
|
||||
; in the Software without restriction, including without limitation the rights
|
||||
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
; copies of the Software, and to permit persons to whom the Software is
|
||||
; furnished to do so, subject to the following conditions:
|
||||
;
|
||||
; The above copyright notice and this permission notice shall be included in
|
||||
; all copies or substantial portions of the Software.
|
||||
;
|
||||
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
; THE SOFTWARE.
|
||||
;
|
||||
;
|
||||
; An implementation of the cos function.
|
||||
;
|
||||
; Prototype:
|
||||
;
|
||||
; double cos(double x);
|
||||
;
|
||||
; Computes cos(x).
|
||||
; It will provide proper C99 return values,
|
||||
; but may not raise floating point status bits properly.
|
||||
; Based on the NAG C implementation.
|
||||
;
|
||||
; If FMA3 hardware is available, an FMA3 implementation of cos will be used.
|
||||
|
||||
.const
|
||||
ALIGN 16
|
||||
L_real_piby2_1 DQ 03ff921fb54400000h ; piby2_1
|
||||
DQ 0
|
||||
L_real_piby2_1tail DQ 03dd0b4611a626331h ; piby2_1tail
|
||||
DQ 0
|
||||
L_real_piby2_2 DQ 03dd0b4611a600000h ; piby2_2
|
||||
DQ 0
|
||||
L_real_piby2_2tail DQ 03ba3198a2e037073h ; piby2_2tail
|
||||
DQ 0
|
||||
|
||||
ALIGN 16
|
||||
L_one DQ 03FF0000000000000h, 03FF0000000000000h
|
||||
L_signbit DQ 08000000000000000h, 00000000000000000h
|
||||
L_int_one DQ 00000000000000001h, 00000000000000000h
|
||||
L_int_two DQ 00000000000000002h, 00000000000000000h
|
||||
|
||||
L_2_by_pi DQ 03fe45f306dc9c883h ; 2/pi
|
||||
L_one_half DQ 03FE0000000000000h ; .5
|
||||
L_neg_one_half DQ 0bfe0000000000000h ; - 0.5
|
||||
L_two_to_neg_27 DQ 03e40000000000000h ; 2^-27
|
||||
L_two_to_neg_13 DQ 03f20000000000000h ; 2^-13
|
||||
L_piby4 DQ 03FE921FB54442D18h ; pi/4
|
||||
L_small_arg_cw DQ 0411E848000000000h ; 5.e5, appropriate for CW
|
||||
L_small_arg_bdl DQ 0417312D000000000h ; 2e7, works for BDL
|
||||
L_sign_mask DQ 07FFFFFFFFFFFFFFFh
|
||||
|
||||
L__inf_mask_64 DQ 07FF0000000000000h ; +Inf
|
||||
|
||||
|
||||
|
||||
EXTRN __Lcosarray:QWORD
|
||||
EXTRN __Lsinarray:QWORD
|
||||
EXTRN __use_fma3_lib:DWORD
|
||||
|
||||
; local storage offsets
|
||||
p_temp EQU 020h ; temporary for get/put bits operation
|
||||
p_temp1 EQU 030h ; temporary for get/put bits operation
|
||||
dummy_space EQU 040h
|
||||
stack_size EQU 068h
|
||||
|
||||
include fm.inc
|
||||
|
||||
fname TEXTEQU <cos>
|
||||
fname_special TEXTEQU <_cos_special>
|
||||
|
||||
;Define name and any external functions being called
|
||||
EXTERN __remainder_piby2_forAsm : PROC
|
||||
EXTERN __remainder_piby2_fma3 : PROC
|
||||
EXTERN __remainder_piby2_fma3_bdl : PROC
|
||||
EXTERN fname_special : PROC
|
||||
|
||||
.code
|
||||
|
||||
PUBLIC fname
|
||||
fname PROC FRAME
|
||||
StackAllocate stack_size
|
||||
.ENDPROLOG
|
||||
|
||||
cmp DWORD PTR __use_fma3_lib, 0
|
||||
jne L_cos_fma3
|
||||
|
||||
Lcos_sse2:
|
||||
movd rdx, xmm0
|
||||
xorpd xmm2, xmm2 ; zeroed out for later use
|
||||
|
||||
mov r10, rdx
|
||||
btr r10, 63 ; r10 <-- |x|
|
||||
cmp r10, L_piby4
|
||||
jb Lcos_sse2_absx_lt_piby4
|
||||
|
||||
Lcos_absx_nlt_piby4: ; common case
|
||||
|
||||
; Here rdx has x, r10 has |x|
|
||||
movd xmm0, r10 ; xmm0 <-- |x|
|
||||
|
||||
cmp r10, QWORD PTR L_small_arg_cw
|
||||
jae Lcos_reduce_precise ; Note NaN/Inf will branch
|
||||
|
||||
; At this point we have |x| < L_small_arg_cw, which is currently 500000.
|
||||
; Note that if |x| were too large, conversion of npi2 to integer would fail.
|
||||
; We reduce the argument to be in a range from -pi/4 to +pi/4
|
||||
; by subtracting multiples of pi/2
|
||||
movapd xmm2, xmm0
|
||||
mulsd xmm2, L_2_by_pi
|
||||
movapd xmm4, xmm0
|
||||
|
||||
; xexp = ax >> EXPSHIFTBITS_DP64;
|
||||
mov r9, r10
|
||||
shr r9, 52 ; >>EXPSHIFTBITS_DP64
|
||||
|
||||
; How many pi/2 is |x| a multiple of?
|
||||
; npi2 = (int)(x * twobypi + 0.5);
|
||||
addsd xmm2, L_one_half ; npi2
|
||||
|
||||
movsd xmm3, L_real_piby2_1
|
||||
cvttpd2dq xmm0, xmm2 ; convert npi2 to integer
|
||||
movsd xmm1, L_real_piby2_1tail
|
||||
cvtdq2pd xmm2, xmm0 ; and back to double.
|
||||
|
||||
; Subtract the multiple from x to get an extra-precision remainder
|
||||
; rhead = x - npi2 * piby2_1;
|
||||
mulsd xmm3, xmm2
|
||||
subsd xmm4, xmm3 ; rhead
|
||||
|
||||
; rtail = npi2 * piby2_1tail;
|
||||
mulsd xmm1, xmm2 ; rtail
|
||||
movd eax, xmm0 ; eax <-- npi2
|
||||
|
||||
; GET_BITS_DP64(rhead-rtail, uy);
|
||||
; originally only rhead
|
||||
movapd xmm0, xmm4
|
||||
subsd xmm0, xmm1
|
||||
|
||||
movsd xmm3, L_real_piby2_2
|
||||
movd rcx, xmm0 ; rcx <-- rhead - rtail
|
||||
movsd xmm5, L_real_piby2_2tail ; piby2_2tail
|
||||
|
||||
; xmm0=r, xmm1=rtail, xmm2=npi2, xmm3=temp for calc,
|
||||
; xmm4=rhead xmm5= temp for calc
|
||||
; expdiff = xexp - ((uy & EXPBITS_DP64) >> EXPSHIFTBITS_DP64);
|
||||
; expdiff measures how close rhead - rtail is to |x|
|
||||
; (larger expdiff ==> more cancellation in |x| - (rhead-rtail) ==> closer)
|
||||
shl rcx, 1 ; strip any sign bit
|
||||
shr rcx, 53 ; >> EXPSHIFTBITS_DP64 +1
|
||||
sub r9, rcx ; expdiff
|
||||
|
||||
;; if (expdiff > 15)
|
||||
cmp r9, 15
|
||||
jle Lcos_sse2_cw_reduction_done
|
||||
|
||||
; Here the remainder is pretty small compared with x, which
|
||||
; implies that x is a near multiple of pi/2
|
||||
; (x matches the multiple to at least 15 bits)
|
||||
; So we do another stage of argument reduction.
|
||||
|
||||
; t = rhead;
|
||||
movapd xmm1, xmm4
|
||||
|
||||
; rtail = npi2 * piby2_2;
|
||||
mulsd xmm3, xmm2
|
||||
|
||||
; rhead = t - rtail;
|
||||
mulsd xmm5, xmm2 ; npi2 * piby2_2tail
|
||||
subsd xmm4, xmm3 ; rhead
|
||||
|
||||
; rtail = npi2 * piby2_2tail - ((t - rhead) - rtail);
|
||||
subsd xmm1, xmm4 ; t - rhead
|
||||
subsd xmm1, xmm3 ; -rtail
|
||||
subsd xmm5, xmm1 ; rtail
|
||||
|
||||
; r = rhead - rtail;
|
||||
movapd xmm0, xmm4
|
||||
|
||||
;HARSHA
|
||||
;xmm1=rtail
|
||||
movapd xmm1, xmm5 ; xmm1 <-- copy of rtail
|
||||
subsd xmm0, xmm5
|
||||
|
||||
; xmm0=r, xmm4=rhead, xmm1=rtail
|
||||
Lcos_sse2_cw_reduction_done:
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; if the input was close to a pi/2 multiple
|
||||
; The original NAG code missed this trick.
|
||||
; If the input is very close to n*pi/2 after reduction, so r < 2^-27,
|
||||
; then the cos is either ~ 1.0 or ~r, to within 53 bits.
|
||||
|
||||
; NOTE: Unfortunately, this introduces two jcc instructions close to each
|
||||
; other and to other branches. As r < 2^-13 should be rather uncommon,
|
||||
; the problems for branch prediction outweigh the computational savings. - WAT
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; region = npi2 & 3;
|
||||
subsd xmm4, xmm0 ; rhead-r
|
||||
subsd xmm4, xmm1 ; rr = (rhead-r) - rtail
|
||||
|
||||
Lcos_piby4:
|
||||
; perform taylor series to calc sinx or cosx
|
||||
; x2 = r * r;
|
||||
|
||||
;xmm4 = a part of rr for the sin path, xmm4 is overwritten in the cos path
|
||||
;instead use xmm3 because that was freed up in the sin path, xmm3 is overwritten in sin path
|
||||
movapd xmm3, xmm0
|
||||
movapd xmm2, xmm0
|
||||
mulsd xmm2, xmm0 ;x2
|
||||
|
||||
bt eax,0
|
||||
jnc Lcos_sse2_calc_cos
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; region 1 or 3 do a sin calculation
|
||||
movsd xmm3, __Lsinarray+50h ; s6
|
||||
mulsd xmm3, xmm2 ; x2s6
|
||||
movsd xmm5, __Lsinarray+20h ; s3
|
||||
movsd QWORD PTR p_temp[rsp], xmm4 ; store xx
|
||||
movapd xmm1, xmm2 ; move for x4
|
||||
mulsd xmm1, xmm2 ; x4
|
||||
movsd QWORD PTR p_temp1[rsp], xmm0 ; store x
|
||||
mulsd xmm5, xmm2 ; x2s3
|
||||
movapd xmm4, xmm0 ; move for x3
|
||||
addsd xmm3, __Lsinarray+40h ; s5+x2s6
|
||||
mulsd xmm1, xmm2 ; x6
|
||||
mulsd xmm3, xmm2 ; x2(s5+x2s6)
|
||||
mulsd xmm4, xmm2 ; x3
|
||||
addsd xmm5, __Lsinarray+10h ; s2+x2s3
|
||||
mulsd xmm5, xmm2 ; x2(s2+x2s3)
|
||||
addsd xmm3, __Lsinarray+30h ; s4 + x2(s5+x2s6)
|
||||
mulsd xmm2, L_one_half ; 0.5 *x2
|
||||
movsd xmm0, QWORD PTR p_temp[rsp] ; load xx
|
||||
mulsd xmm3, xmm1 ; x6(s4 + x2(s5+x2s6))
|
||||
addsd xmm5, __Lsinarray ; s1+x2(s2+x2s3)
|
||||
mulsd xmm2, xmm0 ; 0.5 * x2 *xx
|
||||
addsd xmm3, xmm5 ; zs
|
||||
mulsd xmm4, xmm3 ; *x3
|
||||
subsd xmm4, xmm2 ; x3*zs - 0.5 * x2 *xx
|
||||
addsd xmm0, xmm4 ; +xx
|
||||
addsd xmm0, QWORD PTR p_temp1[rsp] ; +x
|
||||
|
||||
jmp Lcos_sse2_adjust_region
|
||||
|
||||
ALIGN 16
|
||||
Lcos_sse2_calc_cos:
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; region 0 or 2 - do a cos calculation
|
||||
; zc = (c2 + x2 * (c3 + x2 * (c4 + x2 * (c5 + x2 * c6))));
|
||||
mulsd xmm4, xmm0 ; x*xx
|
||||
movsd xmm5, L_one_half
|
||||
movsd xmm1, __Lcosarray+50h ; c6
|
||||
movsd xmm0, __Lcosarray+20h ; c3
|
||||
mulsd xmm5, xmm2 ; r = 0.5 *x2
|
||||
movapd xmm3, xmm2 ; copy of x2
|
||||
movsd QWORD PTR p_temp[rsp], xmm4 ; store x*xx
|
||||
mulsd xmm1, xmm2 ; c6*x2
|
||||
mulsd xmm0, xmm2 ; c3*x2
|
||||
subsd xmm5, L_one ; -t=r-1.0, trash r
|
||||
mulsd xmm3, xmm2 ; x4
|
||||
addsd xmm1, __Lcosarray+40h ; c5+x2c6
|
||||
addsd xmm0, __Lcosarray+10h ; c2+x2C3
|
||||
addsd xmm5, L_one ; 1 + (-t), trash t
|
||||
mulsd xmm3, xmm2 ; x6
|
||||
mulsd xmm1, xmm2 ; x2(c5+x2c6)
|
||||
mulsd xmm0, xmm2 ; x2(c2+x2C3)
|
||||
movapd xmm4, xmm2 ; copy of x2
|
||||
mulsd xmm4, L_one_half ; r recalculate
|
||||
addsd xmm1, __Lcosarray+30h ; c4 + x2(c5+x2c6)
|
||||
addsd xmm0, __Lcosarray ; c1+x2(c2+x2C3)
|
||||
mulsd xmm2, xmm2 ; x4 recalculate
|
||||
subsd xmm5, xmm4 ; (1 + (-t)) - r
|
||||
mulsd xmm1, xmm3 ; x6(c4 + x2(c5+x2c6))
|
||||
addsd xmm0, xmm1 ; zc
|
||||
subsd xmm4, L_one ; t relaculate
|
||||
subsd xmm5, QWORD PTR p_temp[rsp] ; ((1 + (-t)) - r) - x*xx
|
||||
mulsd xmm0, xmm2 ; x4 * zc
|
||||
addsd xmm0, xmm5 ; x4 * zc + ((1 + (-t)) - r -x*xx)
|
||||
subsd xmm0, xmm4 ; result - (-t)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
Lcos_sse2_adjust_region:
|
||||
; switch (region)
|
||||
add eax, 1
|
||||
and eax, 2
|
||||
jz Lcos_sse2_cleanup
|
||||
|
||||
;; if the original region 1 or 2 then we negate the result.
|
||||
movapd xmm2, xmm0
|
||||
xorpd xmm0, xmm0
|
||||
subsd xmm0, xmm2
|
||||
|
||||
ALIGN 16
|
||||
Lcos_sse2_cleanup:
|
||||
StackDeallocate stack_size
|
||||
ret
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
ALIGN 16
|
||||
Lcos_sse2_absx_lt_piby4:
|
||||
; cos = cos_piby4(x, 0.0);
|
||||
|
||||
; x2 = r * r;
|
||||
cmp r10, L_two_to_neg_13
|
||||
jb Lcos_sse2_x_small
|
||||
movapd xmm2, xmm0
|
||||
mulsd xmm2, xmm0 ; x2
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; region 0 - do a cos calculation
|
||||
; zc = (c2 + x2 * (c3 + x2 * (c4 + x2 * (c5 + x2 * c6))));
|
||||
movsd xmm1, __Lcosarray+10h ; c2
|
||||
movapd xmm4, xmm2 ; move for x4
|
||||
mulsd xmm4, xmm2 ; x4
|
||||
movsd xmm3, __Lcosarray+30h ; c4
|
||||
mulsd xmm1, xmm2 ; c2x2
|
||||
movsd xmm5, __Lcosarray+50h ; c6
|
||||
mulsd xmm3, xmm2 ; c4x2
|
||||
movapd xmm0, xmm4 ; move for x8
|
||||
mulsd xmm5, xmm2 ; c6x2
|
||||
mulsd xmm0, xmm4 ; x8
|
||||
addsd xmm1, __Lcosarray ; c1 + c2x2
|
||||
mulsd xmm1, xmm4 ; c1x4 + c2x6
|
||||
addsd xmm3, __Lcosarray+20h ; c3 + c4x2
|
||||
mulsd xmm2, L_neg_one_half ; -0.5x2, destroy xmm2
|
||||
addsd xmm5, __Lcosarray+40h ; c5 + c6x2
|
||||
mulsd xmm3, xmm0 ; c3x8 + c4x10
|
||||
mulsd xmm4, xmm0 ; x12
|
||||
mulsd xmm4, xmm5 ; c5x12 + c6x14
|
||||
|
||||
movsd xmm0, L_one
|
||||
addsd xmm1, xmm3 ; c1x4 + c2x6 + c3x8 + c4x10
|
||||
movapd xmm3, xmm2 ; preserve -0.5x2
|
||||
addsd xmm2, xmm0 ; t = 1 - 0.5x2
|
||||
subsd xmm0, xmm2 ; 1-t
|
||||
addsd xmm0, xmm3 ; (1-t) - r
|
||||
addsd xmm1, xmm4 ; c1x4 + c2x6 + c3x8 + c4x10 + c5x12 + c6x14
|
||||
addsd xmm0, xmm1 ; (1-t) - r + c1x4 + c2x6 + c3x8 + c4x10 + c5x12 + c6x14
|
||||
addsd xmm0, xmm2 ; 1 - 0.5x2 + above
|
||||
|
||||
StackDeallocate stack_size
|
||||
ret
|
||||
|
||||
ALIGN 16
|
||||
Lcos_sse2_x_small:
|
||||
movsd xmm2, xmm0
|
||||
movsd xmm0, L_one
|
||||
cmp r10, L_two_to_neg_27
|
||||
jb Lcos_sse2_x_smaller
|
||||
mulsd xmm2, xmm2
|
||||
mulsd xmm2, L_one_half
|
||||
subsd xmm0, xmm2
|
||||
StackDeallocate stack_size
|
||||
ret
|
||||
|
||||
ALIGN 16
|
||||
Lcos_sse2_x_smaller:
|
||||
movsd xmm0, L_one
|
||||
addsd xmm0, L_int_one ; really adding smallest subnormal; set inexact
|
||||
StackDeallocate stack_size
|
||||
ret
|
||||
|
||||
ALIGN 16
|
||||
Lcos_reduce_precise:
|
||||
; Reduce x into range [-pi/4, pi/4]
|
||||
cmp r10, L__inf_mask_64
|
||||
jae Lcos_x_naninf
|
||||
call __remainder_piby2_forAsm
|
||||
|
||||
; At this point xmm0 has r, xmm1 has rr, rax has region
|
||||
|
||||
movapd xmm4, xmm1 ; xmm4 <-- rr
|
||||
jmp Lcos_piby4
|
||||
|
||||
; xmm0 = x, xmm4 = xx, eax= region
|
||||
|
||||
|
||||
ALIGN 16
|
||||
Lcos_x_naninf:
|
||||
call fname_special
|
||||
StackDeallocate stack_size
|
||||
ret
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; From this point we assume that FMA3 and AVX hardware are present.
|
||||
|
||||
ALIGN 16
|
||||
L_cos_fma3:
|
||||
vmovq r9,xmm0
|
||||
mov rax,r9
|
||||
and r9,L_sign_mask ; clear sign
|
||||
|
||||
Lcos_early_exit_s_1: ;; unused label
|
||||
cmp r9,L_piby4
|
||||
jg Lcos_early_exit_s ; Note that NaN will branch
|
||||
cmp r9,L_two_to_neg_13
|
||||
jge Lcompute_cos_pyby_4
|
||||
cmp r9,L_two_to_neg_27
|
||||
jge Lcompute_1_xx_5
|
||||
vmovq xmm0,L_one ; for tiniest args, cos is 1
|
||||
jmp Lreturn_no_restore
|
||||
|
||||
Lcompute_1_xx_5:
|
||||
vmulsd xmm1,xmm0,L_one_half ; xmm1l <-- .5*x
|
||||
vfnmadd213sd xmm0,xmm1,L_one ; xmm0l <-- 1.0 - (.5*x)*x
|
||||
jmp Lreturn_no_restore
|
||||
|
||||
Lcompute_cos_pyby_4:
|
||||
; make sure this is accurate enough
|
||||
; note that x^2 can't be all that close to 1 here
|
||||
vmulsd xmm3,xmm0,xmm0 ; xmm3 <-- xx = x*x
|
||||
vmovapd xmm0,__Lcosarray+050h ; xmm0 <-- c5
|
||||
vfmadd213sd xmm0,xmm3,__Lcosarray+040h ; xmm0 <-- c5*xx + c4
|
||||
vfmadd213sd xmm0,xmm3,__Lcosarray+030h ; xmm0 <-- (c5*xx + c4)*xx + c3
|
||||
vfmadd213sd xmm0,xmm3,__Lcosarray+020h
|
||||
vfmadd213sd xmm0,xmm3,__Lcosarray+010h
|
||||
vfmadd213sd xmm0,xmm3,__Lcosarray
|
||||
vfmsub213sd xmm0,xmm3,L_one_half
|
||||
vfmadd213sd xmm0,xmm3,L_one
|
||||
|
||||
jmp Lreturn_no_restore
|
||||
|
||||
Lcos_early_exit_s:
|
||||
mov r8,L__inf_mask_64
|
||||
and rax,r8
|
||||
cmp rax, r8
|
||||
jz Lcos_x_naninf
|
||||
|
||||
Lrange_reduce:
|
||||
vmovq xmm0,r9 ; r9 <-- |x|
|
||||
cmp r9,L_small_arg_bdl
|
||||
jae Lcos_remainder_piby2
|
||||
|
||||
; For __remainder_piby2_fma3 and __remainder_piby2_fma3_bdl
|
||||
; on input
|
||||
; x is in xmm0
|
||||
; on output
|
||||
; r is in xmm0
|
||||
; rr is in xmm1
|
||||
; region is in rax
|
||||
|
||||
; Boldo-Daumas-Li reduction for reasonably small |x|
|
||||
call __remainder_piby2_fma3_bdl
|
||||
|
||||
;; if region is 0 or 2 do a cos calc.
|
||||
;; if region is 1 or 3 do a sin calc.
|
||||
Lcos_exit_s:
|
||||
bt rax,0
|
||||
jc Lsin_piby4_compute
|
||||
|
||||
Lcos_piby4_compute: ;; unused label
|
||||
; compute the cosine of r+rr, where this sum is in [-pi/4,pi/4]
|
||||
vmovapd xmm2,L_one
|
||||
vmulsd xmm3,xmm0,xmm0 ; xmm3 <-- x * x
|
||||
vmulsd xmm5,xmm3,L_one_half ; xmm5 <-- x*x*.5 == r
|
||||
vsubsd xmm4,xmm2,xmm5 ; xmm4 <-- t = 1. - x*x*.5
|
||||
vsubsd xmm2,xmm2,xmm4 ; 1-t
|
||||
vsubsd xmm2,xmm2,xmm5 ; xmm2 <-- (1-t) - r
|
||||
vmovapd xmm5,__Lcosarray+040h
|
||||
vfnmadd231sd xmm2,xmm0,xmm1 ; (1.0 - t) - r) - x * xx) xmm2
|
||||
vmulsd xmm1,xmm3,xmm3 ; x2 * x2 xmm1
|
||||
vfmadd231sd xmm5,xmm3,__Lcosarray+050h
|
||||
vfmadd213sd xmm5,xmm3,__Lcosarray+030h
|
||||
vfmadd213sd xmm5,xmm3,__Lcosarray+020h
|
||||
vfmadd213sd xmm5,xmm3,__Lcosarray+010h
|
||||
vfmadd213sd xmm5,xmm3,__Lcosarray
|
||||
vfmadd213sd xmm5,xmm1,xmm2
|
||||
vaddsd xmm0,xmm5,xmm4
|
||||
|
||||
jmp Lcos_exit_s_1
|
||||
|
||||
ALIGN 16
|
||||
Lsin_piby4_compute:
|
||||
; compute the sine of r+rr, where this sum is in [-pi/4,pi/4]
|
||||
vmovapd xmm5,__Lsinarray+040h
|
||||
vmulsd xmm3,xmm0,xmm0 ; xmm3 <-- x2 = x * x
|
||||
vfmadd231sd xmm5,xmm3,__Lsinarray+050h
|
||||
vfmadd213sd xmm5,xmm3,__Lsinarray+030h
|
||||
vfmadd213sd xmm5,xmm3,__Lsinarray+020h
|
||||
vfmadd213sd xmm5,xmm3,__Lsinarray+010h ; xmm5 <-- r
|
||||
|
||||
vmulsd xmm4,xmm0,xmm3 ; xmm4 <-- x3 = x*x*x
|
||||
vmulsd xmm2,xmm4,xmm5 ; xmm2 <-- x*x*x * r
|
||||
vmulsd xmm5,xmm1,L_one_half ; xmm5 <-- .5*x*x
|
||||
vsubsd xmm2,xmm5,xmm2 ; xmm2 <-- .5*x*x - x*x*x*r
|
||||
vmulsd xmm2,xmm3,xmm2
|
||||
vsubsd xmm2,xmm2,xmm1
|
||||
vfnmadd231sd xmm2, xmm4,__Lsinarray
|
||||
vsubsd xmm0,xmm0,xmm2
|
||||
|
||||
Lcos_exit_s_1:
|
||||
xor r8,r8
|
||||
add eax, 1
|
||||
and eax, 2
|
||||
cmovnz r8, L_signbit
|
||||
vmovq xmm3,r8
|
||||
vxorpd xmm0,xmm0,xmm3
|
||||
|
||||
Lreturn_restore_regs:
|
||||
StackDeallocate stack_size
|
||||
ret
|
||||
|
||||
Lreturn_no_restore:
|
||||
StackDeallocate stack_size
|
||||
ret
|
||||
|
||||
ALIGN 16
|
||||
Lcos_remainder_piby2:
|
||||
; argument reduction for general x
|
||||
call __remainder_piby2_fma3
|
||||
jmp Lcos_exit_s
|
||||
|
||||
|
||||
fname endp
|
||||
END
|
525
sdk/lib/crt/math/libm_sse2/cosf.asm
Normal file
525
sdk/lib/crt/math/libm_sse2/cosf.asm
Normal file
|
@ -0,0 +1,525 @@
|
|||
;
|
||||
; MIT License
|
||||
; -----------
|
||||
;
|
||||
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
||||
;
|
||||
; Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
; of this Software and associated documentaon files (the "Software"), to deal
|
||||
; in the Software without restriction, including without limitation the rights
|
||||
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
; copies of the Software, and to permit persons to whom the Software is
|
||||
; furnished to do so, subject to the following conditions:
|
||||
;
|
||||
; The above copyright notice and this permission notice shall be included in
|
||||
; all copies or substantial portions of the Software.
|
||||
;
|
||||
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
; THE SOFTWARE.
|
||||
;
|
||||
; An implementation of the cosf function.
|
||||
;
|
||||
; Prototype:
|
||||
;
|
||||
; float cosf(float x);
|
||||
;
|
||||
; Computes cosf(x).
|
||||
; Based on the NAG C implementation.
|
||||
; It will provide proper C99 return values,
|
||||
; but may not raise floating point status bits properly.
|
||||
; Original Author: Harsha Jagasia
|
||||
|
||||
.const
|
||||
ALIGN 16
|
||||
L_real_one DQ 03ff0000000000000h ; 1.0
|
||||
DQ 0 ; for alignment
|
||||
L_one_half DQ 03fe0000000000000h ; 0.5
|
||||
DQ 0
|
||||
L_2bypi DQ 03fe45f306dc9c883h ; 2./pi
|
||||
DQ 0
|
||||
L_one_sixth DQ 03fc5555555555555h ; 0.166666666666
|
||||
DQ 0
|
||||
L_piby2 DQ 03fe921fb54442d18h
|
||||
DQ 0
|
||||
L_piby2_1 DQ 03ff921fb54400000h ; piby2_1
|
||||
DQ 0
|
||||
L_piby2_1tail DQ 03dd0b4611a626331h ; piby2_1tail
|
||||
DQ 0
|
||||
L_piby2_2 DQ 03dd0b4611a600000h ; piby2_2
|
||||
DQ 0
|
||||
L_piby2_2tail DQ 03ba3198a2e037073h ; piby2_2tail
|
||||
DQ 0
|
||||
L_large_x_sse2 DQ 0411E848000000000h ; 5e5
|
||||
DQ 0
|
||||
L_large_x_fma3 DQ 041E921FB60000000h ; 3.37325952e9
|
||||
DQ 0
|
||||
L_sign_mask DQ 07FFFFFFFFFFFFFFFh
|
||||
DQ 07FFFFFFFFFFFFFFFh
|
||||
L__int_three DQ 00000000000000003h
|
||||
DQ 00000000000000003h
|
||||
L__min_norm_double DQ 00010000000000000h
|
||||
DQ 00010000000000000h
|
||||
L_two_to_neg_7 DQ 03f80000000000000h
|
||||
DQ 0
|
||||
L_two_to_neg_13 DQ 03f20000000000000h
|
||||
DQ 0
|
||||
L_inf_mask_32 DD 07F800000h
|
||||
DQ 0
|
||||
|
||||
fname TEXTEQU <cosf>
|
||||
fname_special TEXTEQU <_cosf_special>
|
||||
|
||||
;Define name and any external functions being called
|
||||
EXTERN __remainder_piby2d2f_forAsm : PROC ; NEAR
|
||||
EXTERN __remainder_piby2_fma3_bdl : PROC ; NEAR
|
||||
EXTERN __remainder_piby2_fma3 : PROC ; NEAR
|
||||
EXTERN fname_special : PROC
|
||||
EXTERN _set_statfp : PROC
|
||||
|
||||
|
||||
EXTRN __Lcosfarray:QWORD
|
||||
EXTRN __Lsinfarray:QWORD
|
||||
EXTRN __use_fma3_lib:DWORD
|
||||
|
||||
; define local variable storage offsets
|
||||
p_temp equ 020h ; temporary for get/put bits operation
|
||||
p_temp1 equ 030h ; temporary for get/put bits operation
|
||||
dummy_space EQU 040h
|
||||
stack_size EQU 068h
|
||||
|
||||
include fm.inc
|
||||
|
||||
.code
|
||||
|
||||
ALIGN 16
|
||||
PUBLIC fname
|
||||
fname PROC FRAME
|
||||
StackAllocate stack_size
|
||||
.ENDPROLOG
|
||||
cmp DWORD PTR __use_fma3_lib, 0
|
||||
jne Lcosf_fma3
|
||||
|
||||
Lcosf_sse2:
|
||||
|
||||
xorpd xmm2, xmm2 ; zeroed out for later use
|
||||
|
||||
;; if NaN or inf
|
||||
movd edx, xmm0
|
||||
mov eax, 07f800000h
|
||||
mov r10d, eax
|
||||
and r10d, edx
|
||||
cmp r10d, eax
|
||||
jz Lcosf_sse2_naninf
|
||||
|
||||
cvtss2sd xmm0, xmm0
|
||||
movd rdx, xmm0
|
||||
|
||||
; ax = (ux & ~SIGNBIT_DP64);
|
||||
mov r10, rdx
|
||||
btr r10, 63 ; r10 <-- |x|
|
||||
mov r8d, 1 ; for determining region later on
|
||||
|
||||
movapd xmm1, xmm0 ; xmm1 <-- copy of x
|
||||
|
||||
|
||||
;; if (ax <= 3fe921fb54442d18h) /* abs(x) <= pi/4 */
|
||||
mov rax, 03fe921fb54442d18h
|
||||
cmp r10, rax
|
||||
jg Lcosf_sse2_absx_gt_piby4
|
||||
|
||||
; *c = cos_piby4(x, 0.0);
|
||||
movapd xmm2, xmm0
|
||||
mulsd xmm2, xmm2 ;x^2
|
||||
xor eax, eax
|
||||
mov rdx, r10
|
||||
movsd xmm5, QWORD PTR L_one_half
|
||||
jmp Lcosf_sse2_calc_sincosf_piby4 ; done
|
||||
|
||||
|
||||
ALIGN 16
|
||||
Lcosf_sse2_absx_gt_piby4:
|
||||
; reduce the argument to be in a range from -pi/4 to +pi/4
|
||||
; by subtracting multiples of pi/2
|
||||
; xneg = (ax != ux);
|
||||
movd xmm0, r10 ; xmm0 <-- |x|
|
||||
cmp r10, QWORD PTR L_large_x_sse2
|
||||
jae Lcosf_sse2_reduce_precise
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; xmm0=abs(x), xmm1=x
|
||||
;/* How many pi/2 is x a multiple of? */
|
||||
|
||||
movapd xmm2, xmm0
|
||||
movsd xmm3, QWORD PTR L_2bypi
|
||||
movapd xmm4, xmm0
|
||||
movsd xmm5, QWORD PTR L_one_half
|
||||
mulsd xmm2, xmm3
|
||||
|
||||
; movsd xmm5, QWORD PTR L_one_half
|
||||
; movapd xmm2, xmm0
|
||||
; mulsd xmm2, QWORD PTR L_2bypi
|
||||
; movapd xmm4, xmm0
|
||||
|
||||
mov r9, r10
|
||||
shr r9, 52 ; r9 <-- biased exponent of x
|
||||
|
||||
; npi2 = (int)(x * twobypi + 0.5);
|
||||
addsd xmm2, xmm5 ; npi2
|
||||
|
||||
movsd xmm3, QWORD PTR L_piby2_1 ; piby2_1
|
||||
cvttpd2dq xmm0, xmm2 ; xmm0 <-- npi2
|
||||
movsd xmm1, QWORD PTR L_piby2_1tail ; piby2_1tail
|
||||
cvtdq2pd xmm2, xmm0 ; xmm2 <-- (double)npi2
|
||||
|
||||
; Subtract the multiple from x to get an extra-precision remainder
|
||||
; rhead = x - npi2 * piby2_1;
|
||||
|
||||
mulsd xmm3, xmm2 ; use piby2_1
|
||||
subsd xmm4, xmm3 ; rhead
|
||||
|
||||
; rtail = npi2 * piby2_1tail;
|
||||
mulsd xmm1, xmm2 ; rtail
|
||||
movd eax, xmm0
|
||||
|
||||
; GET_BITS_DP64(rhead-rtail, uy);
|
||||
; originally only rhead
|
||||
movapd xmm0, xmm4
|
||||
subsd xmm0, xmm1
|
||||
|
||||
movsd xmm3, QWORD PTR L_piby2_2 ; piby2_2
|
||||
movd rcx, xmm0 ; rcx <-- rhead-rtail
|
||||
movsd xmm5, QWORD PTR L_piby2_2tail ; piby2_2tail
|
||||
|
||||
; region = npi2 & 3;
|
||||
; and eax, 3
|
||||
; expdiff = xexp - ((uy & EXPBITS_DP64) >> EXPSHIFTBITS_DP64);
|
||||
shl rcx, 1 ; strip any sign bit
|
||||
shr rcx, 53 ; >> EXPSHIFTBITS_DP64 +1
|
||||
sub r9, rcx ; expdiff
|
||||
|
||||
;; if (expdiff > 15)
|
||||
cmp r9, 15
|
||||
jle Lcosf_sse2_expdiff_le_15
|
||||
|
||||
; The remainder is pretty small compared with x, which
|
||||
; implies that x is a near multiple of pi/2
|
||||
; (x matches the multiple to at least 15 bits)
|
||||
; t = rhead;
|
||||
movapd xmm1, xmm4
|
||||
|
||||
; rtail = npi2 * piby2_2;
|
||||
mulsd xmm3, xmm2
|
||||
|
||||
; rhead = t - rtail;
|
||||
mulsd xmm5, xmm2 ; npi2 * piby2_2tail
|
||||
subsd xmm4, xmm3 ; rhead
|
||||
|
||||
; rtail = npi2 * piby2_2tail - ((t - rhead) - rtail);
|
||||
subsd xmm1, xmm4 ; t - rhead
|
||||
subsd xmm1, xmm3 ; -rtail
|
||||
subsd xmm5, xmm1 ; rtail
|
||||
|
||||
; r = rhead - rtail;
|
||||
movapd xmm0, xmm4
|
||||
|
||||
;HARSHA
|
||||
;xmm1=rtail
|
||||
movapd xmm1, xmm5
|
||||
subsd xmm0, xmm5
|
||||
|
||||
; xmm0=r, xmm4=rhead, xmm1=rtail
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
Lcosf_sse2_expdiff_le_15:
|
||||
cmp rcx, 03f2h ; is r < 2^-13 ?
|
||||
jge Lcosf_sse2_calc_sincosf_piby4 ; use taylor series if not
|
||||
cmp rcx, 03deh ; is r < 2^-33 ?
|
||||
jle Lcosf_sse2_r_very_small ; then cosf(r) ~ 1 or r
|
||||
|
||||
movapd xmm2, xmm0
|
||||
mulsd xmm2, xmm0 ; xmm2 <-- x^2
|
||||
|
||||
;; if region is 1 or 3 do a sinf calc.
|
||||
and r8d, eax
|
||||
jz Lcosf_sse2_r_small_calc_sin
|
||||
|
||||
Lcosf_sse2_r_small_calc_cos:
|
||||
; region 1 or 3
|
||||
; use simply polynomial
|
||||
; *s = x - x*x*x*0.166666666666666666;
|
||||
movsd xmm3, QWORD PTR L_one_sixth
|
||||
mulsd xmm3, xmm0 ; * x
|
||||
mulsd xmm3, xmm2 ; * x^2
|
||||
subsd xmm0, xmm3 ; xs
|
||||
jmp Lcosf_sse2_adjust_region
|
||||
|
||||
ALIGN 16
|
||||
Lcosf_sse2_r_small_calc_sin:
|
||||
; region 0 or 2
|
||||
; cos = 1.0 - x*x*0.5;
|
||||
movsd xmm0, QWORD PTR L_real_one ; 1.0
|
||||
mulsd xmm2, QWORD PTR L_one_half ; 0.5 *x^2
|
||||
subsd xmm0, xmm2
|
||||
jmp Lcosf_sse2_adjust_region
|
||||
|
||||
ALIGN 16
|
||||
Lcosf_sse2_r_very_small:
|
||||
; then sin(r) = r
|
||||
; if region is 1 or 3 do a sin calc.
|
||||
and r8d, eax
|
||||
jnz Lcosf_sse2_adjust_region
|
||||
|
||||
movsd xmm0, QWORD PTR L_real_one ; cosf(r) is a 1
|
||||
; By this point, calculations should already have set inexact
|
||||
jmp Lcosf_sse2_adjust_region
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
ALIGN 16
|
||||
Lcosf_sse2_reduce_precise:
|
||||
; Reduce abs(x) into range [-pi/4, pi/4]
|
||||
; remainder_piby2d2f(ax, &r, ®ion);
|
||||
mov QWORD PTR p_temp[rsp], rdx ; save ux for use later
|
||||
mov QWORD PTR p_temp1[rsp], r10 ; save ax for use later
|
||||
|
||||
call __remainder_piby2d2f_forAsm
|
||||
mov rdx, QWORD PTR p_temp[rsp] ; restore ux for use later
|
||||
mov r10, QWORD PTR p_temp1[rsp] ; restore ax for use later
|
||||
mov r8d, 1 ; for determining region later
|
||||
|
||||
; Reduced argument is in xmm0. No second word; after all, we started in
|
||||
; single precision. Region is in rax.
|
||||
movapd xmm1, xmm0
|
||||
movsd xmm5, QWORD PTR L_one_half
|
||||
|
||||
jmp Lcosf_sse2_calc_sincosf_piby4
|
||||
|
||||
|
||||
; done with reducing the argument. Now perform the sin/cos calculations.
|
||||
ALIGN 16
|
||||
Lcosf_sse2_calc_sincosf_piby4:
|
||||
movapd xmm2, xmm0
|
||||
mulsd xmm2, xmm0 ; x^2
|
||||
|
||||
;; if region is 0 or 2, do a cosf calc
|
||||
and r8d, eax
|
||||
jz Lcosf_sse2_do_cosf_calc
|
||||
; region is 1 or 3: do a sinf calc.
|
||||
Lcosf_sse2_do_sinf_calc:
|
||||
movsd xmm1, QWORD PTR __Lsinfarray+18h ; s4
|
||||
mulsd xmm1, xmm2 ; s4x2
|
||||
movsd xmm4, xmm2 ; move for x4
|
||||
mulsd xmm4, xmm2 ; x4
|
||||
movsd xmm5, QWORD PTR __Lsinfarray+8h ; s2
|
||||
mulsd xmm5, xmm2 ; s2x2
|
||||
movsd xmm3, xmm0 ; move for x3
|
||||
mulsd xmm3, xmm2 ; x3
|
||||
addsd xmm1, QWORD PTR __Lsinfarray+10h ; s3+s4x2
|
||||
mulsd xmm1, xmm4 ; s3x4+s4x6
|
||||
addsd xmm5, QWORD PTR __Lsinfarray ; s1+s2x2
|
||||
addsd xmm1, xmm5 ; s1+s2x2+s3x4+s4x6
|
||||
mulsd xmm1, xmm3 ; x3(s1+s2x2+s3x4+s4x6)
|
||||
addsd xmm0, xmm1 ; x + x3(s1+s2x2+s3x4+s4x6)
|
||||
jmp Lcosf_sse2_adjust_region
|
||||
|
||||
ALIGN 16
|
||||
Lcosf_sse2_do_cosf_calc:
|
||||
; region 0 or 2 - do a cos calculation
|
||||
; zc = 1-0.5*x2+ c1*x4 +c2*x6 +c3*x8;
|
||||
; zc = 1-0.5*x2+ c1*x4 +c2*x6 +c3*x8 + c4*x10 for a higher precision
|
||||
movsd xmm1, QWORD PTR __Lcosfarray+20h ; c4
|
||||
movsd xmm4, xmm2 ; move for x4
|
||||
mulsd xmm1, xmm2 ; c4x2
|
||||
movsd xmm3, QWORD PTR __Lcosfarray+10h ; c2
|
||||
mulsd xmm4, xmm2 ; x4
|
||||
movsd xmm0, QWORD PTR __Lcosfarray ; c0
|
||||
mulsd xmm3, xmm2 ; c2x2
|
||||
mulsd xmm0, xmm2 ; c0x2 (=-0.5x2)
|
||||
addsd xmm1, QWORD PTR __Lcosfarray+18h ; c3+c4x2
|
||||
mulsd xmm1, xmm4 ; c3x4 + c4x6
|
||||
addsd xmm3, QWORD PTR __Lcosfarray+8h ; c1+c2x2
|
||||
addsd xmm1, xmm3 ; c1 + c2x2 + c3x4 + c4x6
|
||||
mulsd xmm1, xmm4 ; c1x4 + c2x6 + c3x8 + c4x10
|
||||
addsd xmm0, QWORD PTR L_real_one ; 1 - 0.5x2
|
||||
addsd xmm0, xmm1 ; 1 - 0.5x2 + c1x4 + c2x6 + c3x8 + c4x10
|
||||
|
||||
Lcosf_sse2_adjust_region:
|
||||
; xmm1 is cos or sin, relies on previous sections to
|
||||
; switch (region)
|
||||
add eax, 1
|
||||
and eax, 2
|
||||
jz Lcosf_sse2_cleanup
|
||||
;; if region 1 or 2 then we negate the result.
|
||||
xorpd xmm2, xmm2
|
||||
subsd xmm2, xmm0
|
||||
movapd xmm0, xmm2
|
||||
|
||||
ALIGN 16
|
||||
Lcosf_sse2_cleanup:
|
||||
cvtsd2ss xmm0, xmm0
|
||||
StackDeallocate stack_size
|
||||
ret
|
||||
|
||||
|
||||
Lcosf_sse2_naninf:
|
||||
call fname_special
|
||||
StackDeallocate stack_size
|
||||
ret
|
||||
|
||||
|
||||
ALIGN 16
|
||||
Lcosf_fma3:
|
||||
vmovd eax,xmm0
|
||||
mov r8d,L_inf_mask_32
|
||||
and eax,r8d
|
||||
cmp eax, r8d
|
||||
jz Lcosf_fma3_naninf
|
||||
|
||||
vcvtss2sd xmm5,xmm0,xmm0
|
||||
vmovq r9,xmm5
|
||||
btr r9,63 ;clear sign
|
||||
|
||||
cmp r9,L_piby2
|
||||
jg Lcosf_fma3_range_reduce
|
||||
cmp r9,L_two_to_neg_7
|
||||
jge Lcosf_fma3_compute_cosf_piby_4
|
||||
cmp r9,L_two_to_neg_13
|
||||
jge Lcosf_fma3_compute_1_xx_5
|
||||
|
||||
vmovq xmm0,QWORD PTR L_real_one
|
||||
; Here we need to set inexact
|
||||
vaddsd xmm0,xmm0,L__min_norm_double ; this will set inexact
|
||||
jmp Lcosf_fma3_return
|
||||
|
||||
ALIGN 16
|
||||
Lcosf_fma3_compute_1_xx_5:
|
||||
vmulsd xmm0,xmm5,QWORD PTR L_one_half
|
||||
vfnmadd213sd xmm0,xmm5,L_real_one ; xmm9 1.0 - x*x*(double2)0.5
|
||||
jmp Lcosf_fma3_return
|
||||
|
||||
ALIGN 16
|
||||
Lcosf_fma3_compute_cosf_piby_4:
|
||||
movsd xmm0,xmm5
|
||||
vmovapd xmm2,L_real_one
|
||||
vmulsd xmm3,xmm0,xmm0
|
||||
vmulsd xmm1,xmm3,L_one_half ; xmm1 <-- r
|
||||
vsubsd xmm2,xmm2,xmm1
|
||||
vmovsd xmm1,__Lcosfarray+018h
|
||||
vfmadd231sd xmm1,xmm3,__Lcosfarray+020h
|
||||
vfmadd213sd xmm1,xmm3,__Lcosfarray+010h
|
||||
vfmadd213sd xmm1,xmm3,__Lcosfarray+008h
|
||||
vmulsd xmm3,xmm3,xmm3 ; xmm3 <-- x^4
|
||||
vmovdqa xmm0,xmm2
|
||||
vfmadd231sd xmm0,xmm1,xmm3
|
||||
jmp Lcosf_fma3_return
|
||||
|
||||
ALIGN 16
|
||||
Lcosf_fma3_range_reduce:
|
||||
vmovq xmm0,r9 ; xmm0 <-- |x|
|
||||
cmp r9,L_large_x_fma3
|
||||
jge Lcosf_reduce_precise
|
||||
|
||||
;cosff_range_e_5_s:
|
||||
vandpd xmm1,xmm0,L_sign_mask
|
||||
vmovapd xmm2,L_2bypi
|
||||
vfmadd213sd xmm2,xmm1,L_one_half
|
||||
vcvttpd2dq xmm2,xmm2
|
||||
vpmovsxdq xmm1,xmm2
|
||||
vandpd xmm4,xmm1,L__int_three ; region xmm4
|
||||
vshufps xmm1 ,xmm1,xmm1,8
|
||||
vcvtdq2pd xmm1,xmm1
|
||||
vmovdqa xmm2,xmm0
|
||||
vfnmadd231sd xmm2,xmm1,L_piby2_1 ; xmm2 rhead
|
||||
vmulsd xmm3,xmm1,L_piby2_1tail ; xmm3 rtail
|
||||
vsubsd xmm0,xmm2,xmm3 ; r_1 xmm0
|
||||
vsubsd xmm2,xmm2,xmm0
|
||||
vsubsd xmm1,xmm2,xmm3
|
||||
vmovq rax,xmm4
|
||||
jmp Lcosf_exit_s
|
||||
|
||||
ALIGN 16
|
||||
Lcosf_reduce_precise:
|
||||
|
||||
vmovq xmm0,r9 ; r9 <-- |x|
|
||||
cmp r9,L_large_x_fma3
|
||||
jge Lcos_remainder_piby2
|
||||
|
||||
; __remainder_piby2_fma3 and __remainder_piby2_fma3_bdl
|
||||
; have the following conventions:
|
||||
; on input
|
||||
; x is in xmm0
|
||||
; on output
|
||||
; r is in xmm0
|
||||
; rr is in xmm1
|
||||
; region is in rax
|
||||
; The _bdl routine is guaranteed not to touch r10
|
||||
|
||||
Lcos_remainder_piby2_small: ;; unused label
|
||||
; Boldo-Daumas-Li reduction for reasonably small |x|
|
||||
call __remainder_piby2_fma3_bdl
|
||||
jmp Lcosf_exit_s
|
||||
|
||||
ALIGN 16
|
||||
Lcos_remainder_piby2:
|
||||
; argument reduction for general x
|
||||
call __remainder_piby2_fma3
|
||||
Lcosf_exit_s:
|
||||
bt rax,0
|
||||
jnc Lcosf_piby4_compute
|
||||
|
||||
;sinf_piby4_compute:
|
||||
; vmovapd xmm1,__Lsinfarray+010h
|
||||
vmovsd xmm1,__Lsinfarray+010h
|
||||
vmulsd xmm3,xmm0,xmm0
|
||||
vfmadd231sd xmm1,xmm3,__Lsinfarray+018h
|
||||
vfmadd213sd xmm1,xmm3,__Lsinfarray+008h
|
||||
vfmadd213sd xmm1,xmm3,__Lsinfarray
|
||||
vmulsd xmm3,xmm0,xmm3 ; xmm3 <-- x^3
|
||||
vfmadd231sd xmm0,xmm1,xmm3
|
||||
jmp Lcosf_fma3_adjust_sign
|
||||
|
||||
ALIGN 16
|
||||
Lcosf_piby4_compute:
|
||||
vmovapd xmm2,L_real_one
|
||||
vmulsd xmm3,xmm0,xmm0
|
||||
vmulsd xmm1,xmm3,L_one_half ; xmm1 <-- r
|
||||
vsubsd xmm2,xmm2,xmm1
|
||||
vmovsd xmm1,__Lcosfarray+018h
|
||||
vfmadd231sd xmm1 ,xmm3,__Lcosfarray+020h
|
||||
vfmadd213sd xmm1 ,xmm3,__Lcosfarray+010h
|
||||
vfmadd213sd xmm1 ,xmm3,__Lcosfarray+008h
|
||||
vmulsd xmm3,xmm3,xmm3 ; xmm3 <-- x^4
|
||||
vmovdqa xmm0, xmm2
|
||||
vfmadd231sd xmm0 ,xmm1,xmm3
|
||||
|
||||
Lcosf_fma3_adjust_sign:
|
||||
; assuming FMA3 ==> AVX ==> SSE4.1
|
||||
; vpcmpeqq xmm1,xmm4,XMMWORD PTR L_int_one
|
||||
; vpcmpeqq xmm2,xmm4,XMMWORD PTR L_int_two
|
||||
; vorpd xmm3,xmm2,xmm1
|
||||
|
||||
; vandpd xmm3,xmm3,L_signbit
|
||||
|
||||
add rax,1 ; 1,2 --> 2,3
|
||||
shr rax,1 ; 2,3 --> 1
|
||||
shl rax,63 ; 1 --> sign bit
|
||||
vmovq xmm3,rax
|
||||
|
||||
vxorpd xmm0,xmm0,xmm3
|
||||
|
||||
Lcosf_fma3_return:
|
||||
vcvtsd2ss xmm0,xmm0,xmm0
|
||||
StackDeallocate stack_size
|
||||
ret
|
||||
|
||||
Lcosf_fma3_naninf:
|
||||
call fname_special
|
||||
StackDeallocate stack_size
|
||||
ret
|
||||
|
||||
fname endp
|
||||
END
|
344
sdk/lib/crt/math/libm_sse2/cosh.c
Normal file
344
sdk/lib/crt/math/libm_sse2/cosh.c
Normal file
|
@ -0,0 +1,344 @@
|
|||
|
||||
/*******************************************************************************
|
||||
MIT License
|
||||
-----------
|
||||
|
||||
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this Software and associated documentaon files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "libm.h"
|
||||
#include "libm_util.h"
|
||||
|
||||
#define USE_SPLITEXP
|
||||
#define USE_SCALEDOUBLE_1
|
||||
#define USE_SCALEDOUBLE_2
|
||||
#define USE_INFINITY_WITH_FLAGS
|
||||
#define USE_VAL_WITH_FLAGS
|
||||
#define USE_HANDLE_ERROR
|
||||
#include "libm_inlines.h"
|
||||
#undef USE_SPLITEXP
|
||||
#undef USE_SCALEDOUBLE_1
|
||||
#undef USE_SCALEDOUBLE_2
|
||||
#undef USE_INFINITY_WITH_FLAGS
|
||||
#undef USE_VAL_WITH_FLAGS
|
||||
#undef USE_HANDLE_ERROR
|
||||
|
||||
|
||||
#pragma function(cosh)
|
||||
double cosh(double x)
|
||||
{
|
||||
/*
|
||||
Derived from sinh subroutine
|
||||
|
||||
After dealing with special cases the computation is split into
|
||||
regions as follows:
|
||||
|
||||
abs(x) >= max_cosh_arg:
|
||||
cosh(x) = sign(x)*Inf
|
||||
|
||||
abs(x) >= small_threshold:
|
||||
cosh(x) = sign(x)*exp(abs(x))/2 computed using the
|
||||
splitexp and scaleDouble functions as for exp_amd().
|
||||
|
||||
abs(x) < small_threshold:
|
||||
compute p = exp(y) - 1 and then z = 0.5*(p+(p/(p+1.0)))
|
||||
cosh(x) is then sign(x)*z. */
|
||||
|
||||
static const double
|
||||
max_cosh_arg = 7.10475860073943977113e+02, /* 0x408633ce8fb9f87e */
|
||||
thirtytwo_by_log2 = 4.61662413084468283841e+01, /* 0x40471547652b82fe */
|
||||
log2_by_32_lead = 2.16608493356034159660e-02, /* 0x3f962e42fe000000 */
|
||||
log2_by_32_tail = 5.68948749532545630390e-11, /* 0x3dcf473de6af278e */
|
||||
// small_threshold = 8*BASEDIGITS_DP64*0.30102999566398119521373889;
|
||||
small_threshold = 20.0;
|
||||
/* (8*BASEDIGITS_DP64*log10of2) ' exp(-x) insignificant c.f. exp(x) */
|
||||
|
||||
/* Lead and tail tabulated values of sinh(i) and cosh(i)
|
||||
for i = 0,...,36. The lead part has 26 leading bits. */
|
||||
|
||||
static const double sinh_lead[ 37] = {
|
||||
0.00000000000000000000e+00, /* 0x0000000000000000 */
|
||||
1.17520117759704589844e+00, /* 0x3ff2cd9fc0000000 */
|
||||
3.62686038017272949219e+00, /* 0x400d03cf60000000 */
|
||||
1.00178747177124023438e+01, /* 0x40240926e0000000 */
|
||||
2.72899169921875000000e+01, /* 0x403b4a3800000000 */
|
||||
7.42032089233398437500e+01, /* 0x40528d0160000000 */
|
||||
2.01713153839111328125e+02, /* 0x406936d228000000 */
|
||||
5.48316116333007812500e+02, /* 0x4081228768000000 */
|
||||
1.49047882080078125000e+03, /* 0x409749ea50000000 */
|
||||
4.05154187011718750000e+03, /* 0x40afa71570000000 */
|
||||
1.10132326660156250000e+04, /* 0x40c5829dc8000000 */
|
||||
2.99370708007812500000e+04, /* 0x40dd3c4488000000 */
|
||||
8.13773945312500000000e+04, /* 0x40f3de1650000000 */
|
||||
2.21206695312500000000e+05, /* 0x410b00b590000000 */
|
||||
6.01302140625000000000e+05, /* 0x412259ac48000000 */
|
||||
1.63450865625000000000e+06, /* 0x4138f0cca8000000 */
|
||||
4.44305525000000000000e+06, /* 0x4150f2ebd0000000 */
|
||||
1.20774762500000000000e+07, /* 0x4167093488000000 */
|
||||
3.28299845000000000000e+07, /* 0x417f4f2208000000 */
|
||||
8.92411500000000000000e+07, /* 0x419546d8f8000000 */
|
||||
2.42582596000000000000e+08, /* 0x41aceb0888000000 */
|
||||
6.59407856000000000000e+08, /* 0x41c3a6e1f8000000 */
|
||||
1.79245641600000000000e+09, /* 0x41dab5adb8000000 */
|
||||
4.87240166400000000000e+09, /* 0x41f226af30000000 */
|
||||
1.32445608960000000000e+10, /* 0x4208ab7fb0000000 */
|
||||
3.60024494080000000000e+10, /* 0x4220c3d390000000 */
|
||||
9.78648043520000000000e+10, /* 0x4236c93268000000 */
|
||||
2.66024116224000000000e+11, /* 0x424ef822f0000000 */
|
||||
7.23128516608000000000e+11, /* 0x42650bba30000000 */
|
||||
1.96566712320000000000e+12, /* 0x427c9aae40000000 */
|
||||
5.34323724288000000000e+12, /* 0x4293704708000000 */
|
||||
1.45244246507520000000e+13, /* 0x42aa6b7658000000 */
|
||||
3.94814795284480000000e+13, /* 0x42c1f43fc8000000 */
|
||||
1.07321789251584000000e+14, /* 0x42d866f348000000 */
|
||||
2.91730863685632000000e+14, /* 0x42f0953e28000000 */
|
||||
7.93006722514944000000e+14, /* 0x430689e220000000 */
|
||||
2.15561576592179200000e+15}; /* 0x431ea215a0000000 */
|
||||
|
||||
static const double sinh_tail[ 37] = {
|
||||
0.00000000000000000000e+00, /* 0x0000000000000000 */
|
||||
1.60467555584448807892e-08, /* 0x3e513ae6096a0092 */
|
||||
2.76742892754807136947e-08, /* 0x3e5db70cfb79a640 */
|
||||
2.09697499555224576530e-07, /* 0x3e8c2526b66dc067 */
|
||||
2.04940252448908240062e-07, /* 0x3e8b81b18647f380 */
|
||||
1.65444891522700935932e-06, /* 0x3ebbc1cdd1e1eb08 */
|
||||
3.53116789999998198721e-06, /* 0x3ecd9f201534fb09 */
|
||||
6.94023870987375490695e-06, /* 0x3edd1c064a4e9954 */
|
||||
4.98876893611587449271e-06, /* 0x3ed4eca65d06ea74 */
|
||||
3.19656024605152215752e-05, /* 0x3f00c259bcc0ecc5 */
|
||||
2.08687768377236501204e-04, /* 0x3f2b5a6647cf9016 */
|
||||
4.84668088325403796299e-05, /* 0x3f09691adefb0870 */
|
||||
1.17517985422733832468e-03, /* 0x3f53410fc29cde38 */
|
||||
6.90830086959560562415e-04, /* 0x3f46a31a50b6fb3c */
|
||||
1.45697262451506548420e-03, /* 0x3f57defc71805c40 */
|
||||
2.99859023684906737806e-02, /* 0x3f9eb49fd80e0bab */
|
||||
1.02538800507941396667e-02, /* 0x3f84fffc7bcd5920 */
|
||||
1.26787628407699110022e-01, /* 0x3fc03a93b6c63435 */
|
||||
6.86652479544033744752e-02, /* 0x3fb1940bb255fd1c */
|
||||
4.81593627621056619148e-01, /* 0x3fded26e14260b50 */
|
||||
1.70489513795397629181e+00, /* 0x3ffb47401fc9f2a2 */
|
||||
1.12416073482258713767e+01, /* 0x40267bb3f55634f1 */
|
||||
7.06579578070110514432e+00, /* 0x401c435ff8194ddc */
|
||||
5.91244512999659974639e+01, /* 0x404d8fee052ba63a */
|
||||
1.68921736147050694399e+02, /* 0x40651d7edccde3f6 */
|
||||
2.60692936262073658327e+02, /* 0x40704b1644557d1a */
|
||||
3.62419382134885609048e+02, /* 0x4076a6b5ca0a9dc4 */
|
||||
4.07689930834187271103e+03, /* 0x40afd9cc72249aba */
|
||||
1.55377375868385224749e+04, /* 0x40ce58de693edab5 */
|
||||
2.53720210371943067003e+04, /* 0x40d8c70158ac6363 */
|
||||
4.78822310734952334315e+04, /* 0x40e7614764f43e20 */
|
||||
1.81871712615542812273e+05, /* 0x4106337db36fc718 */
|
||||
5.62892347580489004031e+05, /* 0x41212d98b1f611e2 */
|
||||
6.41374032312148716301e+05, /* 0x412392bc108b37cc */
|
||||
7.57809544070145115256e+06, /* 0x415ce87bdc3473dc */
|
||||
3.64177136406482197344e+06, /* 0x414bc8d5ae99ad14 */
|
||||
7.63580561355670914054e+06}; /* 0x415d20d76744835c */
|
||||
|
||||
static const double cosh_lead[ 37] = {
|
||||
1.00000000000000000000e+00, /* 0x3ff0000000000000 */
|
||||
1.54308062791824340820e+00, /* 0x3ff8b07550000000 */
|
||||
3.76219564676284790039e+00, /* 0x400e18fa08000000 */
|
||||
1.00676617622375488281e+01, /* 0x402422a490000000 */
|
||||
2.73082327842712402344e+01, /* 0x403b4ee858000000 */
|
||||
7.42099475860595703125e+01, /* 0x40528d6fc8000000 */
|
||||
2.01715633392333984375e+02, /* 0x406936e678000000 */
|
||||
5.48317031860351562500e+02, /* 0x4081228948000000 */
|
||||
1.49047915649414062500e+03, /* 0x409749eaa8000000 */
|
||||
4.05154199218750000000e+03, /* 0x40afa71580000000 */
|
||||
1.10132329101562500000e+04, /* 0x40c5829dd0000000 */
|
||||
2.99370708007812500000e+04, /* 0x40dd3c4488000000 */
|
||||
8.13773945312500000000e+04, /* 0x40f3de1650000000 */
|
||||
2.21206695312500000000e+05, /* 0x410b00b590000000 */
|
||||
6.01302140625000000000e+05, /* 0x412259ac48000000 */
|
||||
1.63450865625000000000e+06, /* 0x4138f0cca8000000 */
|
||||
4.44305525000000000000e+06, /* 0x4150f2ebd0000000 */
|
||||
1.20774762500000000000e+07, /* 0x4167093488000000 */
|
||||
3.28299845000000000000e+07, /* 0x417f4f2208000000 */
|
||||
8.92411500000000000000e+07, /* 0x419546d8f8000000 */
|
||||
2.42582596000000000000e+08, /* 0x41aceb0888000000 */
|
||||
6.59407856000000000000e+08, /* 0x41c3a6e1f8000000 */
|
||||
1.79245641600000000000e+09, /* 0x41dab5adb8000000 */
|
||||
4.87240166400000000000e+09, /* 0x41f226af30000000 */
|
||||
1.32445608960000000000e+10, /* 0x4208ab7fb0000000 */
|
||||
3.60024494080000000000e+10, /* 0x4220c3d390000000 */
|
||||
9.78648043520000000000e+10, /* 0x4236c93268000000 */
|
||||
2.66024116224000000000e+11, /* 0x424ef822f0000000 */
|
||||
7.23128516608000000000e+11, /* 0x42650bba30000000 */
|
||||
1.96566712320000000000e+12, /* 0x427c9aae40000000 */
|
||||
5.34323724288000000000e+12, /* 0x4293704708000000 */
|
||||
1.45244246507520000000e+13, /* 0x42aa6b7658000000 */
|
||||
3.94814795284480000000e+13, /* 0x42c1f43fc8000000 */
|
||||
1.07321789251584000000e+14, /* 0x42d866f348000000 */
|
||||
2.91730863685632000000e+14, /* 0x42f0953e28000000 */
|
||||
7.93006722514944000000e+14, /* 0x430689e220000000 */
|
||||
2.15561576592179200000e+15}; /* 0x431ea215a0000000 */
|
||||
|
||||
static const double cosh_tail[ 37] = {
|
||||
0.00000000000000000000e+00, /* 0x0000000000000000 */
|
||||
6.89700037027478056904e-09, /* 0x3e3d9f5504c2bd28 */
|
||||
4.43207835591715833630e-08, /* 0x3e67cb66f0a4c9fd */
|
||||
2.33540217013828929694e-07, /* 0x3e8f58617928e588 */
|
||||
5.17452463948269748331e-08, /* 0x3e6bc7d000c38d48 */
|
||||
9.38728274131605919153e-07, /* 0x3eaf7f9d4e329998 */
|
||||
2.73012191010840495544e-06, /* 0x3ec6e6e464885269 */
|
||||
3.29486051438996307950e-06, /* 0x3ecba3a8b946c154 */
|
||||
4.75803746362771416375e-06, /* 0x3ed3f4e76110d5a4 */
|
||||
3.33050940471947692369e-05, /* 0x3f017622515a3e2b */
|
||||
9.94707313972136215365e-06, /* 0x3ee4dc4b528af3d0 */
|
||||
6.51685096227860253398e-05, /* 0x3f11156278615e10 */
|
||||
1.18132406658066663359e-03, /* 0x3f535ad50ed821f5 */
|
||||
6.93090416366541877541e-04, /* 0x3f46b61055f2935c */
|
||||
1.45780415323416845386e-03, /* 0x3f57e2794a601240 */
|
||||
2.99862082708111758744e-02, /* 0x3f9eb4b45f6aadd3 */
|
||||
1.02539925859688602072e-02, /* 0x3f85000b967b3698 */
|
||||
1.26787669807076286421e-01, /* 0x3fc03a940fadc092 */
|
||||
6.86652631843830962843e-02, /* 0x3fb1940bf3bf874c */
|
||||
4.81593633223853068159e-01, /* 0x3fded26e1a2a2110 */
|
||||
1.70489514001513020602e+00, /* 0x3ffb4740205796d6 */
|
||||
1.12416073489841270572e+01, /* 0x40267bb3f55cb85d */
|
||||
7.06579578098005001152e+00, /* 0x401c435ff81e18ac */
|
||||
5.91244513000686140458e+01, /* 0x404d8fee052bdea4 */
|
||||
1.68921736147088438429e+02, /* 0x40651d7edccde926 */
|
||||
2.60692936262087528121e+02, /* 0x40704b1644557e0e */
|
||||
3.62419382134890611269e+02, /* 0x4076a6b5ca0a9e1c */
|
||||
4.07689930834187453002e+03, /* 0x40afd9cc72249abe */
|
||||
1.55377375868385224749e+04, /* 0x40ce58de693edab5 */
|
||||
2.53720210371943103382e+04, /* 0x40d8c70158ac6364 */
|
||||
4.78822310734952334315e+04, /* 0x40e7614764f43e20 */
|
||||
1.81871712615542812273e+05, /* 0x4106337db36fc718 */
|
||||
5.62892347580489004031e+05, /* 0x41212d98b1f611e2 */
|
||||
6.41374032312148716301e+05, /* 0x412392bc108b37cc */
|
||||
7.57809544070145115256e+06, /* 0x415ce87bdc3473dc */
|
||||
3.64177136406482197344e+06, /* 0x414bc8d5ae99ad14 */
|
||||
7.63580561355670914054e+06}; /* 0x415d20d76744835c */
|
||||
|
||||
unsigned long ux, aux, xneg;
|
||||
double y, z, z1, z2;
|
||||
int m;
|
||||
|
||||
/* Special cases */
|
||||
|
||||
GET_BITS_DP64(x, ux);
|
||||
aux = ux & ~SIGNBIT_DP64;
|
||||
if (aux < 0x3e30000000000000) /* |x| small enough that cosh(x) = 1 */
|
||||
{
|
||||
if (aux == 0)
|
||||
/* with no inexact */
|
||||
return 1.0;
|
||||
else
|
||||
return val_with_flags(1.0, AMD_F_INEXACT);
|
||||
}
|
||||
else if (aux >= PINFBITPATT_DP64) /* |x| is NaN or Inf */
|
||||
{
|
||||
if (aux > PINFBITPATT_DP64) /* x is NaN */
|
||||
return _handle_error("cosh", OP_COSH, ux|0x0008000000000000,_DOMAIN,
|
||||
0,EDOM, x, 0.0, 1);
|
||||
else /* x is infinity */
|
||||
return infinity_with_flags(0);
|
||||
}
|
||||
|
||||
xneg = (aux != ux);
|
||||
|
||||
y = x;
|
||||
if (xneg) y = -x;
|
||||
|
||||
if (y >= max_cosh_arg)
|
||||
{
|
||||
return _handle_error("cosh", OP_COSH, PINFBITPATT_DP64,_OVERFLOW,
|
||||
AMD_F_INEXACT|AMD_F_OVERFLOW,ERANGE, x, 0.0, 1);
|
||||
|
||||
// z = infinity_with_flags(AMD_F_OVERFLOW);
|
||||
}
|
||||
else if (y >= small_threshold)
|
||||
{
|
||||
/* In this range y is large enough so that
|
||||
the negative exponential is negligible,
|
||||
so cosh(y) is approximated by sign(x)*exp(y)/2. The
|
||||
code below is an inlined version of that from
|
||||
exp() with two changes (it operates on
|
||||
y instead of x, and the division by 2 is
|
||||
done by reducing m by 1). */
|
||||
|
||||
splitexp(y, 1.0, thirtytwo_by_log2, log2_by_32_lead,
|
||||
log2_by_32_tail, &m, &z1, &z2);
|
||||
m -= 1;
|
||||
|
||||
if (m >= EMIN_DP64 && m <= EMAX_DP64)
|
||||
z = scaleDouble_1((z1+z2),m);
|
||||
else
|
||||
z = scaleDouble_2((z1+z2),m);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* In this range we find the integer part y0 of y
|
||||
and the increment dy = y - y0. We then compute
|
||||
|
||||
z = sinh(y) = sinh(y0)cosh(dy) + cosh(y0)sinh(dy)
|
||||
z = cosh(y) = cosh(y0)cosh(dy) + sinh(y0)sinh(dy)
|
||||
|
||||
where sinh(y0) and cosh(y0) are tabulated above. */
|
||||
|
||||
int ind;
|
||||
double dy, dy2, sdy, cdy;
|
||||
|
||||
ind = (int)y;
|
||||
dy = y - ind;
|
||||
|
||||
dy2 = dy*dy;
|
||||
sdy = dy*dy2*(0.166666666666666667013899e0 +
|
||||
(0.833333333333329931873097e-2 +
|
||||
(0.198412698413242405162014e-3 +
|
||||
(0.275573191913636406057211e-5 +
|
||||
(0.250521176994133472333666e-7 +
|
||||
(0.160576793121939886190847e-9 +
|
||||
0.7746188980094184251527126e-12*dy2)*dy2)*dy2)*dy2)*dy2)*dy2);
|
||||
|
||||
cdy = dy2*(0.500000000000000005911074e0 +
|
||||
(0.416666666666660876512776e-1 +
|
||||
(0.138888888889814854814536e-2 +
|
||||
(0.248015872460622433115785e-4 +
|
||||
(0.275573350756016588011357e-6 +
|
||||
(0.208744349831471353536305e-8 +
|
||||
0.1163921388172173692062032e-10*dy2)*dy2)*dy2)*dy2)*dy2)*dy2);
|
||||
|
||||
/* At this point sinh(dy) is approximated by dy + sdy, and cosh(dy) is approximated by 1 + cdy.
|
||||
Shift some significant bits from dy to cdy. */
|
||||
#if 0
|
||||
double sdy1,sdy2;
|
||||
GET_BITS_DP64(dy, ux);
|
||||
ux &= 0xfffffffff8000000;
|
||||
PUT_BITS_DP64(ux, sdy1); // sdy1 is upper 53-27=26 significant bits of dy.
|
||||
sdy2 = sdy + (dy - sdy1); // sdy2 is sdy + lower bits of dy
|
||||
|
||||
z = ((((((cosh_tail[ind]*cdy + sinh_tail[ind]*sdy2)
|
||||
+ sinh_tail[ind]*sdy1) + cosh_tail[ind])
|
||||
+ cosh_lead[ind]*cdy) + sinh_lead[ind]*sdy2)
|
||||
+ sinh_lead[ind]*sdy1) + cosh_lead[ind];
|
||||
#else
|
||||
z = ((((((cosh_tail[ind]*cdy + sinh_tail[ind]*sdy)
|
||||
+ sinh_tail[ind]*dy) + cosh_tail[ind])
|
||||
+ cosh_lead[ind]*cdy) + sinh_lead[ind]*sdy)
|
||||
+ sinh_lead[ind]*dy) + cosh_lead[ind];
|
||||
#endif
|
||||
}
|
||||
|
||||
return z;
|
||||
}
|
247
sdk/lib/crt/math/libm_sse2/coshf.c
Normal file
247
sdk/lib/crt/math/libm_sse2/coshf.c
Normal file
|
@ -0,0 +1,247 @@
|
|||
|
||||
/*******************************************************************************
|
||||
MIT License
|
||||
-----------
|
||||
|
||||
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this Software and associated documentaon files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "libm.h"
|
||||
#include "libm_util.h"
|
||||
|
||||
#define USE_SPLITEXP
|
||||
#define USE_SCALEDOUBLE_1
|
||||
#define USE_SCALEDOUBLE_2
|
||||
#define USE_INFINITYF_WITH_FLAGS
|
||||
#define USE_VALF_WITH_FLAGS
|
||||
#define USE_HANDLE_ERRORF
|
||||
#include "libm_inlines.h"
|
||||
#undef USE_SPLITEXP
|
||||
#undef USE_SCALEDOUBLE_1
|
||||
#undef USE_SCALEDOUBLE_2
|
||||
#undef USE_INFINITYF_WITH_FLAGS
|
||||
#undef USE_VALF_WITH_FLAGS
|
||||
#undef USE_HANDLE_ERRORF
|
||||
|
||||
// Disable "C4163: not available as intrinsic function" warning that older
|
||||
// compilers may issue here.
|
||||
#pragma warning(disable:4163)
|
||||
#pragma function(coshf)
|
||||
|
||||
float coshf(float fx)
|
||||
{
|
||||
/*
|
||||
After dealing with special cases the computation is split into
|
||||
regions as follows:
|
||||
|
||||
abs(x) >= max_cosh_arg:
|
||||
cosh(x) = sign(x)*Inf
|
||||
|
||||
abs(x) >= small_threshold:
|
||||
cosh(x) = sign(x)*exp(abs(x))/2 computed using the
|
||||
splitexp and scaleDouble functions as for exp_amd().
|
||||
|
||||
abs(x) < small_threshold:
|
||||
compute p = exp(y) - 1 and then z = 0.5*(p+(p/(p+1.0)))
|
||||
cosh(x) is then sign(x)*z. */
|
||||
|
||||
static const double
|
||||
/* The max argument of coshf, but stored as a double */
|
||||
max_cosh_arg = 8.94159862922329438106e+01, /* 0x40565a9f84f82e63 */
|
||||
thirtytwo_by_log2 = 4.61662413084468283841e+01, /* 0x40471547652b82fe */
|
||||
log2_by_32_lead = 2.16608493356034159660e-02, /* 0x3f962e42fe000000 */
|
||||
log2_by_32_tail = 5.68948749532545630390e-11, /* 0x3dcf473de6af278e */
|
||||
|
||||
small_threshold = 8*BASEDIGITS_DP64*0.30102999566398119521373889;
|
||||
// small_threshold = 20.0;
|
||||
/* (8*BASEDIGITS_DP64*log10of2) ' exp(-x) insignificant c.f. exp(x) */
|
||||
|
||||
/* Tabulated values of sinh(i) and cosh(i) for i = 0,...,36. */
|
||||
|
||||
static const double sinh_lead[ 37] = {
|
||||
0.00000000000000000000e+00, /* 0x0000000000000000 */
|
||||
1.17520119364380137839e+00, /* 0x3ff2cd9fc44eb982 */
|
||||
3.62686040784701857476e+00, /* 0x400d03cf63b6e19f */
|
||||
1.00178749274099008204e+01, /* 0x40240926e70949ad */
|
||||
2.72899171971277496596e+01, /* 0x403b4a3803703630 */
|
||||
7.42032105777887522891e+01, /* 0x40528d0166f07374 */
|
||||
2.01713157370279219549e+02, /* 0x406936d22f67c805 */
|
||||
5.48316123273246489589e+02, /* 0x408122876ba380c9 */
|
||||
1.49047882578955000099e+03, /* 0x409749ea514eca65 */
|
||||
4.05154190208278987484e+03, /* 0x40afa7157430966f */
|
||||
1.10132328747033916443e+04, /* 0x40c5829dced69991 */
|
||||
2.99370708492480553105e+04, /* 0x40dd3c4488cb48d6 */
|
||||
8.13773957064298447222e+04, /* 0x40f3de1654d043f0 */
|
||||
2.21206696003330085659e+05, /* 0x410b00b5916a31a5 */
|
||||
6.01302142081972560845e+05, /* 0x412259ac48bef7e3 */
|
||||
1.63450868623590236530e+06, /* 0x4138f0ccafad27f6 */
|
||||
4.44305526025387924165e+06, /* 0x4150f2ebd0a7ffe3 */
|
||||
1.20774763767876271158e+07, /* 0x416709348c0ea4ed */
|
||||
3.28299845686652474105e+07, /* 0x417f4f22091940bb */
|
||||
8.92411504815936237574e+07, /* 0x419546d8f9ed26e1 */
|
||||
2.42582597704895108938e+08, /* 0x41aceb088b68e803 */
|
||||
6.59407867241607308388e+08, /* 0x41c3a6e1fd9eecfd */
|
||||
1.79245642306579566002e+09, /* 0x41dab5adb9c435ff */
|
||||
4.87240172312445068359e+09, /* 0x41f226af33b1fdc0 */
|
||||
1.32445610649217357635e+10, /* 0x4208ab7fb5475fb7 */
|
||||
3.60024496686929321289e+10, /* 0x4220c3d3920962c8 */
|
||||
9.78648047144193725586e+10, /* 0x4236c932696a6b5c */
|
||||
2.66024120300899291992e+11, /* 0x424ef822f7f6731c */
|
||||
7.23128532145737548828e+11, /* 0x42650bba3796379a */
|
||||
1.96566714857202099609e+12, /* 0x427c9aae4631c056 */
|
||||
5.34323729076223046875e+12, /* 0x429370470aec28ec */
|
||||
1.45244248326237109375e+13, /* 0x42aa6b765d8cdf6c */
|
||||
3.94814800913403437500e+13, /* 0x42c1f43fcc4b662c */
|
||||
1.07321789892958031250e+14, /* 0x42d866f34a725782 */
|
||||
2.91730871263727437500e+14, /* 0x42f0953e2f3a1ef7 */
|
||||
7.93006726156715250000e+14, /* 0x430689e221bc8d5a */
|
||||
2.15561577355759750000e+15}; /* 0x431ea215a1d20d76 */
|
||||
|
||||
static const double cosh_lead[ 37] = {
|
||||
1.00000000000000000000e+00, /* 0x3ff0000000000000 */
|
||||
1.54308063481524371241e+00, /* 0x3ff8b07551d9f550 */
|
||||
3.76219569108363138810e+00, /* 0x400e18fa0df2d9bc */
|
||||
1.00676619957777653269e+01, /* 0x402422a497d6185e */
|
||||
2.73082328360164865444e+01, /* 0x403b4ee858de3e80 */
|
||||
7.42099485247878334349e+01, /* 0x40528d6fcbeff3a9 */
|
||||
2.01715636122455890700e+02, /* 0x406936e67db9b919 */
|
||||
5.48317035155212010977e+02, /* 0x4081228949ba3a8b */
|
||||
1.49047916125217807348e+03, /* 0x409749eaa93f4e76 */
|
||||
4.05154202549259389343e+03, /* 0x40afa715845d8894 */
|
||||
1.10132329201033226127e+04, /* 0x40c5829dd053712d */
|
||||
2.99370708659497577173e+04, /* 0x40dd3c4489115627 */
|
||||
8.13773957125740562333e+04, /* 0x40f3de1654d6b543 */
|
||||
2.21206696005590405548e+05, /* 0x410b00b5916b6105 */
|
||||
6.01302142082804115489e+05, /* 0x412259ac48bf13ca */
|
||||
1.63450868623620807193e+06, /* 0x4138f0ccafad2d17 */
|
||||
4.44305526025399193168e+06, /* 0x4150f2ebd0a8005c */
|
||||
1.20774763767876680940e+07, /* 0x416709348c0ea503 */
|
||||
3.28299845686652623117e+07, /* 0x417f4f22091940bf */
|
||||
8.92411504815936237574e+07, /* 0x419546d8f9ed26e1 */
|
||||
2.42582597704895138741e+08, /* 0x41aceb088b68e804 */
|
||||
6.59407867241607308388e+08, /* 0x41c3a6e1fd9eecfd */
|
||||
1.79245642306579566002e+09, /* 0x41dab5adb9c435ff */
|
||||
4.87240172312445068359e+09, /* 0x41f226af33b1fdc0 */
|
||||
1.32445610649217357635e+10, /* 0x4208ab7fb5475fb7 */
|
||||
3.60024496686929321289e+10, /* 0x4220c3d3920962c8 */
|
||||
9.78648047144193725586e+10, /* 0x4236c932696a6b5c */
|
||||
2.66024120300899291992e+11, /* 0x424ef822f7f6731c */
|
||||
7.23128532145737548828e+11, /* 0x42650bba3796379a */
|
||||
1.96566714857202099609e+12, /* 0x427c9aae4631c056 */
|
||||
5.34323729076223046875e+12, /* 0x429370470aec28ec */
|
||||
1.45244248326237109375e+13, /* 0x42aa6b765d8cdf6c */
|
||||
3.94814800913403437500e+13, /* 0x42c1f43fcc4b662c */
|
||||
1.07321789892958031250e+14, /* 0x42d866f34a725782 */
|
||||
2.91730871263727437500e+14, /* 0x42f0953e2f3a1ef7 */
|
||||
7.93006726156715250000e+14, /* 0x430689e221bc8d5a */
|
||||
2.15561577355759750000e+15}; /* 0x431ea215a1d20d76 */
|
||||
|
||||
unsigned long ux, aux, xneg;
|
||||
unsigned int uhx;
|
||||
double x = fx, y, z, z1, z2;
|
||||
int m;
|
||||
|
||||
/* Special cases */
|
||||
|
||||
GET_BITS_DP64(x, ux);
|
||||
aux = ux & ~SIGNBIT_DP64;
|
||||
if (aux < 0x3f10000000000000) /* |x| small enough that cosh(x) = 1 */
|
||||
{
|
||||
if (aux == 0) return (float)1.0; /* with no inexact */
|
||||
if (LAMBDA_DP64 + x > 1.0) return valf_with_flags((float)1.0, AMD_F_INEXACT); /* with inexact */
|
||||
}
|
||||
else if (aux >= PINFBITPATT_DP64) /* |x| is NaN or Inf */
|
||||
if (aux > PINFBITPATT_DP64) /* x is NaN */
|
||||
{
|
||||
GET_BITS_SP32(fx, uhx);
|
||||
return _handle_errorf("coshf",OP_COSH,uhx|0x00400000,_DOMAIN, 0,
|
||||
EDOM, fx, 0.0, 1);
|
||||
}
|
||||
else /* x is infinity */
|
||||
return infinityf_with_flags(0);
|
||||
xneg = (aux != ux);
|
||||
|
||||
y = x;
|
||||
if (xneg) y = -x;
|
||||
|
||||
if (y >= max_cosh_arg)
|
||||
/* Return +infinity with overflow flag. */
|
||||
return _handle_errorf("coshf",OP_COSH,PINFBITPATT_SP32,_OVERFLOW,
|
||||
AMD_F_INEXACT|AMD_F_OVERFLOW,ERANGE, fx, 0.0, 1);
|
||||
// z = infinity_with_flags(AMD_F_OVERFLOW);
|
||||
else if (y >= small_threshold)
|
||||
{
|
||||
/* In this range y is large enough so that
|
||||
the negative exponential is negligible,
|
||||
so cosh(y) is approximated by sign(x)*exp(y)/2. The
|
||||
code below is an inlined version of that from
|
||||
exp() with two changes (it operates on
|
||||
y instead of x, and the division by 2 is
|
||||
done by reducing m by 1). */
|
||||
|
||||
splitexp(y, 1.0, thirtytwo_by_log2, log2_by_32_lead,
|
||||
log2_by_32_tail, &m, &z1, &z2);
|
||||
m -= 1;
|
||||
|
||||
/* scaleDouble_1 is always safe because the argument x was
|
||||
float, rather than double */
|
||||
z = scaleDouble_1((z1+z2),m);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* In this range we find the integer part y0 of y
|
||||
and the increment dy = y - y0. We then compute
|
||||
|
||||
z = sinh(y) = sinh(y0)cosh(dy) + cosh(y0)sinh(dy)
|
||||
z = cosh(y) = cosh(y0)cosh(dy) + sinh(y0)sinh(dy)
|
||||
|
||||
where sinh(y0) and cosh(y0) are tabulated above. */
|
||||
|
||||
int ind;
|
||||
double dy, dy2, sdy, cdy;
|
||||
|
||||
ind = (int)y;
|
||||
dy = y - ind;
|
||||
|
||||
dy2 = dy*dy;
|
||||
|
||||
sdy = dy + dy*dy2*(0.166666666666666667013899e0 +
|
||||
(0.833333333333329931873097e-2 +
|
||||
(0.198412698413242405162014e-3 +
|
||||
(0.275573191913636406057211e-5 +
|
||||
(0.250521176994133472333666e-7 +
|
||||
(0.160576793121939886190847e-9 +
|
||||
0.7746188980094184251527126e-12*dy2)*dy2)*dy2)*dy2)*dy2)*dy2);
|
||||
|
||||
cdy = 1 + dy2*(0.500000000000000005911074e0 +
|
||||
(0.416666666666660876512776e-1 +
|
||||
(0.138888888889814854814536e-2 +
|
||||
(0.248015872460622433115785e-4 +
|
||||
(0.275573350756016588011357e-6 +
|
||||
(0.208744349831471353536305e-8 +
|
||||
0.1163921388172173692062032e-10*dy2)*dy2)*dy2)*dy2)*dy2)*dy2);
|
||||
|
||||
z = cosh_lead[ind]*cdy + sinh_lead[ind]*sdy;
|
||||
}
|
||||
|
||||
// if (xneg) z = - z;
|
||||
return (float)z;
|
||||
}
|
439
sdk/lib/crt/math/libm_sse2/exp.asm
Normal file
439
sdk/lib/crt/math/libm_sse2/exp.asm
Normal file
|
@ -0,0 +1,439 @@
|
|||
;
|
||||
; MIT License
|
||||
; -----------
|
||||
;
|
||||
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
||||
;
|
||||
; Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
; of this Software and associated documentaon files (the "Software"), to deal
|
||||
; in the Software without restriction, including without limitation the rights
|
||||
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
; copies of the Software, and to permit persons to whom the Software is
|
||||
; furnished to do so, subject to the following conditions:
|
||||
;
|
||||
; The above copyright notice and this permission notice shall be included in
|
||||
; all copies or substantial portions of the Software.
|
||||
;
|
||||
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
; THE SOFTWARE.
|
||||
;
|
||||
; exp.asm
|
||||
;
|
||||
; An implementation of the exp libm function.
|
||||
;
|
||||
; Prototype:
|
||||
;
|
||||
; double exp(double x);
|
||||
;
|
||||
|
||||
;
|
||||
; Algorithm:
|
||||
;
|
||||
; e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
|
||||
;
|
||||
; x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
|
||||
; n = 64*m + j, 0 <= j < 64
|
||||
;
|
||||
; e^x = 2^((64*m + j + f)/64)
|
||||
; = (2^m) * (2^(j/64)) * 2^(f/64)
|
||||
; = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
|
||||
;
|
||||
; f = x*(64/ln(2)) - n
|
||||
; r = f*(ln(2)/64) = x - n*(ln(2)/64)
|
||||
;
|
||||
; e^x = (2^m) * (2^(j/64)) * e^r
|
||||
;
|
||||
; (2^(j/64)) is precomputed
|
||||
;
|
||||
; e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5! + (r^5)/5!
|
||||
; e^r = 1 + q
|
||||
;
|
||||
; q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5! + (r^5)/5!
|
||||
;
|
||||
|
||||
.const
|
||||
ALIGN 16
|
||||
; these codes and the ones in the corresponding .c file have to match
|
||||
__flag_x_nan DD 00000001
|
||||
__flag_y_zero DD 00000002
|
||||
__flag_y_inf DD 00000003
|
||||
|
||||
ALIGN 16
|
||||
|
||||
L__real_1_by_720 DQ 03f56c16c16c16c17h
|
||||
DQ 03f56c16c16c16c17h ; 1/720
|
||||
L__real_1_by_120 DQ 03f81111111111111h
|
||||
DQ 03f81111111111111h ; 1/120
|
||||
L__real_1_by_6 DQ 03fc5555555555555h
|
||||
DQ 03fc5555555555555h ; 1/6
|
||||
L__real_1_by_2 DQ 03fe0000000000000h
|
||||
DQ 03fe0000000000000h ; 1/2
|
||||
L__real_1_by_24 DQ 03fa5555555555555h
|
||||
DQ 03fa5555555555555h ; 1/24
|
||||
|
||||
ALIGN 16
|
||||
L__log2_by_64_mtail_mhead DQ 0bf862e42fefa0000h, 0bd1cf79abc9e3b39h
|
||||
L__ln_of_smallest_normal DQ 0C086232BDD7ABCD2h
|
||||
L__zero DQ 00000000000000000h
|
||||
L__max_exp_arg DQ 040862e42fefa39efh ; 709.78271289338397
|
||||
L__denormal_tiny_threshold DQ 0c0874046dfefd9d0h ; -744.03460681327306
|
||||
L__min_exp_arg DQ 0c0874910d52d3051h ; -745.13321910194111
|
||||
L__real_64_by_log2 DQ 040571547652b82feh ; 64/ln(2)
|
||||
L__positive_infinity DQ 07ff0000000000000h
|
||||
L__negative_infinity DQ 0fff0000000000000h
|
||||
L__real_qnanbit DQ 0008000000000000h ; qnan set bit
|
||||
L__real_x_near0_threshold DQ 3c00000000000000h
|
||||
L__log2_by_64_mhead DQ 0bf862e42fefa0000h
|
||||
L__log2_by_64_mtail DQ 0bd1cf79abc9e3b39h
|
||||
L__real_smallest_denormal DQ 00000000000000001h
|
||||
L__real_one DQ 03ff0000000000000h
|
||||
L__2_to_neg_26 DQ 03E50000000000000h ; 2^-26
|
||||
L__min_normal DQ 00010000000000000h ; smallest normal
|
||||
|
||||
|
||||
EXTRN __two_to_jby64_table:QWORD
|
||||
EXTRN __two_to_jby64_head_table:QWORD
|
||||
EXTRN __two_to_jby64_tail_table:QWORD
|
||||
EXTRN __use_fma3_lib:DWORD
|
||||
|
||||
; make room for fname_special to save things
|
||||
dummy_space EQU 020h
|
||||
stack_size EQU 038h
|
||||
|
||||
include fm.inc
|
||||
|
||||
fname TEXTEQU <exp>
|
||||
fname_special TEXTEQU <_exp_special>
|
||||
|
||||
;Define name and any external functions being called
|
||||
EXTERN fname_special : PROC
|
||||
|
||||
.code
|
||||
PUBLIC fname
|
||||
fname PROC FRAME
|
||||
StackAllocate stack_size
|
||||
.ENDPROLOG
|
||||
|
||||
; We need to avoid unwanted exceptions from a NaN argument.
|
||||
; It could be argued that a signaling NaN should raise an exception,
|
||||
; but the existing library doesn't. At any rate, the comparison operations
|
||||
; don't seem to like quiet NaN either, so...
|
||||
movd rdx, xmm0
|
||||
btr rdx, 63
|
||||
cmp rdx, L__positive_infinity
|
||||
jge Lexp_x_is_nan_or_inf
|
||||
|
||||
cmp DWORD PTR __use_fma3_lib, 0
|
||||
jne Lexp_fma3
|
||||
|
||||
movapd xmm2, xmm0
|
||||
movapd xmm3, xmm0
|
||||
|
||||
; Some hardware has problems with too many branches in a single
|
||||
; 16- or 32-byte window, so let's peel off the common case into
|
||||
; a single branch.
|
||||
cmplesd xmm2, L__max_exp_arg ; xmm2 <-- 0xFFFFFFFF is x is not too big positive
|
||||
cmpnltsd xmm3, L__denormal_tiny_threshold ; xmm3 <-- 0xFFFFFFFF if x is not too big negative
|
||||
andps xmm2, xmm3 ; xmm2 <-- 0xFFFFFFFF if x is in range, 0 otherwise
|
||||
ucomisd xmm2, xmm2 ; note that FFF... is NaN, so this comparison should set PF for in-range x
|
||||
jp Lexp_y_is_finite
|
||||
|
||||
ucomisd xmm0, L__max_exp_arg
|
||||
ja Lexp_y_is_inf
|
||||
; Since we peeled off the cases with normal result,
|
||||
; there is only one possibility remaining:
|
||||
jmp Lexp_y_is_denormal_or_zero
|
||||
|
||||
ALIGN 16
|
||||
Lexp_y_is_finite:
|
||||
; x * (64/ln(2))
|
||||
movapd xmm1, xmm0
|
||||
btr rdx, 63 ; rdx <-- |x|
|
||||
cmp rdx, L__2_to_neg_26
|
||||
jbe Lexp_return_1_plus_x
|
||||
mulsd xmm1, L__real_64_by_log2
|
||||
|
||||
; n = int( x * (64/ln(2)) )
|
||||
cvttpd2dq xmm2, xmm1 ; xmm2 = (int)n
|
||||
cvtdq2pd xmm1, xmm2 ; xmm1 = (double)n
|
||||
movd ecx, xmm2
|
||||
movapd xmm2, xmm1
|
||||
|
||||
; r1 = x - n * ln(2)/64 head
|
||||
mulsd xmm1, L__log2_by_64_mhead
|
||||
|
||||
; j = n & 0x3f
|
||||
mov rax, 03fh
|
||||
and eax, ecx ; eax = j
|
||||
; m = (n - j) / 64
|
||||
sar ecx, 6 ; ecx = m
|
||||
|
||||
|
||||
; r2 = - n * ln(2)/64 tail
|
||||
mulsd xmm2, L__log2_by_64_mtail
|
||||
addsd xmm0, xmm1 ; xmm0 = r1
|
||||
|
||||
; r1+r2
|
||||
addsd xmm2, xmm0 ; xmm2 = r
|
||||
|
||||
; q = r + r^2*1/2 + r^3*1/6 + r^4 *1/24 + r^5*1/120 + r^6*1/720
|
||||
; q = r + r*r*(1/2 + r*(1/6+ r*(1/24 + r*(1/120 + r*(1/720)))))
|
||||
movapd xmm3, L__real_1_by_720 ; xmm3 = 1/720
|
||||
mulsd xmm3, xmm2 ; xmm3 = r*1/720
|
||||
movapd xmm0, L__real_1_by_6 ; xmm0 = 1/6
|
||||
movapd xmm1, xmm2 ; xmm1 = r
|
||||
mulsd xmm0, xmm2 ; xmm0 = r*1/6
|
||||
addsd xmm3, L__real_1_by_120 ; xmm3 = 1/120 + (r*1/720)
|
||||
mulsd xmm1, xmm2 ; xmm1 = r*r
|
||||
addsd xmm0, L__real_1_by_2 ; xmm0 = 1/2 + (r*1/6)
|
||||
movapd xmm4, xmm1 ; xmm4 = r*r
|
||||
mulsd xmm4, xmm1 ; xmm4 = (r*r) * (r*r)
|
||||
mulsd xmm3, xmm2 ; xmm3 = r * (1/120 + (r*1/720))
|
||||
mulsd xmm0, xmm1 ; xmm0 = (r*r)*(1/2 + (r*1/6))
|
||||
addsd xmm3, L__real_1_by_24 ; xmm3 = 1/24 + (r * (1/120 + (r*1/720)))
|
||||
addsd xmm0, xmm2 ; xmm0 = r + ((r*r)*(1/2 + (r*1/6)))
|
||||
mulsd xmm3, xmm4 ; xmm3 = ((r*r) * (r*r)) * (1/24 + (r * (1/120 + (r*1/720))))
|
||||
addsd xmm0, xmm3 ; xmm0 = r + ((r*r)*(1/2 + (r*1/6))) + ((r*r) * (r*r)) * (1/24 + (r * (1/120 + (r*1/720))))
|
||||
|
||||
;(f)*(q) + f2 + f1
|
||||
cmp ecx, 0fffffc02h ; -1022
|
||||
lea rdx, __two_to_jby64_table
|
||||
lea r11, __two_to_jby64_tail_table
|
||||
lea r10, __two_to_jby64_head_table
|
||||
mulsd xmm0, QWORD PTR [rdx+rax * 8 ]
|
||||
addsd xmm0, QWORD PTR [r11+rax * 8 ]
|
||||
addsd xmm0, QWORD PTR [r10+rax * 8 ]
|
||||
|
||||
jle Lexp_process_denormal
|
||||
Lexp_process_normal:
|
||||
shl rcx, 52
|
||||
movd xmm2, rcx
|
||||
paddq xmm0, xmm2
|
||||
StackDeallocate stack_size
|
||||
ret
|
||||
|
||||
ALIGN 16
|
||||
Lexp_process_denormal:
|
||||
jl Lexp_process_true_denormal
|
||||
ucomisd xmm0, L__real_one
|
||||
jae Lexp_process_normal
|
||||
Lexp_process_true_denormal:
|
||||
; here ( e^r < 1 and m = -1022 ) or m <= -1023
|
||||
add ecx, 1074
|
||||
mov rax, 1
|
||||
shl rax, cl
|
||||
movd xmm2, rax
|
||||
mulsd xmm0, xmm2
|
||||
jmp Lexp_finish
|
||||
|
||||
Lexp_y_is_one:
|
||||
movsd xmm0, L__real_one
|
||||
jmp Lexp_finish
|
||||
|
||||
ALIGN 16
|
||||
Lexp_x_is_nan_or_inf:
|
||||
movd rax, xmm0
|
||||
cmp rax, L__positive_infinity
|
||||
je Lexp_finish
|
||||
cmp rax, L__negative_infinity
|
||||
je Lexp_return_zero_without_exception
|
||||
or rax, L__real_qnanbit
|
||||
movd xmm1, rax
|
||||
mov r8d, __flag_x_nan
|
||||
call fname_special
|
||||
jmp Lexp_finish
|
||||
|
||||
ALIGN 16
|
||||
Lexp_y_is_inf:
|
||||
mov rax, 07ff0000000000000h
|
||||
movd xmm1, rax
|
||||
mov r8d, __flag_y_inf
|
||||
call fname_special
|
||||
jmp Lexp_finish
|
||||
|
||||
ALIGN 16
|
||||
Lexp_y_is_denormal_or_zero:
|
||||
ucomisd xmm0, L__min_exp_arg
|
||||
jbe Lexp_y_is_zero
|
||||
movapd xmm0, L__real_smallest_denormal
|
||||
jmp Lexp_finish
|
||||
|
||||
ALIGN 16
|
||||
Lexp_y_is_zero:
|
||||
pxor xmm1, xmm1
|
||||
mov r8d, __flag_y_zero
|
||||
call fname_special
|
||||
jmp Lexp_finish
|
||||
|
||||
ALIGN 16
|
||||
Lexp_return_1_plus_x:
|
||||
cmp rdx, L__min_normal
|
||||
jbe Lexp_return_1_plus_eps
|
||||
addsd xmm0, L__real_one
|
||||
StackDeallocate stack_size
|
||||
ret 0
|
||||
|
||||
; Some hardware really does not like subnormals. Try to avoid them.
|
||||
ALIGN 16
|
||||
Lexp_return_1_plus_eps:
|
||||
movsd xmm0, L__real_one
|
||||
addsd xmm0, L__min_normal ; make sure inexact is set
|
||||
StackDeallocate stack_size
|
||||
ret 0
|
||||
|
||||
ALIGN 16
|
||||
Lexp_return_zero_without_exception:
|
||||
pxor xmm0,xmm0
|
||||
StackDeallocate stack_size
|
||||
ret 0
|
||||
|
||||
|
||||
ALIGN 16
|
||||
Lexp_finish:
|
||||
StackDeallocate stack_size
|
||||
ret 0
|
||||
|
||||
ALIGN 16
|
||||
Lexp_fma3:
|
||||
; Some hardware has problems with too many branches in a single
|
||||
; 16- or 32-byte window, so let's peel off the common case into
|
||||
; a single branch.
|
||||
vcmplesd xmm2, xmm0, L__max_exp_arg ; xmm2 <-- 0xFFFFFFFF is x is not too big positive
|
||||
vcmpnltsd xmm3, xmm0, L__denormal_tiny_threshold ; xmm3 <-- 0xFFFFFFFF if x is not too big negative
|
||||
vandps xmm2, xmm3, xmm2 ; xmm2 <-- 0xFFFFFFFF if x is in range, 0 otherwise
|
||||
vucomisd xmm2, xmm2 ; note that FFF... is NaN, so this comparison should set PF for in-range x
|
||||
jp Lexp_fma3_y_is_finite
|
||||
|
||||
vucomisd xmm0,L__max_exp_arg
|
||||
ja Lexp_fma3_y_is_inf
|
||||
; Since we peeled off the cases with normal result,
|
||||
; there is only one possibility remaining:
|
||||
jmp Lexp_fma3_y_is_zero
|
||||
|
||||
; vpsllq xmm1, xmm0, 1
|
||||
; vpsrlq xmm1, xmm1, 1
|
||||
; vucomisd xmm1, L__real_x_near0_threshold ; 2^-63
|
||||
; jb Lexp_fma3_y_is_one
|
||||
|
||||
ALIGN 16
|
||||
Lexp_fma3_y_is_finite:
|
||||
vmovq rdx, xmm0
|
||||
btr rdx, 63 ; rdx <-- |x|
|
||||
cmp rdx, L__2_to_neg_26
|
||||
jbe Lexp_fma3_return_1_plus_x
|
||||
|
||||
; x * (64/ln(2))
|
||||
vmulsd xmm1,xmm0,L__real_64_by_log2
|
||||
|
||||
; n = int( x * (64/ln(2)) )
|
||||
vcvttpd2dq xmm2,xmm1 ;xmm2 = (int)n
|
||||
vcvtdq2pd xmm1,xmm2 ;xmm1 = (double)n ;can use round
|
||||
vmovd ecx,xmm2
|
||||
|
||||
; r1 = x - n * ln(2)/64 head
|
||||
; r2 = - n * ln(2)/64 tail
|
||||
; r = r1+r2
|
||||
vmovlhps xmm1,xmm1,xmm1 ;xmm1 = (double (double)n,)n
|
||||
vmovq xmm0,xmm0 ;xmm0 = 0,x ;zero out the upper part
|
||||
vfmadd132pd xmm1,xmm0,L__log2_by_64_mtail_mhead
|
||||
vhaddpd xmm2,xmm1,xmm1 ;xmm2 = r,r
|
||||
|
||||
;j = n & 03fh
|
||||
mov rax,03fh
|
||||
and eax,ecx ;eax = j
|
||||
; m = (n - j) / 64
|
||||
sar ecx,6 ;ecx = m
|
||||
|
||||
; q = r + r^2*1/2 + r^3*1/6 + r^4 *1/24 + r^5*1/120 + r^6*1/720
|
||||
; q = r + r*r*(1/2 + r*(1/6+ r*(1/24 + r*(1/120 + r*(1/720)))))
|
||||
vmovapd xmm3,L__real_1_by_720
|
||||
vfmadd213sd xmm3,xmm2,L__real_1_by_120
|
||||
vfmadd213sd xmm3,xmm2,L__real_1_by_24
|
||||
vfmadd213sd xmm3,xmm2,L__real_1_by_6
|
||||
vfmadd213sd xmm3,xmm2,L__real_1_by_2
|
||||
vmulsd xmm0,xmm2,xmm2
|
||||
vfmadd213sd xmm0,xmm3,xmm2
|
||||
|
||||
; (f)*(q) + f2 + f1
|
||||
cmp ecx,0fffffc02h ; -1022
|
||||
lea rdx,__two_to_jby64_table
|
||||
lea r11,__two_to_jby64_tail_table
|
||||
lea r10,__two_to_jby64_head_table
|
||||
vmulsd xmm2,xmm0,QWORD PTR[rdx + rax * 8]
|
||||
vaddsd xmm1,xmm2,QWORD PTR[r11 + rax * 8]
|
||||
vaddsd xmm0,xmm1,QWORD PTR[r10 + rax * 8]
|
||||
|
||||
jle Lexp_fma3_process_denormal
|
||||
Lexp_fma3_process_normal:
|
||||
shl rcx,52
|
||||
vmovq xmm2,rcx
|
||||
vpaddq xmm0,xmm0,xmm2
|
||||
StackDeallocate stack_size
|
||||
ret
|
||||
|
||||
ALIGN 16
|
||||
Lexp_fma3_process_denormal:
|
||||
jl Lexp_fma3_process_true_denormal
|
||||
vucomisd xmm0,L__real_one
|
||||
jae Lexp_fma3_process_normal
|
||||
Lexp_fma3_process_true_denormal:
|
||||
; here ( e^r < 1 and m = -1022 ) or m <= -1023
|
||||
add ecx,1074
|
||||
mov rax,1
|
||||
shl rax,cl
|
||||
vmovq xmm2,rax
|
||||
vmulsd xmm0,xmm0,xmm2
|
||||
jmp Lexp_fma3_finish
|
||||
|
||||
Lexp_fma3_y_is_one:
|
||||
vmovsd xmm0, L__real_one
|
||||
jmp Lexp_fma3_finish
|
||||
|
||||
|
||||
ALIGN 16
|
||||
Lexp_fma3_y_is_inf:
|
||||
mov rax,07ff0000000000000h
|
||||
vmovq xmm1,rax
|
||||
mov r8d,__flag_y_inf
|
||||
call fname_special
|
||||
jmp Lexp_fma3_finish
|
||||
|
||||
ALIGN 16
|
||||
Lexp_fma3_return_1_plus_x:
|
||||
cmp rdx, L__min_normal
|
||||
jbe Lexp_fma3_return_1_plus_eps
|
||||
vaddsd xmm0, xmm0, L__real_one
|
||||
StackDeallocate stack_size
|
||||
ret 0
|
||||
|
||||
; Some hardware really does not like subnormals. Try to avoid them.
|
||||
ALIGN 16
|
||||
Lexp_fma3_return_1_plus_eps:
|
||||
vmovsd xmm0, L__real_one
|
||||
vaddsd xmm0, xmm0, L__min_normal ; make sure inexact is set
|
||||
StackDeallocate stack_size
|
||||
ret 0
|
||||
|
||||
ALIGN 16
|
||||
Lexp_fma3_y_is_zero:
|
||||
vpxor xmm1,xmm1,xmm1
|
||||
mov r8d,__flag_y_zero
|
||||
call fname_special
|
||||
jmp Lexp_fma3_finish
|
||||
|
||||
ALIGN 16
|
||||
Lexp_fma3_return_zero_without_exception:
|
||||
vpxor xmm0,xmm0,xmm0
|
||||
|
||||
ALIGN 16
|
||||
Lexp_fma3_finish:
|
||||
StackDeallocate stack_size
|
||||
ret
|
||||
|
||||
fname endp
|
||||
END
|
162
sdk/lib/crt/math/libm_sse2/exp2.c
Normal file
162
sdk/lib/crt/math/libm_sse2/exp2.c
Normal file
|
@ -0,0 +1,162 @@
|
|||
|
||||
/*******************************************************************************
|
||||
MIT License
|
||||
-----------
|
||||
|
||||
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this Software and associated documentaon files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "libm.h"
|
||||
#include "libm_util.h"
|
||||
|
||||
#define USE_SPLITEXP
|
||||
#define USE_SCALEDOUBLE_1
|
||||
#define USE_SCALEDOUBLE_2
|
||||
#define USE_ZERO_WITH_FLAGS
|
||||
#define USE_INFINITY_WITH_FLAGS
|
||||
#define USE_HANDLE_ERROR
|
||||
|
||||
#include "libm_inlines.h"
|
||||
#undef USE_ZERO_WITH_FLAGS
|
||||
#undef USE_SPLITEXP
|
||||
#undef USE_SCALEDOUBLE_1
|
||||
#undef USE_SCALEDOUBLE_2
|
||||
#undef USE_INFINITY_WITH_FLAGS
|
||||
#undef USE_HANDLE_ERROR
|
||||
|
||||
#include "libm_errno.h"
|
||||
|
||||
/* exp2 is only provided for use by powf under Windows, so give
|
||||
it a leading underscore. */
|
||||
double FN_PROTOTYPE(_exp2)(double x)
|
||||
{
|
||||
static const double
|
||||
max_exp2_arg = 1024.0, /* 0x4090000000000000 */
|
||||
min_exp2_arg = -1074.0, /* 0xc090c80000000000 */
|
||||
log2 = 6.931471805599453094178e-01, /* 0x3fe62e42fefa39ef */
|
||||
log2_lead = 6.93147167563438415527E-01, /* 0x3fe62e42f8000000 */
|
||||
log2_tail = 1.29965068938898869640E-08, /* 0x3e4be8e7bcd5e4f1 */
|
||||
one_by_32_lead = 0.03125;
|
||||
|
||||
double y, z1, z2, z, hx, tx, y1, y2;
|
||||
int m;
|
||||
unsigned long ux, ax;
|
||||
|
||||
/*
|
||||
Computation of exp2(x).
|
||||
|
||||
We compute the values m, z1, and z2 such that
|
||||
exp2(x) = 2**m * (z1 + z2), where exp2(x) is 2**x.
|
||||
|
||||
Computations needed in order to obtain m, z1, and z2
|
||||
involve three steps.
|
||||
|
||||
First, we reduce the argument x to the form
|
||||
x = n/32 + remainder,
|
||||
where n has the value of an integer and |remainder| <= 1/64.
|
||||
The value of n = x * 32 rounded to the nearest integer and
|
||||
the remainder = x - n/32.
|
||||
|
||||
Second, we approximate exp2(r1 + r2) - 1 where r1 is the leading
|
||||
part of the remainder and r2 is the trailing part of the remainder.
|
||||
|
||||
Third, we reconstruct exp2(x) so that
|
||||
exp2(x) = 2**m * (z1 + z2).
|
||||
*/
|
||||
|
||||
|
||||
GET_BITS_DP64(x, ux);
|
||||
ax = ux & (~SIGNBIT_DP64);
|
||||
|
||||
if (ax >= 0x4090000000000000) /* abs(x) >= 1024.0 */
|
||||
{
|
||||
if(ax >= 0x7ff0000000000000)
|
||||
{
|
||||
/* x is either NaN or infinity */
|
||||
if (ux & MANTBITS_DP64)
|
||||
/* x is NaN */
|
||||
return _handle_error("exp2", OP_EXP, ux|0x0008000000000000, _DOMAIN,
|
||||
0, EDOM, x, 0.0, 1);
|
||||
else if (ux & SIGNBIT_DP64)
|
||||
/* x is negative infinity; return 0.0 with no flags. */
|
||||
return 0.0;
|
||||
else
|
||||
/* x is positive infinity */
|
||||
return x;
|
||||
}
|
||||
if (x > max_exp2_arg)
|
||||
/* Return +infinity with overflow flag */
|
||||
return _handle_error("exp2", OP_EXP, PINFBITPATT_DP64, _OVERFLOW,
|
||||
AMD_F_OVERFLOW | AMD_F_INEXACT, ERANGE, x, 0.0, 1);
|
||||
else if (x < min_exp2_arg)
|
||||
/* x is negative. Return +zero with underflow and inexact flags */
|
||||
return _handle_error("exp2", OP_EXP, 0, _UNDERFLOW,
|
||||
AMD_F_UNDERFLOW | AMD_F_INEXACT, ERANGE, x, 0.0, 1);
|
||||
}
|
||||
|
||||
|
||||
/* Handle small arguments separately */
|
||||
if (ax < 0x3fb7154764ee6c2f) /* abs(x) < 1/(16*log2) */
|
||||
{
|
||||
if (ax < 0x3c00000000000000) /* abs(x) < 2^(-63) */
|
||||
return 1.0 + x; /* Raises inexact if x is non-zero */
|
||||
else
|
||||
{
|
||||
/* Split x into hx (head) and tx (tail). */
|
||||
unsigned long u;
|
||||
hx = x;
|
||||
GET_BITS_DP64(hx, u);
|
||||
u &= 0xfffffffff8000000;
|
||||
PUT_BITS_DP64(u, hx);
|
||||
tx = x - hx;
|
||||
/* Carefully multiply x by log2. y1 is the most significant
|
||||
part of the result, and y2 the least significant part */
|
||||
y1 = x * log2_lead;
|
||||
y2 = (((hx * log2_lead - y1) + hx * log2_tail) +
|
||||
tx * log2_lead) + tx * log2_tail;
|
||||
|
||||
y = y1 + y2;
|
||||
z = (9.99564649780173690e-1 +
|
||||
(1.61251249355268050e-5 +
|
||||
(2.37986978239838493e-2 +
|
||||
2.68724774856111190e-7*y)*y)*y)/
|
||||
(9.99564649780173692e-1 +
|
||||
(-4.99766199765151309e-1 +
|
||||
(1.070876894098586184e-1 +
|
||||
(-1.189773642681502232e-2 +
|
||||
5.9480622371960190616e-4*y)*y)*y)*y);
|
||||
z = ((z * y1) + (z * y2)) + 1.0;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Find m, z1 and z2 such that exp2(x) = 2**m * (z1 + z2) */
|
||||
|
||||
splitexp(x, log2, 32.0, one_by_32_lead, 0.0, &m, &z1, &z2);
|
||||
|
||||
/* Scale (z1 + z2) by 2.0**m */
|
||||
if (m > EMIN_DP64 && m < EMAX_DP64)
|
||||
z = scaleDouble_1((z1+z2),m);
|
||||
else
|
||||
z = scaleDouble_2((z1+z2),m);
|
||||
}
|
||||
return z;
|
||||
}
|
101
sdk/lib/crt/math/libm_sse2/exp_special.c
Normal file
101
sdk/lib/crt/math/libm_sse2/exp_special.c
Normal file
|
@ -0,0 +1,101 @@
|
|||
|
||||
/*******************************************************************************
|
||||
MIT License
|
||||
-----------
|
||||
|
||||
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this Software and associated documentaon files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include <fpieee.h>
|
||||
#include <excpt.h>
|
||||
#include <float.h>
|
||||
#include <math.h>
|
||||
#include <errno.h>
|
||||
|
||||
#include "libm_new.h"
|
||||
|
||||
// y = expf(x)
|
||||
// y = exp(x)
|
||||
|
||||
// these codes and the ones in the related .asm files have to match
|
||||
#define EXP_X_NAN 1
|
||||
#define EXP_Y_ZERO 2
|
||||
#define EXP_Y_INF 3
|
||||
|
||||
float _expf_special(float x, float y, U32 code)
|
||||
{
|
||||
switch(code)
|
||||
{
|
||||
case EXP_X_NAN:
|
||||
{
|
||||
UT64 ym; ym.u64 = 0; ym.f32[0] = y;
|
||||
_handle_errorf("expf", _FpCodeExp, ym.u64, _DOMAIN, 0, EDOM, x, 0.0, 1);
|
||||
}
|
||||
break;
|
||||
|
||||
case EXP_Y_ZERO:
|
||||
{
|
||||
UT64 ym; ym.u64 = 0; ym.f32[0] = y;
|
||||
_handle_errorf("expf", _FpCodeExp, ym.u64, _UNDERFLOW, AMD_F_INEXACT|AMD_F_UNDERFLOW, ERANGE, x, 0.0, 1);
|
||||
}
|
||||
break;
|
||||
|
||||
case EXP_Y_INF:
|
||||
{
|
||||
UT64 ym; ym.u64 = 0; ym.f32[0] = y;
|
||||
_handle_errorf("expf", _FpCodeExp, ym.u64, _OVERFLOW, AMD_F_INEXACT|AMD_F_OVERFLOW, ERANGE, x, 0.0, 1);
|
||||
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
return y;
|
||||
}
|
||||
|
||||
double _exp_special(double x, double y, U32 code)
|
||||
{
|
||||
switch(code)
|
||||
{
|
||||
case EXP_X_NAN:
|
||||
{
|
||||
UT64 ym; ym.f64 = y;
|
||||
_handle_error("exp", _FpCodeExp, ym.u64, _DOMAIN, 0, EDOM, x, 0.0, 1);
|
||||
}
|
||||
break;
|
||||
|
||||
case EXP_Y_ZERO:
|
||||
{
|
||||
UT64 ym; ym.f64 = y;
|
||||
_handle_error("exp", _FpCodeExp, ym.u64, _UNDERFLOW, AMD_F_INEXACT|AMD_F_UNDERFLOW, ERANGE, x, 0.0, 1);
|
||||
}
|
||||
break;
|
||||
|
||||
case EXP_Y_INF:
|
||||
{
|
||||
UT64 ym; ym.f64 = y;
|
||||
_handle_error("exp", _FpCodeExp, ym.u64, _OVERFLOW, AMD_F_INEXACT|AMD_F_OVERFLOW, ERANGE, x, 0.0, 1);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
|
||||
return y;
|
||||
}
|
303
sdk/lib/crt/math/libm_sse2/expf.asm
Normal file
303
sdk/lib/crt/math/libm_sse2/expf.asm
Normal file
|
@ -0,0 +1,303 @@
|
|||
;
|
||||
; MIT License
|
||||
; -----------
|
||||
;
|
||||
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
||||
;
|
||||
; Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
; of this Software and associated documentaon files (the "Software"), to deal
|
||||
; in the Software without restriction, including without limitation the rights
|
||||
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
; copies of the Software, and to permit persons to whom the Software is
|
||||
; furnished to do so, subject to the following conditions:
|
||||
;
|
||||
; The above copyright notice and this permission notice shall be included in
|
||||
; all copies or substantial portions of the Software.
|
||||
;
|
||||
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
; THE SOFTWARE.
|
||||
;
|
||||
; expf.asm
|
||||
;
|
||||
; An implementation of the expf libm function.
|
||||
;
|
||||
; Prototype:
|
||||
;
|
||||
; float expf(float x);
|
||||
;
|
||||
|
||||
;
|
||||
; Algorithm:
|
||||
; Similar to one presnted in exp.asm
|
||||
;
|
||||
; If FMA3 hardware is available, an FMA3 implementation of expf will be used.
|
||||
|
||||
|
||||
.const
|
||||
ALIGN 16
|
||||
|
||||
__real_inf DD 7f800000h
|
||||
DD 0
|
||||
DQ 0
|
||||
|
||||
__real_ninf DD 0ff800000h
|
||||
DD 0
|
||||
DQ 0
|
||||
|
||||
__real_qnanbit DD 00400000h
|
||||
DD 0
|
||||
DQ 0
|
||||
|
||||
__real_zero DD 00000000h
|
||||
DD 0
|
||||
DQ 0
|
||||
|
||||
__real_p8192 DQ 40c0000000000000h
|
||||
DQ 0
|
||||
__real_m9600 DQ 0c0c2c00000000000h
|
||||
DQ 0
|
||||
|
||||
__real_64_by_log2 DQ 40571547652b82feh ; 64/ln(2)
|
||||
DQ 0
|
||||
__real_log2_by_64 DQ 3f862e42fefa39efh ; log2_by_64
|
||||
DQ 0
|
||||
|
||||
__real_1_by_6 DQ 3fc5555555555555h ; 1/6
|
||||
DQ 0
|
||||
__real_1_by_2 DQ 3fe0000000000000h ; 1/2
|
||||
DQ 0
|
||||
|
||||
; these codes and the ones in the corresponding .c file have to match
|
||||
__flag_x_nan DD 00000001
|
||||
__flag_y_zero DD 00000002
|
||||
__flag_y_inf DD 00000003
|
||||
|
||||
EXTRN __two_to_jby64_table:QWORD
|
||||
EXTRN __use_fma3_lib:DWORD
|
||||
|
||||
fname TEXTEQU <expf>
|
||||
fname_special TEXTEQU <_expf_special>
|
||||
|
||||
; define local variable storage offsets
|
||||
|
||||
; make room for fname_special to save things
|
||||
dummy_space EQU 020h
|
||||
stack_size EQU 038h
|
||||
|
||||
include fm.inc
|
||||
|
||||
; external function
|
||||
EXTERN fname_special:PROC
|
||||
|
||||
.code
|
||||
|
||||
ALIGN 16
|
||||
PUBLIC fname
|
||||
fname PROC FRAME
|
||||
StackAllocate stack_size
|
||||
.ENDPROLOG
|
||||
|
||||
; Do this to avoid possible exceptions from a NaN argument.
|
||||
movd edx, xmm0
|
||||
btr edx,31
|
||||
cmp edx, DWORD PTR __real_inf
|
||||
jge Lexpf_x_is_inf_or_nan
|
||||
|
||||
cmp DWORD PTR __use_fma3_lib, 0
|
||||
jne Lexpf_fma3
|
||||
|
||||
Lexpf_sse2:
|
||||
|
||||
cvtss2sd xmm0, xmm0
|
||||
|
||||
; x * (64/ln(2))
|
||||
movsd xmm3, QWORD PTR __real_64_by_log2
|
||||
mulsd xmm3, xmm0
|
||||
|
||||
; x <= 128*ln(2), ( x * (64/ln(2)) ) <= 64*128
|
||||
; x > -150*ln(2), ( x * (64/ln(2)) ) > 64*(-150)
|
||||
comisd xmm3, QWORD PTR __real_p8192
|
||||
jae Lexpf_y_is_inf
|
||||
|
||||
comisd xmm3, QWORD PTR __real_m9600
|
||||
jb Lexpf_y_is_zero
|
||||
|
||||
; n = int( x * (64/ln(2)) )
|
||||
cvtpd2dq xmm4, xmm3
|
||||
lea r10, __two_to_jby64_table
|
||||
cvtdq2pd xmm1, xmm4
|
||||
|
||||
; r = x - n * ln(2)/64
|
||||
movsd xmm2, QWORD PTR __real_log2_by_64
|
||||
mulsd xmm2, xmm1
|
||||
movd ecx, xmm4
|
||||
mov rax, 3fh
|
||||
and eax, ecx
|
||||
subsd xmm0, xmm2
|
||||
movsd xmm1, xmm0
|
||||
|
||||
; m = (n - j) / 64
|
||||
sub ecx, eax
|
||||
sar ecx, 6
|
||||
|
||||
; q
|
||||
movsd xmm3, QWORD PTR __real_1_by_6
|
||||
mulsd xmm3, xmm0
|
||||
mulsd xmm0, xmm0
|
||||
addsd xmm3, QWORD PTR __real_1_by_2
|
||||
mulsd xmm0, xmm3
|
||||
addsd xmm0, xmm1
|
||||
|
||||
add rcx, 1023
|
||||
shl rcx, 52
|
||||
|
||||
; (f)*(1+q)
|
||||
movsd xmm2, QWORD PTR [r10+rax*8]
|
||||
mulsd xmm0, xmm2
|
||||
addsd xmm0, xmm2
|
||||
|
||||
movd xmm1, rcx
|
||||
mulsd xmm0, xmm1
|
||||
cvtsd2ss xmm0, xmm0
|
||||
|
||||
Lexpf_final_check:
|
||||
StackDeallocate stack_size
|
||||
ret
|
||||
|
||||
ALIGN 16
|
||||
Lexpf_y_is_zero:
|
||||
|
||||
movss xmm1, DWORD PTR __real_zero
|
||||
movd xmm0, edx
|
||||
mov r8d, DWORD PTR __flag_y_zero
|
||||
|
||||
call fname_special
|
||||
jmp Lexpf_finish
|
||||
|
||||
ALIGN 16
|
||||
Lexpf_y_is_inf:
|
||||
|
||||
movss xmm1, DWORD PTR __real_inf
|
||||
movd xmm0, edx
|
||||
mov r8d, DWORD PTR __flag_y_inf
|
||||
|
||||
call fname_special
|
||||
jmp Lexpf_finish
|
||||
|
||||
ALIGN 16
|
||||
Lexpf_x_is_inf_or_nan:
|
||||
|
||||
cmp edx, DWORD PTR __real_inf
|
||||
je Lexpf_finish
|
||||
|
||||
cmp edx, DWORD PTR __real_ninf
|
||||
je Lexpf_process_zero
|
||||
|
||||
or edx, DWORD PTR __real_qnanbit
|
||||
movd xmm1, edx
|
||||
mov r8d, DWORD PTR __flag_x_nan
|
||||
call fname_special
|
||||
jmp Lexpf_finish
|
||||
|
||||
ALIGN 16
|
||||
Lexpf_process_zero:
|
||||
movss xmm0, DWORD PTR __real_zero
|
||||
jmp Lexpf_final_check
|
||||
|
||||
ALIGN 16
|
||||
Lexpf_finish:
|
||||
StackDeallocate stack_size
|
||||
ret
|
||||
|
||||
|
||||
ALIGN 16
|
||||
Lexpf_fma3:
|
||||
|
||||
vcvtss2sd xmm0, xmm0, xmm0
|
||||
|
||||
; x * (64/ln(2))
|
||||
vmulsd xmm3, xmm0, QWORD PTR __real_64_by_log2
|
||||
|
||||
; x <= 128*ln(2), ( x * (64/ln(2)) ) <= 64*128
|
||||
; x > -150*ln(2), ( x * (64/ln(2)) ) > 64*(-150)
|
||||
vcomisd xmm3, QWORD PTR __real_p8192
|
||||
jae Lexpf_fma3_y_is_inf
|
||||
|
||||
vucomisd xmm3, QWORD PTR __real_m9600
|
||||
jb Lexpf_fma3_y_is_zero
|
||||
|
||||
; n = int( x * (64/ln(2)) )
|
||||
vcvtpd2dq xmm4, xmm3
|
||||
lea r10, __two_to_jby64_table
|
||||
vcvtdq2pd xmm1, xmm4
|
||||
|
||||
; r = x - n * ln(2)/64
|
||||
vfnmadd231sd xmm0, xmm1, QWORD PTR __real_log2_by_64
|
||||
vmovd ecx, xmm4
|
||||
mov rax, 3fh
|
||||
and eax, ecx
|
||||
vmovapd xmm1, xmm0 ; xmm1 <-- copy of r
|
||||
|
||||
; m = (n - j) / 64
|
||||
sub ecx, eax
|
||||
sar ecx, 6
|
||||
|
||||
; q
|
||||
vmovsd xmm3, QWORD PTR __real_1_by_6
|
||||
vmulsd xmm0, xmm0, xmm0 ; xmm0 <-- r^2
|
||||
vfmadd213sd xmm3, xmm1, QWORD PTR __real_1_by_2 ; xmm3 <-- r/6 + 1/2
|
||||
vfmadd213sd xmm0, xmm3, xmm1 ; xmm0 <-- q = r^2*(r/6 + 1/2) + r
|
||||
|
||||
add rcx, 1023
|
||||
shl rcx, 52
|
||||
|
||||
; (f)*(1+q)
|
||||
vmovsd xmm2, QWORD PTR [r10+rax*8]
|
||||
vfmadd213sd xmm0, xmm2, xmm2
|
||||
|
||||
vmovq xmm2,rcx
|
||||
vmulsd xmm0, xmm0, xmm2
|
||||
vcvtsd2ss xmm0, xmm0, xmm0
|
||||
|
||||
Lexpf_fma3_final_check:
|
||||
StackDeallocate stack_size
|
||||
ret
|
||||
|
||||
ALIGN 16
|
||||
Lexpf_fma3_y_is_zero:
|
||||
|
||||
vmovss xmm1, DWORD PTR __real_zero
|
||||
vmovd xmm0, edx
|
||||
mov r8d, DWORD PTR __flag_y_zero
|
||||
|
||||
call fname_special
|
||||
jmp Lexpf_fma3_finish
|
||||
|
||||
ALIGN 16
|
||||
Lexpf_fma3_y_is_inf:
|
||||
|
||||
vmovss xmm1, DWORD PTR __real_inf
|
||||
vmovd xmm0, edx
|
||||
mov r8d, DWORD PTR __flag_y_inf
|
||||
|
||||
call fname_special
|
||||
jmp Lexpf_fma3_finish
|
||||
|
||||
ALIGN 16
|
||||
Lexpf_fma3_process_zero:
|
||||
vmovss xmm0, DWORD PTR __real_zero
|
||||
jmp Lexpf_fma3_final_check
|
||||
|
||||
ALIGN 16
|
||||
Lexpf_fma3_finish:
|
||||
StackDeallocate stack_size
|
||||
ret
|
||||
|
||||
fname endp
|
||||
|
||||
END
|
85
sdk/lib/crt/math/libm_sse2/floor.c
Normal file
85
sdk/lib/crt/math/libm_sse2/floor.c
Normal file
|
@ -0,0 +1,85 @@
|
|||
|
||||
/*******************************************************************************
|
||||
MIT License
|
||||
-----------
|
||||
|
||||
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this Software and associated documentaon files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "libm.h"
|
||||
#include "libm_util.h"
|
||||
|
||||
#include "libm_errno.h"
|
||||
#define USE_HANDLE_ERROR
|
||||
#include "libm_inlines.h"
|
||||
#undef USE_HANDLE_ERROR
|
||||
|
||||
#pragma function(floor)
|
||||
|
||||
double FN_PROTOTYPE(floor)(double x)
|
||||
{
|
||||
double r;
|
||||
long rexp, xneg;
|
||||
|
||||
|
||||
unsigned long ux, ax, ur, mask;
|
||||
|
||||
GET_BITS_DP64(x, ux);
|
||||
ax = ux & (~SIGNBIT_DP64);
|
||||
xneg = (ux != ax);
|
||||
|
||||
if (ax >= 0x4340000000000000)
|
||||
{
|
||||
/* abs(x) is either NaN, infinity, or >= 2^53 */
|
||||
if (ax > 0x7ff0000000000000)
|
||||
/* x is NaN */
|
||||
return _handle_error("floor", OP_FLOOR, ux|0x0008000000000000, _DOMAIN,
|
||||
0, EDOM, x, 0.0, 1);
|
||||
else
|
||||
return x;
|
||||
}
|
||||
else if (ax < 0x3ff0000000000000) /* abs(x) < 1.0 */
|
||||
{
|
||||
if (ax == 0x0000000000000000)
|
||||
/* x is +zero or -zero; return the same zero */
|
||||
return x;
|
||||
else if (xneg) /* x < 0.0 */
|
||||
return -1.0;
|
||||
else
|
||||
return 0.0;
|
||||
}
|
||||
else
|
||||
{
|
||||
r = x;
|
||||
rexp = ((ux & EXPBITS_DP64) >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64;
|
||||
/* Mask out the bits of r that we don't want */
|
||||
mask = 1;
|
||||
mask = (mask << (EXPSHIFTBITS_DP64 - rexp)) - 1;
|
||||
ur = (ux & ~mask);
|
||||
PUT_BITS_DP64(ur, r);
|
||||
if (xneg && (ur != ux))
|
||||
/* We threw some bits away and x was negative */
|
||||
return r - 1.0;
|
||||
else
|
||||
return r;
|
||||
}
|
||||
|
||||
}
|
83
sdk/lib/crt/math/libm_sse2/floorf.c
Normal file
83
sdk/lib/crt/math/libm_sse2/floorf.c
Normal file
|
@ -0,0 +1,83 @@
|
|||
|
||||
/*******************************************************************************
|
||||
MIT License
|
||||
-----------
|
||||
|
||||
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this Software and associated documentaon files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "libm.h"
|
||||
#include "libm_util.h"
|
||||
|
||||
#include "libm_errno.h"
|
||||
#define USE_HANDLE_ERRORF
|
||||
#include "libm_inlines.h"
|
||||
#undef USE_HANDLE_ERRORF
|
||||
|
||||
// Disable "C4163: not available as intrinsic function" warning that older
|
||||
// compilers may issue here.
|
||||
#pragma warning(disable:4163)
|
||||
#pragma function(floorf)
|
||||
|
||||
float FN_PROTOTYPE(floorf)(float x)
|
||||
{
|
||||
float r;
|
||||
int rexp, xneg;
|
||||
unsigned int ux, ax, ur, mask;
|
||||
|
||||
GET_BITS_SP32(x, ux);
|
||||
ax = ux & (~SIGNBIT_SP32);
|
||||
xneg = (ux != ax);
|
||||
|
||||
if (ax >= 0x4b800000)
|
||||
{
|
||||
/* abs(x) is either NaN, infinity, or >= 2^24 */
|
||||
if (ax > 0x7f800000)
|
||||
/* x is NaN */
|
||||
return _handle_errorf("floorf", OP_FLOOR, ux|0x00400000, _DOMAIN,
|
||||
0, EDOM, x, 0.0F, 1);
|
||||
else
|
||||
return x;
|
||||
}
|
||||
else if (ax < 0x3f800000) /* abs(x) < 1.0 */
|
||||
{
|
||||
if (ax == 0x00000000)
|
||||
/* x is +zero or -zero; return the same zero */
|
||||
return x;
|
||||
else if (xneg) /* x < 0.0 */
|
||||
return -1.0F;
|
||||
else
|
||||
return 0.0F;
|
||||
}
|
||||
else
|
||||
{
|
||||
rexp = ((ux & EXPBITS_SP32) >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
|
||||
/* Mask out the bits of r that we don't want */
|
||||
mask = (1 << (EXPSHIFTBITS_SP32 - rexp)) - 1;
|
||||
ur = (ux & ~mask);
|
||||
PUT_BITS_SP32(ur, r);
|
||||
if (xneg && (ux != ur))
|
||||
/* We threw some bits away and x was negative */
|
||||
return r - 1.0F;
|
||||
else
|
||||
return r;
|
||||
}
|
||||
}
|
39
sdk/lib/crt/math/libm_sse2/fm.inc
Normal file
39
sdk/lib/crt/math/libm_sse2/fm.inc
Normal file
|
@ -0,0 +1,39 @@
|
|||
StackAllocate MACRO size
|
||||
if size ne 0
|
||||
sub rsp, size
|
||||
.ALLOCSTACK size
|
||||
endif
|
||||
ENDM
|
||||
|
||||
StackDeallocate MACRO size
|
||||
if size ne 0
|
||||
add rsp, size
|
||||
endif
|
||||
ENDM
|
||||
|
||||
SaveReg MACRO reg64, offset
|
||||
mov QWORD PTR [rsp+offset], reg64
|
||||
.SAVEREG reg64, offset
|
||||
ENDM
|
||||
|
||||
RestoreReg MACRO reg64, offset
|
||||
mov reg64, QWORD PTR [rsp+offset]
|
||||
ENDM
|
||||
|
||||
SaveXmm MACRO xmmreg, offset
|
||||
movdqa XMMWORD PTR [offset+rsp], xmmreg
|
||||
.SAVEXMM128 xmmreg, offset
|
||||
ENDM
|
||||
|
||||
RestoreXmm MACRO xmmreg, offset
|
||||
movdqa xmmreg, XMMWORD PTR [offset+rsp]
|
||||
ENDM
|
||||
|
||||
AVXSaveXmm MACRO xmmreg, offset
|
||||
vmovdqa XMMWORD PTR [offset+rsp], xmmreg
|
||||
.SAVEXMM128 xmmreg, offset
|
||||
ENDM
|
||||
|
||||
AVXRestoreXmm MACRO xmmreg, offset
|
||||
vmovdqa xmmreg, XMMWORD PTR [offset+rsp]
|
||||
ENDM
|
66
sdk/lib/crt/math/libm_sse2/fma3_available.c
Normal file
66
sdk/lib/crt/math/libm_sse2/fma3_available.c
Normal file
|
@ -0,0 +1,66 @@
|
|||
|
||||
/*******************************************************************************
|
||||
MIT License
|
||||
-----------
|
||||
|
||||
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this Software and associated documentaon files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*******************************************************************************/
|
||||
|
||||
#ifdef TEST_STANDALONE
|
||||
#include <stdio.h>
|
||||
#pragma section (".CRT$XIC",long,read)
|
||||
typedef void (__cdecl *_PIFV)(void);
|
||||
#else
|
||||
#include <sect_attribs.h>
|
||||
#include <windows.h>
|
||||
#include <cruntime.h>
|
||||
#include <internal.h>
|
||||
#endif
|
||||
|
||||
#define _CRTALLOC(x) __declspec(allocate(x))
|
||||
|
||||
int __fma3_is_available = 0;
|
||||
int __use_fma3_lib = 0;
|
||||
|
||||
|
||||
int __cdecl _set_FMA3_enable(int flag)
|
||||
{
|
||||
if (__fma3_is_available) __use_fma3_lib = flag;
|
||||
return __use_fma3_lib;
|
||||
}
|
||||
|
||||
int __fma3_lib_init(void);
|
||||
|
||||
_CRTALLOC(".CRT$XIC") static _PIFV init_fma3 = __fma3_lib_init;
|
||||
|
||||
int __fma3_lib_init(void)
|
||||
{
|
||||
int CPUID[4]; // CPUID[2] is ECX;
|
||||
|
||||
__fma3_is_available = 0;
|
||||
__cpuid(CPUID, 1);
|
||||
if (CPUID[2] & (1 << 12)) {
|
||||
__fma3_is_available = 1;
|
||||
}
|
||||
|
||||
__use_fma3_lib = __fma3_is_available;
|
||||
return 0;
|
||||
}
|
160
sdk/lib/crt/math/libm_sse2/fmod.asm
Normal file
160
sdk/lib/crt/math/libm_sse2/fmod.asm
Normal file
|
@ -0,0 +1,160 @@
|
|||
;
|
||||
; MIT License
|
||||
; -----------
|
||||
;
|
||||
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
||||
;
|
||||
; Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
; of this Software and associated documentaon files (the "Software"), to deal
|
||||
; in the Software without restriction, including without limitation the rights
|
||||
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
; copies of the Software, and to permit persons to whom the Software is
|
||||
; furnished to do so, subject to the following conditions:
|
||||
;
|
||||
; The above copyright notice and this permission notice shall be included in
|
||||
; all copies or substantial portions of the Software.
|
||||
;
|
||||
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
; THE SOFTWARE.
|
||||
;
|
||||
; $Workfile: fmod.asm $
|
||||
; $Revision: 4 $
|
||||
; $Date: 9/15/04 16:43 $
|
||||
;
|
||||
;
|
||||
; This is an optimized version of fmod.
|
||||
;
|
||||
; Define _CRTBLD_C9X to make it compliant with C90 and on.
|
||||
;
|
||||
; If building the OS CRTL (_NTSUBSET_ defined), abort.
|
||||
|
||||
; .ERRDEF _NTSUBSET_, "x87 code cannot be used in kernel mode"
|
||||
|
||||
DOMAIN EQU 1 ; _DOMAIN
|
||||
EDOM EQU 33 ; EDOM
|
||||
FPCODEFMOD EQU 22 ; _FpCodeFmod
|
||||
INVALID EQU 8 ; AMD_F_INVALID
|
||||
|
||||
FPIND EQU 0fff8000000000000h ; indefinite
|
||||
FPSNAN EQU 07ff7ffffffffffffh ; SNAN
|
||||
FPQNAN EQU 07fffffffffffffffh ; QNAN
|
||||
|
||||
X87SW RECORD X87SW_B: 1,
|
||||
X87SW_C3: 1,
|
||||
X87SW_TOP: 3,
|
||||
X87SW_C: 3,
|
||||
X87SW_ES: 1,
|
||||
X87SW_SF: 1,
|
||||
X87SW_PE: 1,
|
||||
X87SW_E: 5
|
||||
|
||||
X87XAM EQU MASK X87SW_C3 OR MASK X87SW_C AND NOT (1 SHL (X87SW_C + 1))
|
||||
X87XAM_INF EQU 5 SHL X87SW_C
|
||||
X87XAM_NAN EQU 1 SHL X87SW_C
|
||||
X87XAM_BAD EQU MASK X87SW_E AND NOT 2
|
||||
|
||||
EXTRN _handle_error: PROC ; float _handle_error (char *fname, int opcode, unsigned long long value, int type, int flags, int error, double arg1, double arg2, int nargs)
|
||||
|
||||
.const
|
||||
|
||||
@fmodz DB "fmod", 0
|
||||
|
||||
.CODE
|
||||
|
||||
; double fmod [double, double] ----------------------------------
|
||||
|
||||
fmod PROC FRAME
|
||||
|
||||
sub rsp, 40 + 32
|
||||
|
||||
.ALLOCSTACK 40 + 32
|
||||
.ENDPROLOG
|
||||
|
||||
movsd QWORD PTR 24 [rsp + 32], xmm1 ; Y
|
||||
movsd QWORD PTR 16 [rsp + 32], xmm0 ; X
|
||||
|
||||
DB 0ddh, 44h, 24h, 38h ; fld QWORD PTR 24 [rsp + 32]
|
||||
DB 0ddh, 44h, 24h, 30h ; fld QWORD PTR 16 [rsp + 32]
|
||||
|
||||
DB 0d9h, 0e5h ; fxam (X)
|
||||
DB 09bh, 0ddh, 07ch, 024h, 010h ; fstsw 16 [rsp]
|
||||
|
||||
movzx ecx, WORD PTR 16 [rsp]
|
||||
and ecx, X87XAM
|
||||
|
||||
fnclex ; clear exception flags
|
||||
; in preparation for fprem
|
||||
|
||||
@@:
|
||||
DB 0d9h, 0f8h ; fprem
|
||||
|
||||
DB 09bh, 0dfh, 0e0h ; fstsw ax
|
||||
test ax, 4 SHL X87SW_C
|
||||
jnz @b ; do it again in case of partial result
|
||||
|
||||
DB 0ddh, 01ch, 024h ; fstp QWORD PTR [rsp]
|
||||
movlpd xmm0, QWORD PTR [rsp] ; result
|
||||
|
||||
DB 0d9h, 0e5h ; fxam (Y)
|
||||
DB 09bh, 0ddh, 07ch, 024h, 008h ; fstsw 8 [rsp]
|
||||
|
||||
movzx edx, WORD PTR 8 [rsp]
|
||||
and edx, X87XAM
|
||||
|
||||
DB 0ddh, 0d8h ; fstp st(0)
|
||||
|
||||
cmp edx, X87XAM_NAN ; fmod (x, NAN) = QNAN
|
||||
je @error
|
||||
|
||||
cmp ecx, X87XAM_NAN ; fmod (NAN, y) = QNAN
|
||||
je @error
|
||||
|
||||
and eax, X87XAM_BAD
|
||||
jnz @raise ; handle error
|
||||
|
||||
IFNDEF _CRTBLD_C9X ; Not C90
|
||||
cmp edx, X87XAM_INF ; fmod (x, infinity) = ???
|
||||
je @raise
|
||||
ELSE ; C90
|
||||
; fmod (x, infinity) = x (as x87 already does)
|
||||
ENDIF
|
||||
|
||||
@exit:
|
||||
add rsp, 40 + 32
|
||||
ret
|
||||
|
||||
ALIGN 16
|
||||
|
||||
@raise:
|
||||
mov eax, INVALID ; raise exception
|
||||
mov r8, FPIND
|
||||
jmp @f
|
||||
|
||||
@error:
|
||||
xor eax, eax ; no exception
|
||||
movd r8, xmm0
|
||||
jmp @f
|
||||
|
||||
@@:
|
||||
lea rcx, [@fmodz] ; fname
|
||||
mov edx, FPCODEFMOD ; opcode
|
||||
; mov r8, INDEF ; value
|
||||
mov r9d, DOMAIN ; type
|
||||
mov DWORD PTR 0 [rsp + 32], eax ; flags
|
||||
mov DWORD PTR 8 [rsp + 32], EDOM ; error
|
||||
mov DWORD PTR 32 [rsp + 32], 2 ; nargs
|
||||
call _handle_error ; (char *fname, int opcode, unsigned long long value, int type, int flags, int error, double arg1, double arg2, int nargs)
|
||||
|
||||
DB 09bh, 0dbh, 0e2h ; fclex
|
||||
jmp @exit
|
||||
|
||||
fmod ENDP
|
||||
|
||||
; ---------------------------------------------------------------
|
||||
|
||||
END
|
160
sdk/lib/crt/math/libm_sse2/fmodf.asm
Normal file
160
sdk/lib/crt/math/libm_sse2/fmodf.asm
Normal file
|
@ -0,0 +1,160 @@
|
|||
;
|
||||
; MIT License
|
||||
; -----------
|
||||
;
|
||||
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
||||
;
|
||||
; Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
; of this Software and associated documentaon files (the "Software"), to deal
|
||||
; in the Software without restriction, including without limitation the rights
|
||||
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
; copies of the Software, and to permit persons to whom the Software is
|
||||
; furnished to do so, subject to the following conditions:
|
||||
;
|
||||
; The above copyright notice and this permission notice shall be included in
|
||||
; all copies or substantial portions of the Software.
|
||||
;
|
||||
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
; THE SOFTWARE.
|
||||
;
|
||||
; $Workfile: fmodf.asm $
|
||||
; $Revision: 4 $
|
||||
; $Date: 9/15/04 16:43 $
|
||||
;
|
||||
;
|
||||
; This is an optimized version of fmod.
|
||||
;
|
||||
; Define _CRTBLD_C9X to make it compliant with C90 and on.
|
||||
;
|
||||
; If building the OS CRTL (_NTSUBSET_ defined), abort.
|
||||
|
||||
.ERRDEF _NTSUBSET_, "x87 code cannot be used in kernel mode"
|
||||
|
||||
DOMAIN EQU 1 ; _DOMAIN
|
||||
EDOM EQU 33 ; EDOM
|
||||
FPCODEFMOD EQU 22 ; _FpCodeFmod
|
||||
INVALID EQU 8 ; AMD_F_INVALID
|
||||
|
||||
FPIND EQU 0ffc00000h ; indefinite
|
||||
FPSNAN EQU 07fbfffffh ; SNAN
|
||||
FPQNAN EQU 07fffffffh ; QNAN
|
||||
|
||||
X87SW RECORD X87SW_B: 1,
|
||||
X87SW_C3: 1,
|
||||
X87SW_TOP: 3,
|
||||
X87SW_C: 3,
|
||||
X87SW_ES: 1,
|
||||
X87SW_SF: 1,
|
||||
X87SW_PE: 1,
|
||||
X87SW_E: 5
|
||||
|
||||
X87XAM EQU MASK X87SW_C3 OR MASK X87SW_C AND NOT (1 SHL (X87SW_C + 1))
|
||||
X87XAM_INF EQU 5 SHL X87SW_C
|
||||
X87XAM_NAN EQU 1 SHL X87SW_C
|
||||
X87XAM_BAD EQU MASK X87SW_E AND NOT 2
|
||||
|
||||
EXTRN _handle_errorf: PROC ; float _handle_error (char *fname, int opcode, unsigned long value, int type, int flags, int error, float arg1, float arg2, int nargs)
|
||||
|
||||
.CONST
|
||||
|
||||
@fmodfz DB "fmodf", 0
|
||||
|
||||
.CODE
|
||||
|
||||
; float fmodf [float, float] ------------------------------------
|
||||
|
||||
fmodf PROC FRAME
|
||||
|
||||
sub rsp, 40 + 32
|
||||
|
||||
.ALLOCSTACK 40 + 32
|
||||
.ENDPROLOG
|
||||
|
||||
movss DWORD PTR 24 [rsp + 32], xmm1
|
||||
movss DWORD PTR 16 [rsp + 32], xmm0
|
||||
|
||||
DB 0d9h, 44h, 24h, 38h ; fld DWORD PTR 24 [rsp + 32]
|
||||
DB 0d9h, 44h, 24h, 30h ; fld DWORD PTR 16 [rsp + 32]
|
||||
|
||||
DB 0d9h, 0e5h ; fxam (X)
|
||||
DB 09bh, 0ddh, 07ch, 024h, 010h ; fstsw 16 [rsp]
|
||||
|
||||
movzx ecx, WORD PTR 16 [rsp]
|
||||
and ecx, X87XAM
|
||||
|
||||
fnclex ; clear exception flags
|
||||
; in preparation for fprem
|
||||
|
||||
@@:
|
||||
DB 0d9h, 0f8h ; fprem
|
||||
|
||||
DB 9bh, 0dfh, 0e0h ; fstsw ax
|
||||
test ax, 00400h
|
||||
jnz @b ; do it again in case of partial result
|
||||
|
||||
DB 0d9h, 1ch, 24h ; fstp DWORD PTR [rsp]
|
||||
movss xmm0, DWORD PTR [rsp] ; result
|
||||
|
||||
DB 0d9h, 0e5h ; fxam (Y)
|
||||
DB 09bh, 0ddh, 07ch, 024h, 008h ; fstsw 8 [rsp]
|
||||
|
||||
movzx edx, WORD PTR 8 [rsp]
|
||||
and edx, X87XAM
|
||||
|
||||
DB 0ddh, 0d8h ; fstp st(0)
|
||||
|
||||
cmp edx, X87XAM_NAN ; fmod (x, NAN) = QNAN
|
||||
je @error
|
||||
|
||||
cmp ecx, X87XAM_NAN ; fmod (NAN, y) = QNAN
|
||||
je @error
|
||||
|
||||
and eax, X87XAM_BAD
|
||||
jnz @raise ; handle error
|
||||
|
||||
IFNDEF _CRTBLD_C9X ; Not C90
|
||||
cmp edx, X87XAM_INF ; fmod (x, infinity) = ???
|
||||
je @raise
|
||||
ELSE ; C90
|
||||
; fmod (x, infinity) = x (as x87 already does)
|
||||
ENDIF
|
||||
|
||||
@exit:
|
||||
add rsp, 40 + 32
|
||||
ret
|
||||
|
||||
ALIGN 16
|
||||
|
||||
@raise:
|
||||
mov eax, INVALID ; raise exception
|
||||
mov r8d, FPIND
|
||||
jmp @f
|
||||
|
||||
@error:
|
||||
xor eax, eax ; no exception
|
||||
movd r8d, xmm0
|
||||
jmp @f
|
||||
|
||||
@@:
|
||||
lea rcx, [@fmodfz] ; fname
|
||||
mov edx, FPCODEFMOD ; opcode
|
||||
; mov r8d, [rsp] ; value
|
||||
mov r9d, DOMAIN ; type
|
||||
mov DWORD PTR 0 [rsp + 32], eax ; flags
|
||||
mov DWORD PTR 8 [rsp + 32], EDOM ; error
|
||||
mov DWORD PTR 32 [rsp + 32], 2 ; nargs
|
||||
call _handle_errorf ; (char *fname, int opcode, unsigned long long value, int type, int flags, int error, double arg1, double arg2, int nargs)
|
||||
|
||||
DB 9Bh, 0DBh, 0E2h ; fclex
|
||||
jmp @exit
|
||||
|
||||
fmodf ENDP
|
||||
|
||||
; ---------------------------------------------------------------
|
||||
|
||||
END
|
198
sdk/lib/crt/math/libm_sse2/hypot.c
Normal file
198
sdk/lib/crt/math/libm_sse2/hypot.c
Normal file
|
@ -0,0 +1,198 @@
|
|||
|
||||
/*******************************************************************************
|
||||
MIT License
|
||||
-----------
|
||||
|
||||
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this Software and associated documentaon files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "libm.h"
|
||||
#include "libm_util.h"
|
||||
|
||||
#define FAST_BUT_GREATER_THAN_ONE_ULP /* Helps speed by trading off a little
|
||||
accuracy */
|
||||
#define USE_SCALEDOUBLE_1
|
||||
#define USE_INFINITY_WITH_FLAGS
|
||||
#define USE_HANDLE_ERROR
|
||||
#include "libm_inlines.h"
|
||||
#undef USE_SCALEDOUBLE_1
|
||||
#undef USE_INFINITY_WITH_FLAGS
|
||||
#undef USE_HANDLE_ERROR
|
||||
|
||||
#include "libm_errno.h"
|
||||
|
||||
|
||||
double FN_PROTOTYPE(_hypot)(double x, double y)
|
||||
{
|
||||
/* Returns sqrt(x*x + y*y) with no overflow or underflow unless
|
||||
the result warrants it */
|
||||
|
||||
const double large = 1.79769313486231570815e+308; /* 0x7fefffffffffffff */
|
||||
|
||||
#ifdef FAST_BUT_GREATER_THAN_ONE_ULP
|
||||
double r, retval;
|
||||
unsigned long xexp, yexp, ux, uy;
|
||||
#else
|
||||
double u, r, retval, hx, tx, x2, hy, ty, y2, hs, ts;
|
||||
unsigned long xexp, yexp, ux, uy, ut;
|
||||
#endif
|
||||
int dexp, expadjust;
|
||||
|
||||
GET_BITS_DP64(x, ux);
|
||||
ux &= ~SIGNBIT_DP64;
|
||||
GET_BITS_DP64(y, uy);
|
||||
uy &= ~SIGNBIT_DP64;
|
||||
xexp = (ux >> EXPSHIFTBITS_DP64);
|
||||
yexp = (uy >> EXPSHIFTBITS_DP64);
|
||||
|
||||
if (xexp == BIASEDEMAX_DP64 + 1 || yexp == BIASEDEMAX_DP64 + 1)
|
||||
{
|
||||
/* One or both of the arguments are NaN or infinity. The
|
||||
result will also be NaN or infinity. */
|
||||
retval = x*x + y*y;
|
||||
if (((xexp == BIASEDEMAX_DP64 + 1) && !(ux & MANTBITS_DP64)) ||
|
||||
((yexp == BIASEDEMAX_DP64 + 1) && !(uy & MANTBITS_DP64)))
|
||||
/* x or y is infinity. ISO C99 defines that we must
|
||||
return +infinity, even if the other argument is NaN.
|
||||
Note that the computation of x*x + y*y above will already
|
||||
have raised invalid if either x or y is a signalling NaN. */
|
||||
return infinity_with_flags(0);
|
||||
else
|
||||
/* One or both of x or y is NaN, and neither is infinity.
|
||||
Raise invalid if it's a signalling NaN */
|
||||
return retval;
|
||||
}
|
||||
|
||||
/* Set x = abs(x) and y = abs(y) */
|
||||
PUT_BITS_DP64(ux, x);
|
||||
PUT_BITS_DP64(uy, y);
|
||||
|
||||
/* The difference in exponents between x and y */
|
||||
dexp = (int)(xexp - yexp);
|
||||
expadjust = 0;
|
||||
|
||||
if (ux == 0)
|
||||
/* x is zero */
|
||||
return y;
|
||||
else if (uy == 0)
|
||||
/* y is zero */
|
||||
return x;
|
||||
else if (dexp > MANTLENGTH_DP64 + 1 || dexp < -MANTLENGTH_DP64 - 1)
|
||||
/* One of x and y is insignificant compared to the other */
|
||||
return x + y; /* Raise inexact */
|
||||
else if (xexp > EXPBIAS_DP64 + 500 || yexp > EXPBIAS_DP64 + 500)
|
||||
{
|
||||
/* Danger of overflow; scale down by 2**600. */
|
||||
expadjust = 600;
|
||||
ux -= 0x2580000000000000;
|
||||
PUT_BITS_DP64(ux, x);
|
||||
uy -= 0x2580000000000000;
|
||||
PUT_BITS_DP64(uy, y);
|
||||
}
|
||||
else if (xexp < EXPBIAS_DP64 - 500 || yexp < EXPBIAS_DP64 - 500)
|
||||
{
|
||||
/* Danger of underflow; scale up by 2**600. */
|
||||
expadjust = -600;
|
||||
if (xexp == 0)
|
||||
{
|
||||
/* x is denormal - handle by adding 601 to the exponent
|
||||
and then subtracting a correction for the implicit bit */
|
||||
PUT_BITS_DP64(ux + 0x2590000000000000, x);
|
||||
x -= 9.23297861778573578076e-128; /* 0x2590000000000000 */
|
||||
GET_BITS_DP64(x, ux);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* x is normal - just increase the exponent by 600 */
|
||||
ux += 0x2580000000000000;
|
||||
PUT_BITS_DP64(ux, x);
|
||||
}
|
||||
if (yexp == 0)
|
||||
{
|
||||
PUT_BITS_DP64(uy + 0x2590000000000000, y);
|
||||
y -= 9.23297861778573578076e-128; /* 0x2590000000000000 */
|
||||
GET_BITS_DP64(y, uy);
|
||||
}
|
||||
else
|
||||
{
|
||||
uy += 0x2580000000000000;
|
||||
PUT_BITS_DP64(uy, y);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#ifdef FAST_BUT_GREATER_THAN_ONE_ULP
|
||||
/* Not awful, but results in accuracy loss larger than 1 ulp */
|
||||
r = x*x + y*y;
|
||||
#else
|
||||
/* Slower but more accurate */
|
||||
|
||||
/* Sort so that x is greater than y */
|
||||
if (x < y)
|
||||
{
|
||||
u = y;
|
||||
y = x;
|
||||
x = u;
|
||||
ut = ux;
|
||||
ux = uy;
|
||||
uy = ut;
|
||||
}
|
||||
|
||||
/* Split x into hx and tx, head and tail */
|
||||
PUT_BITS_DP64(ux & 0xfffffffff8000000, hx);
|
||||
tx = x - hx;
|
||||
|
||||
PUT_BITS_DP64(uy & 0xfffffffff8000000, hy);
|
||||
ty = y - hy;
|
||||
|
||||
/* Compute r = x*x + y*y with extra precision */
|
||||
x2 = x*x;
|
||||
y2 = y*y;
|
||||
hs = x2 + y2;
|
||||
|
||||
if (dexp == 0)
|
||||
/* We take most care when x and y have equal exponents,
|
||||
i.e. are almost the same size */
|
||||
ts = (((x2 - hs) + y2) +
|
||||
((hx * hx - x2) + 2 * hx * tx) + tx * tx) +
|
||||
((hy * hy - y2) + 2 * hy * ty) + ty * ty;
|
||||
else
|
||||
ts = (((x2 - hs) + y2) +
|
||||
((hx * hx - x2) + 2 * hx * tx) + tx * tx);
|
||||
|
||||
r = hs + ts;
|
||||
#endif
|
||||
|
||||
/* The sqrt can introduce another half ulp error. */
|
||||
/* VC++ intrinsic call */
|
||||
_mm_store_sd(&retval, _mm_sqrt_sd(_mm_setzero_pd(), _mm_load_sd(&r)));
|
||||
|
||||
/* If necessary scale the result back. This may lead to
|
||||
overflow but if so that's the correct result. */
|
||||
retval = scaleDouble_1(retval, expadjust);
|
||||
|
||||
if (retval > large)
|
||||
/* The result overflowed. Deal with errno. */
|
||||
return _handle_error("_hypot", OP_HYPOT, PINFBITPATT_DP64, _OVERFLOW,
|
||||
AMD_F_OVERFLOW | AMD_F_INEXACT, ERANGE, x, y, 2);
|
||||
|
||||
return retval;
|
||||
}
|
99
sdk/lib/crt/math/libm_sse2/hypotf.c
Normal file
99
sdk/lib/crt/math/libm_sse2/hypotf.c
Normal file
|
@ -0,0 +1,99 @@
|
|||
|
||||
/*******************************************************************************
|
||||
MIT License
|
||||
-----------
|
||||
|
||||
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this Software and associated documentaon files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "libm.h"
|
||||
#include "libm_util.h"
|
||||
|
||||
#ifdef USE_SOFTWARE_SQRT
|
||||
#define USE_SQRTF_AMD_INLINE
|
||||
#endif
|
||||
#define USE_INFINITYF_WITH_FLAGS
|
||||
#define USE_HANDLE_ERRORF
|
||||
#include "libm_inlines.h"
|
||||
#ifdef USE_SOFTWARE_SQRT
|
||||
#undef USE_SQRTF_AMD_INLINE
|
||||
#endif
|
||||
#undef USE_INFINITYF_WITH_FLAGS
|
||||
#undef USE_HANDLE_ERRORF
|
||||
|
||||
#include "libm_errno.h"
|
||||
|
||||
|
||||
float FN_PROTOTYPE(_hypotf)(float x, float y)
|
||||
{
|
||||
/* Returns sqrt(x*x + y*y) with no overflow or underflow unless
|
||||
the result warrants it */
|
||||
|
||||
/* Do intermediate computations in double precision
|
||||
and use sqrt instruction from chip if available. */
|
||||
double dx = x, dy = y, dr, retval;
|
||||
|
||||
/* The largest finite float, stored as a double */
|
||||
const double large = 3.40282346638528859812e+38; /* 0x47efffffe0000000 */
|
||||
|
||||
|
||||
unsigned long ux, uy, avx, avy;
|
||||
|
||||
GET_BITS_DP64(x, avx);
|
||||
avx &= ~SIGNBIT_DP64;
|
||||
GET_BITS_DP64(y, avy);
|
||||
avy &= ~SIGNBIT_DP64;
|
||||
ux = (avx >> EXPSHIFTBITS_DP64);
|
||||
uy = (avy >> EXPSHIFTBITS_DP64);
|
||||
|
||||
if (ux == BIASEDEMAX_DP64 + 1 || uy == BIASEDEMAX_DP64 + 1)
|
||||
{
|
||||
retval = x*x + y*y;
|
||||
/* One or both of the arguments are NaN or infinity. The
|
||||
result will also be NaN or infinity. */
|
||||
if (((ux == BIASEDEMAX_DP64 + 1) && !(avx & MANTBITS_DP64)) ||
|
||||
((uy == BIASEDEMAX_DP64 + 1) && !(avy & MANTBITS_DP64)))
|
||||
/* x or y is infinity. ISO C99 defines that we must
|
||||
return +infinity, even if the other argument is NaN.
|
||||
Note that the computation of x*x + y*y above will already
|
||||
have raised invalid if either x or y is a signalling NaN. */
|
||||
return infinityf_with_flags(0);
|
||||
else
|
||||
/* One or both of x or y is NaN, and neither is infinity.
|
||||
Raise invalid if it's a signalling NaN */
|
||||
return (float)retval;
|
||||
}
|
||||
|
||||
dr = (dx*dx + dy*dy);
|
||||
|
||||
#if USE_SOFTWARE_SQRT
|
||||
retval = sqrtf_amd_inline(r);
|
||||
#else
|
||||
/* VC++ intrinsic call */
|
||||
_mm_store_sd(&retval, _mm_sqrt_sd(_mm_setzero_pd(), _mm_load_sd(&dr)));
|
||||
#endif
|
||||
|
||||
if (retval > large)
|
||||
return _handle_errorf("_hypotf", OP_HYPOT, PINFBITPATT_SP32, _OVERFLOW,
|
||||
AMD_F_OVERFLOW | AMD_F_INEXACT, ERANGE, x, y, 2);
|
||||
else
|
||||
return (float)retval;
|
||||
}
|
49
sdk/lib/crt/math/libm_sse2/libm.h
Normal file
49
sdk/lib/crt/math/libm_sse2/libm.h
Normal file
|
@ -0,0 +1,49 @@
|
|||
/***********************************************************************************/
|
||||
/** MIT License **/
|
||||
/** ----------- **/
|
||||
/** **/
|
||||
/** Copyright (c) 2002-2019 Advanced Micro Devices, Inc. **/
|
||||
/** **/
|
||||
/** Permission is hereby granted, free of charge, to any person obtaining a copy **/
|
||||
/** of this Software and associated documentaon files (the "Software"), to deal **/
|
||||
/** in the Software without restriction, including without limitation the rights **/
|
||||
/** to use, copy, modify, merge, publish, distribute, sublicense, and/or sell **/
|
||||
/** copies of the Software, and to permit persons to whom the Software is **/
|
||||
/** furnished to do so, subject to the following conditions: **/
|
||||
/** **/
|
||||
/** The above copyright notice and this permission notice shall be included in **/
|
||||
/** all copies or substantial portions of the Software. **/
|
||||
/** **/
|
||||
/** THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR **/
|
||||
/** IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, **/
|
||||
/** FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE **/
|
||||
/** AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER **/
|
||||
/** LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, **/
|
||||
/** OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN **/
|
||||
/** THE SOFTWARE. **/
|
||||
/***********************************************************************************/
|
||||
|
||||
#ifndef LIBM_AMD_H_INCLUDED
|
||||
#define LIBM_AMD_H_INCLUDED 1
|
||||
|
||||
#define FN_PROTOTYPE(fname) fname
|
||||
|
||||
#include <math.h>
|
||||
#include <fpieee.h>
|
||||
|
||||
#ifndef IS_64BIT
|
||||
#define IS_64BIT
|
||||
#endif
|
||||
|
||||
#ifndef _COMPLEX_DEFINED
|
||||
struct _complex
|
||||
{
|
||||
double x, y; /* real and imaginary parts */
|
||||
};
|
||||
#define _COMPLEX_DEFINED
|
||||
#endif
|
||||
#define COMPLEX struct _complex
|
||||
|
||||
extern void __remainder_piby2(double x, double *r, double *rr, int *region);
|
||||
|
||||
#endif /* LIBM_AMD_H_INCLUDED */
|
35
sdk/lib/crt/math/libm_sse2/libm_errno.h
Normal file
35
sdk/lib/crt/math/libm_sse2/libm_errno.h
Normal file
|
@ -0,0 +1,35 @@
|
|||
/***********************************************************************************/
|
||||
/** MIT License **/
|
||||
/** ----------- **/
|
||||
/** **/
|
||||
/** Copyright (c) 2002-2019 Advanced Micro Devices, Inc. **/
|
||||
/** **/
|
||||
/** Permission is hereby granted, free of charge, to any person obtaining a copy **/
|
||||
/** of this Software and associated documentaon files (the "Software"), to deal **/
|
||||
/** in the Software without restriction, including without limitation the rights **/
|
||||
/** to use, copy, modify, merge, publish, distribute, sublicense, and/or sell **/
|
||||
/** copies of the Software, and to permit persons to whom the Software is **/
|
||||
/** furnished to do so, subject to the following conditions: **/
|
||||
/** **/
|
||||
/** The above copyright notice and this permission notice shall be included in **/
|
||||
/** all copies or substantial portions of the Software. **/
|
||||
/** **/
|
||||
/** THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR **/
|
||||
/** IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, **/
|
||||
/** FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE **/
|
||||
/** AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER **/
|
||||
/** LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, **/
|
||||
/** OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN **/
|
||||
/** THE SOFTWARE. **/
|
||||
/***********************************************************************************/
|
||||
|
||||
#ifndef LIBM_ERRNO_AMD_H_INCLUDED
|
||||
#define LIBM_ERRNO_AMD_H_INCLUDED 1
|
||||
|
||||
#include <stdio.h>
|
||||
#include <errno.h>
|
||||
#ifndef __set_errno
|
||||
#define __set_errno(x) errno = (x)
|
||||
#endif
|
||||
|
||||
#endif /* LIBM_ERRNO_AMD_H_INCLUDED */
|
2101
sdk/lib/crt/math/libm_sse2/libm_inlines.h
Normal file
2101
sdk/lib/crt/math/libm_sse2/libm_inlines.h
Normal file
File diff suppressed because it is too large
Load diff
122
sdk/lib/crt/math/libm_sse2/libm_new.h
Normal file
122
sdk/lib/crt/math/libm_sse2/libm_new.h
Normal file
|
@ -0,0 +1,122 @@
|
|||
|
||||
/***********************************************************************************/
|
||||
/** MIT License **/
|
||||
/** ----------- **/
|
||||
/** **/
|
||||
/** Copyright (c) 2002-2019 Advanced Micro Devices, Inc. **/
|
||||
/** **/
|
||||
/** Permission is hereby granted, free of charge, to any person obtaining a copy **/
|
||||
/** of this Software and associated documentaon files (the "Software"), to deal **/
|
||||
/** in the Software without restriction, including without limitation the rights **/
|
||||
/** to use, copy, modify, merge, publish, distribute, sublicense, and/or sell **/
|
||||
/** copies of the Software, and to permit persons to whom the Software is **/
|
||||
/** furnished to do so, subject to the following conditions: **/
|
||||
/** **/
|
||||
/** The above copyright notice and this permission notice shall be included in **/
|
||||
/** all copies or substantial portions of the Software. **/
|
||||
/** **/
|
||||
/** THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR **/
|
||||
/** IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, **/
|
||||
/** FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE **/
|
||||
/** AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER **/
|
||||
/** LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, **/
|
||||
/** OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN **/
|
||||
/** THE SOFTWARE. **/
|
||||
/***********************************************************************************/
|
||||
|
||||
#ifndef __LIBM_NEW_H__
|
||||
#define __LIBM_NEW_H__
|
||||
|
||||
// Defines, protos, etc for *new* math funcs updated by AMD 11/2008
|
||||
// Old files will continue to include libm_util.h, libm.h, libm_inlines.h
|
||||
// until such time as these have all been refreshed w/ new versions.
|
||||
|
||||
typedef float F32;
|
||||
typedef unsigned int U32;
|
||||
|
||||
typedef double F64;
|
||||
typedef unsigned long long U64;
|
||||
|
||||
union UT32_
|
||||
{
|
||||
F32 f32;
|
||||
U32 u32;
|
||||
};
|
||||
|
||||
union UT64_
|
||||
{
|
||||
F64 f64;
|
||||
U64 u64;
|
||||
|
||||
F32 f32[2];
|
||||
U32 u32[2];
|
||||
};
|
||||
|
||||
typedef union UT32_ UT32;
|
||||
typedef union UT64_ UT64;
|
||||
|
||||
#define SIGN_MASK_32 0x80000000
|
||||
#define MANTISSA_MASK_32 0x007fffff
|
||||
#define EXPONENT_MASK_32 0x7f800000
|
||||
#define QNAN_MASK_32 0x00400000
|
||||
|
||||
#define INF_POS_32 0x7f800000
|
||||
#define INF_NEG_32 0xff800000
|
||||
#define QNAN_POS_32 0x7fc00000
|
||||
#define QNAN_NEG_32 0xffc00000
|
||||
#define IND_32 0xffc00000
|
||||
|
||||
#define EXPONENT_FULL_32 0x7f800000
|
||||
#define SIGN_SET_32 0x80000000
|
||||
#define QNAN_SET_32 0x00400000
|
||||
|
||||
#define INF_POS_64 0x7ff0000000000000
|
||||
#define INF_NEG_64 0xfff0000000000000
|
||||
|
||||
#define MANTISSA_MASK_64 0x000fffffffffffff
|
||||
#define SIGN_MASK_64 0x8000000000000000
|
||||
#define IND_64 0xfff8000000000000
|
||||
#define QNAN_MASK_64 0x0008000000000000
|
||||
|
||||
// constants for 'flags' argument of _handle_error and _handle_errorf
|
||||
#define AMD_F_INEXACT 0x00000010
|
||||
#define AMD_F_OVERFLOW 0x00000001
|
||||
#define AMD_F_UNDERFLOW 0x00000002
|
||||
#define AMD_F_DIVBYZERO 0x00000004
|
||||
#define AMD_F_INVALID 0x00000008
|
||||
|
||||
// define the Microsoft specific error handling routine
|
||||
|
||||
// Note to mainainers:
|
||||
// These prototypes may appear, at first glance, to differ from the versions
|
||||
// declared in libm_inlines.h and defined in libm_error.c. The third
|
||||
// parameter appears to have changed type from unsigned long to unsigned long
|
||||
// long. In fact they are the same because in both of the aforementioned
|
||||
// files, long has been #defined to __int64 in a most cowardly fashion. This
|
||||
// disgusts me. The buck stops here. - MAS
|
||||
|
||||
double _handle_error(
|
||||
char *fname,
|
||||
int opcode,
|
||||
unsigned long long value,
|
||||
int type,
|
||||
int flags,
|
||||
int error,
|
||||
double arg1,
|
||||
double arg2,
|
||||
int nargs
|
||||
);
|
||||
float _handle_errorf(
|
||||
char *fname,
|
||||
int opcode,
|
||||
unsigned long long value,
|
||||
int type,
|
||||
int flags,
|
||||
int error,
|
||||
float arg1,
|
||||
float arg2,
|
||||
int nargs
|
||||
);
|
||||
|
||||
#endif // __LIBM_NEW_H
|
||||
|
150
sdk/lib/crt/math/libm_sse2/libm_util.h
Normal file
150
sdk/lib/crt/math/libm_sse2/libm_util.h
Normal file
|
@ -0,0 +1,150 @@
|
|||
/***********************************************************************************/
|
||||
/** MIT License **/
|
||||
/** ----------- **/
|
||||
/** **/
|
||||
/** Copyright (c) 2002-2019 Advanced Micro Devices, Inc. **/
|
||||
/** **/
|
||||
/** Permission is hereby granted, free of charge, to any person obtaining a copy **/
|
||||
/** of this Software and associated documentaon files (the "Software"), to deal **/
|
||||
/** in the Software without restriction, including without limitation the rights **/
|
||||
/** to use, copy, modify, merge, publish, distribute, sublicense, and/or sell **/
|
||||
/** copies of the Software, and to permit persons to whom the Software is **/
|
||||
/** furnished to do so, subject to the following conditions: **/
|
||||
/** **/
|
||||
/** The above copyright notice and this permission notice shall be included in **/
|
||||
/** all copies or substantial portions of the Software. **/
|
||||
/** **/
|
||||
/** THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR **/
|
||||
/** IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, **/
|
||||
/** FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE **/
|
||||
/** AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER **/
|
||||
/** LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, **/
|
||||
/** OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN **/
|
||||
/** THE SOFTWARE. **/
|
||||
/***********************************************************************************/
|
||||
|
||||
#ifndef LIBM_UTIL_AMD_H_INCLUDED
|
||||
#define LIBM_UTIL_AMD_H_INCLUDED 1
|
||||
|
||||
#define inline __inline
|
||||
#undef long
|
||||
#define long __int64
|
||||
|
||||
#include "emmintrin.h"
|
||||
#include "float.h"
|
||||
|
||||
|
||||
|
||||
/* Compile-time verification that type long is the same size
|
||||
as type double (i.e. we are really on a 64-bit machine) */
|
||||
void check_long_against_double_size(int machine_is_64_bit[(sizeof(long) == sizeof(double))?1:-1]);
|
||||
|
||||
|
||||
/* Definitions for double functions on 64 bit machines */
|
||||
#define SIGNBIT_DP64 0x8000000000000000
|
||||
#define EXPBITS_DP64 0x7ff0000000000000
|
||||
#define MANTBITS_DP64 0x000fffffffffffff
|
||||
#define ONEEXPBITS_DP64 0x3ff0000000000000
|
||||
#define TWOEXPBITS_DP64 0x4000000000000000
|
||||
#define HALFEXPBITS_DP64 0x3fe0000000000000
|
||||
#define IMPBIT_DP64 0x0010000000000000
|
||||
#define QNANBITPATT_DP64 0x7ff8000000000000
|
||||
#define INDEFBITPATT_DP64 0xfff8000000000000
|
||||
#define PINFBITPATT_DP64 0x7ff0000000000000
|
||||
#define NINFBITPATT_DP64 0xfff0000000000000
|
||||
#define EXPBIAS_DP64 1023
|
||||
#define EXPSHIFTBITS_DP64 52
|
||||
#define BIASEDEMIN_DP64 1
|
||||
#define EMIN_DP64 -1022
|
||||
#define BIASEDEMAX_DP64 2046
|
||||
#define EMAX_DP64 1023
|
||||
#define LAMBDA_DP64 1.0e300
|
||||
#define MANTLENGTH_DP64 53
|
||||
#define BASEDIGITS_DP64 15
|
||||
|
||||
|
||||
/* These definitions, used by float functions,
|
||||
are for both 32 and 64 bit machines */
|
||||
#define SIGNBIT_SP32 0x80000000
|
||||
#define EXPBITS_SP32 0x7f800000
|
||||
#define MANTBITS_SP32 0x007fffff
|
||||
#define ONEEXPBITS_SP32 0x3f800000
|
||||
#define TWOEXPBITS_SP32 0x40000000
|
||||
#define HALFEXPBITS_SP32 0x3f000000
|
||||
#define IMPBIT_SP32 0x00800000
|
||||
#define QNANBITPATT_SP32 0x7fc00000
|
||||
#define INDEFBITPATT_SP32 0xffc00000
|
||||
#define PINFBITPATT_SP32 0x7f800000
|
||||
#define NINFBITPATT_SP32 0xff800000
|
||||
#define EXPBIAS_SP32 127
|
||||
#define EXPSHIFTBITS_SP32 23
|
||||
#define BIASEDEMIN_SP32 1
|
||||
#define EMIN_SP32 -126
|
||||
#define BIASEDEMAX_SP32 254
|
||||
#define EMAX_SP32 127
|
||||
#define LAMBDA_SP32 1.0e30
|
||||
#define MANTLENGTH_SP32 24
|
||||
#define BASEDIGITS_SP32 7
|
||||
|
||||
#define CLASS_SIGNALLING_NAN 1
|
||||
#define CLASS_QUIET_NAN 2
|
||||
#define CLASS_NEGATIVE_INFINITY 3
|
||||
#define CLASS_NEGATIVE_NORMAL_NONZERO 4
|
||||
#define CLASS_NEGATIVE_DENORMAL 5
|
||||
#define CLASS_NEGATIVE_ZERO 6
|
||||
#define CLASS_POSITIVE_ZERO 7
|
||||
#define CLASS_POSITIVE_DENORMAL 8
|
||||
#define CLASS_POSITIVE_NORMAL_NONZERO 9
|
||||
#define CLASS_POSITIVE_INFINITY 10
|
||||
|
||||
#define OLD_BITS_SP32(x) (*((unsigned int *)&x))
|
||||
#define OLD_BITS_DP64(x) (*((unsigned long *)&x))
|
||||
|
||||
/* Alternatives to the above functions which don't have
|
||||
problems when using high optimization levels on gcc */
|
||||
#define GET_BITS_SP32(x, ux) \
|
||||
{ \
|
||||
volatile union {float f; unsigned int i;} _bitsy; \
|
||||
_bitsy.f = (x); \
|
||||
ux = _bitsy.i; \
|
||||
}
|
||||
#define PUT_BITS_SP32(ux, x) \
|
||||
{ \
|
||||
volatile union {float f; unsigned int i;} _bitsy; \
|
||||
_bitsy.i = (ux); \
|
||||
x = _bitsy.f; \
|
||||
}
|
||||
|
||||
#define GET_BITS_DP64(x, ux) \
|
||||
{ \
|
||||
volatile union {double d; unsigned long i;} _bitsy; \
|
||||
_bitsy.d = (x); \
|
||||
ux = _bitsy.i; \
|
||||
}
|
||||
#define PUT_BITS_DP64(ux, x) \
|
||||
{ \
|
||||
volatile union {double d; unsigned long i;} _bitsy; \
|
||||
_bitsy.i = (ux); \
|
||||
x = _bitsy.d; \
|
||||
}
|
||||
|
||||
|
||||
/* Processor-dependent floating-point status flags */
|
||||
#define AMD_F_OVERFLOW 0x00000001
|
||||
#define AMD_F_UNDERFLOW 0x00000002
|
||||
#define AMD_F_DIVBYZERO 0x00000004
|
||||
#define AMD_F_INVALID 0x00000008
|
||||
#define AMD_F_INEXACT 0x00000010
|
||||
|
||||
/* Processor-dependent floating-point precision-control flags */
|
||||
#define AMD_F_EXTENDED 0x00000300
|
||||
#define AMD_F_DOUBLE 0x00000200
|
||||
#define AMD_F_SINGLE 0x00000000
|
||||
|
||||
/* Processor-dependent floating-point rounding-control flags */
|
||||
#define AMD_F_RC_NEAREST 0x00000000
|
||||
#define AMD_F_RC_DOWN 0x00002000
|
||||
#define AMD_F_RC_UP 0x00004000
|
||||
#define AMD_F_RC_ZERO 0x00006000
|
||||
|
||||
#endif /* LIBM_UTIL_AMD_H_INCLUDED */
|
557
sdk/lib/crt/math/libm_sse2/log.asm
Normal file
557
sdk/lib/crt/math/libm_sse2/log.asm
Normal file
|
@ -0,0 +1,557 @@
|
|||
;
|
||||
; MIT License
|
||||
; -----------
|
||||
;
|
||||
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
||||
;
|
||||
; Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
; of this Software and associated documentaon files (the "Software"), to deal
|
||||
; in the Software without restriction, including without limitation the rights
|
||||
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
; copies of the Software, and to permit persons to whom the Software is
|
||||
; furnished to do so, subject to the following conditions:
|
||||
;
|
||||
; The above copyright notice and this permission notice shall be included in
|
||||
; all copies or substantial portions of the Software.
|
||||
;
|
||||
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
; THE SOFTWARE.
|
||||
;
|
||||
; log.asm
|
||||
;
|
||||
; An implementation of the log libm function.
|
||||
;
|
||||
; Prototype:
|
||||
;
|
||||
; double log(double x);
|
||||
;
|
||||
|
||||
;
|
||||
; Algorithm:
|
||||
;
|
||||
; Based on:
|
||||
; Ping-Tak Peter Tang
|
||||
; "Table-driven implementation of the logarithm function in IEEE
|
||||
; floating-point arithmetic"
|
||||
; ACM Transactions on Mathematical Software (TOMS)
|
||||
; Volume 16, Issue 4 (December 1990)
|
||||
;
|
||||
;
|
||||
; x very close to 1.0 is handled differently, for x everywhere else
|
||||
; a brief explanation is given below
|
||||
;
|
||||
; x = (2^m)*A
|
||||
; x = (2^m)*(G+g) with (1 <= G < 2) and (g <= 2^(-9))
|
||||
; x = (2^m)*2*(G/2+g/2)
|
||||
; x = (2^m)*2*(F+f) with (0.5 <= F < 1) and (f <= 2^(-10))
|
||||
;
|
||||
; Y = (2^(-1))*(2^(-m))*(2^m)*A
|
||||
; Now, range of Y is: 0.5 <= Y < 1
|
||||
;
|
||||
; F = 0x100 + (first 8 mantissa bits) + (9th mantissa bit)
|
||||
; Now, range of F is: 256 <= F <= 512
|
||||
; F = F / 512
|
||||
; Now, range of F is: 0.5 <= F <= 1
|
||||
;
|
||||
; f = -(Y-F), with (f <= 2^(-10))
|
||||
;
|
||||
; log(x) = m*log(2) + log(2) + log(F-f)
|
||||
; log(x) = m*log(2) + log(2) + log(F) + log(1-(f/F))
|
||||
; log(x) = m*log(2) + log(2*F) + log(1-r)
|
||||
;
|
||||
; r = (f/F), with (r <= 2^(-9))
|
||||
; r = f*(1/F) with (1/F) precomputed to avoid division
|
||||
;
|
||||
; log(x) = m*log(2) + log(G) - poly
|
||||
;
|
||||
; log(G) is precomputed
|
||||
; poly = (r + (r^2)/2 + (r^3)/3 + (r^4)/4) + (r^5)/5) + (r^6)/6))
|
||||
;
|
||||
; log(2) and log(G) need to be maintained in extra precision
|
||||
; to avoid losing precision in the calculations
|
||||
;
|
||||
|
||||
.const
|
||||
ALIGN 16
|
||||
|
||||
__real_ninf DQ 0fff0000000000000h ; -inf
|
||||
DQ 0000000000000000h
|
||||
__real_inf DQ 7ff0000000000000h ; +inf
|
||||
DQ 0000000000000000h
|
||||
__real_neg_qnan DQ 0fff8000000000000h ; neg qNaN
|
||||
DQ 0000000000000000h
|
||||
__real_qnanbit DQ 0008000000000000h
|
||||
DQ 0000000000000000h
|
||||
__real_min_norm DQ 0010000000000000h
|
||||
DQ 0000000000000000h
|
||||
__real_mant DQ 000FFFFFFFFFFFFFh ; mantissa bits
|
||||
DQ 0000000000000000h
|
||||
__mask_1023 DQ 00000000000003ffh
|
||||
DQ 0000000000000000h
|
||||
__mask_001 DQ 0000000000000001h
|
||||
DQ 0000000000000000h
|
||||
|
||||
__mask_mant_all8 DQ 000ff00000000000h
|
||||
DQ 0000000000000000h
|
||||
__mask_mant9 DQ 0000080000000000h
|
||||
DQ 0000000000000000h
|
||||
|
||||
__real_two DQ 4000000000000000h ; 2
|
||||
DQ 0000000000000000h
|
||||
|
||||
__real_one DQ 3ff0000000000000h ; 1
|
||||
DQ 0000000000000000h
|
||||
|
||||
__real_near_one_lt DQ 3fee000000000000h ; .9375
|
||||
DQ 0000000000000000h
|
||||
|
||||
__real_near_one_gt DQ 3ff1000000000000h ; 1.0625
|
||||
DQ 0000000000000000h
|
||||
|
||||
__real_half DQ 3fe0000000000000h ; 1/2
|
||||
DQ 0000000000000000h
|
||||
|
||||
__mask_100 DQ 0000000000000100h
|
||||
DQ 0000000000000000h
|
||||
|
||||
__real_1_over_512 DQ 3f60000000000000h
|
||||
DQ 0000000000000000h
|
||||
|
||||
__real_1_over_2 DQ 3fe0000000000000h
|
||||
DQ 0000000000000000h
|
||||
__real_1_over_3 DQ 3fd5555555555555h
|
||||
DQ 0000000000000000h
|
||||
__real_1_over_4 DQ 3fd0000000000000h
|
||||
DQ 0000000000000000h
|
||||
__real_1_over_5 DQ 3fc999999999999ah
|
||||
DQ 0000000000000000h
|
||||
__real_1_over_6 DQ 3fc5555555555555h
|
||||
DQ 0000000000000000h
|
||||
|
||||
__mask_1023_f DQ 0c08ff80000000000h
|
||||
DQ 0000000000000000h
|
||||
|
||||
__mask_2045 DQ 00000000000007fdh
|
||||
DQ 0000000000000000h
|
||||
|
||||
__real_threshold DQ 3fb0000000000000h ; .0625
|
||||
DQ 0000000000000000h
|
||||
|
||||
__real_notsign DQ 7ffFFFFFFFFFFFFFh ; ^sign bit
|
||||
DQ 0000000000000000h
|
||||
|
||||
__real_ca1 DQ 3fb55555555554e6h ; 8.33333333333317923934e-02
|
||||
DQ 0000000000000000h
|
||||
__real_ca2 DQ 3f89999999bac6d4h ; 1.25000000037717509602e-02
|
||||
DQ 0000000000000000h
|
||||
__real_ca3 DQ 3f62492307f1519fh ; 2.23213998791944806202e-03
|
||||
DQ 0000000000000000h
|
||||
__real_ca4 DQ 3f3c8034c85dfff0h ; 4.34887777707614552256e-04
|
||||
DQ 0000000000000000h
|
||||
__real_log2_lead DQ 03fe62e42e0000000h ; 6.93147122859954833984e-01
|
||||
DQ 00000000000000000h
|
||||
__real_log2_tail DQ 03e6efa39ef35793ch ; 5.76999904754328540596e-08
|
||||
DQ 00000000000000000h
|
||||
|
||||
; these codes and the ones in the corresponding .c file have to match
|
||||
__flag_x_zero DD 00000001
|
||||
__flag_x_neg DD 00000002
|
||||
__flag_x_nan DD 00000003
|
||||
|
||||
|
||||
EXTRN __log_256_lead:QWORD
|
||||
EXTRN __log_256_tail:QWORD
|
||||
EXTRN __log_F_inv_qword:QWORD
|
||||
EXTRN __use_fma3_lib:DWORD
|
||||
|
||||
|
||||
fname TEXTEQU <log>
|
||||
fname_special TEXTEQU <_log_special>
|
||||
|
||||
; define local variable storage offsets
|
||||
|
||||
save_xmm6 EQU 20h
|
||||
dummy_space EQU 40h
|
||||
|
||||
stack_size EQU 58h
|
||||
|
||||
include fm.inc
|
||||
|
||||
; external function
|
||||
EXTERN fname_special:PROC
|
||||
|
||||
.code
|
||||
ALIGN 16
|
||||
PUBLIC fname
|
||||
fname PROC FRAME
|
||||
StackAllocate stack_size
|
||||
SaveXmm xmm6, save_xmm6
|
||||
.ENDPROLOG
|
||||
|
||||
cmp DWORD PTR __use_fma3_lib, 0
|
||||
jne Llog_fma3
|
||||
|
||||
Llog_sse2:
|
||||
|
||||
; compute exponent part
|
||||
movdqa xmm3, xmm0
|
||||
movapd xmm4, xmm0
|
||||
psrlq xmm3, 52
|
||||
movd rax, xmm0
|
||||
psubq xmm3, XMMWORD PTR __mask_1023
|
||||
|
||||
; NaN or inf
|
||||
mov rcx, rax
|
||||
btr rcx, 63
|
||||
cmp rcx, QWORD PTR __real_inf
|
||||
jae __x_is_inf_or_nan
|
||||
|
||||
movdqa xmm2, xmm0
|
||||
cvtdq2pd xmm6, xmm3 ; xexp
|
||||
|
||||
|
||||
pand xmm2, XMMWORD PTR __real_mant
|
||||
subsd xmm4, QWORD PTR __real_one
|
||||
|
||||
comisd xmm6, QWORD PTR __mask_1023_f
|
||||
je __denormal_adjust
|
||||
|
||||
__continue_common:
|
||||
|
||||
andpd xmm4, XMMWORD PTR __real_notsign
|
||||
; compute index into the log tables
|
||||
mov r9, rax
|
||||
and rax, QWORD PTR __mask_mant_all8
|
||||
and r9, QWORD PTR __mask_mant9
|
||||
shl r9, 1
|
||||
add rax, r9
|
||||
movd xmm1, rax
|
||||
|
||||
; near one codepath
|
||||
comisd xmm4, QWORD PTR __real_threshold
|
||||
jb __near_one
|
||||
|
||||
; F, Y
|
||||
shr rax, 44
|
||||
por xmm2, XMMWORD PTR __real_half
|
||||
por xmm1, XMMWORD PTR __real_half
|
||||
lea r9, __log_F_inv_qword
|
||||
|
||||
; check for negative numbers or zero
|
||||
xorpd xmm5, xmm5
|
||||
comisd xmm0, xmm5
|
||||
jbe __x_is_zero_or_neg
|
||||
|
||||
; f = F - Y, r = f * inv
|
||||
subsd xmm1, xmm2 ; xmm1 <-- f = F - Y
|
||||
mulsd xmm1, QWORD PTR [r9+rax*8] ; xmm1 <-- r = f * inv
|
||||
|
||||
movapd xmm2, xmm1 ; xmm2 <-- copy of r
|
||||
movapd xmm0, xmm1 ; xmm0 <-- copy of r
|
||||
lea r9, QWORD PTR __log_256_lead
|
||||
|
||||
; poly
|
||||
movsd xmm3, QWORD PTR __real_1_over_6
|
||||
movsd xmm1, QWORD PTR __real_1_over_3
|
||||
mulsd xmm3, xmm2 ; xmm3 <-- r/6
|
||||
mulsd xmm1, xmm2 ; xmm1 <-- r/3
|
||||
mulsd xmm0, xmm2 ; xmm0 <-- r*r
|
||||
movapd xmm4, xmm0 ; xmm4 <-- copy of r*r
|
||||
addsd xmm3, QWORD PTR __real_1_over_5 ; xmm3 <-- r/6 + 1/5
|
||||
addsd xmm1, QWORD PTR __real_1_over_2 ; xmm1 <-- r/3 + 1/2
|
||||
mulsd xmm4, xmm0 ; xmm4 <-- r^4
|
||||
mulsd xmm3, xmm2 ; xmm3 <-- (r/6 + 1/5)*r
|
||||
mulsd xmm1, xmm0 ; xmm1 <-- (r/3 + 1/2)*r^2
|
||||
addsd xmm3, QWORD PTR __real_1_over_4 ; xmm3 <-- (r/6 + 1/5)*r + 1/4
|
||||
addsd xmm1, xmm2 ; xmm1 <-- (r/3 + 1/2)*r^2 + r
|
||||
mulsd xmm3, xmm4 ; xmm3 <-- ((r/6+1/5)*r+1/4)*r^4
|
||||
addsd xmm1, xmm3 ; xmm1 <-- poly
|
||||
|
||||
; m*log(2)_tail + log(G)_tail - poly
|
||||
movsd xmm5, QWORD PTR __real_log2_tail
|
||||
mulsd xmm5, xmm6 ; xmm5 <-- m*log2_tail
|
||||
subsd xmm5, xmm1 ; xmm5 <-- m*log2_tail - poly
|
||||
|
||||
movsd xmm0, QWORD PTR [r9+rax*8] ; xmm0 <-- log(G)_lead
|
||||
lea rdx, QWORD PTR __log_256_tail
|
||||
movsd xmm2, QWORD PTR [rdx+rax*8] ; xmm2 <-- log(G)_tail
|
||||
addsd xmm2, xmm5 ; xmm2 <-- (m*log2_tail - poly) + log(G)_tail
|
||||
|
||||
movsd xmm4, QWORD PTR __real_log2_lead
|
||||
mulsd xmm4, xmm6 ; xmm4 <-- m*log2_lead
|
||||
addsd xmm0, xmm4 ; xmm0 <-- m*log2_lead + log(G)_lead
|
||||
|
||||
addsd xmm0, xmm2 ; xmm0 <-- m*log(2)_tail + log(G)_tail - poly
|
||||
|
||||
RestoreXmm xmm6, save_xmm6
|
||||
StackDeallocate stack_size
|
||||
ret
|
||||
|
||||
ALIGN 16
|
||||
__near_one:
|
||||
|
||||
; r = x - 1.0
|
||||
movsd xmm2, QWORD PTR __real_two
|
||||
subsd xmm0, QWORD PTR __real_one ; r
|
||||
|
||||
addsd xmm2, xmm0
|
||||
movsd xmm1, xmm0
|
||||
divsd xmm1, xmm2 ; r/(2+r) = u/2
|
||||
|
||||
movsd xmm4, QWORD PTR __real_ca2
|
||||
movsd xmm5, QWORD PTR __real_ca4
|
||||
|
||||
movsd xmm6, xmm0
|
||||
mulsd xmm6, xmm1 ; correction
|
||||
|
||||
addsd xmm1, xmm1 ; u
|
||||
movsd xmm2, xmm1
|
||||
|
||||
mulsd xmm2, xmm1 ; u^2
|
||||
|
||||
mulsd xmm4, xmm2
|
||||
mulsd xmm5, xmm2
|
||||
|
||||
addsd xmm4, __real_ca1
|
||||
addsd xmm5, __real_ca3
|
||||
|
||||
mulsd xmm2, xmm1 ; u^3
|
||||
mulsd xmm4, xmm2
|
||||
|
||||
mulsd xmm2, xmm2
|
||||
mulsd xmm2, xmm1 ; u^7
|
||||
mulsd xmm5, xmm2
|
||||
|
||||
addsd xmm4, xmm5
|
||||
subsd xmm4, xmm6
|
||||
addsd xmm0, xmm4
|
||||
|
||||
RestoreXmm xmm6, save_xmm6
|
||||
StackDeallocate stack_size
|
||||
ret
|
||||
|
||||
ALIGN 16
|
||||
__denormal_adjust:
|
||||
por xmm2, XMMWORD PTR __real_one
|
||||
subsd xmm2, QWORD PTR __real_one
|
||||
movsd xmm5, xmm2
|
||||
pand xmm2, XMMWORD PTR __real_mant
|
||||
movd rax, xmm2
|
||||
psrlq xmm5, 52
|
||||
psubd xmm5, XMMWORD PTR __mask_2045
|
||||
cvtdq2pd xmm6, xmm5
|
||||
jmp __continue_common
|
||||
|
||||
ALIGN 16
|
||||
__x_is_zero_or_neg:
|
||||
jne __x_is_neg
|
||||
|
||||
movsd xmm1, QWORD PTR __real_ninf
|
||||
mov r8d, DWORD PTR __flag_x_zero
|
||||
call fname_special
|
||||
jmp __finish
|
||||
|
||||
ALIGN 16
|
||||
__x_is_neg:
|
||||
|
||||
movsd xmm1, QWORD PTR __real_neg_qnan
|
||||
mov r8d, DWORD PTR __flag_x_neg
|
||||
call fname_special
|
||||
jmp __finish
|
||||
|
||||
ALIGN 16
|
||||
__x_is_inf_or_nan:
|
||||
|
||||
cmp rax, QWORD PTR __real_inf
|
||||
je __finish
|
||||
|
||||
cmp rax, QWORD PTR __real_ninf
|
||||
je __x_is_neg
|
||||
|
||||
or rax, QWORD PTR __real_qnanbit
|
||||
movd xmm1, rax
|
||||
mov r8d, DWORD PTR __flag_x_nan
|
||||
call fname_special
|
||||
jmp __finish
|
||||
|
||||
ALIGN 16
|
||||
__finish:
|
||||
RestoreXmm xmm6, save_xmm6
|
||||
StackDeallocate stack_size
|
||||
ret
|
||||
|
||||
ALIGN 16
|
||||
Llog_fma3:
|
||||
; compute exponent part
|
||||
xor rax,rax
|
||||
vpsrlq xmm3,xmm0,52
|
||||
vmovq rax,xmm0
|
||||
vpsubq xmm3,xmm3,QWORD PTR __mask_1023
|
||||
vcvtdq2pd xmm6,xmm3 ; xexp
|
||||
|
||||
; NaN or inf
|
||||
vpand xmm5,xmm0,QWORD PTR __real_inf
|
||||
vcomisd xmm5,QWORD PTR __real_inf
|
||||
je Llog_fma3_x_is_inf_or_nan
|
||||
|
||||
; check for negative numbers or zero
|
||||
vpxor xmm5,xmm5,xmm5
|
||||
vcomisd xmm0,xmm5
|
||||
jbe Llog_fma3_x_is_zero_or_neg
|
||||
|
||||
vpand xmm2,xmm0,QWORD PTR __real_mant
|
||||
vsubsd xmm4,xmm0,QWORD PTR __real_one
|
||||
|
||||
vcomisd xmm6,QWORD PTR __mask_1023_f
|
||||
je Llog_fma3_denormal_adjust
|
||||
|
||||
Llog_fma3_continue_common:
|
||||
; compute index into the log tables
|
||||
vpand xmm1,xmm0,QWORD PTR __mask_mant_all8
|
||||
vpand xmm3,xmm0,QWORD PTR __mask_mant9
|
||||
vpsllq xmm3,xmm3,1
|
||||
vpaddq xmm1,xmm3,xmm1
|
||||
vmovq rax,xmm1
|
||||
|
||||
; near one codepath
|
||||
vpand xmm4,xmm4,QWORD PTR __real_notsign
|
||||
vcomisd xmm4,QWORD PTR __real_threshold
|
||||
jb Llog_fma3_near_one
|
||||
|
||||
; F,Y
|
||||
shr rax,44
|
||||
vpor xmm2,xmm2,QWORD PTR __real_half
|
||||
vpor xmm1,xmm1,QWORD PTR __real_half
|
||||
lea r9,QWORD PTR __log_F_inv_qword
|
||||
|
||||
; f = F - Y,r = f * inv
|
||||
vsubsd xmm1,xmm1,xmm2
|
||||
vmulsd xmm1,xmm1,QWORD PTR[r9 + rax * 8]
|
||||
|
||||
lea r9,QWORD PTR __log_256_lead
|
||||
|
||||
; poly
|
||||
vmulsd xmm0,xmm1,xmm1 ; r*r
|
||||
vmovsd xmm3,QWORD PTR __real_1_over_6
|
||||
vmovsd xmm5,QWORD PTR __real_1_over_3
|
||||
vfmadd213sd xmm3,xmm1,QWORD PTR __real_1_over_5 ; r*1/6 + 1/5
|
||||
vfmadd213sd xmm5,xmm1,QWORD PTR __real_1_over_2 ; 1/2+r*1/3
|
||||
vmovsd xmm4,xmm0,xmm0
|
||||
vfmadd213sd xmm3,xmm1,QWORD PTR __real_1_over_4 ; 1/4+(1/5*r+r*r*1/6)
|
||||
|
||||
vmulsd xmm4,xmm0,xmm0 ; r*r*r*r
|
||||
vfmadd231sd xmm1,xmm5,xmm0 ; r*r*(1/2+r*1/3) + r
|
||||
vfmadd231sd xmm1,xmm3,xmm4
|
||||
|
||||
; m*log(2) + log(G) - poly
|
||||
vmovsd xmm5,QWORD PTR __real_log2_tail
|
||||
vfmsub213sd xmm5,xmm6,xmm1
|
||||
|
||||
vmovsd xmm0,QWORD PTR[r9 + rax * 8]
|
||||
lea rdx,QWORD PTR __log_256_tail
|
||||
vmovsd xmm1,QWORD PTR[rdx + rax * 8]
|
||||
vaddsd xmm1,xmm1,xmm5
|
||||
|
||||
vfmadd231sd xmm0,xmm6,QWORD PTR __real_log2_lead
|
||||
|
||||
vaddsd xmm0,xmm0,xmm1
|
||||
AVXRestoreXmm xmm6, save_xmm6
|
||||
StackDeallocate stack_size
|
||||
ret
|
||||
|
||||
|
||||
ALIGN 16
|
||||
Llog_fma3_near_one:
|
||||
|
||||
; r = x - 1.0
|
||||
vmovsd xmm3,QWORD PTR __real_two
|
||||
vsubsd xmm0,xmm0,QWORD PTR __real_one ; r
|
||||
|
||||
vaddsd xmm3,xmm3,xmm0
|
||||
vdivsd xmm1,xmm0,xmm3 ; r/(2+r) = u/2
|
||||
|
||||
vmovsd xmm4,QWORD PTR __real_ca2
|
||||
vmovsd xmm5,QWORD PTR __real_ca4
|
||||
|
||||
vmulsd xmm3,xmm0,xmm1 ; correction
|
||||
vaddsd xmm1,xmm1,xmm1 ; u
|
||||
|
||||
vmulsd xmm2,xmm1,xmm1 ; u^2
|
||||
vfmadd213sd xmm4,xmm2,QWORD PTR __real_ca1
|
||||
vfmadd213sd xmm5,xmm2,QWORD PTR __real_ca3
|
||||
|
||||
vmulsd xmm2,xmm2,xmm1 ; u^3
|
||||
vmulsd xmm4,xmm4,xmm2
|
||||
|
||||
vmulsd xmm2,xmm2,xmm2
|
||||
vmulsd xmm2,xmm2,xmm1 ; u^7
|
||||
|
||||
vfmadd231sd xmm4,xmm5,xmm2
|
||||
vsubsd xmm4,xmm4,xmm3
|
||||
vaddsd xmm0,xmm0,xmm4
|
||||
|
||||
AVXRestoreXmm xmm6, save_xmm6
|
||||
StackDeallocate stack_size
|
||||
ret
|
||||
|
||||
|
||||
Llog_fma3_denormal_adjust:
|
||||
vpor xmm2,xmm2,QWORD PTR __real_one
|
||||
vsubsd xmm2,xmm2,QWORD PTR __real_one
|
||||
vpsrlq xmm5,xmm2,52
|
||||
vpand xmm2,xmm2,QWORD PTR __real_mant
|
||||
vmovapd xmm0,xmm2
|
||||
vpsubd xmm5,xmm5,XMMWORD PTR __mask_2045
|
||||
vcvtdq2pd xmm6,xmm5
|
||||
jmp Llog_fma3_continue_common
|
||||
|
||||
ALIGN 16
|
||||
Llog_fma3_x_is_zero_or_neg:
|
||||
jne Llog_fma3_x_is_neg
|
||||
vmovsd xmm1,QWORD PTR __real_ninf
|
||||
mov r8d,DWORD PTR __flag_x_zero
|
||||
call fname_special
|
||||
|
||||
AVXRestoreXmm xmm6, save_xmm6
|
||||
StackDeallocate stack_size
|
||||
ret
|
||||
|
||||
ALIGN 16
|
||||
Llog_fma3_x_is_neg:
|
||||
|
||||
vmovsd xmm1,QWORD PTR __real_neg_qnan
|
||||
mov r8d,DWORD PTR __flag_x_neg
|
||||
call fname_special
|
||||
|
||||
AVXRestoreXmm xmm6, save_xmm6
|
||||
StackDeallocate stack_size
|
||||
ret
|
||||
|
||||
ALIGN 16
|
||||
Llog_fma3_x_is_inf_or_nan:
|
||||
|
||||
cmp rax,QWORD PTR __real_inf
|
||||
je Llog_fma3_finish
|
||||
|
||||
cmp rax,QWORD PTR __real_ninf
|
||||
je Llog_fma3_x_is_neg
|
||||
|
||||
or rax,QWORD PTR __real_qnanbit
|
||||
vmovq xmm1,rax
|
||||
mov r8d,DWORD PTR __flag_x_nan
|
||||
call fname_special
|
||||
|
||||
ALIGN 16
|
||||
Llog_fma3_finish:
|
||||
AVXRestoreXmm xmm6, save_xmm6
|
||||
StackDeallocate stack_size
|
||||
ret
|
||||
fname endp
|
||||
|
||||
END
|
||||
|
565
sdk/lib/crt/math/libm_sse2/log10.asm
Normal file
565
sdk/lib/crt/math/libm_sse2/log10.asm
Normal file
|
@ -0,0 +1,565 @@
|
|||
;
|
||||
; MIT License
|
||||
; -----------
|
||||
;
|
||||
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
||||
;
|
||||
; Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
; of this Software and associated documentaon files (the "Software"), to deal
|
||||
; in the Software without restriction, including without limitation the rights
|
||||
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
; copies of the Software, and to permit persons to whom the Software is
|
||||
; furnished to do so, subject to the following conditions:
|
||||
;
|
||||
; The above copyright notice and this permission notice shall be included in
|
||||
; all copies or substantial portions of the Software.
|
||||
;
|
||||
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
; THE SOFTWARE.
|
||||
;
|
||||
; log10.asm
|
||||
;
|
||||
; An implementation of the log10 libm function.
|
||||
;
|
||||
; Prototype:
|
||||
;
|
||||
; double log10(double x);
|
||||
;
|
||||
|
||||
;
|
||||
; Algorithm:
|
||||
; Similar to one presnted in log.asm
|
||||
;
|
||||
|
||||
.const
|
||||
|
||||
ALIGN 16
|
||||
|
||||
__real_ninf DQ 0fff0000000000000h ; -inf
|
||||
DQ 0000000000000000h
|
||||
__real_inf DQ 7ff0000000000000h ; +inf
|
||||
DQ 0000000000000000h
|
||||
__real_neg_qnan DQ 0fff8000000000000h ; neg qNaN
|
||||
DQ 0000000000000000h
|
||||
__real_qnanbit DQ 0008000000000000h
|
||||
DQ 0000000000000000h
|
||||
__int_1023 DQ 00000000000003ffh
|
||||
DQ 0000000000000000h
|
||||
__mask_001 DQ 0000000000000001h
|
||||
DQ 0000000000000000h
|
||||
|
||||
__mask_mant DQ 000FFFFFFFFFFFFFh ; mask for mantissa bits
|
||||
DQ 0000000000000000h
|
||||
|
||||
__mask_mant_top8 DQ 000ff00000000000h ; mask for top 8 mantissa bits
|
||||
DQ 0000000000000000h
|
||||
|
||||
__mask_mant9 DQ 0000080000000000h ; mask for 9th mantissa bit
|
||||
DQ 0000000000000000h
|
||||
|
||||
__real_log10_e DQ 3fdbcb7b1526e50eh
|
||||
DQ 0000000000000000h
|
||||
|
||||
__real_log10_e_lead DQ 3fdbcb7800000000h ; log10e_lead 4.34293746948242187500e-01
|
||||
DQ 0000000000000000h
|
||||
__real_log10_e_tail DQ 3ea8a93728719535h ; log10e_tail 7.3495500964015109100644e-7
|
||||
DQ 0000000000000000h
|
||||
|
||||
__real_log10_2_lead DQ 3fd3441350000000h
|
||||
DQ 0000000000000000h
|
||||
__real_log10_2_tail DQ 3e03ef3fde623e25h
|
||||
DQ 0000000000000000h
|
||||
|
||||
__real_two DQ 4000000000000000h ; 2
|
||||
DQ 0000000000000000h
|
||||
|
||||
__real_one DQ 3ff0000000000000h ; 1
|
||||
DQ 0000000000000000h
|
||||
|
||||
__real_half DQ 3fe0000000000000h ; 1/2
|
||||
DQ 0000000000000000h
|
||||
|
||||
__mask_100 DQ 0000000000000100h
|
||||
DQ 0000000000000000h
|
||||
__real_1_over_512 DQ 3f60000000000000h
|
||||
DQ 0000000000000000h
|
||||
|
||||
__real_1_over_2 DQ 3fe0000000000000h
|
||||
DQ 0000000000000000h
|
||||
__real_1_over_3 DQ 3fd5555555555555h
|
||||
DQ 0000000000000000h
|
||||
__real_1_over_4 DQ 3fd0000000000000h
|
||||
DQ 0000000000000000h
|
||||
__real_1_over_5 DQ 3fc999999999999ah
|
||||
DQ 0000000000000000h
|
||||
__real_1_over_6 DQ 3fc5555555555555h
|
||||
DQ 0000000000000000h
|
||||
|
||||
__real_neg_1023 DQ 0c08ff80000000000h
|
||||
DQ 0000000000000000h
|
||||
|
||||
__mask_2045 DQ 00000000000007fdh
|
||||
DQ 0000000000000000h
|
||||
|
||||
__real_threshold DQ 3fb0000000000000h ; .0625
|
||||
DQ 0000000000000000h
|
||||
|
||||
__real_near_one_lt DQ 3fee000000000000h ; .9375
|
||||
DQ 0000000000000000h
|
||||
|
||||
__real_near_one_gt DQ 3ff1000000000000h ; 1.0625
|
||||
DQ 0000000000000000h
|
||||
|
||||
__real_min_norm DQ 0010000000000000h
|
||||
DQ 0000000000000000h
|
||||
|
||||
__real_notsign DQ 7ffFFFFFFFFFFFFFh ; ^sign bit
|
||||
DQ 0000000000000000h
|
||||
|
||||
__real_ca1 DQ 3fb55555555554e6h ; 8.33333333333317923934e-02
|
||||
DQ 0000000000000000h
|
||||
__real_ca2 DQ 3f89999999bac6d4h ; 1.25000000037717509602e-02
|
||||
DQ 0000000000000000h
|
||||
__real_ca3 DQ 3f62492307f1519fh ; 2.23213998791944806202e-03
|
||||
DQ 0000000000000000h
|
||||
__real_ca4 DQ 3f3c8034c85dfff0h ; 4.34887777707614552256e-04
|
||||
DQ 0000000000000000h
|
||||
|
||||
__mask_lower DQ 0ffffffff00000000h
|
||||
DQ 0000000000000000h
|
||||
|
||||
; these codes and the ones in the corresponding .c file have to match
|
||||
__flag_x_zero DD 00000001
|
||||
__flag_x_neg DD 00000002
|
||||
__flag_x_nan DD 00000003
|
||||
|
||||
|
||||
EXTRN __log10_256_lead:QWORD
|
||||
EXTRN __log10_256_tail:QWORD
|
||||
EXTRN __log_F_inv_qword:QWORD
|
||||
EXTRN __use_fma3_lib:DWORD
|
||||
|
||||
|
||||
; local variable storage offsets
|
||||
save_xmm6 EQU 20h
|
||||
dummy_space EQU 30h
|
||||
stack_size EQU 058h
|
||||
|
||||
include fm.inc
|
||||
|
||||
fname TEXTEQU <log10>
|
||||
fname_special TEXTEQU <_log10_special>
|
||||
|
||||
EXTERN fname_special:PROC
|
||||
|
||||
.code
|
||||
ALIGN 16
|
||||
PUBLIC fname
|
||||
fname PROC FRAME
|
||||
StackAllocate stack_size
|
||||
SaveXmm xmm6, save_xmm6
|
||||
.ENDPROLOG
|
||||
|
||||
cmp DWORD PTR __use_fma3_lib, 0
|
||||
jne Llog10_fma3
|
||||
|
||||
Llog10_sse2:
|
||||
|
||||
; compute exponent part
|
||||
movapd xmm3, xmm0
|
||||
movapd xmm4, xmm0
|
||||
psrlq xmm3, 52
|
||||
movd rax, xmm0
|
||||
psubq xmm3, XMMWORD PTR __int_1023 ; xmm3 <-- unbiased exponent
|
||||
|
||||
; NaN or inf
|
||||
movapd xmm5, xmm0
|
||||
andpd xmm5, XMMWORD PTR __real_inf
|
||||
comisd xmm5, QWORD PTR __real_inf
|
||||
je Llog10_sse2_x_is_inf_or_nan
|
||||
|
||||
movapd xmm2, xmm0
|
||||
cvtdq2pd xmm6, xmm3 ; xmm6 <-- unbiased exp as double
|
||||
|
||||
|
||||
pand xmm2, XMMWORD PTR __mask_mant
|
||||
subsd xmm4, QWORD PTR __real_one
|
||||
|
||||
comisd xmm6, QWORD PTR __real_neg_1023
|
||||
je Llog10_sse2_denormal_adjust
|
||||
|
||||
Llog10_sse2_continue_common:
|
||||
|
||||
andpd xmm4, XMMWORD PTR __real_notsign
|
||||
; compute index into the log tables
|
||||
mov r9, rax
|
||||
and rax, QWORD PTR __mask_mant_top8
|
||||
and r9, QWORD PTR __mask_mant9
|
||||
shl r9, 1
|
||||
add rax, r9
|
||||
movd xmm1, rax
|
||||
|
||||
; near one codepath
|
||||
comisd xmm4, QWORD PTR __real_threshold
|
||||
jb Llog10_sse2_near_one
|
||||
|
||||
; F, Y
|
||||
shr rax, 44
|
||||
por xmm2, XMMWORD PTR __real_half
|
||||
por xmm1, XMMWORD PTR __real_half
|
||||
lea r9, QWORD PTR __log_F_inv_qword
|
||||
|
||||
; check for negative numbers or zero
|
||||
xorpd xmm5, xmm5
|
||||
comisd xmm0, xmm5
|
||||
jbe Llog10_sse2_x_is_zero_or_neg
|
||||
|
||||
; f = F - Y, r = f * inv
|
||||
subsd xmm1, xmm2
|
||||
mulsd xmm1, QWORD PTR [r9+rax*8]
|
||||
|
||||
movapd xmm2, xmm1
|
||||
movapd xmm0, xmm1
|
||||
lea r9, QWORD PTR __log10_256_lead
|
||||
|
||||
; poly
|
||||
movsd xmm3, QWORD PTR __real_1_over_6
|
||||
movsd xmm1, QWORD PTR __real_1_over_3
|
||||
mulsd xmm3, xmm2
|
||||
mulsd xmm1, xmm2
|
||||
mulsd xmm0, xmm2
|
||||
movapd xmm4, xmm0
|
||||
addsd xmm3, QWORD PTR __real_1_over_5
|
||||
addsd xmm1, QWORD PTR __real_1_over_2
|
||||
mulsd xmm4, xmm0
|
||||
mulsd xmm3, xmm2
|
||||
mulsd xmm1, xmm0
|
||||
addsd xmm3, QWORD PTR __real_1_over_4
|
||||
addsd xmm1, xmm2
|
||||
mulsd xmm3, xmm4
|
||||
addsd xmm1, xmm3
|
||||
|
||||
movsd xmm5, QWORD PTR __real_log10_2_tail
|
||||
mulsd xmm1, QWORD PTR __real_log10_e
|
||||
|
||||
; m*log(10) + log10(G) - poly
|
||||
mulsd xmm5, xmm6
|
||||
subsd xmm5, xmm1
|
||||
|
||||
movsd xmm0, QWORD PTR [r9+rax*8]
|
||||
lea rdx, QWORD PTR __log10_256_tail
|
||||
movsd xmm2, QWORD PTR [rdx+rax*8]
|
||||
|
||||
movsd xmm4, QWORD PTR __real_log10_2_lead
|
||||
mulsd xmm4, xmm6
|
||||
addsd xmm0, xmm4
|
||||
addsd xmm2, xmm5
|
||||
|
||||
addsd xmm0, xmm2
|
||||
|
||||
RestoreXmm xmm6, save_xmm6
|
||||
StackDeallocate stack_size
|
||||
ret
|
||||
|
||||
ALIGN 16
|
||||
Llog10_sse2_near_one:
|
||||
|
||||
; r = x - 1.0
|
||||
movsd xmm2, QWORD PTR __real_two
|
||||
subsd xmm0, QWORD PTR __real_one ; r
|
||||
|
||||
addsd xmm2, xmm0
|
||||
movapd xmm1, xmm0
|
||||
divsd xmm1, xmm2 ; r/(2+r) = u/2
|
||||
|
||||
movsd xmm4, QWORD PTR __real_ca2
|
||||
movsd xmm5, QWORD PTR __real_ca4
|
||||
|
||||
movapd xmm6, xmm0
|
||||
mulsd xmm6, xmm1 ; correction
|
||||
|
||||
addsd xmm1, xmm1 ; u
|
||||
movapd xmm2, xmm1
|
||||
|
||||
mulsd xmm2, xmm1 ; u^2
|
||||
|
||||
mulsd xmm4, xmm2
|
||||
mulsd xmm5, xmm2
|
||||
|
||||
addsd xmm4, QWORD PTR __real_ca1
|
||||
addsd xmm5, QWORD PTR __real_ca3
|
||||
|
||||
mulsd xmm2, xmm1 ; u^3
|
||||
mulsd xmm4, xmm2
|
||||
|
||||
mulsd xmm2, xmm2
|
||||
mulsd xmm2, xmm1 ; u^7
|
||||
mulsd xmm5, xmm2
|
||||
|
||||
movsd xmm2, QWORD PTR __real_log10_e_tail
|
||||
addsd xmm4, xmm5
|
||||
subsd xmm4, xmm6
|
||||
movsd xmm6, QWORD PTR __real_log10_e_lead
|
||||
|
||||
movapd xmm3, xmm0
|
||||
pand xmm3, XMMWORD PTR __mask_lower
|
||||
subsd xmm0, xmm3
|
||||
addsd xmm4, xmm0
|
||||
|
||||
movapd xmm0, xmm3
|
||||
movapd xmm1, xmm4
|
||||
|
||||
mulsd xmm4, xmm2
|
||||
mulsd xmm0, xmm2
|
||||
mulsd xmm1, xmm6
|
||||
mulsd xmm3, xmm6
|
||||
|
||||
addsd xmm0, xmm4
|
||||
addsd xmm0, xmm1
|
||||
addsd xmm0, xmm3
|
||||
|
||||
RestoreXmm xmm6, save_xmm6
|
||||
StackDeallocate stack_size
|
||||
ret
|
||||
|
||||
Llog10_sse2_denormal_adjust:
|
||||
por xmm2, XMMWORD PTR __real_one
|
||||
subsd xmm2, QWORD PTR __real_one
|
||||
movsd xmm5, xmm2
|
||||
pand xmm2, XMMWORD PTR __mask_mant
|
||||
movd rax, xmm2
|
||||
psrlq xmm5, 52
|
||||
psubd xmm5, XMMWORD PTR __mask_2045
|
||||
cvtdq2pd xmm6, xmm5
|
||||
jmp Llog10_sse2_continue_common
|
||||
|
||||
ALIGN 16
|
||||
Llog10_sse2_x_is_zero_or_neg:
|
||||
jne Llog10_sse2_x_is_neg
|
||||
|
||||
movsd xmm1, QWORD PTR __real_ninf
|
||||
mov r8d, DWORD PTR __flag_x_zero
|
||||
call fname_special
|
||||
jmp Llog10_sse2_finish
|
||||
|
||||
ALIGN 16
|
||||
Llog10_sse2_x_is_neg:
|
||||
|
||||
movsd xmm1, QWORD PTR __real_neg_qnan
|
||||
mov r8d, DWORD PTR __flag_x_neg
|
||||
call fname_special
|
||||
jmp Llog10_sse2_finish
|
||||
|
||||
ALIGN 16
|
||||
Llog10_sse2_x_is_inf_or_nan:
|
||||
|
||||
cmp rax, QWORD PTR __real_inf
|
||||
je Llog10_sse2_finish
|
||||
|
||||
cmp rax, QWORD PTR __real_ninf
|
||||
je Llog10_sse2_x_is_neg
|
||||
|
||||
or rax, QWORD PTR __real_qnanbit
|
||||
movd xmm1, rax
|
||||
mov r8d, DWORD PTR __flag_x_nan
|
||||
call fname_special
|
||||
jmp Llog10_sse2_finish
|
||||
|
||||
ALIGN 16
|
||||
Llog10_sse2_finish:
|
||||
RestoreXmm xmm6, save_xmm6
|
||||
StackDeallocate stack_size
|
||||
ret
|
||||
|
||||
ALIGN 16
|
||||
Llog10_fma3:
|
||||
; compute exponent part
|
||||
xor rax,rax
|
||||
vpsrlq xmm3,xmm0,52
|
||||
vmovq rax,xmm0
|
||||
vpsubq xmm3,xmm3,QWORD PTR __int_1023
|
||||
vcvtdq2pd xmm6,xmm3 ; xmm6 <-- (double)xexp
|
||||
|
||||
; NaN or Inf?
|
||||
vpand xmm5,xmm0,__real_inf
|
||||
vcomisd xmm5,QWORD PTR __real_inf
|
||||
je Llog10_fma3_x_is_inf_or_nan
|
||||
|
||||
; negative number or zero?
|
||||
vpxor xmm5,xmm5,xmm5
|
||||
vcomisd xmm0,xmm5
|
||||
jbe Llog10_fma3_x_is_zero_or_neg
|
||||
|
||||
vpand xmm2,xmm0,__mask_mant
|
||||
vsubsd xmm4,xmm0,QWORD PTR __real_one
|
||||
|
||||
; Subnormal?
|
||||
vcomisd xmm6,QWORD PTR __real_neg_1023
|
||||
je Llog10_fma3_denormal_adjust
|
||||
|
||||
Llog10_fma3_continue_common:
|
||||
; compute index into the log tables
|
||||
vpand xmm1,xmm0,DWORD PTR __mask_mant_top8
|
||||
vpand xmm3,xmm0,DWORD PTR __mask_mant9
|
||||
vpsllq xmm3,xmm3,1
|
||||
vpaddq xmm1,xmm3,xmm1
|
||||
vmovq rax,xmm1
|
||||
|
||||
; near one codepath
|
||||
vpand xmm4,xmm4,DWORD PTR __real_notsign
|
||||
vcomisd xmm4,QWORD PTR __real_threshold
|
||||
jb Llog10_fma3_near_one
|
||||
|
||||
; F,Y
|
||||
shr rax,44
|
||||
vpor xmm2,xmm2,DWORD PTR __real_half
|
||||
vpor xmm1,xmm1,DWORD PTR __real_half
|
||||
lea r9,DWORD PTR __log_F_inv_qword
|
||||
|
||||
; f = F - Y,r = f * inv
|
||||
vsubsd xmm1,xmm1,xmm2
|
||||
vmulsd xmm1,xmm1,QWORD PTR [r9 + rax * 8]
|
||||
|
||||
lea r9,DWORD PTR __log10_256_lead
|
||||
|
||||
; poly
|
||||
vmulsd xmm0,xmm1,xmm1 ; r*r
|
||||
vmovsd xmm3,QWORD PTR __real_1_over_6
|
||||
vmovsd xmm5,QWORD PTR __real_1_over_3
|
||||
vfmadd213sd xmm3,xmm1,QWORD PTR __real_1_over_5 ; r*1/6 + 1/5
|
||||
vfmadd213sd xmm5,xmm1,QWORD PTR __real_half ; 1/2+r*1/3
|
||||
movsd xmm4,xmm0 ; r*r
|
||||
vfmadd213sd xmm3 ,xmm1,QWORD PTR __real_1_over_4 ; 1/4+(1/5*r+r*r*1/6)
|
||||
|
||||
vmulsd xmm4,xmm0,xmm0 ; r*r*r*r
|
||||
vfmadd231sd xmm1,xmm5,xmm0 ; r*r*(1/2+r*1/3) + r
|
||||
vfmadd231sd xmm1,xmm3,xmm4
|
||||
|
||||
vmulsd xmm1,xmm1,QWORD PTR __real_log10_e
|
||||
; m*log(2) + log(G) - poly*log10_e
|
||||
vmovsd xmm5,QWORD PTR __real_log10_2_tail
|
||||
vfmsub213sd xmm5,xmm6,xmm1
|
||||
|
||||
movsd xmm0,QWORD PTR [r9 + rax * 8]
|
||||
lea rdx,DWORD PTR __log10_256_tail
|
||||
movsd xmm2,QWORD PTR [rdx + rax * 8]
|
||||
vaddsd xmm2,xmm2,xmm5
|
||||
|
||||
vfmadd231sd xmm0,xmm6,QWORD PTR __real_log10_2_lead
|
||||
|
||||
vaddsd xmm0,xmm0,xmm2
|
||||
AVXRestoreXmm xmm6, save_xmm6
|
||||
StackDeallocate stack_size
|
||||
ret
|
||||
|
||||
|
||||
ALIGN 16
|
||||
Llog10_fma3_near_one:
|
||||
; r = x - 1.0
|
||||
vmovsd xmm2,QWORD PTR __real_two
|
||||
vsubsd xmm0,xmm0,QWORD PTR __real_one ; r
|
||||
|
||||
vaddsd xmm2,xmm2,xmm0
|
||||
vdivsd xmm1,xmm0,xmm2 ; r/(2+r) = u/2
|
||||
|
||||
vmovsd xmm4,QWORD PTR __real_ca2
|
||||
vmovsd xmm5,QWORD PTR __real_ca4
|
||||
|
||||
vmulsd xmm6,xmm0,xmm1 ; correction
|
||||
vaddsd xmm1,xmm1,xmm1 ; u
|
||||
|
||||
vmulsd xmm2,xmm1,xmm1 ; u^2
|
||||
vfmadd213sd xmm4,xmm2,QWORD PTR __real_ca1
|
||||
vfmadd213sd xmm5,xmm2,QWORD PTR __real_ca3
|
||||
|
||||
vmulsd xmm2,xmm2,xmm1 ; u^3
|
||||
vmulsd xmm4,xmm4,xmm2
|
||||
|
||||
vmulsd xmm2,xmm2,xmm2
|
||||
vmulsd xmm2,xmm2,xmm1 ; u^7
|
||||
|
||||
vmulsd xmm5,xmm5,xmm2
|
||||
vaddsd xmm4,xmm4,xmm5
|
||||
vsubsd xmm4,xmm4,xmm6
|
||||
vpand xmm3,xmm0,QWORD PTR __mask_lower
|
||||
vsubsd xmm0,xmm0,xmm3
|
||||
vaddsd xmm4,xmm4,xmm0
|
||||
|
||||
vmulsd xmm1,xmm4,QWORD PTR __real_log10_e_lead
|
||||
vmulsd xmm4,xmm4,QWORD PTR __real_log10_e_tail
|
||||
vmulsd xmm0,xmm3,QWORD PTR __real_log10_e_tail
|
||||
vmulsd xmm3,xmm3,QWORD PTR __real_log10_e_lead
|
||||
|
||||
vaddsd xmm0,xmm0,xmm4
|
||||
vaddsd xmm0,xmm0,xmm1
|
||||
vaddsd xmm0,xmm0,xmm3
|
||||
|
||||
AVXRestoreXmm xmm6, save_xmm6
|
||||
StackDeallocate stack_size
|
||||
ret
|
||||
|
||||
|
||||
Llog10_fma3_denormal_adjust:
|
||||
vpor xmm2,xmm2,QWORD PTR __real_one
|
||||
vsubsd xmm2,xmm2,QWORD PTR __real_one
|
||||
vpsrlq xmm5,xmm2,52
|
||||
vpand xmm2,xmm2,QWORD PTR __mask_mant
|
||||
vmovapd xmm0,xmm2
|
||||
vpsubd xmm5,xmm5,DWORD PTR __mask_2045
|
||||
vcvtdq2pd xmm6,xmm5
|
||||
jmp Llog10_fma3_continue_common
|
||||
|
||||
ALIGN 16
|
||||
Llog10_fma3_x_is_zero_or_neg:
|
||||
jne Llog10_fma3_x_is_neg
|
||||
vmovsd xmm1,QWORD PTR __real_ninf
|
||||
mov r8d,DWORD PTR __flag_x_zero
|
||||
call fname_special
|
||||
|
||||
AVXRestoreXmm xmm6, save_xmm6
|
||||
StackDeallocate stack_size
|
||||
ret
|
||||
|
||||
|
||||
ALIGN 16
|
||||
Llog10_fma3_x_is_neg:
|
||||
|
||||
vmovsd xmm1,QWORD PTR __real_neg_qnan
|
||||
mov r8d,DWORD PTR __flag_x_neg
|
||||
call fname_special
|
||||
|
||||
AVXRestoreXmm xmm6, save_xmm6
|
||||
StackDeallocate stack_size
|
||||
ret
|
||||
|
||||
|
||||
ALIGN 16
|
||||
Llog10_fma3_x_is_inf_or_nan:
|
||||
|
||||
cmp rax,QWORD PTR __real_inf
|
||||
je Llog10_fma3_finish
|
||||
|
||||
cmp rax,QWORD PTR __real_ninf
|
||||
je Llog10_fma3_x_is_neg
|
||||
|
||||
or rax,QWORD PTR __real_qnanbit
|
||||
movd xmm1,rax
|
||||
mov r8d,DWORD PTR __flag_x_nan
|
||||
call fname_special
|
||||
jmp Llog10_fma3_finish
|
||||
|
||||
ALIGN 16
|
||||
Llog10_fma3_finish:
|
||||
|
||||
AVXRestoreXmm xmm6, save_xmm6
|
||||
StackDeallocate stack_size
|
||||
ret
|
||||
fname endp
|
||||
|
||||
END
|
||||
|
297
sdk/lib/crt/math/libm_sse2/log10_128_lead_tail_table.asm
Normal file
297
sdk/lib/crt/math/libm_sse2/log10_128_lead_tail_table.asm
Normal file
|
@ -0,0 +1,297 @@
|
|||
;
|
||||
; MIT License
|
||||
; -----------
|
||||
;
|
||||
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
||||
;
|
||||
; Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
; of this Software and associated documentaon files (the "Software"), to deal
|
||||
; in the Software without restriction, including without limitation the rights
|
||||
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
; copies of the Software, and to permit persons to whom the Software is
|
||||
; furnished to do so, subject to the following conditions:
|
||||
;
|
||||
; The above copyright notice and this permission notice shall be included in
|
||||
; all copies or substantial portions of the Software.
|
||||
;
|
||||
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
; THE SOFTWARE.
|
||||
;
|
||||
;;
|
||||
;; Defines __log_128_lead and __log_128_tail tables
|
||||
;; Used by log and pow
|
||||
;;
|
||||
|
||||
.const
|
||||
|
||||
ALIGN 16
|
||||
PUBLIC __log10_128_lead
|
||||
__log10_128_lead:
|
||||
DD 00000000h
|
||||
DD 3b5d4000h
|
||||
DD 3bdc8000h
|
||||
DD 3c24c000h
|
||||
DD 3c5ac000h
|
||||
DD 3c884000h
|
||||
DD 3ca2c000h
|
||||
DD 3cbd4000h
|
||||
DD 3cd78000h
|
||||
DD 3cf1c000h
|
||||
DD 3d05c000h
|
||||
DD 3d128000h
|
||||
DD 3d1f4000h
|
||||
DD 3d2c0000h
|
||||
DD 3d388000h
|
||||
DD 3d450000h
|
||||
DD 3d518000h
|
||||
DD 3d5dc000h
|
||||
DD 3d6a0000h
|
||||
DD 3d760000h
|
||||
DD 3d810000h
|
||||
DD 3d870000h
|
||||
DD 3d8d0000h
|
||||
DD 3d92c000h
|
||||
DD 3d98c000h
|
||||
DD 3d9e8000h
|
||||
DD 3da44000h
|
||||
DD 3daa0000h
|
||||
DD 3dafc000h
|
||||
DD 3db58000h
|
||||
DD 3dbb4000h
|
||||
DD 3dc0c000h
|
||||
DD 3dc64000h
|
||||
DD 3dcc0000h
|
||||
DD 3dd18000h
|
||||
DD 3dd6c000h
|
||||
DD 3ddc4000h
|
||||
DD 3de1c000h
|
||||
DD 3de70000h
|
||||
DD 3dec8000h
|
||||
DD 3df1c000h
|
||||
DD 3df70000h
|
||||
DD 3dfc4000h
|
||||
DD 3e00c000h
|
||||
DD 3e034000h
|
||||
DD 3e05c000h
|
||||
DD 3e088000h
|
||||
DD 3e0b0000h
|
||||
DD 3e0d8000h
|
||||
DD 3e100000h
|
||||
DD 3e128000h
|
||||
DD 3e150000h
|
||||
DD 3e178000h
|
||||
DD 3e1a0000h
|
||||
DD 3e1c8000h
|
||||
DD 3e1ec000h
|
||||
DD 3e214000h
|
||||
DD 3e23c000h
|
||||
DD 3e260000h
|
||||
DD 3e288000h
|
||||
DD 3e2ac000h
|
||||
DD 3e2d4000h
|
||||
DD 3e2f8000h
|
||||
DD 3e31c000h
|
||||
DD 3e344000h
|
||||
DD 3e368000h
|
||||
DD 3e38c000h
|
||||
DD 3e3b0000h
|
||||
DD 3e3d4000h
|
||||
DD 3e3fc000h
|
||||
DD 3e420000h
|
||||
DD 3e440000h
|
||||
DD 3e464000h
|
||||
DD 3e488000h
|
||||
DD 3e4ac000h
|
||||
DD 3e4d0000h
|
||||
DD 3e4f4000h
|
||||
DD 3e514000h
|
||||
DD 3e538000h
|
||||
DD 3e55c000h
|
||||
DD 3e57c000h
|
||||
DD 3e5a0000h
|
||||
DD 3e5c0000h
|
||||
DD 3e5e4000h
|
||||
DD 3e604000h
|
||||
DD 3e624000h
|
||||
DD 3e648000h
|
||||
DD 3e668000h
|
||||
DD 3e688000h
|
||||
DD 3e6ac000h
|
||||
DD 3e6cc000h
|
||||
DD 3e6ec000h
|
||||
DD 3e70c000h
|
||||
DD 3e72c000h
|
||||
DD 3e74c000h
|
||||
DD 3e76c000h
|
||||
DD 3e78c000h
|
||||
DD 3e7ac000h
|
||||
DD 3e7cc000h
|
||||
DD 3e7ec000h
|
||||
DD 3e804000h
|
||||
DD 3e814000h
|
||||
DD 3e824000h
|
||||
DD 3e834000h
|
||||
DD 3e840000h
|
||||
DD 3e850000h
|
||||
DD 3e860000h
|
||||
DD 3e870000h
|
||||
DD 3e880000h
|
||||
DD 3e88c000h
|
||||
DD 3e89c000h
|
||||
DD 3e8ac000h
|
||||
DD 3e8bc000h
|
||||
DD 3e8c8000h
|
||||
DD 3e8d8000h
|
||||
DD 3e8e8000h
|
||||
DD 3e8f4000h
|
||||
DD 3e904000h
|
||||
DD 3e914000h
|
||||
DD 3e920000h
|
||||
DD 3e930000h
|
||||
DD 3e93c000h
|
||||
DD 3e94c000h
|
||||
DD 3e958000h
|
||||
DD 3e968000h
|
||||
DD 3e978000h
|
||||
DD 3e984000h
|
||||
DD 3e994000h
|
||||
DD 3e9a0000h
|
||||
|
||||
ALIGN 16
|
||||
PUBLIC __log10_128_tail
|
||||
__log10_128_tail:
|
||||
DD 00000000h
|
||||
DD 367a8e44h
|
||||
DD 368ed49fh
|
||||
DD 36c21451h
|
||||
DD 375211d6h
|
||||
DD 3720ea11h
|
||||
DD 37e9eb59h
|
||||
DD 37b87be7h
|
||||
DD 37bf2560h
|
||||
DD 33d597a0h
|
||||
DD 37806a05h
|
||||
DD 3820581fh
|
||||
DD 38223334h
|
||||
DD 378e3bach
|
||||
DD 3810684fh
|
||||
DD 37feb7aeh
|
||||
DD 36a9d609h
|
||||
DD 37a68163h
|
||||
DD 376a8b27h
|
||||
DD 384c8fd6h
|
||||
DD 3885183eh
|
||||
DD 3874a760h
|
||||
DD 380d1154h
|
||||
DD 38ea42bdh
|
||||
DD 384c1571h
|
||||
DD 38ba66b8h
|
||||
DD 38e7da3bh
|
||||
DD 38eee632h
|
||||
DD 38d00911h
|
||||
DD 388bbedeh
|
||||
DD 378a0512h
|
||||
DD 3894c7a0h
|
||||
DD 38e30710h
|
||||
DD 36db2829h
|
||||
DD 3729d609h
|
||||
DD 38fa0e82h
|
||||
DD 38bc9a75h
|
||||
DD 383a9297h
|
||||
DD 38dc83c8h
|
||||
DD 37eac335h
|
||||
DD 38706ac3h
|
||||
DD 389574c2h
|
||||
DD 3892d068h
|
||||
DD 38615032h
|
||||
DD 3917acf4h
|
||||
DD 3967a126h
|
||||
DD 38217840h
|
||||
DD 38b420abh
|
||||
DD 38f9c7b2h
|
||||
DD 391103bdh
|
||||
DD 39169a6bh
|
||||
DD 390dd194h
|
||||
DD 38eda471h
|
||||
DD 38a38950h
|
||||
DD 37f6844ah
|
||||
DD 395e1cdbh
|
||||
DD 390fcffch
|
||||
DD 38503e9dh
|
||||
DD 394b00fdh
|
||||
DD 38a9910ah
|
||||
DD 39518a31h
|
||||
DD 3882d2c2h
|
||||
DD 392488e4h
|
||||
DD 397b0affh
|
||||
DD 388a22d8h
|
||||
DD 3902bd5eh
|
||||
DD 39342f85h
|
||||
DD 39598811h
|
||||
DD 3972e6b1h
|
||||
DD 34d53654h
|
||||
DD 360ca25eh
|
||||
DD 39785cc0h
|
||||
DD 39630710h
|
||||
DD 39424ed7h
|
||||
DD 39165101h
|
||||
DD 38be5421h
|
||||
DD 37e7b0c0h
|
||||
DD 394fd0c3h
|
||||
DD 38efaaaah
|
||||
DD 37a8f566h
|
||||
DD 3927c744h
|
||||
DD 383fa4d5h
|
||||
DD 392d9e39h
|
||||
DD 3803feaeh
|
||||
DD 390a268ch
|
||||
DD 39692b80h
|
||||
DD 38789b4fh
|
||||
DD 3909307dh
|
||||
DD 394a601ch
|
||||
DD 35e67edch
|
||||
DD 383e386dh
|
||||
DD 38a7743dh
|
||||
DD 38dccec3h
|
||||
DD 38ff57e0h
|
||||
DD 39079d8bh
|
||||
DD 390651a6h
|
||||
DD 38f7bad9h
|
||||
DD 38d0ab82h
|
||||
DD 38979e7dh
|
||||
DD 381978eeh
|
||||
DD 397816c8h
|
||||
DD 39410cb2h
|
||||
DD 39015384h
|
||||
DD 3863fa28h
|
||||
DD 39f41065h
|
||||
DD 39c7668ah
|
||||
DD 39968afah
|
||||
DD 39430db9h
|
||||
DD 38a18cf3h
|
||||
DD 39eb2907h
|
||||
DD 39a9e10ch
|
||||
DD 39492800h
|
||||
DD 385a53d1h
|
||||
DD 39ce0cf7h
|
||||
DD 3979c7b2h
|
||||
DD 389f5d99h
|
||||
DD 39ceefcbh
|
||||
DD 39646a39h
|
||||
DD 380d7a9bh
|
||||
DD 39ad6650h
|
||||
DD 390ac3b8h
|
||||
DD 39d9a9a8h
|
||||
DD 39548a99h
|
||||
DD 39f73c4bh
|
||||
DD 3980960eh
|
||||
DD 374b3d5ah
|
||||
DD 39888f1eh
|
||||
DD 37679a07h
|
||||
DD 39826a13h
|
||||
END
|
552
sdk/lib/crt/math/libm_sse2/log10_256_lead_tail_table.asm
Normal file
552
sdk/lib/crt/math/libm_sse2/log10_256_lead_tail_table.asm
Normal file
|
@ -0,0 +1,552 @@
|
|||
;
|
||||
; MIT License
|
||||
; -----------
|
||||
;
|
||||
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
||||
;
|
||||
; Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
; of this Software and associated documentaon files (the "Software"), to deal
|
||||
; in the Software without restriction, including without limitation the rights
|
||||
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
; copies of the Software, and to permit persons to whom the Software is
|
||||
; furnished to do so, subject to the following conditions:
|
||||
;
|
||||
; The above copyright notice and this permission notice shall be included in
|
||||
; all copies or substantial portions of the Software.
|
||||
;
|
||||
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
; THE SOFTWARE.
|
||||
;
|
||||
;;
|
||||
;; Defines __log_256_lead and __log_256_tail tables
|
||||
;; Used by log and pow
|
||||
;;
|
||||
|
||||
.const
|
||||
|
||||
ALIGN 16
|
||||
PUBLIC __log10_256_lead
|
||||
__log10_256_lead:
|
||||
DQ 0000000000000000h
|
||||
DQ 3f5bbd9e90000000h
|
||||
DQ 3f6bafd470000000h
|
||||
DQ 3f74b99560000000h
|
||||
DQ 3f7b9476a0000000h
|
||||
DQ 3f81344da0000000h
|
||||
DQ 3f849b0850000000h
|
||||
DQ 3f87fe71c0000000h
|
||||
DQ 3f8b5e9080000000h
|
||||
DQ 3f8ebb6af0000000h
|
||||
DQ 3f910a83a0000000h
|
||||
DQ 3f92b5b5e0000000h
|
||||
DQ 3f945f4f50000000h
|
||||
DQ 3f96075300000000h
|
||||
DQ 3f97adc3d0000000h
|
||||
DQ 3f9952a4f0000000h
|
||||
DQ 3f9af5f920000000h
|
||||
DQ 3f9c97c370000000h
|
||||
DQ 3f9e3806a0000000h
|
||||
DQ 3f9fd6c5b0000000h
|
||||
DQ 3fa0ba01a0000000h
|
||||
DQ 3fa187e120000000h
|
||||
DQ 3fa25502c0000000h
|
||||
DQ 3fa32167c0000000h
|
||||
DQ 3fa3ed1190000000h
|
||||
DQ 3fa4b80180000000h
|
||||
DQ 3fa58238e0000000h
|
||||
DQ 3fa64bb910000000h
|
||||
DQ 3fa7148340000000h
|
||||
DQ 3fa7dc98c0000000h
|
||||
DQ 3fa8a3fad0000000h
|
||||
DQ 3fa96aaac0000000h
|
||||
DQ 3faa30a9d0000000h
|
||||
DQ 3faaf5f920000000h
|
||||
DQ 3fabba9a00000000h
|
||||
DQ 3fac7e8d90000000h
|
||||
DQ 3fad41d510000000h
|
||||
DQ 3fae0471a0000000h
|
||||
DQ 3faec66470000000h
|
||||
DQ 3faf87aeb0000000h
|
||||
DQ 3fb02428c0000000h
|
||||
DQ 3fb08426f0000000h
|
||||
DQ 3fb0e3d290000000h
|
||||
DQ 3fb1432c30000000h
|
||||
DQ 3fb1a23440000000h
|
||||
DQ 3fb200eb60000000h
|
||||
DQ 3fb25f5210000000h
|
||||
DQ 3fb2bd68e0000000h
|
||||
DQ 3fb31b3050000000h
|
||||
DQ 3fb378a8e0000000h
|
||||
DQ 3fb3d5d330000000h
|
||||
DQ 3fb432afa0000000h
|
||||
DQ 3fb48f3ed0000000h
|
||||
DQ 3fb4eb8120000000h
|
||||
DQ 3fb5477730000000h
|
||||
DQ 3fb5a32160000000h
|
||||
DQ 3fb5fe8040000000h
|
||||
DQ 3fb6599440000000h
|
||||
DQ 3fb6b45df0000000h
|
||||
DQ 3fb70eddb0000000h
|
||||
DQ 3fb7691400000000h
|
||||
DQ 3fb7c30160000000h
|
||||
DQ 3fb81ca630000000h
|
||||
DQ 3fb8760300000000h
|
||||
DQ 3fb8cf1830000000h
|
||||
DQ 3fb927e640000000h
|
||||
DQ 3fb9806d90000000h
|
||||
DQ 3fb9d8aea0000000h
|
||||
DQ 3fba30a9d0000000h
|
||||
DQ 3fba885fa0000000h
|
||||
DQ 3fbadfd070000000h
|
||||
DQ 3fbb36fcb0000000h
|
||||
DQ 3fbb8de4d0000000h
|
||||
DQ 3fbbe48930000000h
|
||||
DQ 3fbc3aea40000000h
|
||||
DQ 3fbc910870000000h
|
||||
DQ 3fbce6e410000000h
|
||||
DQ 3fbd3c7da0000000h
|
||||
DQ 3fbd91d580000000h
|
||||
DQ 3fbde6ec00000000h
|
||||
DQ 3fbe3bc1a0000000h
|
||||
DQ 3fbe9056b0000000h
|
||||
DQ 3fbee4aba0000000h
|
||||
DQ 3fbf38c0c0000000h
|
||||
DQ 3fbf8c9680000000h
|
||||
DQ 3fbfe02d30000000h
|
||||
DQ 3fc019c2a0000000h
|
||||
DQ 3fc0434f70000000h
|
||||
DQ 3fc06cbd60000000h
|
||||
DQ 3fc0960c80000000h
|
||||
DQ 3fc0bf3d00000000h
|
||||
DQ 3fc0e84f10000000h
|
||||
DQ 3fc11142f0000000h
|
||||
DQ 3fc13a18a0000000h
|
||||
DQ 3fc162d080000000h
|
||||
DQ 3fc18b6a90000000h
|
||||
DQ 3fc1b3e710000000h
|
||||
DQ 3fc1dc4630000000h
|
||||
DQ 3fc2048810000000h
|
||||
DQ 3fc22cace0000000h
|
||||
DQ 3fc254b4d0000000h
|
||||
DQ 3fc27c9ff0000000h
|
||||
DQ 3fc2a46e80000000h
|
||||
DQ 3fc2cc20b0000000h
|
||||
DQ 3fc2f3b690000000h
|
||||
DQ 3fc31b3050000000h
|
||||
DQ 3fc3428e20000000h
|
||||
DQ 3fc369d020000000h
|
||||
DQ 3fc390f680000000h
|
||||
DQ 3fc3b80160000000h
|
||||
DQ 3fc3def0e0000000h
|
||||
DQ 3fc405c530000000h
|
||||
DQ 3fc42c7e70000000h
|
||||
DQ 3fc4531cd0000000h
|
||||
DQ 3fc479a070000000h
|
||||
DQ 3fc4a00970000000h
|
||||
DQ 3fc4c65800000000h
|
||||
DQ 3fc4ec8c30000000h
|
||||
DQ 3fc512a640000000h
|
||||
DQ 3fc538a630000000h
|
||||
DQ 3fc55e8c50000000h
|
||||
DQ 3fc5845890000000h
|
||||
DQ 3fc5aa0b40000000h
|
||||
DQ 3fc5cfa470000000h
|
||||
DQ 3fc5f52440000000h
|
||||
DQ 3fc61a8ad0000000h
|
||||
DQ 3fc63fd850000000h
|
||||
DQ 3fc6650cd0000000h
|
||||
DQ 3fc68a2880000000h
|
||||
DQ 3fc6af2b80000000h
|
||||
DQ 3fc6d415e0000000h
|
||||
DQ 3fc6f8e7d0000000h
|
||||
DQ 3fc71da170000000h
|
||||
DQ 3fc74242e0000000h
|
||||
DQ 3fc766cc40000000h
|
||||
DQ 3fc78b3da0000000h
|
||||
DQ 3fc7af9730000000h
|
||||
DQ 3fc7d3d910000000h
|
||||
DQ 3fc7f80350000000h
|
||||
DQ 3fc81c1620000000h
|
||||
DQ 3fc8401190000000h
|
||||
DQ 3fc863f5c0000000h
|
||||
DQ 3fc887c2e0000000h
|
||||
DQ 3fc8ab7900000000h
|
||||
DQ 3fc8cf1830000000h
|
||||
DQ 3fc8f2a0a0000000h
|
||||
DQ 3fc9161270000000h
|
||||
DQ 3fc9396db0000000h
|
||||
DQ 3fc95cb280000000h
|
||||
DQ 3fc97fe100000000h
|
||||
DQ 3fc9a2f950000000h
|
||||
DQ 3fc9c5fb70000000h
|
||||
DQ 3fc9e8e7b0000000h
|
||||
DQ 3fca0bbdf0000000h
|
||||
DQ 3fca2e7e80000000h
|
||||
DQ 3fca512960000000h
|
||||
DQ 3fca73bea0000000h
|
||||
DQ 3fca963e70000000h
|
||||
DQ 3fcab8a8f0000000h
|
||||
DQ 3fcadafe20000000h
|
||||
DQ 3fcafd3e30000000h
|
||||
DQ 3fcb1f6930000000h
|
||||
DQ 3fcb417f40000000h
|
||||
DQ 3fcb638070000000h
|
||||
DQ 3fcb856cf0000000h
|
||||
DQ 3fcba744b0000000h
|
||||
DQ 3fcbc907f0000000h
|
||||
DQ 3fcbeab6c0000000h
|
||||
DQ 3fcc0c5130000000h
|
||||
DQ 3fcc2dd750000000h
|
||||
DQ 3fcc4f4950000000h
|
||||
DQ 3fcc70a740000000h
|
||||
DQ 3fcc91f130000000h
|
||||
DQ 3fccb32740000000h
|
||||
DQ 3fccd44980000000h
|
||||
DQ 3fccf55810000000h
|
||||
DQ 3fcd165300000000h
|
||||
DQ 3fcd373a60000000h
|
||||
DQ 3fcd580e60000000h
|
||||
DQ 3fcd78cf00000000h
|
||||
DQ 3fcd997c70000000h
|
||||
DQ 3fcdba16a0000000h
|
||||
DQ 3fcdda9dd0000000h
|
||||
DQ 3fcdfb11f0000000h
|
||||
DQ 3fce1b7330000000h
|
||||
DQ 3fce3bc1a0000000h
|
||||
DQ 3fce5bfd50000000h
|
||||
DQ 3fce7c2660000000h
|
||||
DQ 3fce9c3ce0000000h
|
||||
DQ 3fcebc40e0000000h
|
||||
DQ 3fcedc3280000000h
|
||||
DQ 3fcefc11d0000000h
|
||||
DQ 3fcf1bdee0000000h
|
||||
DQ 3fcf3b99d0000000h
|
||||
DQ 3fcf5b42a0000000h
|
||||
DQ 3fcf7ad980000000h
|
||||
DQ 3fcf9a5e70000000h
|
||||
DQ 3fcfb9d190000000h
|
||||
DQ 3fcfd932f0000000h
|
||||
DQ 3fcff882a0000000h
|
||||
DQ 3fd00be050000000h
|
||||
DQ 3fd01b76a0000000h
|
||||
DQ 3fd02b0430000000h
|
||||
DQ 3fd03a8910000000h
|
||||
DQ 3fd04a0540000000h
|
||||
DQ 3fd05978e0000000h
|
||||
DQ 3fd068e3f0000000h
|
||||
DQ 3fd0784670000000h
|
||||
DQ 3fd087a080000000h
|
||||
DQ 3fd096f210000000h
|
||||
DQ 3fd0a63b30000000h
|
||||
DQ 3fd0b57bf0000000h
|
||||
DQ 3fd0c4b450000000h
|
||||
DQ 3fd0d3e460000000h
|
||||
DQ 3fd0e30c30000000h
|
||||
DQ 3fd0f22bc0000000h
|
||||
DQ 3fd1014310000000h
|
||||
DQ 3fd1105240000000h
|
||||
DQ 3fd11f5940000000h
|
||||
DQ 3fd12e5830000000h
|
||||
DQ 3fd13d4f00000000h
|
||||
DQ 3fd14c3dd0000000h
|
||||
DQ 3fd15b24a0000000h
|
||||
DQ 3fd16a0370000000h
|
||||
DQ 3fd178da50000000h
|
||||
DQ 3fd187a940000000h
|
||||
DQ 3fd1967060000000h
|
||||
DQ 3fd1a52fa0000000h
|
||||
DQ 3fd1b3e710000000h
|
||||
DQ 3fd1c296c0000000h
|
||||
DQ 3fd1d13eb0000000h
|
||||
DQ 3fd1dfdef0000000h
|
||||
DQ 3fd1ee7770000000h
|
||||
DQ 3fd1fd0860000000h
|
||||
DQ 3fd20b91a0000000h
|
||||
DQ 3fd21a1350000000h
|
||||
DQ 3fd2288d70000000h
|
||||
DQ 3fd2370010000000h
|
||||
DQ 3fd2456b30000000h
|
||||
DQ 3fd253ced0000000h
|
||||
DQ 3fd2622b00000000h
|
||||
DQ 3fd2707fd0000000h
|
||||
DQ 3fd27ecd40000000h
|
||||
DQ 3fd28d1360000000h
|
||||
DQ 3fd29b5220000000h
|
||||
DQ 3fd2a989a0000000h
|
||||
DQ 3fd2b7b9e0000000h
|
||||
DQ 3fd2c5e2e0000000h
|
||||
DQ 3fd2d404b0000000h
|
||||
DQ 3fd2e21f50000000h
|
||||
DQ 3fd2f032c0000000h
|
||||
DQ 3fd2fe3f20000000h
|
||||
DQ 3fd30c4470000000h
|
||||
DQ 3fd31a42b0000000h
|
||||
DQ 3fd32839e0000000h
|
||||
DQ 3fd3362a10000000h
|
||||
DQ 3fd3441350000000h
|
||||
ALIGN 16
|
||||
PUBLIC __log10_256_tail
|
||||
__log10_256_tail:
|
||||
DQ 0000000000000000h
|
||||
DQ 3db20abc22b2208fh
|
||||
DQ 3db10f69332e0dd4h
|
||||
DQ 3dce950de87ed257h
|
||||
DQ 3dd3f3443b626d69h
|
||||
DQ 3df45aeaa5363e57h
|
||||
DQ 3dc443683ce1bf0bh
|
||||
DQ 3df989cd60c6a511h
|
||||
DQ 3dfd626f201f2e9fh
|
||||
DQ 3de94f8bb8dabdcdh
|
||||
DQ 3e0088d8ef423015h
|
||||
DQ 3e080413a62b79adh
|
||||
DQ 3e059717c0eed3c4h
|
||||
DQ 3dad4a77add44902h
|
||||
DQ 3e0e763ff037300eh
|
||||
DQ 3de162d74706f6c3h
|
||||
DQ 3e0601cc1f4dbc14h
|
||||
DQ 3deaf3e051f6e5bfh
|
||||
DQ 3e097a0b1e1af3ebh
|
||||
DQ 3dc0a38970c002c7h
|
||||
DQ 3e102e000057c751h
|
||||
DQ 3e155b00eecd6e0eh
|
||||
DQ 3ddf86297003b5afh
|
||||
DQ 3e1057b9b336a36dh
|
||||
DQ 3e134bc84a06ea4fh
|
||||
DQ 3e1643da9ea1bcadh
|
||||
DQ 3e1d66a7b4f7ea2ah
|
||||
DQ 3df6b2e038f7fcefh
|
||||
DQ 3df3e954c670f088h
|
||||
DQ 3e047209093acab3h
|
||||
DQ 3e1d708fe7275da7h
|
||||
DQ 3e1fdf9e7771b9e7h
|
||||
DQ 3e0827bfa70a0660h
|
||||
DQ 3e1601cc1f4dbc14h
|
||||
DQ 3e0637f6106a5e5bh
|
||||
DQ 3e126a13f17c624bh
|
||||
DQ 3e093eb2ce80623ah
|
||||
DQ 3e1430d1e91594deh
|
||||
DQ 3e1d6b10108fa031h
|
||||
DQ 3e16879c0bbaf241h
|
||||
DQ 3dff08015ea6bc2bh
|
||||
DQ 3e29b63dcdc6676ch
|
||||
DQ 3e2b022cbcc4ab2ch
|
||||
DQ 3df917d07ddd6544h
|
||||
DQ 3e1540605703379eh
|
||||
DQ 3e0cd18b947a1b60h
|
||||
DQ 3e17ad65277ca97eh
|
||||
DQ 3e11884dc59f5fa9h
|
||||
DQ 3e1711c46006d082h
|
||||
DQ 3e2f092e3c3108f8h
|
||||
DQ 3e1714c5e32be13ah
|
||||
DQ 3e26bba7fd734f9ah
|
||||
DQ 3dfdf48fb5e08483h
|
||||
DQ 3e232f9bc74d0b95h
|
||||
DQ 3df973e848790c13h
|
||||
DQ 3e1eccbc08c6586eh
|
||||
DQ 3e2115e9f9524a98h
|
||||
DQ 3e2f1740593131b8h
|
||||
DQ 3e1bcf8b25643835h
|
||||
DQ 3e1f5fa81d8bed80h
|
||||
DQ 3e244a4df929d9e4h
|
||||
DQ 3e129820d8220c94h
|
||||
DQ 3e2a0b489304e309h
|
||||
DQ 3e1f4d56aba665feh
|
||||
DQ 3e210c9019365163h
|
||||
DQ 3df80f78fe592736h
|
||||
DQ 3e10528825c81ccah
|
||||
DQ 3de095537d6d746ah
|
||||
DQ 3e1827bfa70a0660h
|
||||
DQ 3e06b0a8ec45933ch
|
||||
DQ 3e105af81bf5dba9h
|
||||
DQ 3e17e2fa2655d515h
|
||||
DQ 3e0d59ecbfaee4bfh
|
||||
DQ 3e1d8b2fda683fa3h
|
||||
DQ 3e24b8ddfd3a3737h
|
||||
DQ 3e13827e61ae1204h
|
||||
DQ 3e2c8c7b49e90f9fh
|
||||
DQ 3e29eaf01597591dh
|
||||
DQ 3e19aaa66e317b36h
|
||||
DQ 3e2e725609720655h
|
||||
DQ 3e261c33fc7aac54h
|
||||
DQ 3e29662bcf61a252h
|
||||
DQ 3e1843c811c42730h
|
||||
DQ 3e2064bb0b5acb36h
|
||||
DQ 3e0a340c842701a4h
|
||||
DQ 3e1a8e55b58f79d6h
|
||||
DQ 3de92d219c5e9d9ah
|
||||
DQ 3e3f63e60d7ffd6ah
|
||||
DQ 3e2e9b0ed9516314h
|
||||
DQ 3e2923901962350ch
|
||||
DQ 3e326f8838785e81h
|
||||
DQ 3e3b5b6a4caba6afh
|
||||
DQ 3df0226adc8e761ch
|
||||
DQ 3e3c4ad7313a1aedh
|
||||
DQ 3e1564e87c738d17h
|
||||
DQ 3e338fecf18a6618h
|
||||
DQ 3e3d929ef5777666h
|
||||
DQ 3e39483bf08da0b8h
|
||||
DQ 3e3bdd0eeeaa5826h
|
||||
DQ 3e39c4dd590237bah
|
||||
DQ 3e1af3e9e0ebcac7h
|
||||
DQ 3e35ce5382270dach
|
||||
DQ 3e394f74532ab9bah
|
||||
DQ 3e07342795888654h
|
||||
DQ 3e0c5a000be34bf0h
|
||||
DQ 3e2711c46006d082h
|
||||
DQ 3e250025b4ed8cf8h
|
||||
DQ 3e2ed18bcef2d2a0h
|
||||
DQ 3e21282e0c0a7554h
|
||||
DQ 3e0d70f33359a7cah
|
||||
DQ 3e2b7f7e13a84025h
|
||||
DQ 3e33306ec321891eh
|
||||
DQ 3e3fc7f8038b7550h
|
||||
DQ 3e3eb0358cd71d64h
|
||||
DQ 3e3a76c822859474h
|
||||
DQ 3e3d0ec652de86e3h
|
||||
DQ 3e2fa4cce08658afh
|
||||
DQ 3e3b84a2d2c00a9eh
|
||||
DQ 3e20a5b0f2c25bd1h
|
||||
DQ 3e3dd660225bf699h
|
||||
DQ 3e08b10f859bf037h
|
||||
DQ 3e3e8823b590cbe1h
|
||||
DQ 3e361311f31e96f6h
|
||||
DQ 3e2e1f875ca20f9ah
|
||||
DQ 3e2c95724939b9a5h
|
||||
DQ 3e3805957a3e58e2h
|
||||
DQ 3e2ff126ea9f0334h
|
||||
DQ 3e3953f5598e5609h
|
||||
DQ 3e36c16ff856c448h
|
||||
DQ 3e24cb220ff261f4h
|
||||
DQ 3e35e120d53d53a2h
|
||||
DQ 3e3a527f6189f256h
|
||||
DQ 3e3856fcffd49c0fh
|
||||
DQ 3e300c2e8228d7dah
|
||||
DQ 3df113d09444dfe0h
|
||||
DQ 3e2510630eea59a6h
|
||||
DQ 3e262e780f32d711h
|
||||
DQ 3ded3ed91a10f8cfh
|
||||
DQ 3e23654a7e4bcd85h
|
||||
DQ 3e055b784980ad21h
|
||||
DQ 3e212f2dd4b16e64h
|
||||
DQ 3e37c4add939f50ch
|
||||
DQ 3e281784627180fch
|
||||
DQ 3dea5162c7e14961h
|
||||
DQ 3e310c9019365163h
|
||||
DQ 3e373c4d2ba17688h
|
||||
DQ 3e2ae8a5e0e93d81h
|
||||
DQ 3e2ab0c6f01621afh
|
||||
DQ 3e301e8b74dd5b66h
|
||||
DQ 3e2d206fecbb5494h
|
||||
DQ 3df0b48b724fcc00h
|
||||
DQ 3e3f831f0b61e229h
|
||||
DQ 3df81a97c407bcafh
|
||||
DQ 3e3e286c1ccbb7aah
|
||||
DQ 3e28630b49220a93h
|
||||
DQ 3dff0b15c1a22c5ch
|
||||
DQ 3e355445e71c0946h
|
||||
DQ 3e3be630f8066d85h
|
||||
DQ 3e2599dff0d96c39h
|
||||
DQ 3e36cc85b18fb081h
|
||||
DQ 3e34476d001ea8c8h
|
||||
DQ 3e373f889e16d31fh
|
||||
DQ 3e3357100d792a87h
|
||||
DQ 3e3bd179ae6101f6h
|
||||
DQ 3e0ca31056c3f6e2h
|
||||
DQ 3e3d2870629c08fbh
|
||||
DQ 3e3aba3880d2673fh
|
||||
DQ 3e2c3633cb297da6h
|
||||
DQ 3e21843899efea02h
|
||||
DQ 3e3bccc99d2008e6h
|
||||
DQ 3e38000544bdd350h
|
||||
DQ 3e2b91c226606ae1h
|
||||
DQ 3e2a7adf26b62bdfh
|
||||
DQ 3e18764fc8826ec9h
|
||||
DQ 3e1f4f3de50f68f0h
|
||||
DQ 3df760ca757995e3h
|
||||
DQ 3dfc667ed3805147h
|
||||
DQ 3e3733f6196adf6fh
|
||||
DQ 3e2fb710f33e836bh
|
||||
DQ 3e39886eba641013h
|
||||
DQ 3dfb5368d0af8c1ah
|
||||
DQ 3e358c691b8d2971h
|
||||
DQ 3dfe9465226d08fbh
|
||||
DQ 3e33587e063f0097h
|
||||
DQ 3e3618e702129f18h
|
||||
DQ 3e361c33fc7aac54h
|
||||
DQ 3e3f07a68408604ah
|
||||
DQ 3e3c34bfe4945421h
|
||||
DQ 3e38b1f00e41300bh
|
||||
DQ 3e3f434284d61b63h
|
||||
DQ 3e3a63095e397436h
|
||||
DQ 3e34428656b919deh
|
||||
DQ 3e36ca9201b2d9a6h
|
||||
DQ 3e2738823a2a931ch
|
||||
DQ 3e3c11880e179230h
|
||||
DQ 3e313ddc8d6d52feh
|
||||
DQ 3e33eed58922e917h
|
||||
DQ 3e295992846bdd50h
|
||||
DQ 3e0ddb4d5f2e278bh
|
||||
DQ 3df1a5f12a0635c4h
|
||||
DQ 3e4642f0882c3c34h
|
||||
DQ 3e2aee9ba7f6475eh
|
||||
DQ 3e264b7f834a60e4h
|
||||
DQ 3e290d42e243792eh
|
||||
DQ 3e4c272008134f01h
|
||||
DQ 3e4a782e16d6cf5bh
|
||||
DQ 3e44505c79da6648h
|
||||
DQ 3e4ca9d4ea4dcd21h
|
||||
DQ 3e297d3d627cd5bch
|
||||
DQ 3e20b15cf9bcaa13h
|
||||
DQ 3e315b2063cf76ddh
|
||||
DQ 3e2983e6f3aa2748h
|
||||
DQ 3e3f4c64f4ffe994h
|
||||
DQ 3e46beba7ce85a0fh
|
||||
DQ 3e3b9c69fd4ea6b8h
|
||||
DQ 3e2b6aa5835fa4abh
|
||||
DQ 3e43ccc3790fedd1h
|
||||
DQ 3e29c04cc4404fe0h
|
||||
DQ 3e40734b7a75d89dh
|
||||
DQ 3e1b4404c4e01612h
|
||||
DQ 3e40c565c2ce4894h
|
||||
DQ 3e33c71441d935cdh
|
||||
DQ 3d72a492556b3b4eh
|
||||
DQ 3e20fa090341dc43h
|
||||
DQ 3e2e8f7009e3d9f4h
|
||||
DQ 3e4b1bf68b048a45h
|
||||
DQ 3e3eee52dffaa956h
|
||||
DQ 3e456b0900e465bdh
|
||||
DQ 3e4d929ef5777666h
|
||||
DQ 3e486ea28637e260h
|
||||
DQ 3e4665aff10ca2f0h
|
||||
DQ 3e2f11fdaf48ec74h
|
||||
DQ 3e4cbe1b86a4d1c7h
|
||||
DQ 3e25b05bfea87665h
|
||||
DQ 3e41cec20a1a4a1dh
|
||||
DQ 3e41cd5f0a409b9fh
|
||||
DQ 3e453656c8265070h
|
||||
DQ 3e377ed835282260h
|
||||
DQ 3e2417bc3040b9d2h
|
||||
DQ 3e408eef7b79eff2h
|
||||
DQ 3e4dc76f39dc57e9h
|
||||
DQ 3e4c0493a70cf457h
|
||||
DQ 3e4a83d6cea5a60ch
|
||||
DQ 3e30d6700dc557bah
|
||||
DQ 3e44c96c12e8bd0ah
|
||||
DQ 3e3d2c1993e32315h
|
||||
DQ 3e22c721135f8242h
|
||||
DQ 3e279a3e4dda747dh
|
||||
DQ 3dfcf89f6941a72bh
|
||||
DQ 3e2149a702f10831h
|
||||
DQ 3e4ead4b7c8175dbh
|
||||
DQ 3e4e6930fe63e70ah
|
||||
DQ 3e41e106bed9ee2fh
|
||||
DQ 3e2d682b82f11c92h
|
||||
DQ 3e3a07f188dba47ch
|
||||
DQ 3e40f9342dc172f6h
|
||||
DQ 3e03ef3fde623e25h
|
||||
END
|
294
sdk/lib/crt/math/libm_sse2/log_128_lead_tail_table.asm
Normal file
294
sdk/lib/crt/math/libm_sse2/log_128_lead_tail_table.asm
Normal file
|
@ -0,0 +1,294 @@
|
|||
;
|
||||
; MIT License
|
||||
; -----------
|
||||
;
|
||||
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
||||
;
|
||||
; Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
; of this Software and associated documentaon files (the "Software"), to deal
|
||||
; in the Software without restriction, including without limitation the rights
|
||||
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
; copies of the Software, and to permit persons to whom the Software is
|
||||
; furnished to do so, subject to the following conditions:
|
||||
;
|
||||
; The above copyright notice and this permission notice shall be included in
|
||||
; all copies or substantial portions of the Software.
|
||||
;
|
||||
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
; THE SOFTWARE.
|
||||
;
|
||||
;; Defines __log_128_lead and __log_128_tail tables
|
||||
;; Used by log and pow
|
||||
;;
|
||||
|
||||
.const
|
||||
|
||||
ALIGN 16
|
||||
PUBLIC __log_128_lead
|
||||
__log_128_lead DD 000000000h
|
||||
DD 03bff0000h
|
||||
DD 03c7e0000h
|
||||
DD 03cbdc000h
|
||||
DD 03cfc1000h
|
||||
DD 03d1cf000h
|
||||
DD 03d3ba000h
|
||||
DD 03d5a1000h
|
||||
DD 03d785000h
|
||||
DD 03d8b2000h
|
||||
DD 03d9a0000h
|
||||
DD 03da8d000h
|
||||
DD 03db78000h
|
||||
DD 03dc61000h
|
||||
DD 03dd49000h
|
||||
DD 03de2f000h
|
||||
DD 03df13000h
|
||||
DD 03dff6000h
|
||||
DD 03e06b000h
|
||||
DD 03e0db000h
|
||||
DD 03e14a000h
|
||||
DD 03e1b8000h
|
||||
DD 03e226000h
|
||||
DD 03e293000h
|
||||
DD 03e2ff000h
|
||||
DD 03e36b000h
|
||||
DD 03e3d5000h
|
||||
DD 03e43f000h
|
||||
DD 03e4a9000h
|
||||
DD 03e511000h
|
||||
DD 03e579000h
|
||||
DD 03e5e1000h
|
||||
DD 03e647000h
|
||||
DD 03e6ae000h
|
||||
DD 03e713000h
|
||||
DD 03e778000h
|
||||
DD 03e7dc000h
|
||||
DD 03e820000h
|
||||
DD 03e851000h
|
||||
DD 03e882000h
|
||||
DD 03e8b3000h
|
||||
DD 03e8e4000h
|
||||
DD 03e914000h
|
||||
DD 03e944000h
|
||||
DD 03e974000h
|
||||
DD 03e9a3000h
|
||||
DD 03e9d3000h
|
||||
DD 03ea02000h
|
||||
DD 03ea30000h
|
||||
DD 03ea5f000h
|
||||
DD 03ea8d000h
|
||||
DD 03eabb000h
|
||||
DD 03eae8000h
|
||||
DD 03eb16000h
|
||||
DD 03eb43000h
|
||||
DD 03eb70000h
|
||||
DD 03eb9c000h
|
||||
DD 03ebc9000h
|
||||
DD 03ebf5000h
|
||||
DD 03ec21000h
|
||||
DD 03ec4d000h
|
||||
DD 03ec78000h
|
||||
DD 03eca3000h
|
||||
DD 03ecce000h
|
||||
DD 03ecf9000h
|
||||
DD 03ed24000h
|
||||
DD 03ed4e000h
|
||||
DD 03ed78000h
|
||||
DD 03eda2000h
|
||||
DD 03edcc000h
|
||||
DD 03edf5000h
|
||||
DD 03ee1e000h
|
||||
DD 03ee47000h
|
||||
DD 03ee70000h
|
||||
DD 03ee99000h
|
||||
DD 03eec1000h
|
||||
DD 03eeea000h
|
||||
DD 03ef12000h
|
||||
DD 03ef3a000h
|
||||
DD 03ef61000h
|
||||
DD 03ef89000h
|
||||
DD 03efb0000h
|
||||
DD 03efd7000h
|
||||
DD 03effe000h
|
||||
DD 03f012000h
|
||||
DD 03f025000h
|
||||
DD 03f039000h
|
||||
DD 03f04c000h
|
||||
DD 03f05f000h
|
||||
DD 03f072000h
|
||||
DD 03f084000h
|
||||
DD 03f097000h
|
||||
DD 03f0aa000h
|
||||
DD 03f0bc000h
|
||||
DD 03f0cf000h
|
||||
DD 03f0e1000h
|
||||
DD 03f0f4000h
|
||||
DD 03f106000h
|
||||
DD 03f118000h
|
||||
DD 03f12a000h
|
||||
DD 03f13c000h
|
||||
DD 03f14e000h
|
||||
DD 03f160000h
|
||||
DD 03f172000h
|
||||
DD 03f183000h
|
||||
DD 03f195000h
|
||||
DD 03f1a7000h
|
||||
DD 03f1b8000h
|
||||
DD 03f1c9000h
|
||||
DD 03f1db000h
|
||||
DD 03f1ec000h
|
||||
DD 03f1fd000h
|
||||
DD 03f20e000h
|
||||
DD 03f21f000h
|
||||
DD 03f230000h
|
||||
DD 03f241000h
|
||||
DD 03f252000h
|
||||
DD 03f263000h
|
||||
DD 03f273000h
|
||||
DD 03f284000h
|
||||
DD 03f295000h
|
||||
DD 03f2a5000h
|
||||
DD 03f2b5000h
|
||||
DD 03f2c6000h
|
||||
DD 03f2d6000h
|
||||
DD 03f2e6000h
|
||||
DD 03f2f7000h
|
||||
DD 03f307000h
|
||||
DD 03f317000h
|
||||
|
||||
ALIGN 16
|
||||
PUBLIC __log_128_tail
|
||||
__log_128_tail DD 000000000h
|
||||
DD 03429ac41h
|
||||
DD 035a8b0fch
|
||||
DD 0368d83eah
|
||||
DD 0361b0e78h
|
||||
DD 03687b9feh
|
||||
DD 03631ec65h
|
||||
DD 036dd7119h
|
||||
DD 035c30045h
|
||||
DD 0379b7751h
|
||||
DD 037ebcb0dh
|
||||
DD 037839f83h
|
||||
DD 037528ae5h
|
||||
DD 037a2eb18h
|
||||
DD 036da7495h
|
||||
DD 036a91eb7h
|
||||
DD 03783b715h
|
||||
DD 0371131dbh
|
||||
DD 0383f3e68h
|
||||
DD 038156a97h
|
||||
DD 038297c0fh
|
||||
DD 0387e100fh
|
||||
DD 03815b665h
|
||||
DD 037e5e3a1h
|
||||
DD 038183853h
|
||||
DD 035fe719dh
|
||||
DD 038448108h
|
||||
DD 038503290h
|
||||
DD 0373539e8h
|
||||
DD 0385e0ff1h
|
||||
DD 03864a740h
|
||||
DD 03786742dh
|
||||
DD 0387be3cdh
|
||||
DD 03685ad3eh
|
||||
DD 03803b715h
|
||||
DD 037adcbdch
|
||||
DD 0380c36afh
|
||||
DD 0371652d3h
|
||||
DD 038927139h
|
||||
DD 038c5fcd7h
|
||||
DD 038ae55d5h
|
||||
DD 03818c169h
|
||||
DD 038a0fde7h
|
||||
DD 038ad09efh
|
||||
DD 03862bae1h
|
||||
DD 038eecd4ch
|
||||
DD 03798aad2h
|
||||
DD 037421a1ah
|
||||
DD 038c5e10eh
|
||||
DD 037bf2aeeh
|
||||
DD 0382d872dh
|
||||
DD 037ee2e8ah
|
||||
DD 038dedfach
|
||||
DD 03802f2b9h
|
||||
DD 038481e9bh
|
||||
DD 0380eaa2bh
|
||||
DD 038ebfb5dh
|
||||
DD 038255fddh
|
||||
DD 038783b82h
|
||||
DD 03851da1eh
|
||||
DD 0374e1b05h
|
||||
DD 0388f439bh
|
||||
DD 038ca0e10h
|
||||
DD 038cac08bh
|
||||
DD 03891f65fh
|
||||
DD 0378121cbh
|
||||
DD 0386c9a9ah
|
||||
DD 038949923h
|
||||
DD 038777bcch
|
||||
DD 037b12d26h
|
||||
DD 038a6ced3h
|
||||
DD 038ebd3e6h
|
||||
DD 038fbe3cdh
|
||||
DD 038d785c2h
|
||||
DD 0387e7e00h
|
||||
DD 038f392c5h
|
||||
DD 037d40983h
|
||||
DD 038081a7ch
|
||||
DD 03784c3adh
|
||||
DD 038cce923h
|
||||
DD 0380f5fafh
|
||||
DD 03891fd38h
|
||||
DD 038ac47bch
|
||||
DD 03897042bh
|
||||
DD 0392952d2h
|
||||
DD 0396fced4h
|
||||
DD 037f97073h
|
||||
DD 0385e9eaeh
|
||||
DD 03865c84ah
|
||||
DD 038130ba3h
|
||||
DD 03979cf16h
|
||||
DD 03938cac9h
|
||||
DD 038c3d2f4h
|
||||
DD 039755dech
|
||||
DD 038e6b467h
|
||||
DD 0395c0fb8h
|
||||
DD 0383ebce0h
|
||||
DD 038dcd192h
|
||||
DD 039186bdfh
|
||||
DD 0392de74ch
|
||||
DD 0392f0944h
|
||||
DD 0391bff61h
|
||||
DD 038e9ed44h
|
||||
DD 038686dc8h
|
||||
DD 0396b99a7h
|
||||
DD 039099c89h
|
||||
DD 037a27673h
|
||||
DD 0390bdaa3h
|
||||
DD 0397069abh
|
||||
DD 0388449ffh
|
||||
DD 039013538h
|
||||
DD 0392dc268h
|
||||
DD 03947f423h
|
||||
DD 0394ff17ch
|
||||
DD 03945e10eh
|
||||
DD 03929e8f5h
|
||||
DD 038f85db0h
|
||||
DD 038735f99h
|
||||
DD 0396c08dbh
|
||||
DD 03909e600h
|
||||
DD 037b4996fh
|
||||
DD 0391233cch
|
||||
DD 0397cead9h
|
||||
DD 038adb5cdh
|
||||
DD 03920261ah
|
||||
DD 03958ee36h
|
||||
DD 035aa4905h
|
||||
DD 037cbd11eh
|
||||
DD 03805fdf4h
|
||||
END
|
554
sdk/lib/crt/math/libm_sse2/log_256_lead_tail_table.asm
Normal file
554
sdk/lib/crt/math/libm_sse2/log_256_lead_tail_table.asm
Normal file
|
@ -0,0 +1,554 @@
|
|||
;;
|
||||
;
|
||||
; MIT License
|
||||
; -----------
|
||||
;
|
||||
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
||||
;
|
||||
; Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
; of this Software and associated documentaon files (the "Software"), to deal
|
||||
; in the Software without restriction, including without limitation the rights
|
||||
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
; copies of the Software, and to permit persons to whom the Software is
|
||||
; furnished to do so, subject to the following conditions:
|
||||
;
|
||||
; The above copyright notice and this permission notice shall be included in
|
||||
; all copies or substantial portions of the Software.
|
||||
;
|
||||
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
; THE SOFTWARE.
|
||||
;
|
||||
;; Defines __log_256_lead and __log_256_tail tables
|
||||
;; Used by log and pow
|
||||
;;
|
||||
|
||||
.const
|
||||
|
||||
ALIGN 16
|
||||
PUBLIC __log_256_lead
|
||||
__log_256_lead DQ 0000000000000000h
|
||||
DQ 3f6ff00aa0000000h
|
||||
DQ 3f7fe02a60000000h
|
||||
DQ 3f87dc4750000000h
|
||||
DQ 3f8fc0a8b0000000h
|
||||
DQ 3f93cea440000000h
|
||||
DQ 3f97b91b00000000h
|
||||
DQ 3f9b9fc020000000h
|
||||
DQ 3f9f829b00000000h
|
||||
DQ 3fa1b0d980000000h
|
||||
DQ 3fa39e87b0000000h
|
||||
DQ 3fa58a5ba0000000h
|
||||
DQ 3fa77458f0000000h
|
||||
DQ 3fa95c8300000000h
|
||||
DQ 3fab42dd70000000h
|
||||
DQ 3fad276b80000000h
|
||||
DQ 3faf0a30c0000000h
|
||||
DQ 3fb0759830000000h
|
||||
DQ 3fb16536e0000000h
|
||||
DQ 3fb253f620000000h
|
||||
DQ 3fb341d790000000h
|
||||
DQ 3fb42edcb0000000h
|
||||
DQ 3fb51b0730000000h
|
||||
DQ 3fb60658a0000000h
|
||||
DQ 3fb6f0d280000000h
|
||||
DQ 3fb7da7660000000h
|
||||
DQ 3fb8c345d0000000h
|
||||
DQ 3fb9ab4240000000h
|
||||
DQ 3fba926d30000000h
|
||||
DQ 3fbb78c820000000h
|
||||
DQ 3fbc5e5480000000h
|
||||
DQ 3fbd4313d0000000h
|
||||
DQ 3fbe270760000000h
|
||||
DQ 3fbf0a30c0000000h
|
||||
DQ 3fbfec9130000000h
|
||||
DQ 3fc0671510000000h
|
||||
DQ 3fc0d77e70000000h
|
||||
DQ 3fc1478580000000h
|
||||
DQ 3fc1b72ad0000000h
|
||||
DQ 3fc2266f10000000h
|
||||
DQ 3fc29552f0000000h
|
||||
DQ 3fc303d710000000h
|
||||
DQ 3fc371fc20000000h
|
||||
DQ 3fc3dfc2b0000000h
|
||||
DQ 3fc44d2b60000000h
|
||||
DQ 3fc4ba36f0000000h
|
||||
DQ 3fc526e5e0000000h
|
||||
DQ 3fc59338d0000000h
|
||||
DQ 3fc5ff3070000000h
|
||||
DQ 3fc66acd40000000h
|
||||
DQ 3fc6d60fe0000000h
|
||||
DQ 3fc740f8f0000000h
|
||||
DQ 3fc7ab8900000000h
|
||||
DQ 3fc815c0a0000000h
|
||||
DQ 3fc87fa060000000h
|
||||
DQ 3fc8e928d0000000h
|
||||
DQ 3fc9525a90000000h
|
||||
DQ 3fc9bb3620000000h
|
||||
DQ 3fca23bc10000000h
|
||||
DQ 3fca8becf0000000h
|
||||
DQ 3fcaf3c940000000h
|
||||
DQ 3fcb5b5190000000h
|
||||
DQ 3fcbc28670000000h
|
||||
DQ 3fcc296850000000h
|
||||
DQ 3fcc8ff7c0000000h
|
||||
DQ 3fccf63540000000h
|
||||
DQ 3fcd5c2160000000h
|
||||
DQ 3fcdc1bca0000000h
|
||||
DQ 3fce270760000000h
|
||||
DQ 3fce8c0250000000h
|
||||
DQ 3fcef0adc0000000h
|
||||
DQ 3fcf550a50000000h
|
||||
DQ 3fcfb91860000000h
|
||||
DQ 3fd00e6c40000000h
|
||||
DQ 3fd0402590000000h
|
||||
DQ 3fd071b850000000h
|
||||
DQ 3fd0a324e0000000h
|
||||
DQ 3fd0d46b50000000h
|
||||
DQ 3fd1058bf0000000h
|
||||
DQ 3fd1368700000000h
|
||||
DQ 3fd1675ca0000000h
|
||||
DQ 3fd1980d20000000h
|
||||
DQ 3fd1c898c0000000h
|
||||
DQ 3fd1f8ff90000000h
|
||||
DQ 3fd22941f0000000h
|
||||
DQ 3fd2596010000000h
|
||||
DQ 3fd2895a10000000h
|
||||
DQ 3fd2b93030000000h
|
||||
DQ 3fd2e8e2b0000000h
|
||||
DQ 3fd31871c0000000h
|
||||
DQ 3fd347dd90000000h
|
||||
DQ 3fd3772660000000h
|
||||
DQ 3fd3a64c50000000h
|
||||
DQ 3fd3d54fa0000000h
|
||||
DQ 3fd4043080000000h
|
||||
DQ 3fd432ef20000000h
|
||||
DQ 3fd4618bc0000000h
|
||||
DQ 3fd4900680000000h
|
||||
DQ 3fd4be5f90000000h
|
||||
DQ 3fd4ec9730000000h
|
||||
DQ 3fd51aad80000000h
|
||||
DQ 3fd548a2c0000000h
|
||||
DQ 3fd5767710000000h
|
||||
DQ 3fd5a42ab0000000h
|
||||
DQ 3fd5d1bdb0000000h
|
||||
DQ 3fd5ff3070000000h
|
||||
DQ 3fd62c82f0000000h
|
||||
DQ 3fd659b570000000h
|
||||
DQ 3fd686c810000000h
|
||||
DQ 3fd6b3bb20000000h
|
||||
DQ 3fd6e08ea0000000h
|
||||
DQ 3fd70d42e0000000h
|
||||
DQ 3fd739d7f0000000h
|
||||
DQ 3fd7664e10000000h
|
||||
DQ 3fd792a550000000h
|
||||
DQ 3fd7bede00000000h
|
||||
DQ 3fd7eaf830000000h
|
||||
DQ 3fd816f410000000h
|
||||
DQ 3fd842d1d0000000h
|
||||
DQ 3fd86e9190000000h
|
||||
DQ 3fd89a3380000000h
|
||||
DQ 3fd8c5b7c0000000h
|
||||
DQ 3fd8f11e80000000h
|
||||
DQ 3fd91c67e0000000h
|
||||
DQ 3fd9479410000000h
|
||||
DQ 3fd972a340000000h
|
||||
DQ 3fd99d9580000000h
|
||||
DQ 3fd9c86b00000000h
|
||||
DQ 3fd9f323e0000000h
|
||||
DQ 3fda1dc060000000h
|
||||
DQ 3fda484090000000h
|
||||
DQ 3fda72a490000000h
|
||||
DQ 3fda9cec90000000h
|
||||
DQ 3fdac718c0000000h
|
||||
DQ 3fdaf12930000000h
|
||||
DQ 3fdb1b1e00000000h
|
||||
DQ 3fdb44f770000000h
|
||||
DQ 3fdb6eb590000000h
|
||||
DQ 3fdb985890000000h
|
||||
DQ 3fdbc1e080000000h
|
||||
DQ 3fdbeb4d90000000h
|
||||
DQ 3fdc149ff0000000h
|
||||
DQ 3fdc3dd7a0000000h
|
||||
DQ 3fdc66f4e0000000h
|
||||
DQ 3fdc8ff7c0000000h
|
||||
DQ 3fdcb8e070000000h
|
||||
DQ 3fdce1af00000000h
|
||||
DQ 3fdd0a63a0000000h
|
||||
DQ 3fdd32fe70000000h
|
||||
DQ 3fdd5b7f90000000h
|
||||
DQ 3fdd83e720000000h
|
||||
DQ 3fddac3530000000h
|
||||
DQ 3fddd46a00000000h
|
||||
DQ 3fddfc8590000000h
|
||||
DQ 3fde248810000000h
|
||||
DQ 3fde4c71a0000000h
|
||||
DQ 3fde744260000000h
|
||||
DQ 3fde9bfa60000000h
|
||||
DQ 3fdec399d0000000h
|
||||
DQ 3fdeeb20c0000000h
|
||||
DQ 3fdf128f50000000h
|
||||
DQ 3fdf39e5b0000000h
|
||||
DQ 3fdf6123f0000000h
|
||||
DQ 3fdf884a30000000h
|
||||
DQ 3fdfaf5880000000h
|
||||
DQ 3fdfd64f20000000h
|
||||
DQ 3fdffd2e00000000h
|
||||
DQ 3fe011fab0000000h
|
||||
DQ 3fe02552a0000000h
|
||||
DQ 3fe0389ee0000000h
|
||||
DQ 3fe04bdf90000000h
|
||||
DQ 3fe05f14b0000000h
|
||||
DQ 3fe0723e50000000h
|
||||
DQ 3fe0855c80000000h
|
||||
DQ 3fe0986f40000000h
|
||||
DQ 3fe0ab76b0000000h
|
||||
DQ 3fe0be72e0000000h
|
||||
DQ 3fe0d163c0000000h
|
||||
DQ 3fe0e44980000000h
|
||||
DQ 3fe0f72410000000h
|
||||
DQ 3fe109f390000000h
|
||||
DQ 3fe11cb810000000h
|
||||
DQ 3fe12f7190000000h
|
||||
DQ 3fe1422020000000h
|
||||
DQ 3fe154c3d0000000h
|
||||
DQ 3fe1675ca0000000h
|
||||
DQ 3fe179eab0000000h
|
||||
DQ 3fe18c6e00000000h
|
||||
DQ 3fe19ee6b0000000h
|
||||
DQ 3fe1b154b0000000h
|
||||
DQ 3fe1c3b810000000h
|
||||
DQ 3fe1d610f0000000h
|
||||
DQ 3fe1e85f50000000h
|
||||
DQ 3fe1faa340000000h
|
||||
DQ 3fe20cdcd0000000h
|
||||
DQ 3fe21f0bf0000000h
|
||||
DQ 3fe23130d0000000h
|
||||
DQ 3fe2434b60000000h
|
||||
DQ 3fe2555bc0000000h
|
||||
DQ 3fe2676200000000h
|
||||
DQ 3fe2795e10000000h
|
||||
DQ 3fe28b5000000000h
|
||||
DQ 3fe29d37f0000000h
|
||||
DQ 3fe2af15f0000000h
|
||||
DQ 3fe2c0e9e0000000h
|
||||
DQ 3fe2d2b400000000h
|
||||
DQ 3fe2e47430000000h
|
||||
DQ 3fe2f62a90000000h
|
||||
DQ 3fe307d730000000h
|
||||
DQ 3fe3197a00000000h
|
||||
DQ 3fe32b1330000000h
|
||||
DQ 3fe33ca2b0000000h
|
||||
DQ 3fe34e2890000000h
|
||||
DQ 3fe35fa4e0000000h
|
||||
DQ 3fe37117b0000000h
|
||||
DQ 3fe38280f0000000h
|
||||
DQ 3fe393e0d0000000h
|
||||
DQ 3fe3a53730000000h
|
||||
DQ 3fe3b68440000000h
|
||||
DQ 3fe3c7c7f0000000h
|
||||
DQ 3fe3d90260000000h
|
||||
DQ 3fe3ea3390000000h
|
||||
DQ 3fe3fb5b80000000h
|
||||
DQ 3fe40c7a40000000h
|
||||
DQ 3fe41d8fe0000000h
|
||||
DQ 3fe42e9c60000000h
|
||||
DQ 3fe43f9fe0000000h
|
||||
DQ 3fe4509a50000000h
|
||||
DQ 3fe4618bc0000000h
|
||||
DQ 3fe4727430000000h
|
||||
DQ 3fe48353d0000000h
|
||||
DQ 3fe4942a80000000h
|
||||
DQ 3fe4a4f850000000h
|
||||
DQ 3fe4b5bd60000000h
|
||||
DQ 3fe4c679a0000000h
|
||||
DQ 3fe4d72d30000000h
|
||||
DQ 3fe4e7d810000000h
|
||||
DQ 3fe4f87a30000000h
|
||||
DQ 3fe50913c0000000h
|
||||
DQ 3fe519a4c0000000h
|
||||
DQ 3fe52a2d20000000h
|
||||
DQ 3fe53aad00000000h
|
||||
DQ 3fe54b2460000000h
|
||||
DQ 3fe55b9350000000h
|
||||
DQ 3fe56bf9d0000000h
|
||||
DQ 3fe57c57f0000000h
|
||||
DQ 3fe58cadb0000000h
|
||||
DQ 3fe59cfb20000000h
|
||||
DQ 3fe5ad4040000000h
|
||||
DQ 3fe5bd7d30000000h
|
||||
DQ 3fe5cdb1d0000000h
|
||||
DQ 3fe5ddde50000000h
|
||||
DQ 3fe5ee02a0000000h
|
||||
DQ 3fe5fe1ed0000000h
|
||||
DQ 3fe60e32f0000000h
|
||||
DQ 3fe61e3ef0000000h
|
||||
DQ 3fe62e42e0000000h
|
||||
DQ 0000000000000000h
|
||||
|
||||
ALIGN 16
|
||||
PUBLIC __log_256_tail
|
||||
__log_256_tail DQ 0000000000000000h
|
||||
DQ 3db5885e0250435ah
|
||||
DQ 3de620cf11f86ed2h
|
||||
DQ 3dff0214edba4a25h
|
||||
DQ 3dbf807c79f3db4eh
|
||||
DQ 3dea352ba779a52bh
|
||||
DQ 3dff56c46aa49fd5h
|
||||
DQ 3dfebe465fef5196h
|
||||
DQ 3e0cf0660099f1f8h
|
||||
DQ 3e1247b2ff85945dh
|
||||
DQ 3e13fd7abf5202b6h
|
||||
DQ 3e1f91c9a918d51eh
|
||||
DQ 3e08cb73f118d3cah
|
||||
DQ 3e1d91c7d6fad074h
|
||||
DQ 3de1971bec28d14ch
|
||||
DQ 3e15b616a423c78ah
|
||||
DQ 3da162a6617cc971h
|
||||
DQ 3e166391c4c06d29h
|
||||
DQ 3e2d46f5c1d0c4b8h
|
||||
DQ 3e2e14282df1f6d3h
|
||||
DQ 3e186f47424a660dh
|
||||
DQ 3e2d4c8de077753eh
|
||||
DQ 3e2e0c307ed24f1ch
|
||||
DQ 3e226ea18763bdd3h
|
||||
DQ 3e25cad69737c933h
|
||||
DQ 3e2af62599088901h
|
||||
DQ 3e18c66c83d6b2d0h
|
||||
DQ 3e1880ceb36fb30fh
|
||||
DQ 3e2495aac6ca17a4h
|
||||
DQ 3e2761db4210878ch
|
||||
DQ 3e2eb78e862bac2fh
|
||||
DQ 3e19b2cd75790dd9h
|
||||
DQ 3e2c55e5cbd3d50fh
|
||||
DQ 3db162a6617cc971h
|
||||
DQ 3dfdbeabaaa2e519h
|
||||
DQ 3e1652cb7150c647h
|
||||
DQ 3e39a11cb2cd2ee2h
|
||||
DQ 3e219d0ab1a28813h
|
||||
DQ 3e24bd9e80a41811h
|
||||
DQ 3e3214b596faa3dfh
|
||||
DQ 3e303fea46980bb8h
|
||||
DQ 3e31c8ffa5fd28c7h
|
||||
DQ 3dce8f743bcd96c5h
|
||||
DQ 3dfd98c5395315c6h
|
||||
DQ 3e3996fa3ccfa7b2h
|
||||
DQ 3e1cd2af2ad13037h
|
||||
DQ 3e1d0da1bd17200eh
|
||||
DQ 3e3330410ba68b75h
|
||||
DQ 3df4f27a790e7c41h
|
||||
DQ 3e13956a86f6ff1bh
|
||||
DQ 3e2c6748723551d9h
|
||||
DQ 3e2500de9326cdfch
|
||||
DQ 3e1086c848df1b59h
|
||||
DQ 3e04357ead6836ffh
|
||||
DQ 3e24832442408024h
|
||||
DQ 3e3d10da8154b13dh
|
||||
DQ 3e39e8ad68ec8260h
|
||||
DQ 3e3cfbf706abaf18h
|
||||
DQ 3e3fc56ac6326e23h
|
||||
DQ 3e39105e3185cf21h
|
||||
DQ 3e3d017fe5b19cc0h
|
||||
DQ 3e3d1f6b48dd13feh
|
||||
DQ 3e20b63358a7e73ah
|
||||
DQ 3e263063028c211ch
|
||||
DQ 3e2e6a6886b09760h
|
||||
DQ 3e3c138bb891cd03h
|
||||
DQ 3e369f7722b7221ah
|
||||
DQ 3df57d8fac1a628ch
|
||||
DQ 3e3c55e5cbd3d50fh
|
||||
DQ 3e1552d2ff48fe2eh
|
||||
DQ 3e37b8b26ca431bch
|
||||
DQ 3e292decdc1c5f6dh
|
||||
DQ 3e3abc7c551aaa8ch
|
||||
DQ 3e36b540731a354bh
|
||||
DQ 3e32d341036b89efh
|
||||
DQ 3e4f9ab21a3a2e0fh
|
||||
DQ 3e239c871afb9fbdh
|
||||
DQ 3e3e6add2c81f640h
|
||||
DQ 3e435c95aa313f41h
|
||||
DQ 3e249d4582f6cc53h
|
||||
DQ 3e47574c1c07398fh
|
||||
DQ 3e4ba846dece9e8dh
|
||||
DQ 3e16999fafbc68e7h
|
||||
DQ 3e4c9145e51b0103h
|
||||
DQ 3e479ef2cb44850ah
|
||||
DQ 3e0beec73de11275h
|
||||
DQ 3e2ef4351af5a498h
|
||||
DQ 3e45713a493b4a50h
|
||||
DQ 3e45c23a61385992h
|
||||
DQ 3e42a88309f57299h
|
||||
DQ 3e4530faa9ac8aceh
|
||||
DQ 3e25fec2d792a758h
|
||||
DQ 3e35a517a71cbcd7h
|
||||
DQ 3e3707dc3e1cd9a3h
|
||||
DQ 3e3a1a9f8ef43049h
|
||||
DQ 3e4409d0276b3674h
|
||||
DQ 3e20e2f613e85bd9h
|
||||
DQ 3df0027433001e5fh
|
||||
DQ 3e35dde2836d3265h
|
||||
DQ 3e2300134d7aaf04h
|
||||
DQ 3e3cb7e0b42724f5h
|
||||
DQ 3e2d6e93167e6308h
|
||||
DQ 3e3d1569b1526adbh
|
||||
DQ 3e0e99fc338a1a41h
|
||||
DQ 3e4eb01394a11b1ch
|
||||
DQ 3e04f27a790e7c41h
|
||||
DQ 3e25ce3ca97b7af9h
|
||||
DQ 3e281f0f940ed857h
|
||||
DQ 3e4d36295d88857ch
|
||||
DQ 3e21aca1ec4af526h
|
||||
DQ 3e445743c7182726h
|
||||
DQ 3e23c491aead337eh
|
||||
DQ 3e3aef401a738931h
|
||||
DQ 3e21cede76092a29h
|
||||
DQ 3e4fba8f44f82bb4h
|
||||
DQ 3e446f5f7f3c3e1ah
|
||||
DQ 3e47055f86c9674bh
|
||||
DQ 3e4b41a92b6b6e1ah
|
||||
DQ 3e443d162e927628h
|
||||
DQ 3e4466174013f9b1h
|
||||
DQ 3e3b05096ad69c62h
|
||||
DQ 3e40b169150faa58h
|
||||
DQ 3e3cd98b1df85da7h
|
||||
DQ 3e468b507b0f8fa8h
|
||||
DQ 3e48422df57499bah
|
||||
DQ 3e11351586970274h
|
||||
DQ 3e117e08acba92eeh
|
||||
DQ 3e26e04314dd0229h
|
||||
DQ 3e497f3097e56d1ah
|
||||
DQ 3e3356e655901286h
|
||||
DQ 3e0cb761457f94d6h
|
||||
DQ 3e39af67a85a9dach
|
||||
DQ 3e453410931a909fh
|
||||
DQ 3e22c587206058f5h
|
||||
DQ 3e223bc358899c22h
|
||||
DQ 3e4d7bf8b6d223cbh
|
||||
DQ 3e47991ec5197ddbh
|
||||
DQ 3e4a79e6bb3a9219h
|
||||
DQ 3e3a4c43ed663ec5h
|
||||
DQ 3e461b5a1484f438h
|
||||
DQ 3e4b4e36f7ef0c3ah
|
||||
DQ 3e115f026acd0d1bh
|
||||
DQ 3e3f36b535cecf05h
|
||||
DQ 3e2ffb7fbf3eb5c6h
|
||||
DQ 3e3e6a6886b09760h
|
||||
DQ 3e3135eb27f5bbc3h
|
||||
DQ 3e470be7d6f6fa57h
|
||||
DQ 3e4ce43cc84ab338h
|
||||
DQ 3e4c01d7aac3bd91h
|
||||
DQ 3e45c58d07961060h
|
||||
DQ 3e3628bcf941456eh
|
||||
DQ 3e4c58b2a8461cd2h
|
||||
DQ 3e33071282fb989ah
|
||||
DQ 3e420dab6a80f09ch
|
||||
DQ 3e44f8d84c397b1eh
|
||||
DQ 3e40d0ee08599e48h
|
||||
DQ 3e1d68787e37da36h
|
||||
DQ 3e366187d591bafch
|
||||
DQ 3e22346600bae772h
|
||||
DQ 3e390377d0d61b8eh
|
||||
DQ 3e4f5e0dd966b907h
|
||||
DQ 3e49023cb79a00e2h
|
||||
DQ 3e44e05158c28ad8h
|
||||
DQ 3e3bfa7b08b18ae4h
|
||||
DQ 3e4ef1e63db35f67h
|
||||
DQ 3e0ec2ae39493d4fh
|
||||
DQ 3e40afe930ab2fa0h
|
||||
DQ 3e225ff8a1810dd4h
|
||||
DQ 3e469743fb1a71a5h
|
||||
DQ 3e5f9cc676785571h
|
||||
DQ 3e5b524da4cbf982h
|
||||
DQ 3e5a4c8b381535b8h
|
||||
DQ 3e5839be809caf2ch
|
||||
DQ 3e50968a1cb82c13h
|
||||
DQ 3e5eae6a41723fb5h
|
||||
DQ 3e5d9c29a380a4dbh
|
||||
DQ 3e4094aa0ada625eh
|
||||
DQ 3e5973ad6fc108cah
|
||||
DQ 3e4747322fdbab97h
|
||||
DQ 3e593692fa9d4221h
|
||||
DQ 3e5c5a992dfbc7d9h
|
||||
DQ 3e4e1f33e102387ah
|
||||
DQ 3e464fbef14c048ch
|
||||
DQ 3e4490f513ca5e3bh
|
||||
DQ 3e37a6af4d4c799dh
|
||||
DQ 3e57574c1c07398fh
|
||||
DQ 3e57b133417f8c1ch
|
||||
DQ 3e5feb9e0c176514h
|
||||
DQ 3e419f25bb3172f7h
|
||||
DQ 3e45f68a7bbfb852h
|
||||
DQ 3e5ee278497929f1h
|
||||
DQ 3e5ccee006109d58h
|
||||
DQ 3e5ce081a07bd8b3h
|
||||
DQ 3e570e12981817b8h
|
||||
DQ 3e292ab6d93503d0h
|
||||
DQ 3e58cb7dd7c3b61eh
|
||||
DQ 3e4efafd0a0b78dah
|
||||
DQ 3e5e907267c4288eh
|
||||
DQ 3e5d31ef96780875h
|
||||
DQ 3e23430dfcd2ad50h
|
||||
DQ 3e344d88d75bc1f9h
|
||||
DQ 3e5bec0f055e04fch
|
||||
DQ 3e5d85611590b9adh
|
||||
DQ 3df320568e583229h
|
||||
DQ 3e5a891d1772f538h
|
||||
DQ 3e22edc9dabba74dh
|
||||
DQ 3e4b9009a1015086h
|
||||
DQ 3e52a12a8c5b1a19h
|
||||
DQ 3e3a7885f0fdac85h
|
||||
DQ 3e5f4ffcd43ac691h
|
||||
DQ 3e52243ae2640aadh
|
||||
DQ 3e546513299035d3h
|
||||
DQ 3e5b39c3a62dd725h
|
||||
DQ 3e5ba6dd40049f51h
|
||||
DQ 3e451d1ed7177409h
|
||||
DQ 3e5cb0f2fd7f5216h
|
||||
DQ 3e3ab150cd4e2213h
|
||||
DQ 3e5cfd7bf3193844h
|
||||
DQ 3e53fff8455f1dbdh
|
||||
DQ 3e5fee640b905fc9h
|
||||
DQ 3e54e2adf548084ch
|
||||
DQ 3e3b597adc1ecdd2h
|
||||
DQ 3e4345bd096d3a75h
|
||||
DQ 3e5101b9d2453c8bh
|
||||
DQ 3e508ce55cc8c979h
|
||||
DQ 3e5bbf017e595f71h
|
||||
DQ 3e37ce733bd393dch
|
||||
DQ 3e233bb0a503f8a1h
|
||||
DQ 3e30e2f613e85bd9h
|
||||
DQ 3e5e67555a635b3ch
|
||||
DQ 3e2ea88df73d5e8bh
|
||||
DQ 3e3d17e03bda18a8h
|
||||
DQ 3e5b607d76044f7eh
|
||||
DQ 3e52adc4e71bc2fch
|
||||
DQ 3e5f99dc7362d1d9h
|
||||
DQ 3e5473fa008e6a6ah
|
||||
DQ 3e2b75bb09cb0985h
|
||||
DQ 3e5ea04dd10b9abah
|
||||
DQ 3e5802d0d6979674h
|
||||
DQ 3e174688ccd99094h
|
||||
DQ 3e496f16abb9df22h
|
||||
DQ 3e46e66df2aa374fh
|
||||
DQ 3e4e66525ea4550ah
|
||||
DQ 3e42d02f34f20cbdh
|
||||
DQ 3e46cfce65047188h
|
||||
DQ 3e39b78c842d58b8h
|
||||
DQ 3e4735e624c24bc9h
|
||||
DQ 3e47eba1f7dd1adfh
|
||||
DQ 3e586b3e59f65355h
|
||||
DQ 3e1ce38e637f1b4dh
|
||||
DQ 3e58d82ec919edc7h
|
||||
DQ 3e4c52648ddcfa37h
|
||||
DQ 3e52482ceae1ac12h
|
||||
DQ 3e55a312311aba4fh
|
||||
DQ 3e411e236329f225h
|
||||
DQ 3e5b48c8cd2f246ch
|
||||
DQ 3e6efa39ef35793ch
|
||||
DQ 0000000000000000h
|
||||
|
||||
END
|
164
sdk/lib/crt/math/libm_sse2/log_F_inv_dword_table.asm
Normal file
164
sdk/lib/crt/math/libm_sse2/log_F_inv_dword_table.asm
Normal file
|
@ -0,0 +1,164 @@
|
|||
;;
|
||||
;
|
||||
; MIT License
|
||||
; -----------
|
||||
;
|
||||
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
||||
;
|
||||
; Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
; of this Software and associated documentaon files (the "Software"), to deal
|
||||
; in the Software without restriction, including without limitation the rights
|
||||
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
; copies of the Software, and to permit persons to whom the Software is
|
||||
; furnished to do so, subject to the following conditions:
|
||||
;
|
||||
; The above copyright notice and this permission notice shall be included in
|
||||
; all copies or substantial portions of the Software.
|
||||
;
|
||||
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
; THE SOFTWARE.
|
||||
;
|
||||
;; Defines __log_F_inv_dword
|
||||
;; Used in log10f and logf
|
||||
;;
|
||||
|
||||
.const
|
||||
|
||||
ALIGN 16
|
||||
PUBLIC __log_F_inv_dword
|
||||
__log_F_inv_dword DD 40000000h
|
||||
DD 3ffe03f8h
|
||||
DD 3ffc0fc1h
|
||||
DD 3ffa232dh
|
||||
DD 3ff83e10h
|
||||
DD 3ff6603eh
|
||||
DD 3ff4898dh
|
||||
DD 3ff2b9d6h
|
||||
DD 3ff0f0f1h
|
||||
DD 3fef2eb7h
|
||||
DD 3fed7304h
|
||||
DD 3febbdb3h
|
||||
DD 3fea0ea1h
|
||||
DD 3fe865ach
|
||||
DD 3fe6c2b4h
|
||||
DD 3fe52598h
|
||||
DD 3fe38e39h
|
||||
DD 3fe1fc78h
|
||||
DD 3fe07038h
|
||||
DD 3fdee95ch
|
||||
DD 3fdd67c9h
|
||||
DD 3fdbeb62h
|
||||
DD 3fda740eh
|
||||
DD 3fd901b2h
|
||||
DD 3fd79436h
|
||||
DD 3fd62b81h
|
||||
DD 3fd4c77bh
|
||||
DD 3fd3680dh
|
||||
DD 3fd20d21h
|
||||
DD 3fd0b6a0h
|
||||
DD 3fcf6475h
|
||||
DD 3fce168ah
|
||||
DD 3fcccccdh
|
||||
DD 3fcb8728h
|
||||
DD 3fca4588h
|
||||
DD 3fc907dah
|
||||
DD 3fc7ce0ch
|
||||
DD 3fc6980ch
|
||||
DD 3fc565c8h
|
||||
DD 3fc43730h
|
||||
DD 3fc30c31h
|
||||
DD 3fc1e4bch
|
||||
DD 3fc0c0c1h
|
||||
DD 3fbfa030h
|
||||
DD 3fbe82fah
|
||||
DD 3fbd6910h
|
||||
DD 3fbc5264h
|
||||
DD 3fbb3ee7h
|
||||
DD 3fba2e8ch
|
||||
DD 3fb92144h
|
||||
DD 3fb81703h
|
||||
DD 3fb70fbbh
|
||||
DD 3fb60b61h
|
||||
DD 3fb509e7h
|
||||
DD 3fb40b41h
|
||||
DD 3fb30f63h
|
||||
DD 3fb21643h
|
||||
DD 3fb11fd4h
|
||||
DD 3fb02c0bh
|
||||
DD 3faf3adeh
|
||||
DD 3fae4c41h
|
||||
DD 3fad602bh
|
||||
DD 3fac7692h
|
||||
DD 3fab8f6ah
|
||||
DD 3faaaaabh
|
||||
DD 3fa9c84ah
|
||||
DD 3fa8e83fh
|
||||
DD 3fa80a81h
|
||||
DD 3fa72f05h
|
||||
DD 3fa655c4h
|
||||
DD 3fa57eb5h
|
||||
DD 3fa4a9cfh
|
||||
DD 3fa3d70ah
|
||||
DD 3fa3065eh
|
||||
DD 3fa237c3h
|
||||
DD 3fa16b31h
|
||||
DD 3fa0a0a1h
|
||||
DD 3f9fd80ah
|
||||
DD 3f9f1166h
|
||||
DD 3f9e4cadh
|
||||
DD 3f9d89d9h
|
||||
DD 3f9cc8e1h
|
||||
DD 3f9c09c1h
|
||||
DD 3f9b4c70h
|
||||
DD 3f9a90e8h
|
||||
DD 3f99d723h
|
||||
DD 3f991f1ah
|
||||
DD 3f9868c8h
|
||||
DD 3f97b426h
|
||||
DD 3f97012eh
|
||||
DD 3f964fdah
|
||||
DD 3f95a025h
|
||||
DD 3f94f209h
|
||||
DD 3f944581h
|
||||
DD 3f939a86h
|
||||
DD 3f92f114h
|
||||
DD 3f924925h
|
||||
DD 3f91a2b4h
|
||||
DD 3f90fdbch
|
||||
DD 3f905a38h
|
||||
DD 3f8fb824h
|
||||
DD 3f8f177ah
|
||||
DD 3f8e7835h
|
||||
DD 3f8dda52h
|
||||
DD 3f8d3dcbh
|
||||
DD 3f8ca29ch
|
||||
DD 3f8c08c1h
|
||||
DD 3f8b7034h
|
||||
DD 3f8ad8f3h
|
||||
DD 3f8a42f8h
|
||||
DD 3f89ae41h
|
||||
DD 3f891ac7h
|
||||
DD 3f888889h
|
||||
DD 3f87f781h
|
||||
DD 3f8767abh
|
||||
DD 3f86d905h
|
||||
DD 3f864b8ah
|
||||
DD 3f85bf37h
|
||||
DD 3f853408h
|
||||
DD 3f84a9fah
|
||||
DD 3f842108h
|
||||
DD 3f839930h
|
||||
DD 3f83126fh
|
||||
DD 3f828cc0h
|
||||
DD 3f820821h
|
||||
DD 3f81848eh
|
||||
DD 3f810204h
|
||||
DD 3f808081h
|
||||
DD 3f800000h
|
||||
|
||||
END
|
294
sdk/lib/crt/math/libm_sse2/log_F_inv_qword_table.asm
Normal file
294
sdk/lib/crt/math/libm_sse2/log_F_inv_qword_table.asm
Normal file
|
@ -0,0 +1,294 @@
|
|||
;;
|
||||
;
|
||||
; MIT License
|
||||
; -----------
|
||||
;
|
||||
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
||||
;
|
||||
; Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
; of this Software and associated documentaon files (the "Software"), to deal
|
||||
; in the Software without restriction, including without limitation the rights
|
||||
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
; copies of the Software, and to permit persons to whom the Software is
|
||||
; furnished to do so, subject to the following conditions:
|
||||
;
|
||||
; The above copyright notice and this permission notice shall be included in
|
||||
; all copies or substantial portions of the Software.
|
||||
;
|
||||
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
; THE SOFTWARE.
|
||||
;
|
||||
;; Defines __log_F_inv_qword
|
||||
;; Used in log10 and log
|
||||
;;
|
||||
|
||||
.const
|
||||
|
||||
ALIGN 16
|
||||
PUBLIC __log_F_inv_qword
|
||||
__log_F_inv_qword DQ 4000000000000000h
|
||||
DQ 3fffe01fe01fe020h
|
||||
DQ 3fffc07f01fc07f0h
|
||||
DQ 3fffa11caa01fa12h
|
||||
DQ 3fff81f81f81f820h
|
||||
DQ 3fff6310aca0dbb5h
|
||||
DQ 3fff44659e4a4271h
|
||||
DQ 3fff25f644230ab5h
|
||||
DQ 3fff07c1f07c1f08h
|
||||
DQ 3ffee9c7f8458e02h
|
||||
DQ 3ffecc07b301ecc0h
|
||||
DQ 3ffeae807aba01ebh
|
||||
DQ 3ffe9131abf0b767h
|
||||
DQ 3ffe741aa59750e4h
|
||||
DQ 3ffe573ac901e574h
|
||||
DQ 3ffe3a9179dc1a73h
|
||||
DQ 3ffe1e1e1e1e1e1eh
|
||||
DQ 3ffe01e01e01e01eh
|
||||
DQ 3ffde5d6e3f8868ah
|
||||
DQ 3ffdca01dca01dcah
|
||||
DQ 3ffdae6076b981dbh
|
||||
DQ 3ffd92f2231e7f8ah
|
||||
DQ 3ffd77b654b82c34h
|
||||
DQ 3ffd5cac807572b2h
|
||||
DQ 3ffd41d41d41d41dh
|
||||
DQ 3ffd272ca3fc5b1ah
|
||||
DQ 3ffd0cb58f6ec074h
|
||||
DQ 3ffcf26e5c44bfc6h
|
||||
DQ 3ffcd85689039b0bh
|
||||
DQ 3ffcbe6d9601cbe7h
|
||||
DQ 3ffca4b3055ee191h
|
||||
DQ 3ffc8b265afb8a42h
|
||||
DQ 3ffc71c71c71c71ch
|
||||
DQ 3ffc5894d10d4986h
|
||||
DQ 3ffc3f8f01c3f8f0h
|
||||
DQ 3ffc26b5392ea01ch
|
||||
DQ 3ffc0e070381c0e0h
|
||||
DQ 3ffbf583ee868d8bh
|
||||
DQ 3ffbdd2b899406f7h
|
||||
DQ 3ffbc4fd65883e7bh
|
||||
DQ 3ffbacf914c1bad0h
|
||||
DQ 3ffb951e2b18ff23h
|
||||
DQ 3ffb7d6c3dda338bh
|
||||
DQ 3ffb65e2e3beee05h
|
||||
DQ 3ffb4e81b4e81b4fh
|
||||
DQ 3ffb37484ad806ceh
|
||||
DQ 3ffb2036406c80d9h
|
||||
DQ 3ffb094b31d922a4h
|
||||
DQ 3ffaf286bca1af28h
|
||||
DQ 3ffadbe87f94905eh
|
||||
DQ 3ffac5701ac5701bh
|
||||
DQ 3ffaaf1d2f87ebfdh
|
||||
DQ 3ffa98ef606a63beh
|
||||
DQ 3ffa82e65130e159h
|
||||
DQ 3ffa6d01a6d01a6dh
|
||||
DQ 3ffa574107688a4ah
|
||||
DQ 3ffa41a41a41a41ah
|
||||
DQ 3ffa2c2a87c51ca0h
|
||||
DQ 3ffa16d3f97a4b02h
|
||||
DQ 3ffa01a01a01a01ah
|
||||
DQ 3ff9ec8e951033d9h
|
||||
DQ 3ff9d79f176b682dh
|
||||
DQ 3ff9c2d14ee4a102h
|
||||
DQ 3ff9ae24ea5510dah
|
||||
DQ 3ff999999999999ah
|
||||
DQ 3ff9852f0d8ec0ffh
|
||||
DQ 3ff970e4f80cb872h
|
||||
DQ 3ff95cbb0be377aeh
|
||||
DQ 3ff948b0fcd6e9e0h
|
||||
DQ 3ff934c67f9b2ce6h
|
||||
DQ 3ff920fb49d0e229h
|
||||
DQ 3ff90d4f120190d5h
|
||||
DQ 3ff8f9c18f9c18fah
|
||||
DQ 3ff8e6527af1373fh
|
||||
DQ 3ff8d3018d3018d3h
|
||||
DQ 3ff8bfce8062ff3ah
|
||||
DQ 3ff8acb90f6bf3aah
|
||||
DQ 3ff899c0f601899ch
|
||||
DQ 3ff886e5f0abb04ah
|
||||
DQ 3ff87427bcc092b9h
|
||||
DQ 3ff8618618618618h
|
||||
DQ 3ff84f00c2780614h
|
||||
DQ 3ff83c977ab2beddh
|
||||
DQ 3ff82a4a0182a4a0h
|
||||
DQ 3ff8181818181818h
|
||||
DQ 3ff8060180601806h
|
||||
DQ 3ff7f405fd017f40h
|
||||
DQ 3ff7e225515a4f1dh
|
||||
DQ 3ff7d05f417d05f4h
|
||||
DQ 3ff7beb3922e017ch
|
||||
DQ 3ff7ad2208e0ecc3h
|
||||
DQ 3ff79baa6bb6398bh
|
||||
DQ 3ff78a4c8178a4c8h
|
||||
DQ 3ff77908119ac60dh
|
||||
DQ 3ff767dce434a9b1h
|
||||
DQ 3ff756cac201756dh
|
||||
DQ 3ff745d1745d1746h
|
||||
DQ 3ff734f0c541fe8dh
|
||||
DQ 3ff724287f46debch
|
||||
DQ 3ff713786d9c7c09h
|
||||
DQ 3ff702e05c0b8170h
|
||||
DQ 3ff6f26016f26017h
|
||||
DQ 3ff6e1f76b4337c7h
|
||||
DQ 3ff6d1a62681c861h
|
||||
DQ 3ff6c16c16c16c17h
|
||||
DQ 3ff6b1490aa31a3dh
|
||||
DQ 3ff6a13cd1537290h
|
||||
DQ 3ff691473a88d0c0h
|
||||
DQ 3ff6816816816817h
|
||||
DQ 3ff6719f3601671ah
|
||||
DQ 3ff661ec6a5122f9h
|
||||
DQ 3ff6524f853b4aa3h
|
||||
DQ 3ff642c8590b2164h
|
||||
DQ 3ff63356b88ac0deh
|
||||
DQ 3ff623fa77016240h
|
||||
DQ 3ff614b36831ae94h
|
||||
DQ 3ff6058160581606h
|
||||
DQ 3ff5f66434292dfch
|
||||
DQ 3ff5e75bb8d015e7h
|
||||
DQ 3ff5d867c3ece2a5h
|
||||
DQ 3ff5c9882b931057h
|
||||
DQ 3ff5babcc647fa91h
|
||||
DQ 3ff5ac056b015ac0h
|
||||
DQ 3ff59d61f123ccaah
|
||||
DQ 3ff58ed2308158edh
|
||||
DQ 3ff5805601580560h
|
||||
DQ 3ff571ed3c506b3ah
|
||||
DQ 3ff56397ba7c52e2h
|
||||
DQ 3ff5555555555555h
|
||||
DQ 3ff54725e6bb82feh
|
||||
DQ 3ff5390948f40febh
|
||||
DQ 3ff52aff56a8054bh
|
||||
DQ 3ff51d07eae2f815h
|
||||
DQ 3ff50f22e111c4c5h
|
||||
DQ 3ff5015015015015h
|
||||
DQ 3ff4f38f62dd4c9bh
|
||||
DQ 3ff4e5e0a72f0539h
|
||||
DQ 3ff4d843bedc2c4ch
|
||||
DQ 3ff4cab88725af6eh
|
||||
DQ 3ff4bd3edda68fe1h
|
||||
DQ 3ff4afd6a052bf5bh
|
||||
DQ 3ff4a27fad76014ah
|
||||
DQ 3ff49539e3b2d067h
|
||||
DQ 3ff4880522014880h
|
||||
DQ 3ff47ae147ae147bh
|
||||
DQ 3ff46dce34596066h
|
||||
DQ 3ff460cbc7f5cf9ah
|
||||
DQ 3ff453d9e2c776cah
|
||||
DQ 3ff446f86562d9fbh
|
||||
DQ 3ff43a2730abee4dh
|
||||
DQ 3ff42d6625d51f87h
|
||||
DQ 3ff420b5265e5951h
|
||||
DQ 3ff4141414141414h
|
||||
DQ 3ff40782d10e6566h
|
||||
DQ 3ff3fb013fb013fbh
|
||||
DQ 3ff3ee8f42a5af07h
|
||||
DQ 3ff3e22cbce4a902h
|
||||
DQ 3ff3d5d991aa75c6h
|
||||
DQ 3ff3c995a47babe7h
|
||||
DQ 3ff3bd60d9232955h
|
||||
DQ 3ff3b13b13b13b14h
|
||||
DQ 3ff3a524387ac822h
|
||||
DQ 3ff3991c2c187f63h
|
||||
DQ 3ff38d22d366088eh
|
||||
DQ 3ff3813813813814h
|
||||
DQ 3ff3755bd1c945eeh
|
||||
DQ 3ff3698df3de0748h
|
||||
DQ 3ff35dce5f9f2af8h
|
||||
DQ 3ff3521cfb2b78c1h
|
||||
DQ 3ff34679ace01346h
|
||||
DQ 3ff33ae45b57bcb2h
|
||||
DQ 3ff32f5ced6a1dfah
|
||||
DQ 3ff323e34a2b10bfh
|
||||
DQ 3ff3187758e9ebb6h
|
||||
DQ 3ff30d190130d190h
|
||||
DQ 3ff301c82ac40260h
|
||||
DQ 3ff2f684bda12f68h
|
||||
DQ 3ff2eb4ea1fed14bh
|
||||
DQ 3ff2e025c04b8097h
|
||||
DQ 3ff2d50a012d50a0h
|
||||
DQ 3ff2c9fb4d812ca0h
|
||||
DQ 3ff2bef98e5a3711h
|
||||
DQ 3ff2b404ad012b40h
|
||||
DQ 3ff2a91c92f3c105h
|
||||
DQ 3ff29e4129e4129eh
|
||||
DQ 3ff293725bb804a5h
|
||||
DQ 3ff288b01288b013h
|
||||
DQ 3ff27dfa38a1ce4dh
|
||||
DQ 3ff27350b8812735h
|
||||
DQ 3ff268b37cd60127h
|
||||
DQ 3ff25e22708092f1h
|
||||
DQ 3ff2539d7e9177b2h
|
||||
DQ 3ff2492492492492h
|
||||
DQ 3ff23eb79717605bh
|
||||
DQ 3ff23456789abcdfh
|
||||
DQ 3ff22a0122a0122ah
|
||||
DQ 3ff21fb78121fb78h
|
||||
DQ 3ff21579804855e6h
|
||||
DQ 3ff20b470c67c0d9h
|
||||
DQ 3ff2012012012012h
|
||||
DQ 3ff1f7047dc11f70h
|
||||
DQ 3ff1ecf43c7fb84ch
|
||||
DQ 3ff1e2ef3b3fb874h
|
||||
DQ 3ff1d8f5672e4abdh
|
||||
DQ 3ff1cf06ada2811dh
|
||||
DQ 3ff1c522fc1ce059h
|
||||
DQ 3ff1bb4a4046ed29h
|
||||
DQ 3ff1b17c67f2bae3h
|
||||
DQ 3ff1a7b9611a7b96h
|
||||
DQ 3ff19e0119e0119eh
|
||||
DQ 3ff19453808ca29ch
|
||||
DQ 3ff18ab083902bdbh
|
||||
DQ 3ff1811811811812h
|
||||
DQ 3ff1778a191bd684h
|
||||
DQ 3ff16e0689427379h
|
||||
DQ 3ff1648d50fc3201h
|
||||
DQ 3ff15b1e5f75270dh
|
||||
DQ 3ff151b9a3fdd5c9h
|
||||
DQ 3ff1485f0e0acd3bh
|
||||
DQ 3ff13f0e8d344724h
|
||||
DQ 3ff135c81135c811h
|
||||
DQ 3ff12c8b89edc0ach
|
||||
DQ 3ff12358e75d3033h
|
||||
DQ 3ff11a3019a74826h
|
||||
DQ 3ff1111111111111h
|
||||
DQ 3ff107fbbe011080h
|
||||
DQ 3ff0fef010fef011h
|
||||
DQ 3ff0f5edfab325a2h
|
||||
DQ 3ff0ecf56be69c90h
|
||||
DQ 3ff0e40655826011h
|
||||
DQ 3ff0db20a88f4696h
|
||||
DQ 3ff0d24456359e3ah
|
||||
DQ 3ff0c9714fbcda3bh
|
||||
DQ 3ff0c0a7868b4171h
|
||||
DQ 3ff0b7e6ec259dc8h
|
||||
DQ 3ff0af2f722eecb5h
|
||||
DQ 3ff0a6810a6810a7h
|
||||
DQ 3ff09ddba6af8360h
|
||||
DQ 3ff0953f39010954h
|
||||
DQ 3ff08cabb37565e2h
|
||||
DQ 3ff0842108421084h
|
||||
DQ 3ff07b9f29b8eae2h
|
||||
DQ 3ff073260a47f7c6h
|
||||
DQ 3ff06ab59c7912fbh
|
||||
DQ 3ff0624dd2f1a9fch
|
||||
DQ 3ff059eea0727586h
|
||||
DQ 3ff05197f7d73404h
|
||||
DQ 3ff04949cc1664c5h
|
||||
DQ 3ff0410410410410h
|
||||
DQ 3ff038c6b78247fch
|
||||
DQ 3ff03091b51f5e1ah
|
||||
DQ 3ff02864fc7729e9h
|
||||
DQ 3ff0204081020408h
|
||||
DQ 3ff0182436517a37h
|
||||
DQ 3ff0101010101010h
|
||||
DQ 3ff0080402010080h
|
||||
DQ 3ff0000000000000h
|
||||
DQ 0000000000000000h
|
||||
|
||||
|
||||
END
|
133
sdk/lib/crt/math/libm_sse2/log_special.c
Normal file
133
sdk/lib/crt/math/libm_sse2/log_special.c
Normal file
|
@ -0,0 +1,133 @@
|
|||
|
||||
/*******************************************************************************
|
||||
MIT License
|
||||
-----------
|
||||
|
||||
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this Software and associated documentaon files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include <fpieee.h>
|
||||
#include <excpt.h>
|
||||
#include <float.h>
|
||||
#include <math.h>
|
||||
#include <errno.h>
|
||||
|
||||
#include "libm_new.h"
|
||||
|
||||
// y = log10f(x)
|
||||
// y = log10(x)
|
||||
// y = logf(x)
|
||||
// y = log(x)
|
||||
|
||||
// these codes and the ones in the related .asm files have to match
|
||||
#define LOG_X_ZERO 1
|
||||
#define LOG_X_NEG 2
|
||||
#define LOG_X_NAN 3
|
||||
|
||||
static float _logf_special_common(float x, float y, U32 code, unsigned int op, char *name)
|
||||
{
|
||||
switch(code)
|
||||
{
|
||||
case LOG_X_ZERO:
|
||||
{
|
||||
UT64 ym; ym.u64 = 0; ym.f32[0] = y;
|
||||
_handle_errorf(name, op, ym.u64, _SING, AMD_F_DIVBYZERO, ERANGE, x, 0.0, 1);
|
||||
}
|
||||
break;
|
||||
|
||||
case LOG_X_NEG:
|
||||
{
|
||||
UT64 ym; ym.u64 = 0; ym.f32[0] = y;
|
||||
_handle_errorf(name, op, ym.u64, _DOMAIN, AMD_F_INVALID, EDOM, x, 0.0, 1);
|
||||
}
|
||||
break;
|
||||
|
||||
case LOG_X_NAN:
|
||||
{
|
||||
unsigned int is_snan;
|
||||
UT32 xm; UT64 ym;
|
||||
xm.f32 = x;
|
||||
is_snan = (((xm.u32 & QNAN_MASK_32) == QNAN_SET_32) ? 0 : 1);
|
||||
ym.u64 = 0; ym.f32[0] = y;
|
||||
|
||||
if(is_snan)
|
||||
{
|
||||
_handle_errorf(name, op, ym.u64, _DOMAIN, AMD_F_INVALID, EDOM, x, 0.0, 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
_handle_errorf(name, op, ym.u64, _DOMAIN, 0, EDOM, x, 0.0, 1);
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
return y;
|
||||
}
|
||||
|
||||
float _logf_special(float x, float y, U32 code)
|
||||
{
|
||||
return _logf_special_common(x, y, code, _FpCodeLog, "logf");
|
||||
}
|
||||
|
||||
float _log10f_special(float x, float y, U32 code)
|
||||
{
|
||||
return _logf_special_common(x, y, code, _FpCodeLog10, "log10f");
|
||||
}
|
||||
|
||||
static double _log_special_common(double x, double y, U32 code, unsigned int op, char *name)
|
||||
{
|
||||
switch(code)
|
||||
{
|
||||
case LOG_X_ZERO:
|
||||
{
|
||||
UT64 ym; ym.f64 = y;
|
||||
_handle_error(name, op, ym.u64, _SING, AMD_F_DIVBYZERO, ERANGE, x, 0.0, 1);
|
||||
}
|
||||
break;
|
||||
|
||||
case LOG_X_NEG:
|
||||
{
|
||||
UT64 ym; ym.f64 = y;
|
||||
_handle_error(name, op, ym.u64, _DOMAIN, AMD_F_INVALID, EDOM, x, 0.0, 1);
|
||||
}
|
||||
break;
|
||||
|
||||
case LOG_X_NAN:
|
||||
{
|
||||
UT64 ym; ym.f64 = y;
|
||||
_handle_error(name, op, ym.u64, _DOMAIN, 0, EDOM, x, 0.0, 1);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
return y;
|
||||
}
|
||||
|
||||
double _log_special(double x, double y, U32 code)
|
||||
{
|
||||
return _log_special_common(x, y, code, _FpCodeLog, "log");
|
||||
}
|
||||
|
||||
double _log10_special(double x, double y, U32 code)
|
||||
{
|
||||
return _log_special_common(x, y, code, _FpCodeLog10, "log10");
|
||||
}
|
84
sdk/lib/crt/math/libm_sse2/logb.c
Normal file
84
sdk/lib/crt/math/libm_sse2/logb.c
Normal file
|
@ -0,0 +1,84 @@
|
|||
|
||||
/*******************************************************************************
|
||||
MIT License
|
||||
-----------
|
||||
|
||||
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this Software and associated documentaon files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "libm.h"
|
||||
#include "libm_util.h"
|
||||
|
||||
#define USE_INFINITY_WITH_FLAGS
|
||||
#define USE_HANDLE_ERROR
|
||||
#include "libm_inlines.h"
|
||||
#undef USE_INFINITY_WITH_FLAGS
|
||||
#undef USE_HANDLE_ERROR
|
||||
|
||||
#include "libm_errno.h"
|
||||
|
||||
double _logb(double x)
|
||||
{
|
||||
|
||||
unsigned long ux;
|
||||
long u;
|
||||
GET_BITS_DP64(x, ux);
|
||||
u = ((ux & EXPBITS_DP64) >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64;
|
||||
if ((ux & ~SIGNBIT_DP64) == 0)
|
||||
/* x is +/-zero. Return -infinity with div-by-zero flag. */
|
||||
return _handle_error("_logb", OP_LOGB, NINFBITPATT_DP64, _SING,
|
||||
AMD_F_DIVBYZERO, ERANGE, x, 0.0, 1);
|
||||
else if (EMIN_DP64 <= u && u <= EMAX_DP64)
|
||||
/* x is a normal number */
|
||||
return (double)u;
|
||||
else if (u > EMAX_DP64)
|
||||
{
|
||||
/* x is infinity or NaN */
|
||||
if ((ux & MANTBITS_DP64) == 0)
|
||||
/* x is +/-infinity. For VC++, return infinity of same sign. */
|
||||
return x;
|
||||
else
|
||||
/* x is NaN, result is NaN */
|
||||
return _handle_error("_logb", OP_LOGB, ux|0x0008000000000000, _DOMAIN,
|
||||
0, EDOM, x, 0.0, 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* x is denormalized. */
|
||||
#ifdef FOLLOW_IEEE754_LOGB
|
||||
/* Return the value of the minimum exponent to ensure that
|
||||
the relationship between logb and scalb, defined in
|
||||
IEEE 754, holds. */
|
||||
return EMIN_DP64;
|
||||
#else
|
||||
/* Follow the rule set by IEEE 854 for logb */
|
||||
ux &= MANTBITS_DP64;
|
||||
u = EMIN_DP64;
|
||||
while (ux < IMPBIT_DP64)
|
||||
{
|
||||
ux <<= 1;
|
||||
u--;
|
||||
}
|
||||
return (double)u;
|
||||
#endif
|
||||
}
|
||||
|
||||
}
|
82
sdk/lib/crt/math/libm_sse2/logbf.c
Normal file
82
sdk/lib/crt/math/libm_sse2/logbf.c
Normal file
|
@ -0,0 +1,82 @@
|
|||
|
||||
/*******************************************************************************
|
||||
MIT License
|
||||
-----------
|
||||
|
||||
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this Software and associated documentaon files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "libm.h"
|
||||
#include "libm_util.h"
|
||||
|
||||
#define USE_INFINITYF_WITH_FLAGS
|
||||
#define USE_HANDLE_ERRORF
|
||||
#include "libm_inlines.h"
|
||||
#undef USE_INFINITYF_WITH_FLAGS
|
||||
#undef USE_HANDLE_ERRORF
|
||||
|
||||
#include "libm_errno.h"
|
||||
|
||||
float _logbf(float x)
|
||||
{
|
||||
unsigned int ux;
|
||||
int u;
|
||||
GET_BITS_SP32(x, ux);
|
||||
u = ((ux & EXPBITS_SP32) >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
|
||||
if ((ux & ~SIGNBIT_SP32) == 0)
|
||||
/* x is +/-zero. Return -infinity with div-by-zero flag. */
|
||||
return _handle_errorf("_logbf", OP_LOGB, NINFBITPATT_SP32, _SING,
|
||||
AMD_F_DIVBYZERO, ERANGE, x, 0.0F, 1);
|
||||
else if (EMIN_SP32 <= u && u <= EMAX_SP32)
|
||||
/* x is a normal number */
|
||||
return (float)u;
|
||||
else if (u > EMAX_SP32)
|
||||
{
|
||||
/* x is infinity or NaN */
|
||||
if ((ux & MANTBITS_SP32) == 0)
|
||||
/* x is +/-infinity. For VC++, return infinity of same sign. */
|
||||
return x;
|
||||
else
|
||||
/* x is NaN, result is NaN */
|
||||
return _handle_errorf("_logbf", OP_LOGB, ux|0x00400000, _DOMAIN,
|
||||
0, EDOM, x, 0.0F, 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* x is denormalized. */
|
||||
#ifdef FOLLOW_IEEE754_LOGB
|
||||
/* Return the value of the minimum exponent to ensure that
|
||||
the relationship between logb and scalb, defined in
|
||||
IEEE 754, holds. */
|
||||
return EMIN_SP32;
|
||||
#else
|
||||
/* Follow the rule set by IEEE 854 for logb */
|
||||
ux &= MANTBITS_SP32;
|
||||
u = EMIN_SP32;
|
||||
while (ux < IMPBIT_SP32)
|
||||
{
|
||||
ux <<= 1;
|
||||
u--;
|
||||
}
|
||||
return (float)u;
|
||||
#endif
|
||||
}
|
||||
}
|
451
sdk/lib/crt/math/libm_sse2/logf.asm
Normal file
451
sdk/lib/crt/math/libm_sse2/logf.asm
Normal file
|
@ -0,0 +1,451 @@
|
|||
;
|
||||
; MIT License
|
||||
; -----------
|
||||
;
|
||||
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
||||
;
|
||||
; Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
; of this Software and associated documentaon files (the "Software"), to deal
|
||||
; in the Software without restriction, including without limitation the rights
|
||||
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
; copies of the Software, and to permit persons to whom the Software is
|
||||
; furnished to do so, subject to the following conditions:
|
||||
;
|
||||
; The above copyright notice and this permission notice shall be included in
|
||||
; all copies or substantial portions of the Software.
|
||||
;
|
||||
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
; THE SOFTWARE.
|
||||
;
|
||||
;
|
||||
; logf.asm
|
||||
;
|
||||
; An implementation of the logf libm function.
|
||||
;
|
||||
; Prototype:
|
||||
;
|
||||
; float logf(float x);
|
||||
;
|
||||
|
||||
;
|
||||
; Algorithm:
|
||||
; Similar to one presnted in log.asm
|
||||
;
|
||||
.const
|
||||
|
||||
|
||||
ALIGN 16
|
||||
|
||||
L_real_one DQ 0000000003f800000h ; 1.0
|
||||
DQ 0000000000000000h
|
||||
L_real_two DQ 00000000040000000h ; 1.0
|
||||
DQ 00000000000000000h
|
||||
L_real_ninf DQ 000000000ff800000h ; -inf
|
||||
DQ 0000000000000000h
|
||||
L_real_inf DQ 0000000007f800000h ; +inf
|
||||
DQ 0000000000000000h
|
||||
L_real_nan DQ 0000000007fc00000h ; NaN
|
||||
DQ 0000000000000000h
|
||||
L_real_neg_qnan DQ 000000000ffc00000h
|
||||
DQ 0000000000000000h
|
||||
L_real_notsign DQ 0000000007ffFFFFFh ; ^sign bit
|
||||
DQ 0000000000000000h
|
||||
L_real_mant DQ 0007FFFFF007FFFFFh ; mantissa bits
|
||||
DQ 0007FFFFF007FFFFFh
|
||||
L_mask_127 DQ 00000007f0000007fh ;
|
||||
DQ 00000007f0000007fh
|
||||
L_mask_253 DQ 000000000000000fdh
|
||||
DQ 00000000000000000h
|
||||
L_mask_mant_all7 DQ 00000000007f0000h
|
||||
DQ 00000000007f0000h
|
||||
L_mask_mant8 DQ 0000000000008000h
|
||||
DQ 0000000000000000h
|
||||
L_real_ca1 DQ 0000000003DAAAAABh ; 8.33333333333317923934e-02
|
||||
DQ 00000000000000000h
|
||||
L_real_ca2 DQ 0000000003C4CCCCDh ; 1.25000000037717509602e-02
|
||||
DQ 00000000000000000h
|
||||
L_real_log2_lead DQ 03F3170003F317000h ; 0.693115234375
|
||||
DQ 00000000000000000h
|
||||
L_real_log2_tail DQ 0000000003805FDF4h ; 0.000031946183
|
||||
DQ 00000000000000000h
|
||||
L_real_half DQ 0000000003f000000h ; 1/2
|
||||
DQ 00000000000000000h
|
||||
L_real_1_over_3 DQ 0000000003eaaaaabh
|
||||
DQ 00000000000000000h
|
||||
|
||||
L_real_1_over_2 DD 03f000000h
|
||||
L_real_neg127 DD 0c2fe0000h
|
||||
L_real_qnanbit DD 000400000h ; quiet nan bit
|
||||
L_real_threshold DD 03d800000h
|
||||
|
||||
; these codes and the ones in the corresponding .c file have to match
|
||||
L_flag_x_zero DD 00000001
|
||||
L_flag_x_neg DD 00000002
|
||||
L_flag_x_nan DD 00000003
|
||||
|
||||
EXTRN __log_128_lead:DWORD
|
||||
EXTRN __log_128_tail:DWORD
|
||||
EXTRN __log_F_inv_dword:DWORD
|
||||
EXTRN __use_fma3_lib:DWORD
|
||||
|
||||
fname TEXTEQU <logf>
|
||||
fname_special TEXTEQU <_logf_special>
|
||||
|
||||
; define local variable storage offsets
|
||||
|
||||
dummy_space EQU 020h
|
||||
stack_size EQU 038h
|
||||
|
||||
include fm.inc
|
||||
|
||||
; external function
|
||||
EXTERN fname_special:PROC
|
||||
|
||||
.code
|
||||
|
||||
PUBLIC fname
|
||||
fname PROC FRAME
|
||||
StackAllocate stack_size
|
||||
.ENDPROLOG
|
||||
cmp DWORD PTR __use_fma3_lib, 0
|
||||
jne Llogf_fma3
|
||||
|
||||
; Some of the placement of instructions below iwll be odd.
|
||||
; We are attempting to have no more than one branch per 32-byte block.
|
||||
Llogf_sse2:
|
||||
; Zero the high bits of rax because it will be used as an index later.
|
||||
xor rax, rax
|
||||
movdqa xmm3, xmm0
|
||||
movaps xmm4, xmm0
|
||||
|
||||
; This computation of the expoonent of x will produce nonsenes if x <= 0.,
|
||||
; but those cases are eliminated below, so it does no harm.
|
||||
psrld xmm3, 23 ; xmm3 <-- biased exp if x > 0.
|
||||
|
||||
; Is x Inf or NaN?
|
||||
movd eax, xmm0 ; eax <-- x
|
||||
mov ecx, eax
|
||||
btr ecx, 31 ; ecx <-- |x|
|
||||
cmp ecx, DWORD PTR L_real_inf
|
||||
jae Llogf_sse2_x_is_inf_or_nan
|
||||
|
||||
; Finish computing exponent.
|
||||
psubd xmm3, XMMWORD PTR L_mask_127 ; xmm3 <-- xexp (unbiased)
|
||||
movdqa xmm2, xmm0
|
||||
cvtdq2ps xmm5, xmm3 ; (float)xexp, unless x <= 0.
|
||||
|
||||
; Is x negative or zero?
|
||||
xorps xmm1, xmm1
|
||||
comiss xmm0, xmm1
|
||||
jbe Llogf_sse2_x_is_zero_or_neg
|
||||
|
||||
pand xmm2, XMMWORD PTR L_real_mant ; xmm2 <-- x mantissa for later
|
||||
subss xmm4, DWORD PTR L_real_one ; xmm4 <-- x - 1. for later
|
||||
|
||||
comiss xmm5, DWORD PTR L_real_neg127 ; x!=0, xexp==0 ==> subnormal
|
||||
je Llogf_sse2_subnormal_adjust
|
||||
|
||||
Llogf_sse2_continue_common:
|
||||
; At this point we need |x| (possibly adjusted) in eax
|
||||
; and m = xexpx (possibly adjusted) in xmm5
|
||||
; We also need the value of x - 1. computed above.
|
||||
|
||||
; compute the index into the log tables
|
||||
mov r9d, eax
|
||||
and eax, DWORD PTR L_mask_mant_all7 ; eax <-- 7 bits of x mantissa
|
||||
and r9d, DWORD PTR L_mask_mant8 ; r9d <-- 8th bit
|
||||
shl r9d, 1
|
||||
add eax, r9d ; use 8th bit to round up
|
||||
movd xmm1, eax
|
||||
|
||||
; Is x near 1.0 ?
|
||||
; Note that if x is subnormal it is perforce not near one.
|
||||
andps xmm4, XMMWORD PTR L_real_notsign ; xmm4 <-- |x-1|
|
||||
comiss xmm4, DWORD PTR L_real_threshold ; is |x-1| < 1/16?
|
||||
jb Llogf_sse2_near_one ; if so, handle elsewhere
|
||||
|
||||
; F, Y
|
||||
; F is a number in [.5,1) scaled from the rounded mantissa bits computed
|
||||
; above by oring in the exponent of .5.
|
||||
; Y is all of the mantissa bits of X scaled to [.5,1.) similarly
|
||||
shr eax, 16 ; shift eax to use as index
|
||||
por xmm2, XMMWORD PTR L_real_half ; xmm2 <-- Y
|
||||
por xmm1, XMMWORD PTR L_real_half ; xmm2 <-- F
|
||||
lea r9, QWORD PTR __log_F_inv_dword
|
||||
|
||||
|
||||
; f = F - Y, r = f * inv
|
||||
subss xmm1, xmm2 ; xmm1 <-- f
|
||||
mulss xmm1, DWORD PTR [r9+rax*4] ; xmm1 <-- r = f*inv (tabled)
|
||||
|
||||
movaps xmm2, xmm1
|
||||
movaps xmm0, xmm1
|
||||
|
||||
; poly
|
||||
mulss xmm2, DWORD PTR L_real_1_over_3 ; xmm2 <-- r/3
|
||||
mulss xmm0, xmm1 ; xmm0 <-- r^2
|
||||
addss xmm2, DWORD PTR L_real_1_over_2
|
||||
movaps xmm3, XMMWORD PTR L_real_log2_tail
|
||||
|
||||
lea r9, QWORD PTR __log_128_tail
|
||||
lea r10, QWORD PTR __log_128_lead
|
||||
|
||||
mulss xmm2, xmm0 ; xmm2 <-- r^2 * (r/3 + 1/2)
|
||||
mulss xmm3, xmm5 ; xmm3 <-- (m=xexp)*log2_tail
|
||||
addss xmm1, xmm2 ; xmm1 <-- poly
|
||||
|
||||
; m*log(2) + log(G) - poly, where G is just 2*F
|
||||
; log(G) is precomputed to extra precision.
|
||||
; small pieces and large pieces are separated until the final add,
|
||||
; to preserve accuracy
|
||||
movaps xmm0, XMMWORD PTR L_real_log2_lead
|
||||
subss xmm3, xmm1 ; xmm3 <-- m*log2_tail - poly
|
||||
mulss xmm0, xmm5 ; xmm0 <-- m*log1_lead
|
||||
addss xmm3, DWORD PTR [r9+rax*4] ; xmm3 += log(G) tail
|
||||
addss xmm0, DWORD PTR [r10+rax*4] ; xmm0 += log(G) lead
|
||||
|
||||
addss xmm0, xmm3 ; xmm0 <-- m*log(2)+log(G)-poly
|
||||
|
||||
StackDeallocate stack_size
|
||||
ret
|
||||
|
||||
ALIGN 16
|
||||
Llogf_sse2_near_one:
|
||||
; Computation of the log for x near one requires special techniques.
|
||||
movaps xmm2, DWORD PTR L_real_two
|
||||
subss xmm0, DWORD PTR L_real_one ; xmm0 <-- r = x - 1.0
|
||||
addss xmm2, xmm0
|
||||
movaps xmm1, xmm0
|
||||
divss xmm1, xmm2 ; xmm1 <-- u = r/(2.0+r)
|
||||
movaps xmm4, xmm0
|
||||
mulss xmm4, xmm1 ; xmm4 <-- correction = r*u
|
||||
addss xmm1, xmm1 ; xmm1 <-- u = 2.*u
|
||||
movaps xmm2, xmm1
|
||||
mulss xmm2, xmm2 ; xmm2 <-- u^2
|
||||
|
||||
; r2 = (u^3 * (ca_1 + u^2 * ca_2) - correction)
|
||||
movaps xmm3, xmm1
|
||||
mulss xmm3, xmm2 ; xmm3 <-- u^3
|
||||
mulss xmm2, DWORD PTR L_real_ca2 ; xmm2 <-- ca2*u^2
|
||||
addss xmm2, DWORD PTR L_real_ca1 ; xmm2 <-- ca2*u^2 + ca1
|
||||
mulss xmm2, xmm3 ; xmm2 <-- u^3*(ca1+u^2*ca2)
|
||||
subss xmm2, xmm4 ; xmm2 <-- r2
|
||||
|
||||
; return r + r2
|
||||
addss xmm0, xmm2
|
||||
StackDeallocate stack_size
|
||||
ret
|
||||
|
||||
ALIGN 16
|
||||
Llogf_sse2_subnormal_adjust:
|
||||
; This code adjusts eax and xmm5.
|
||||
; It must preserve xmm4.
|
||||
por xmm2, XMMWORD PTR L_real_one
|
||||
subss xmm2, DWORD PTR L_real_one
|
||||
movdqa xmm5, xmm2
|
||||
pand xmm2, XMMWORD PTR L_real_mant
|
||||
movd eax, xmm2
|
||||
psrld xmm5, 23
|
||||
psubd xmm5, XMMWORD PTR L_mask_253
|
||||
cvtdq2ps xmm5, xmm5
|
||||
jmp Llogf_sse2_continue_common
|
||||
|
||||
; Until we get to the FMA3 code, the rest of this is special case handling.
|
||||
Llogf_sse2_x_is_zero_or_neg:
|
||||
jne Llogf_sse2_x_is_neg
|
||||
|
||||
movaps xmm1, XMMWORD PTR L_real_ninf
|
||||
mov r8d, DWORD PTR L_flag_x_zero
|
||||
call fname_special
|
||||
jmp Llogf_sse2_finish
|
||||
|
||||
Llogf_sse2_x_is_neg:
|
||||
|
||||
movaps xmm1, XMMWORD PTR L_real_neg_qnan
|
||||
mov r8d, DWORD PTR L_flag_x_neg
|
||||
call fname_special
|
||||
jmp Llogf_sse2_finish
|
||||
|
||||
Llogf_sse2_x_is_inf_or_nan:
|
||||
|
||||
cmp eax, DWORD PTR L_real_inf
|
||||
je Llogf_sse2_finish
|
||||
|
||||
cmp eax, DWORD PTR L_real_ninf
|
||||
je Llogf_sse2_x_is_neg
|
||||
|
||||
or eax, DWORD PTR L_real_qnanbit
|
||||
movd xmm1, eax
|
||||
mov r8d, DWORD PTR L_flag_x_nan
|
||||
call fname_special
|
||||
jmp Llogf_sse2_finish
|
||||
|
||||
Llogf_sse2_finish:
|
||||
StackDeallocate stack_size
|
||||
ret
|
||||
|
||||
ALIGN 16
|
||||
Llogf_fma3:
|
||||
; compute exponent part
|
||||
vmovaps xmm4,XMMWORD PTR L_real_inf ; preload for inf/nan test
|
||||
xor rax,rax
|
||||
vpsrld xmm3,xmm0,23 ; xmm3 <-- (ux>>23)
|
||||
vmovd eax,xmm0 ;eax = x
|
||||
vpsubd xmm3,xmm3,DWORD PTR L_mask_127 ; xmm3 <-- (ux>>23) - 127
|
||||
vcvtdq2ps xmm5,xmm3 ; xmm5 <-- float((ux>>23)-127) = xexp
|
||||
|
||||
; NaN or inf
|
||||
vpand xmm1,xmm0,xmm4 ; xmm1 <-- (ux & 07f800000h)
|
||||
vcomiss xmm1,xmm4
|
||||
je Llogf_fma3_x_is_inf_or_nan
|
||||
|
||||
; check for negative numbers or zero
|
||||
vpxor xmm1,xmm1,xmm1
|
||||
vcomiss xmm0,xmm1
|
||||
jbe Llogf_fma3_x_is_zero_or_neg
|
||||
|
||||
vpand xmm2,xmm0,DWORD PTR L_real_mant ; xmm2 <-- ux & 0007FFFFFh
|
||||
vsubss xmm4,xmm0,DWORD PTR L_real_one ; xmm4 <-- x - 1.0
|
||||
|
||||
vcomiss xmm5,DWORD PTR L_real_neg127
|
||||
je Llogf_fma3_subnormal_adjust
|
||||
|
||||
Llogf_fma3_continue_common:
|
||||
|
||||
; compute the index into the log tables
|
||||
vpand xmm1,xmm0,DWORD PTR L_mask_mant_all7 ; xmm1 = ux & 0007f0000h
|
||||
vpand xmm3,xmm0,DWORD PTR L_mask_mant8 ; xmm3 = ux & 000008000h
|
||||
vpslld xmm3,xmm3,1 ; xmm3 = (ux & 000008000h) << 1
|
||||
vpaddd xmm1,xmm3,xmm1
|
||||
; eax = (ux & 0007f0000h) + ((ux & 000008000h) << 1)
|
||||
; eax <-- x/127., rounded to nearest
|
||||
vmovd eax,xmm1
|
||||
|
||||
; near one codepath
|
||||
vandps xmm4,xmm4,DWORD PTR L_real_notsign ; xmm4 <-- fabs (x - 1.0)
|
||||
vcomiss xmm4,DWORD PTR L_real_threshold
|
||||
jb Llogf_fma3_near_one
|
||||
|
||||
; F,Y
|
||||
shr eax,16
|
||||
vpor xmm2,xmm2,DWORD PTR L_real_half ; xmm2 <-- Y
|
||||
vpor xmm1,xmm1,DWORD PTR L_real_half ; xmm1 <-- F
|
||||
lea r9,QWORD PTR __log_F_inv_dword
|
||||
|
||||
; f = F - Y
|
||||
vsubss xmm1,xmm1,xmm2 ; f = F - Y
|
||||
; r = f * log_F_inv_dword[index]
|
||||
vmulss xmm1,xmm1,DWORD PTR [r9 + rax * 4]
|
||||
|
||||
; poly
|
||||
vmovaps xmm2,XMMWORD PTR L_real_1_over_3
|
||||
vfmadd213ss xmm2,xmm1,DWORD PTR L_real_1_over_2 ; 1/3*r + 1/2
|
||||
vmulss xmm0,xmm1,xmm1 ; r*r
|
||||
vmovaps xmm3,DWORD PTR L_real_log2_tail;
|
||||
|
||||
lea r9,DWORD PTR __log_128_tail
|
||||
lea r10,DWORD PTR __log_128_lead
|
||||
|
||||
vfmadd231ss xmm1,xmm2,xmm0 ; poly = r + 1/2*r*r + 1/3*r*r*r
|
||||
vfmsub213ss xmm3,xmm5,xmm1 ; (xexp * log2_tail) - poly
|
||||
|
||||
; m*log(2) + log(G) - poly
|
||||
vmovaps xmm0,DWORD PTR L_real_log2_lead
|
||||
vfmadd213ss xmm0,xmm5,[r10 + rax * 4]
|
||||
; z2 = (xexp * log2_tail) - poly + log_128_tail[index]
|
||||
vaddss xmm3,xmm3,DWORD PTR [r9 + rax * 4]
|
||||
vaddss xmm0,xmm0,xmm3 ; return z1 + z2
|
||||
|
||||
StackDeallocate stack_size
|
||||
ret
|
||||
|
||||
ALIGN 16
|
||||
Llogf_fma3_near_one:
|
||||
; r = x - 1.0;
|
||||
vmovaps xmm2,DWORD PTR L_real_two
|
||||
vsubss xmm0,xmm0,DWORD PTR L_real_one ; xmm0 = r = = x - 1.0
|
||||
|
||||
; u = r / (2.0 + r)
|
||||
vaddss xmm2,xmm2,xmm0 ; (r+2.0)
|
||||
vdivss xmm1,xmm0,xmm2 ; u = r / (2.0 + r)
|
||||
|
||||
; correction = r * u
|
||||
vmulss xmm4,xmm0,xmm1 ; correction = u*r
|
||||
|
||||
; u = u + u;
|
||||
vaddss xmm1,xmm1,xmm1 ; u = u+u
|
||||
vmulss xmm2,xmm1,xmm1 ; v = u^2
|
||||
|
||||
; r2 = (u * v * (ca_1 + v * ca_2) - correction)
|
||||
vmulss xmm3,xmm1,xmm2 ; u^3
|
||||
vmovaps xmm5,DWORD PTR L_real_ca2
|
||||
vfmadd213ss xmm2,xmm5,DWORD PTR L_real_ca1
|
||||
vfmsub213ss xmm2,xmm3,xmm4 ; r2 = (ca1 + ca2 * v) * u^3 - correction
|
||||
|
||||
; r + r2
|
||||
vaddss xmm0,xmm0,xmm2
|
||||
StackDeallocate stack_size
|
||||
ret
|
||||
|
||||
|
||||
ALIGN 16
|
||||
Llogf_fma3_subnormal_adjust:
|
||||
vmovaps xmm3,DWORD PTR L_real_one
|
||||
vpor xmm2,xmm2,xmm3 ; xmm2 = temp = ((ux &0007FFFFFh) | 03f800000h)
|
||||
vsubss xmm2,xmm2,xmm3 ; xmm2 = temp -1.0
|
||||
vpsrld xmm5,xmm2,23 ; xmm5 = (utemp >> 23)
|
||||
vpand xmm2,xmm2,DWORD PTR L_real_mant ; xmm2 = (utemp & 0007FFFFFh)
|
||||
vmovaps xmm0,xmm2
|
||||
vpsubd xmm5,xmm5,DWORD PTR L_mask_253 ; xmm5 = (utemp >> 23) - 253
|
||||
vcvtdq2ps xmm5,xmm5 ; xmm5 = (float) ((utemp >> 23) - 253)
|
||||
jmp Llogf_fma3_continue_common
|
||||
|
||||
Llogf_fma3_x_is_zero_or_neg:
|
||||
jne Llogf_fma3_x_is_neg
|
||||
|
||||
vmovaps xmm1,DWORD PTR L_real_ninf
|
||||
mov r8d,DWORD PTR L_flag_x_zero
|
||||
call fname_special
|
||||
|
||||
StackDeallocate stack_size
|
||||
ret
|
||||
|
||||
|
||||
Llogf_fma3_x_is_neg:
|
||||
|
||||
vmovaps xmm1,DWORD PTR L_real_neg_qnan
|
||||
mov r8d,DWORD PTR L_flag_x_neg
|
||||
call fname_special
|
||||
|
||||
StackDeallocate stack_size
|
||||
ret
|
||||
|
||||
Llogf_fma3_x_is_inf_or_nan:
|
||||
|
||||
cmp eax,DWORD PTR L_real_inf
|
||||
je Llogf_fma3_finish
|
||||
|
||||
cmp eax,DWORD PTR L_real_ninf
|
||||
je Llogf_fma3_x_is_neg
|
||||
|
||||
or eax,DWORD PTR L_real_qnanbit
|
||||
vmovd xmm1,eax
|
||||
mov r8d,DWORD PTR L_flag_x_nan
|
||||
call fname_special
|
||||
|
||||
StackDeallocate stack_size
|
||||
ret
|
||||
|
||||
Llogf_fma3_finish:
|
||||
|
||||
StackDeallocate stack_size
|
||||
ret
|
||||
|
||||
|
||||
fname endp
|
||||
END
|
76
sdk/lib/crt/math/libm_sse2/modf.c
Normal file
76
sdk/lib/crt/math/libm_sse2/modf.c
Normal file
|
@ -0,0 +1,76 @@
|
|||
|
||||
/*******************************************************************************
|
||||
MIT License
|
||||
-----------
|
||||
|
||||
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this Software and associated documentaon files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "libm.h"
|
||||
#include "libm_util.h"
|
||||
|
||||
double modf(double x, double *iptr)
|
||||
{
|
||||
/* modf splits the argument x into integer and fraction parts,
|
||||
each with the same sign as x. */
|
||||
|
||||
|
||||
long xexp;
|
||||
unsigned long ux, ax, mask;
|
||||
|
||||
GET_BITS_DP64(x, ux);
|
||||
ax = ux & (~SIGNBIT_DP64);
|
||||
|
||||
if (ax >= 0x4340000000000000)
|
||||
{
|
||||
/* abs(x) is either NaN, infinity, or >= 2^53 */
|
||||
if (ax > 0x7ff0000000000000)
|
||||
{
|
||||
/* x is NaN */
|
||||
*iptr = x;
|
||||
return x + x; /* Raise invalid if it is a signalling NaN */
|
||||
}
|
||||
else
|
||||
{
|
||||
/* x is infinity or large. Return zero with the sign of x */
|
||||
*iptr = x;
|
||||
PUT_BITS_DP64(ux & SIGNBIT_DP64, x);
|
||||
return x;
|
||||
}
|
||||
}
|
||||
else if (ax < 0x3ff0000000000000)
|
||||
{
|
||||
/* abs(x) < 1.0. Set iptr to zero with the sign of x
|
||||
and return x. */
|
||||
PUT_BITS_DP64(ux & SIGNBIT_DP64, *iptr);
|
||||
return x;
|
||||
}
|
||||
else
|
||||
{
|
||||
xexp = ((ux & EXPBITS_DP64) >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64;
|
||||
/* Mask out the bits of x that we don't want */
|
||||
mask = 1;
|
||||
mask = (mask << (EXPSHIFTBITS_DP64 - xexp)) - 1;
|
||||
PUT_BITS_DP64(ux & ~mask, *iptr);
|
||||
return x - *iptr;
|
||||
}
|
||||
|
||||
}
|
70
sdk/lib/crt/math/libm_sse2/modff.c
Normal file
70
sdk/lib/crt/math/libm_sse2/modff.c
Normal file
|
@ -0,0 +1,70 @@
|
|||
|
||||
/*******************************************************************************
|
||||
MIT License
|
||||
-----------
|
||||
|
||||
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this Software and associated documentaon files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "libm.h"
|
||||
#include "libm_util.h"
|
||||
|
||||
float modff(float x, float *iptr)
|
||||
{
|
||||
/* modff splits the argument x into integer and fraction parts,
|
||||
each with the same sign as x. */
|
||||
|
||||
unsigned int ux, mask;
|
||||
int xexp;
|
||||
|
||||
GET_BITS_SP32(x, ux);
|
||||
xexp = ((ux & (~SIGNBIT_SP32)) >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
|
||||
|
||||
if (xexp < 0)
|
||||
{
|
||||
/* abs(x) < 1.0. Set iptr to zero with the sign of x
|
||||
and return x. */
|
||||
PUT_BITS_SP32(ux & SIGNBIT_SP32, *iptr);
|
||||
return x;
|
||||
}
|
||||
else if (xexp < EXPSHIFTBITS_SP32)
|
||||
{
|
||||
/* x lies between 1.0 and 2**(24) */
|
||||
/* Mask out the bits of x that we don't want */
|
||||
mask = (1 << (EXPSHIFTBITS_SP32 - xexp)) - 1;
|
||||
PUT_BITS_SP32(ux & ~mask, *iptr);
|
||||
return x - *iptr;
|
||||
}
|
||||
else if ((ux & (~SIGNBIT_SP32)) > 0x7f800000)
|
||||
{
|
||||
/* x is NaN */
|
||||
*iptr = x;
|
||||
return x + x; /* Raise invalid if it is a signalling NaN */
|
||||
}
|
||||
else
|
||||
{
|
||||
/* x is infinity or large. Set iptr to x and return zero
|
||||
with the sign of x. */
|
||||
*iptr = x;
|
||||
PUT_BITS_SP32(ux & SIGNBIT_SP32, x);
|
||||
return x;
|
||||
}
|
||||
}
|
2411
sdk/lib/crt/math/libm_sse2/pow.asm
Normal file
2411
sdk/lib/crt/math/libm_sse2/pow.asm
Normal file
File diff suppressed because it is too large
Load diff
130
sdk/lib/crt/math/libm_sse2/pow_special.c
Normal file
130
sdk/lib/crt/math/libm_sse2/pow_special.c
Normal file
|
@ -0,0 +1,130 @@
|
|||
|
||||
/*******************************************************************************
|
||||
MIT License
|
||||
-----------
|
||||
|
||||
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this Software and associated documentaon files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include <fpieee.h>
|
||||
#include <excpt.h>
|
||||
#include <float.h>
|
||||
#include <math.h>
|
||||
#include <errno.h>
|
||||
|
||||
#include "libm_new.h"
|
||||
|
||||
// these codes and the ones in the related .asm files have to match
|
||||
#define POW_X_ONE_Y_SNAN 1
|
||||
#define POW_X_ZERO_Z_INF 2
|
||||
#define POW_X_NAN 3
|
||||
#define POW_Y_NAN 4
|
||||
#define POW_X_NAN_Y_NAN 5
|
||||
#define POW_X_NEG_Y_NOTINT 6
|
||||
#define POW_Z_ZERO 7
|
||||
#define POW_Z_DENORMAL 8
|
||||
#define POW_Z_INF 9
|
||||
|
||||
float _powf_special(float x, float y, float z, U32 code)
|
||||
{
|
||||
switch(code)
|
||||
{
|
||||
case POW_X_ONE_Y_SNAN:
|
||||
{
|
||||
UT64 zm; zm.u64 = 0; zm.f32[0] = z;
|
||||
_handle_errorf("powf", _FpCodePow, zm.u64, 0, AMD_F_INVALID, 0, x, y, 2);
|
||||
}
|
||||
break;
|
||||
|
||||
case POW_X_ZERO_Z_INF:
|
||||
{
|
||||
UT64 zm; zm.u64 = 0; zm.f32[0] = z;
|
||||
_handle_errorf("powf", _FpCodePow, zm.u64, _SING, AMD_F_DIVBYZERO, ERANGE, x, y, 2);
|
||||
}
|
||||
break;
|
||||
|
||||
case POW_X_NAN:
|
||||
case POW_Y_NAN:
|
||||
case POW_X_NAN_Y_NAN:
|
||||
case POW_X_NEG_Y_NOTINT:
|
||||
{
|
||||
UT64 zm; zm.u64 = 0; zm.f32[0] = z;
|
||||
_handle_errorf("powf", _FpCodePow, zm.u64, _DOMAIN, AMD_F_INVALID, EDOM, x, y, 2);
|
||||
}
|
||||
break;
|
||||
|
||||
case POW_Z_ZERO:
|
||||
{
|
||||
UT64 zm; zm.u64 = 0; zm.f32[0] = z;
|
||||
_handle_errorf("powf", _FpCodePow, zm.u64, _UNDERFLOW, AMD_F_INEXACT|AMD_F_UNDERFLOW, ERANGE, x, y, 2);
|
||||
}
|
||||
break;
|
||||
|
||||
case POW_Z_INF:
|
||||
{
|
||||
UT64 zm; zm.u64 = 0; zm.f32[0] = z;
|
||||
_handle_errorf("powf", _FpCodePow, zm.u64, _OVERFLOW, AMD_F_INEXACT|AMD_F_OVERFLOW, ERANGE, x, y, 2);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
return z;
|
||||
}
|
||||
|
||||
double _pow_special(double x, double y, double z, U32 code)
|
||||
{
|
||||
switch(code)
|
||||
{
|
||||
case POW_X_ZERO_Z_INF:
|
||||
{
|
||||
UT64 zm; zm.f64 = z;
|
||||
_handle_error("pow", _FpCodePow, zm.u64, _SING, AMD_F_DIVBYZERO, ERANGE, x, y, 2);
|
||||
}
|
||||
break;
|
||||
|
||||
case POW_X_NAN:
|
||||
case POW_Y_NAN:
|
||||
case POW_X_NAN_Y_NAN:
|
||||
case POW_X_NEG_Y_NOTINT:
|
||||
{
|
||||
UT64 zm; zm.f64 = z;
|
||||
_handle_error("pow", _FpCodePow, zm.u64, _DOMAIN, AMD_F_INVALID, EDOM, x, y, 2);
|
||||
}
|
||||
break;
|
||||
|
||||
case POW_Z_ZERO:
|
||||
case POW_Z_DENORMAL:
|
||||
{
|
||||
UT64 zm; zm.f64 = z;
|
||||
_handle_error("pow", _FpCodePow, zm.u64, _UNDERFLOW, AMD_F_INEXACT|AMD_F_UNDERFLOW, ERANGE, x, y, 2);
|
||||
}
|
||||
break;
|
||||
|
||||
case POW_Z_INF:
|
||||
{
|
||||
UT64 zm; zm.f64 = z;
|
||||
_handle_error("pow", _FpCodePow, zm.u64, _OVERFLOW, AMD_F_INEXACT|AMD_F_OVERFLOW, ERANGE, x, y, 2);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
return z;
|
||||
}
|
319
sdk/lib/crt/math/libm_sse2/remainder.c
Normal file
319
sdk/lib/crt/math/libm_sse2/remainder.c
Normal file
|
@ -0,0 +1,319 @@
|
|||
|
||||
/*******************************************************************************
|
||||
MIT License
|
||||
-----------
|
||||
|
||||
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this Software and associated documentaon files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "libm.h"
|
||||
#include "libm_util.h"
|
||||
|
||||
#define USE_NAN_WITH_FLAGS
|
||||
#define USE_SCALEDOUBLE_3
|
||||
#define USE_GET_FPSW_INLINE
|
||||
#define USE_SET_FPSW_INLINE
|
||||
#define USE_HANDLE_ERROR
|
||||
#include "libm_inlines.h"
|
||||
#undef USE_NAN_WITH_FLAGS
|
||||
#undef USE_SCALEDOUBLE_3
|
||||
#undef USE_GET_FPSW_INLINE
|
||||
#undef USE_SET_FPSW_INLINE
|
||||
#undef USE_HANDLE_ERROR
|
||||
|
||||
#if !defined(_CRTBLD_C9X)
|
||||
#define _CRTBLD_C9X
|
||||
#endif
|
||||
|
||||
#include "libm_errno.h"
|
||||
|
||||
/* Computes the exact product of x and y, the result being the
|
||||
nearly doublelength number (z,zz) */
|
||||
static inline void dekker_mul12(double x, double y,
|
||||
double *z, double *zz)
|
||||
{
|
||||
double hx, tx, hy, ty;
|
||||
/* Split x into hx (head) and tx (tail). Do the same for y. */
|
||||
unsigned long u;
|
||||
GET_BITS_DP64(x, u);
|
||||
u &= 0xfffffffff8000000;
|
||||
PUT_BITS_DP64(u, hx);
|
||||
tx = x - hx;
|
||||
GET_BITS_DP64(y, u);
|
||||
u &= 0xfffffffff8000000;
|
||||
PUT_BITS_DP64(u, hy);
|
||||
ty = y - hy;
|
||||
*z = x * y;
|
||||
*zz = (((hx * hy - *z) + hx * ty) + tx * hy) + tx * ty;
|
||||
}
|
||||
|
||||
#pragma function(fmod)
|
||||
#undef _FUNCNAME
|
||||
#if defined(COMPILING_FMOD)
|
||||
double fmod(double x, double y)
|
||||
#define _FUNCNAME "fmod"
|
||||
#define _OPERATION OP_FMOD
|
||||
#else
|
||||
double remainder(double x, double y)
|
||||
#define _FUNCNAME "remainder"
|
||||
#define _OPERATION OP_REM
|
||||
#endif
|
||||
{
|
||||
double dx, dy, scale, w, t, v, c, cc;
|
||||
int i, ntimes, xexp, yexp;
|
||||
unsigned long u, ux, uy, ax, ay, todd;
|
||||
unsigned int sw;
|
||||
|
||||
dx = x;
|
||||
dy = y;
|
||||
|
||||
|
||||
GET_BITS_DP64(dx, ux);
|
||||
GET_BITS_DP64(dy, uy);
|
||||
ax = ux & ~SIGNBIT_DP64;
|
||||
ay = uy & ~SIGNBIT_DP64;
|
||||
xexp = (int)((ux & EXPBITS_DP64) >> EXPSHIFTBITS_DP64);
|
||||
yexp = (int)((uy & EXPBITS_DP64) >> EXPSHIFTBITS_DP64);
|
||||
|
||||
if (xexp < 1 || xexp > BIASEDEMAX_DP64 ||
|
||||
yexp < 1 || yexp > BIASEDEMAX_DP64)
|
||||
{
|
||||
/* x or y is zero, denormalized, NaN or infinity */
|
||||
if (xexp > BIASEDEMAX_DP64)
|
||||
{
|
||||
/* x is NaN or infinity */
|
||||
if (ux & MANTBITS_DP64)
|
||||
{
|
||||
/* x is NaN */
|
||||
return _handle_error(_FUNCNAME, _OPERATION, ux|0x0008000000000000, _DOMAIN, 0,
|
||||
EDOM, x, y, 2);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* x is infinity; result is NaN */
|
||||
return _handle_error(_FUNCNAME, _OPERATION, INDEFBITPATT_DP64, _DOMAIN,
|
||||
AMD_F_INVALID, EDOM, x, y, 2);
|
||||
}
|
||||
}
|
||||
else if (yexp > BIASEDEMAX_DP64)
|
||||
{
|
||||
/* y is NaN or infinity */
|
||||
if (uy & MANTBITS_DP64)
|
||||
{
|
||||
/* y is NaN */
|
||||
return _handle_error(_FUNCNAME, _OPERATION, uy|0x0008000000000000, _DOMAIN, 0,
|
||||
EDOM, x, y, 2);
|
||||
}
|
||||
else
|
||||
{
|
||||
#ifdef _CRTBLD_C9X
|
||||
/* C99 return for y = +-inf is x */
|
||||
return x;
|
||||
#else
|
||||
/* y is infinity; result is indefinite */
|
||||
return _handle_error(_FUNCNAME, _OPERATION, INDEFBITPATT_DP64, _DOMAIN,
|
||||
AMD_F_INVALID, EDOM, x, y, 2);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
else if (ax == 0x0000000000000000)
|
||||
{
|
||||
/* x is zero */
|
||||
if (ay == 0x0000000000000000)
|
||||
{
|
||||
/* y is zero */
|
||||
return _handle_error(_FUNCNAME, _OPERATION, INDEFBITPATT_DP64, _DOMAIN,
|
||||
AMD_F_INVALID, EDOM, x, y, 2);
|
||||
}
|
||||
else
|
||||
/* C99 return for x = 0 must preserve sign */
|
||||
return x;
|
||||
}
|
||||
else if (ay == 0x0000000000000000)
|
||||
{
|
||||
/* y is zero */
|
||||
return _handle_error(_FUNCNAME, _OPERATION, INDEFBITPATT_DP64, _DOMAIN,
|
||||
AMD_F_INVALID, EDOM, x, y, 2);
|
||||
}
|
||||
|
||||
/* We've exhausted all other possibilities. One or both of x and
|
||||
y must be denormalized */
|
||||
if (xexp < 1)
|
||||
{
|
||||
/* x is denormalized. Figure out its exponent. */
|
||||
u = ax;
|
||||
while (u < IMPBIT_DP64)
|
||||
{
|
||||
xexp--;
|
||||
u <<= 1;
|
||||
}
|
||||
}
|
||||
if (yexp < 1)
|
||||
{
|
||||
/* y is denormalized. Figure out its exponent. */
|
||||
u = ay;
|
||||
while (u < IMPBIT_DP64)
|
||||
{
|
||||
yexp--;
|
||||
u <<= 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (ax == ay)
|
||||
{
|
||||
/* abs(x) == abs(y); return zero with the sign of x */
|
||||
PUT_BITS_DP64(ux & SIGNBIT_DP64, dx);
|
||||
return dx;
|
||||
}
|
||||
|
||||
/* Set x = abs(x), y = abs(y) */
|
||||
PUT_BITS_DP64(ax, dx);
|
||||
PUT_BITS_DP64(ay, dy);
|
||||
|
||||
if (ax < ay)
|
||||
{
|
||||
/* abs(x) < abs(y) */
|
||||
#if !defined(COMPILING_FMOD)
|
||||
if (dx > 0.5*dy)
|
||||
dx -= dy;
|
||||
#endif
|
||||
return x < 0.0? -dx : dx;
|
||||
}
|
||||
|
||||
/* Save the current floating-point status word. We need
|
||||
to do this because the remainder function is always
|
||||
exact for finite arguments, but our algorithm causes
|
||||
the inexact flag to be raised. We therefore need to
|
||||
restore the entry status before exiting. */
|
||||
sw = get_fpsw_inline();
|
||||
|
||||
/* Set ntimes to the number of times we need to do a
|
||||
partial remainder. If the exponent of x is an exact multiple
|
||||
of 52 larger than the exponent of y, and the mantissa of x is
|
||||
less than the mantissa of y, ntimes will be one too large
|
||||
but it doesn't matter - it just means that we'll go round
|
||||
the loop below one extra time. */
|
||||
if (xexp <= yexp)
|
||||
ntimes = 0;
|
||||
else
|
||||
ntimes = (xexp - yexp) / 52;
|
||||
|
||||
if (ntimes == 0)
|
||||
{
|
||||
w = dy;
|
||||
scale = 1.0;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Set w = y * 2^(52*ntimes) */
|
||||
w = scaleDouble_3(dy, ntimes * 52);
|
||||
|
||||
/* Set scale = 2^(-52) */
|
||||
PUT_BITS_DP64((unsigned long)(-52 + EXPBIAS_DP64) << EXPSHIFTBITS_DP64,
|
||||
scale);
|
||||
}
|
||||
|
||||
|
||||
/* Each time round the loop we compute a partial remainder.
|
||||
This is done by subtracting a large multiple of w
|
||||
from x each time, where w is a scaled up version of y.
|
||||
The subtraction must be performed exactly in quad
|
||||
precision, though the result at each stage can
|
||||
fit exactly in a double precision number. */
|
||||
for (i = 0; i < ntimes; i++)
|
||||
{
|
||||
/* t is the integer multiple of w that we will subtract.
|
||||
We use a truncated value for t.
|
||||
|
||||
N.B. w has been chosen so that the integer t will have
|
||||
at most 52 significant bits. This is the amount by
|
||||
which the exponent of the partial remainder dx gets reduced
|
||||
every time around the loop. In theory we could use
|
||||
53 bits in t, but the quad precision multiplication
|
||||
routine dekker_mul12 does not allow us to do that because
|
||||
it loses the last (106th) bit of its quad precision result. */
|
||||
|
||||
/* Set dx = dx - w * t, where t is equal to trunc(dx/w). */
|
||||
t = (double)(long)(dx / w);
|
||||
/* At this point, t may be one too large due to
|
||||
rounding of dx/w */
|
||||
|
||||
/* Compute w * t in quad precision */
|
||||
dekker_mul12(w, t, &c, &cc);
|
||||
|
||||
/* Subtract w * t from dx */
|
||||
v = dx - c;
|
||||
dx = v + (((dx - v) - c) - cc);
|
||||
|
||||
/* If t was one too large, dx will be negative. Add back
|
||||
one w */
|
||||
/* It might be possible to speed up this loop by finding
|
||||
a way to compute correctly truncated t directly from dx and w.
|
||||
We would then avoid the need for this check on negative dx. */
|
||||
if (dx < 0.0)
|
||||
dx += w;
|
||||
|
||||
/* Scale w down by 2^(-52) for the next iteration */
|
||||
w *= scale;
|
||||
}
|
||||
|
||||
/* One more time */
|
||||
/* Variable todd says whether the integer t is odd or not */
|
||||
t = (double)(long)(dx / w);
|
||||
todd = ((long)(dx / w)) & 1;
|
||||
dekker_mul12(w, t, &c, &cc);
|
||||
v = dx - c;
|
||||
dx = v + (((dx - v) - c) - cc);
|
||||
if (dx < 0.0)
|
||||
{
|
||||
todd = !todd;
|
||||
dx += w;
|
||||
}
|
||||
|
||||
/* At this point, dx lies in the range [0,dy) */
|
||||
#if !defined(COMPILING_FMOD)
|
||||
/* For the fmod function, we're done apart from setting
|
||||
the correct sign. */
|
||||
/* For the remainder function, we need to adjust dx
|
||||
so that it lies in the range (-y/2, y/2] by carefully
|
||||
subtracting w (== dy == y) if necessary. The rigmarole
|
||||
with todd is to get the correct sign of the result
|
||||
when x/y lies exactly half way between two integers,
|
||||
when we need to choose the even integer. */
|
||||
if (ay < 0x7fd0000000000000)
|
||||
{
|
||||
if (dx + dx > w || (todd && (dx + dx == w)))
|
||||
dx -= w;
|
||||
}
|
||||
else if (dx > 0.5 * w || (todd && (dx == 0.5 * w)))
|
||||
dx -= w;
|
||||
|
||||
#endif
|
||||
|
||||
/* **** N.B. for some reason this breaks the 32 bit version
|
||||
of remainder when compiling with optimization. */
|
||||
/* Restore the entry status flags */
|
||||
set_fpsw_inline(sw);
|
||||
|
||||
/* Set the result sign according to input argument x */
|
||||
return x < 0.0? -dx : dx;
|
||||
|
||||
}
|
251
sdk/lib/crt/math/libm_sse2/remainder_piby2.c
Normal file
251
sdk/lib/crt/math/libm_sse2/remainder_piby2.c
Normal file
|
@ -0,0 +1,251 @@
|
|||
|
||||
/*******************************************************************************
|
||||
MIT License
|
||||
-----------
|
||||
|
||||
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this Software and associated documentaon files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "libm.h"
|
||||
#include "libm_util.h"
|
||||
|
||||
|
||||
/* Given positive argument x, reduce it to the range [-pi/4,pi/4] using
|
||||
extra precision, and return the result in r, rr.
|
||||
Return value "region" tells how many lots of pi/2 were subtracted
|
||||
from x to put it in the range [-pi/4,pi/4], mod 4. */
|
||||
void __remainder_piby2(double x, double *r, double *rr, int *region)
|
||||
{
|
||||
/* This method simulates multi-precision floating-point
|
||||
arithmetic and is accurate for all 1 <= x < infinity */
|
||||
static const double
|
||||
piby2_lead = 1.57079632679489655800e+00, /* 0x3ff921fb54442d18 */
|
||||
piby2_part1 = 1.57079631090164184570e+00, /* 0x3ff921fb50000000 */
|
||||
piby2_part2 = 1.58932547122958567343e-08, /* 0x3e5110b460000000 */
|
||||
piby2_part3 = 6.12323399573676480327e-17; /* 0x3c91a62633145c06 */
|
||||
const int bitsper = 10;
|
||||
unsigned long res[500];
|
||||
unsigned long ux, u, carry, mask, mant, highbitsrr;
|
||||
int first, last, i, rexp, xexp, resexp, ltb, determ;
|
||||
double xx, t;
|
||||
static unsigned long pibits[] =
|
||||
{
|
||||
0, 0, 0, 0, 0, 0,
|
||||
162, 998, 54, 915, 580, 84, 671, 777, 855, 839,
|
||||
851, 311, 448, 877, 553, 358, 316, 270, 260, 127,
|
||||
593, 398, 701, 942, 965, 390, 882, 283, 570, 265,
|
||||
221, 184, 6, 292, 750, 642, 465, 584, 463, 903,
|
||||
491, 114, 786, 617, 830, 930, 35, 381, 302, 749,
|
||||
72, 314, 412, 448, 619, 279, 894, 260, 921, 117,
|
||||
569, 525, 307, 637, 156, 529, 504, 751, 505, 160,
|
||||
945, 1022, 151, 1023, 480, 358, 15, 956, 753, 98,
|
||||
858, 41, 721, 987, 310, 507, 242, 498, 777, 733,
|
||||
244, 399, 870, 633, 510, 651, 373, 158, 940, 506,
|
||||
997, 965, 947, 833, 825, 990, 165, 164, 746, 431,
|
||||
949, 1004, 287, 565, 464, 533, 515, 193, 111, 798
|
||||
};
|
||||
|
||||
GET_BITS_DP64(x, ux);
|
||||
|
||||
|
||||
xexp = (int)(((ux & EXPBITS_DP64) >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64);
|
||||
ux = (ux & MANTBITS_DP64) | IMPBIT_DP64;
|
||||
|
||||
/* Now ux is the mantissa bit pattern of x as a long integer */
|
||||
carry = 0;
|
||||
mask = 1;
|
||||
mask = (mask << bitsper) - 1;
|
||||
|
||||
/* Set first and last to the positions of the first
|
||||
and last chunks of 2/pi that we need */
|
||||
first = xexp / bitsper;
|
||||
resexp = xexp - first * bitsper;
|
||||
/* 180 is the theoretical maximum number of bits (actually
|
||||
175 for IEEE double precision) that we need to extract
|
||||
from the middle of 2/pi to compute the reduced argument
|
||||
accurately enough for our purposes */
|
||||
last = first + 180 / bitsper;
|
||||
|
||||
/* Do a long multiplication of the bits of 2/pi by the
|
||||
integer mantissa */
|
||||
#if 0
|
||||
for (i = last; i >= first; i--)
|
||||
{
|
||||
u = pibits[i] * ux + carry;
|
||||
res[i - first] = u & mask;
|
||||
carry = u >> bitsper;
|
||||
}
|
||||
res[last - first + 1] = 0;
|
||||
#else
|
||||
/* Unroll the loop. This is only correct because we know
|
||||
that bitsper is fixed as 10. */
|
||||
res[19] = 0;
|
||||
u = pibits[last] * ux;
|
||||
res[18] = u & mask;
|
||||
carry = u >> bitsper;
|
||||
u = pibits[last-1] * ux + carry;
|
||||
res[17] = u & mask;
|
||||
carry = u >> bitsper;
|
||||
u = pibits[last-2] * ux + carry;
|
||||
res[16] = u & mask;
|
||||
carry = u >> bitsper;
|
||||
u = pibits[last-3] * ux + carry;
|
||||
res[15] = u & mask;
|
||||
carry = u >> bitsper;
|
||||
u = pibits[last-4] * ux + carry;
|
||||
res[14] = u & mask;
|
||||
carry = u >> bitsper;
|
||||
u = pibits[last-5] * ux + carry;
|
||||
res[13] = u & mask;
|
||||
carry = u >> bitsper;
|
||||
u = pibits[last-6] * ux + carry;
|
||||
res[12] = u & mask;
|
||||
carry = u >> bitsper;
|
||||
u = pibits[last-7] * ux + carry;
|
||||
res[11] = u & mask;
|
||||
carry = u >> bitsper;
|
||||
u = pibits[last-8] * ux + carry;
|
||||
res[10] = u & mask;
|
||||
carry = u >> bitsper;
|
||||
u = pibits[last-9] * ux + carry;
|
||||
res[9] = u & mask;
|
||||
carry = u >> bitsper;
|
||||
u = pibits[last-10] * ux + carry;
|
||||
res[8] = u & mask;
|
||||
carry = u >> bitsper;
|
||||
u = pibits[last-11] * ux + carry;
|
||||
res[7] = u & mask;
|
||||
carry = u >> bitsper;
|
||||
u = pibits[last-12] * ux + carry;
|
||||
res[6] = u & mask;
|
||||
carry = u >> bitsper;
|
||||
u = pibits[last-13] * ux + carry;
|
||||
res[5] = u & mask;
|
||||
carry = u >> bitsper;
|
||||
u = pibits[last-14] * ux + carry;
|
||||
res[4] = u & mask;
|
||||
carry = u >> bitsper;
|
||||
u = pibits[last-15] * ux + carry;
|
||||
res[3] = u & mask;
|
||||
carry = u >> bitsper;
|
||||
u = pibits[last-16] * ux + carry;
|
||||
res[2] = u & mask;
|
||||
carry = u >> bitsper;
|
||||
u = pibits[last-17] * ux + carry;
|
||||
res[1] = u & mask;
|
||||
carry = u >> bitsper;
|
||||
u = pibits[last-18] * ux + carry;
|
||||
res[0] = u & mask;
|
||||
#endif
|
||||
|
||||
|
||||
/* Reconstruct the result */
|
||||
ltb = (int)((((res[0] << bitsper) | res[1])
|
||||
>> (bitsper - 1 - resexp)) & 7);
|
||||
|
||||
/* determ says whether the fractional part is >= 0.5 */
|
||||
determ = ltb & 1;
|
||||
|
||||
|
||||
i = 1;
|
||||
if (determ)
|
||||
{
|
||||
/* The mantissa is >= 0.5. We want to subtract it
|
||||
from 1.0 by negating all the bits */
|
||||
*region = ((ltb >> 1) + 1) & 3;
|
||||
mant = 1;
|
||||
mant = ~(res[1]) & ((mant << (bitsper - resexp)) - 1);
|
||||
while (mant < 0x0020000000000000)
|
||||
{
|
||||
i++;
|
||||
mant = (mant << bitsper) | (~(res[i]) & mask);
|
||||
}
|
||||
highbitsrr = ~(res[i + 1]) << (64 - bitsper);
|
||||
}
|
||||
else
|
||||
{
|
||||
*region = (ltb >> 1);
|
||||
mant = 1;
|
||||
mant = res[1] & ((mant << (bitsper - resexp)) - 1);
|
||||
while (mant < 0x0020000000000000)
|
||||
{
|
||||
i++;
|
||||
mant = (mant << bitsper) | res[i];
|
||||
}
|
||||
highbitsrr = res[i + 1] << (64 - bitsper);
|
||||
}
|
||||
|
||||
rexp = 52 + resexp - i * bitsper;
|
||||
|
||||
while (mant >= 0x0020000000000000)
|
||||
{
|
||||
rexp++;
|
||||
highbitsrr = (highbitsrr >> 1) | ((mant & 1) << 63);
|
||||
mant >>= 1;
|
||||
}
|
||||
|
||||
|
||||
/* Put the result exponent rexp onto the mantissa pattern */
|
||||
u = ((unsigned long)rexp + EXPBIAS_DP64) << EXPSHIFTBITS_DP64;
|
||||
ux = (mant & MANTBITS_DP64) | u;
|
||||
if (determ)
|
||||
/* If we negated the mantissa we negate x too */
|
||||
ux |= SIGNBIT_DP64;
|
||||
PUT_BITS_DP64(ux, x);
|
||||
|
||||
/* Create the bit pattern for rr */
|
||||
highbitsrr >>= 12; /* Note this is shifted one place too far */
|
||||
u = ((unsigned long)rexp + EXPBIAS_DP64 - 53) << EXPSHIFTBITS_DP64;
|
||||
PUT_BITS_DP64(u, t);
|
||||
u |= highbitsrr;
|
||||
PUT_BITS_DP64(u, xx);
|
||||
|
||||
/* Subtract the implicit bit we accidentally added */
|
||||
xx -= t;
|
||||
/* Set the correct sign, and double to account for the
|
||||
"one place too far" shift */
|
||||
if (determ)
|
||||
xx *= -2.0;
|
||||
else
|
||||
xx *= 2.0;
|
||||
|
||||
|
||||
/* (x,xx) is an extra-precise version of the fractional part of
|
||||
x * 2 / pi. Multiply (x,xx) by pi/2 in extra precision
|
||||
to get the reduced argument (r,rr). */
|
||||
{
|
||||
double hx, tx, c, cc;
|
||||
/* Split x into hx (head) and tx (tail) */
|
||||
GET_BITS_DP64(x, ux);
|
||||
ux &= 0xfffffffff8000000;
|
||||
PUT_BITS_DP64(ux, hx);
|
||||
tx = x - hx;
|
||||
|
||||
c = piby2_lead * x;
|
||||
cc = ((((piby2_part1 * hx - c) + piby2_part1 * tx) +
|
||||
piby2_part2 * hx) + piby2_part2 * tx) +
|
||||
(piby2_lead * xx + piby2_part3 * x);
|
||||
*r = c + cc;
|
||||
*rr = (c - *r) + cc;
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
415
sdk/lib/crt/math/libm_sse2/remainder_piby2_forAsm.asm
Normal file
415
sdk/lib/crt/math/libm_sse2/remainder_piby2_forAsm.asm
Normal file
|
@ -0,0 +1,415 @@
|
|||
;
|
||||
;
|
||||
; MIT License
|
||||
; -----------
|
||||
;
|
||||
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
||||
;
|
||||
; Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
; of this Software and associated documentaon files (the "Software"), to deal
|
||||
; in the Software without restriction, including without limitation the rights
|
||||
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
; copies of the Software, and to permit persons to whom the Software is
|
||||
; furnished to do so, subject to the following conditions:
|
||||
;
|
||||
; The above copyright notice and this permission notice shall be included in
|
||||
; all copies or substantial portions of the Software.
|
||||
;
|
||||
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
; THE SOFTWARE.
|
||||
;
|
||||
; An implementation of the remainder by pi/2 function
|
||||
; This is a service routine for use by trig functions coded in asm
|
||||
;
|
||||
; On input,
|
||||
; xmm0 = x;
|
||||
; On ouput
|
||||
; xmm0 = r
|
||||
; xmm1 = rr
|
||||
; xmm2 = region
|
||||
|
||||
.const
|
||||
ALIGN 16
|
||||
L__piby2_part3_piby2_lead DQ 03ff921fb54442d18h, 03c91a62633145c06h
|
||||
L__piby2_part1 DQ 03ff921fb50000000h, 03ff921fb50000000h
|
||||
L__piby2_part2 DQ 03e5110b460000000h, 03e5110b460000000h
|
||||
;; constants for CW reduction
|
||||
L_piby2_1 DQ 03FF921FB54400000h, 03FF921FB54400000h
|
||||
L_piby2_2 DQ 03DD0B4611A600000h, 03DD0B4611A600000h
|
||||
L_piby2_3 DQ 03BA3198A2E000000h, 03BA3198A2E000000h
|
||||
L_piby2_1tail DQ 03DD0B4611A626331h, 03DD0B4611A626331h
|
||||
L_piby2_2tail DQ 03BA3198A2E037073h, 03BA3198A2E037073h
|
||||
L_piby2_3tail DQ 0397B839A252049C1h, 0397B839A252049C1h
|
||||
L_twobypi DQ 03FE45F306DC9C883h, 03FE45F306DC9C883h
|
||||
L_point_five DQ 03FE0000000000000h, 03FE0000000000000h
|
||||
L_int_three DQ 00000000000000003h, 00000000000000003h
|
||||
L_inf_mask_64 DQ 07FF0000000000000h, 07FF0000000000000h
|
||||
L_signbit DQ 08000000000000000h, 08000000000000000h
|
||||
L_int_1 DQ 00000000000000001h, 00000000000000001h
|
||||
L_int_15 DQ 0000000000000000Fh
|
||||
L_int_48 DQ 00000000000000030h
|
||||
L_3pio4 DQ 04002D97C7F3321D2h
|
||||
L_5pio4 DQ 0400F6A7A2955385Eh
|
||||
L_7pio4 DQ 04015FDBBE9BBA775h
|
||||
L_9pio4 DQ 0401c463abeccb2bbh
|
||||
ALIGN 16
|
||||
L__2_by_pi_bits DB 224, 241, 27, 193, 12, 88, 33, 116
|
||||
DB 53, 126, 196, 126, 237, 175, 169, 75
|
||||
DB 74, 41, 222, 231, 28, 244, 236, 197
|
||||
DB 151, 175, 31, 235, 158, 212, 181, 168
|
||||
DB 127, 121, 154, 253, 24, 61, 221, 38
|
||||
DB 44, 159, 60, 251, 217, 180, 125, 180
|
||||
DB 41, 104, 45, 70, 188, 188, 63, 96
|
||||
DB 22, 120, 255, 95, 226, 127, 236, 160
|
||||
DB 228, 247, 46, 126, 17, 114, 210, 231
|
||||
DB 76, 13, 230, 88, 71, 230, 4, 249
|
||||
DB 125, 209, 154, 192, 113, 166, 19, 18
|
||||
DB 237, 186, 212, 215, 8, 162, 251, 156
|
||||
DB 166, 196, 114, 172, 119, 248, 115, 72
|
||||
DB 70, 39, 168, 187, 36, 25, 128, 75
|
||||
DB 55, 9, 233, 184, 145, 220, 134, 21
|
||||
DB 239, 122, 175, 142, 69, 249, 7, 65
|
||||
DB 14, 241, 100, 86, 138, 109, 3, 119
|
||||
DB 211, 212, 71, 95, 157, 240, 167, 84
|
||||
DB 16, 57, 185, 13, 230, 139, 2, 0
|
||||
DB 0, 0, 0, 0, 0, 0
|
||||
|
||||
|
||||
; local storage offsets
|
||||
region EQU 000h
|
||||
stack_size EQU 018h
|
||||
sstack_size EQU 000h ; no stack for fsname
|
||||
|
||||
include fm.inc
|
||||
|
||||
fname TEXTEQU <__remainder_piby2_forAsm>
|
||||
fsname TEXTEQU <__remainder_piby2_cw_forAsm>
|
||||
|
||||
|
||||
.code
|
||||
|
||||
; xmm0l has |x|
|
||||
PUBLIC fname
|
||||
fname PROC FRAME
|
||||
StackAllocate stack_size
|
||||
.ENDPROLOG
|
||||
|
||||
; This function is not using rdx, r8, and r9 as pointers;
|
||||
; all returns are in registers
|
||||
|
||||
; get the unbiased exponent and the mantissa part of x
|
||||
lea r9,L__2_by_pi_bits
|
||||
|
||||
;xexp = (x >> 52) - 1023
|
||||
movd r11,xmm0
|
||||
mov rcx,r11
|
||||
shr r11,52
|
||||
sub r11,1023 ; r11 <-- xexp = exponent of input x
|
||||
|
||||
;calculate the last byte from which to start multiplication
|
||||
;last = 134 - (xexp >> 3)
|
||||
mov r10,r11
|
||||
shr r10,3
|
||||
sub r10,134 ; r10 <-- -last
|
||||
neg r10 ; r10 <-- last
|
||||
|
||||
; load 64 bits of 2_by_pi
|
||||
mov rax,[r9 + r10]
|
||||
|
||||
; mantissa of x = ((x << 12) >> 12) | implied bit
|
||||
shl rcx,12
|
||||
shr rcx,12 ; rcx <-- mantissa part of input x
|
||||
bts rcx,52 ; add the implied bit as well
|
||||
|
||||
; load next 128 bits of 2_by_pi
|
||||
add r10,8 ;increment to next 8 bytes of 2_by_pi
|
||||
movdqu xmm0,[r9 + r10]
|
||||
|
||||
; do three 64-bit multiplications with mant of x
|
||||
mul rcx
|
||||
mov r8,rax ; r8 <-- last 64 bits of mul = res1[2]
|
||||
mov r10,rdx ; r10 <-- carry
|
||||
movd rax,xmm0
|
||||
mul rcx
|
||||
; resexp = xexp & 7
|
||||
and r11,7 ; r11 <-- resexp = xexp & 7 = last 3 bits
|
||||
psrldq xmm0,8
|
||||
add rax,r10 ; add the previous carry
|
||||
adc rdx,0
|
||||
mov r9,rax ; r9 <-- next 64 bits of mul = res1[1]
|
||||
mov r10,rdx ; r10 <-- carry
|
||||
movd rax,xmm0
|
||||
mul rcx
|
||||
add r10,rax ; r10 <-- most sig. 64 bits = res1[0]
|
||||
; find the region
|
||||
; last three bits ltb = most sig bits >> (54 - resexp));
|
||||
; decimal point in last 18 bits ==> 8 lsb's in first 64 bits
|
||||
; and 8 msb's in next 64 bits
|
||||
; point_five = ltb & 01h;
|
||||
; region = ((ltb >> 1) + point_five) & 3;
|
||||
mov rcx,54
|
||||
mov rax,r10
|
||||
sub rcx,r11
|
||||
xor rdx,rdx ; rdx <-- sign of x
|
||||
shr rax,cl
|
||||
jnc L__no_point_five
|
||||
; if there is carry then negate the result of multiplication
|
||||
not r10
|
||||
not r9
|
||||
not r8
|
||||
mov rdx,08000000000000000h
|
||||
|
||||
ALIGN 16
|
||||
L__no_point_five:
|
||||
adc rax,0
|
||||
and rax,3 ; rax now has region
|
||||
mov QWORD PTR [region+rsp],rax
|
||||
|
||||
; calculate the number of integer bits and zero them out
|
||||
mov rcx,r11
|
||||
add rcx,10 ; rcx = no. of integer bits
|
||||
shl r10,cl
|
||||
shr r10,cl ; r10 contains only mant bits
|
||||
sub rcx,64 ; form the exponent
|
||||
mov r11,rcx
|
||||
|
||||
;find the highest set bit
|
||||
bsr rcx,r10
|
||||
jnz L__form_mantissa
|
||||
mov r10,r9
|
||||
mov r9,r8
|
||||
mov r8,0
|
||||
bsr rcx,r10 ; rcx = hsb
|
||||
sub r11,64
|
||||
|
||||
|
||||
ALIGN 16
|
||||
L__form_mantissa:
|
||||
add r11,rcx ; for exp of x
|
||||
sub rcx,52 ; rcx = no. of bits to shift in r10
|
||||
cmp rcx,0
|
||||
jl L__hsb_below_52
|
||||
je L__form_numbers
|
||||
; hsb above 52
|
||||
mov r8,r10 ; previous contents of r8 not required
|
||||
shr r10,cl ; r10 = mantissa of x with hsb at 52
|
||||
shr r9,cl ; make space for bits from r10
|
||||
sub rcx,64
|
||||
neg rcx
|
||||
; rcx <-- no of bits to shift r10 to move those bits to r9
|
||||
shl r8,cl
|
||||
or r9,r8 ; r9 = mantissa bits of xx
|
||||
jmp L__form_numbers
|
||||
|
||||
ALIGN 16
|
||||
L__hsb_below_52:
|
||||
neg rcx
|
||||
mov rax,r9
|
||||
shl r10,cl
|
||||
shl r9,cl
|
||||
sub rcx,64
|
||||
neg rcx
|
||||
shr rax,cl
|
||||
or r10,rax
|
||||
shr r8,cl
|
||||
or r9,r8
|
||||
|
||||
ALIGN 16
|
||||
L__form_numbers:
|
||||
add r11,1023
|
||||
btr r10,52 ; remove the implicit bit
|
||||
mov rcx,r11
|
||||
or r10,rdx ; put the sign
|
||||
shl rcx,52
|
||||
or r10,rcx ; r10 <-- x
|
||||
|
||||
movd xmm0,r10 ; xmm0 <-- x
|
||||
movdqa xmm1,xmm0 ; xmm1 <-- x
|
||||
psrlq xmm1,27
|
||||
psllq xmm1,27 ; xmm1 <-- hx
|
||||
movdqa xmm2,xmm0 ; xmm2 <-- x
|
||||
subsd xmm2,xmm1 ; xmm2 <-- tx
|
||||
movlhps xmm0,xmm0 ; xmm0 <-- x,x
|
||||
movlhps xmm2,xmm1 ; xmm2 <-- hx,tx
|
||||
|
||||
movdqa xmm1,XMMWORD PTR L__piby2_part3_piby2_lead
|
||||
movdqa xmm3,XMMWORD PTR L__piby2_part1
|
||||
movdqa xmm4,XMMWORD PTR L__piby2_part2
|
||||
|
||||
; form xx
|
||||
xor rcx,rcx
|
||||
bsr rcx,r9
|
||||
sub rcx,64 ; to shift the implicit bit as well
|
||||
neg rcx
|
||||
shl r9,cl
|
||||
shr r9,12
|
||||
add rcx,52
|
||||
sub r11,rcx
|
||||
shl r11,52
|
||||
or r9,rdx
|
||||
or r9,r11
|
||||
movd xmm5,r9 ; xmm5 <-- xx
|
||||
|
||||
mulpd xmm0,xmm1 ; xmm0 <-- piby2_part3 * x,piby2_lead * x = c
|
||||
mulpd xmm5,xmm1 ; xmm5 <-- piby2_lead * xx
|
||||
mulpd xmm3,xmm2 ; xmm3 <-- piby2_part1 * hx,piby2_part1 * tx
|
||||
mulpd xmm4,xmm2 ; xmm4 <-- piby2_part2 * hx,piby2_part2 * tx
|
||||
|
||||
; cc = (piby2_part1 * hx - c) + (piby2_part1 * tx) +
|
||||
; (piby2_part2 * hx) + (piby2_part2 * tx) +
|
||||
; (piby2_lead * xx + piby2_part3 * x)
|
||||
movhlps xmm1,xmm3 ; xmm1 = piby2_part1 * hx
|
||||
movhlps xmm2,xmm4 ; xmm2 = piby2_part2 * hx
|
||||
subsd xmm1,xmm0 ; xmm1 = (piby2_part1 * hx - c)
|
||||
addsd xmm1,xmm3 ; xmm1 = (piby2_part1 * hx - c) + (piby2_part1 * tx)
|
||||
movhlps xmm3,xmm0 ; xmm3 = piby2_part3 * x
|
||||
addsd xmm1,xmm2
|
||||
; xmm1 = (piby2_part1 * hx - c) + (piby2_part1 * tx) + (piby2_part2 * hx)
|
||||
addsd xmm3,xmm5 ; xmm3 = (piby2_lead * xx + piby2_part3 * x)
|
||||
addsd xmm1,xmm4
|
||||
; xmm1 = (piby2_part1 * hx - c) + (piby2_part1 * tx) +
|
||||
; (piby2_part2 * hx) + (piby2_part2 * tx)
|
||||
addsd xmm1,xmm3 ; xmm1 = cc
|
||||
|
||||
; xmm0 <-- c, xmm1 <-- cc
|
||||
; r = c + cc
|
||||
; rr = (c - r) + cc
|
||||
|
||||
movdqa xmm2,xmm0 ; xmm2 <-- copy of c
|
||||
addsd xmm0,xmm1 ; xmm0 <-- r = c + cc
|
||||
subsd xmm2,xmm0 ; xmm2 <-- c - r
|
||||
addsd xmm1,xmm2 ; xmm1 <-- rr = cc + (c - r)
|
||||
mov rax, QWORD PTR[region+rsp] ; rax <-- region
|
||||
|
||||
StackDeallocate stack_size
|
||||
ret
|
||||
|
||||
fname endp
|
||||
|
||||
; NOTE: If this is not going to be used, should probably remove it. - WAT
|
||||
ALIGN 16
|
||||
PUBLIC fsname
|
||||
fsname PROC FRAME
|
||||
StackAllocate sstack_size
|
||||
.ENDPROLOG
|
||||
|
||||
; xmm0l has |x|
|
||||
; r9 also has |x|
|
||||
; ASSUMPTION: if we call this function, |x| > pi/4
|
||||
|
||||
xor r8d,r8d
|
||||
cmp r9, QWORD PTR L_5pio4
|
||||
ja Lax_gt_5pio4
|
||||
cmp r9, QWORD PTR L_3pio4
|
||||
seta r8b
|
||||
inc r8d
|
||||
jmp Lstage_npi2
|
||||
Lax_gt_5pio4:
|
||||
cmp r9, QWORD PTR L_9pio4
|
||||
ja Lnpi2_full_computation
|
||||
cmp r9, QWORD PTR L_7pio4
|
||||
seta r8b
|
||||
add r8d,3
|
||||
Lstage_npi2:
|
||||
movd xmm2, r8d
|
||||
cvtdq2pd xmm4, xmm2
|
||||
jmp Lnpi2_known
|
||||
|
||||
Lnpi2_full_computation:
|
||||
; movapd xmm1, L_twobypi
|
||||
; movapd xmm3, L_point_five
|
||||
movapd xmm5,xmm0
|
||||
; mulsd xmm5,xmm1
|
||||
; addsd xmm5,xmm3 ; xmm5 <-- |x|*2/pi + .5
|
||||
mulsd xmm5, L_twobypi
|
||||
addsd xmm5, L_point_five
|
||||
|
||||
cvttpd2dq xmm5,xmm5 ; xmm5 < npi2 = int part
|
||||
movapd xmm2,xmm5
|
||||
andpd xmm2,L_int_three
|
||||
cvtdq2pd xmm4,xmm5
|
||||
|
||||
Lnpi2_known:
|
||||
movapd xmm5,xmm4
|
||||
mulsd xmm5,QWORD PTR L_piby2_1 ; xmm5 <-- npi2*piby2_1
|
||||
xorpd xmm5,L_signbit ; xmm5 <-- -npi2*piby2_1
|
||||
addpd xmm5,xmm0 ; xmm5 <-- rhead = x - npi2*piby2_1
|
||||
movapd xmm3,xmm4
|
||||
mulsd xmm3,QWORD PTR L_piby2_1tail ; xmm3 <-- rtail = npi2*piby2_1tail
|
||||
|
||||
; If x is nearly a multiple of pi/2, rhead will be small compared to |x|
|
||||
; we check this by checking exponent difference.
|
||||
|
||||
; Note that both the unbiased exponents are positive, and that of rhead
|
||||
; must be <= that of |x|
|
||||
movapd xmm1,xmm5 ; xmm1l <-- rhead
|
||||
subpd xmm1,xmm3 ; xmm1l <-- r = rhead - rtail
|
||||
andpd xmm1,L_inf_mask_64
|
||||
psubq xmm0,xmm1 ; xmm0 <-- |x| - r
|
||||
psrlq xmm0,52
|
||||
comisd xmm0,L_int_15
|
||||
|
||||
; movd rax, xmm5 ; really a movq
|
||||
; shr rax, 52
|
||||
; shr rdx, 52 ; get exponent of |x| (no and needed)
|
||||
; sub rdx, rax
|
||||
; cmp rdx, 15
|
||||
jbe Lcw_get_r_rr
|
||||
|
||||
; here expdiff > 15, so x is nearly a multiple of pi/2 and things are hard
|
||||
; we use another piece of pi/2 in the reduction
|
||||
|
||||
movapd xmm1,xmm5
|
||||
movapd xmm3,xmm4
|
||||
mulsd xmm3,QWORD PTR L_piby2_2 ; xmm3 <--- rtail = npi2*piby2_2
|
||||
subsd xmm5,xmm3 ; xmm5 <-- rhead = t - rtail
|
||||
|
||||
; now rtail = npi2*piby2_2tail - ((t-rhead) - rtail)
|
||||
subsd xmm1,xmm5
|
||||
subsd xmm1,xmm3
|
||||
movapd xmm3,xmm4
|
||||
mulsd xmm3,QWORD PTR L_piby2_2tail
|
||||
subsd xmm3,xmm1 ; xmm3 <-- rtail
|
||||
|
||||
comisd xmm0,L_int_48
|
||||
; cmp rdx, 48
|
||||
jbe Lcw_get_r_rr
|
||||
|
||||
; here expdiff > 48, so x is REALLY close to a multiple of pi/2
|
||||
; and we use yet another piece of pi/2 in the reduction
|
||||
|
||||
movapd xmm0,xmm5 ; xmm0 <-- t = rhead
|
||||
movapd xmm3,xmm4
|
||||
mulsd xmm3,QWORD PTR L_piby2_3 ; xmm3 <-- rtail = npi2 * piby2_3
|
||||
movapd xmm5,xmm0
|
||||
subsd xmm5,xmm3 ; xmm5 <-- rhead = t - rtail
|
||||
|
||||
; now rtail = npi2 * piby2_3tail - ((t - rhead) - rtail)
|
||||
movapd xmm1,xmm0
|
||||
subsd xmm1,xmm5
|
||||
subsd xmm1,xmm3
|
||||
movapd xmm3,xmm4
|
||||
mulsd xmm3,QWORD PTR L_piby2_3tail
|
||||
subsd xmm3,xmm1 ; xmm3 <-- rtail
|
||||
|
||||
Lcw_get_r_rr:
|
||||
; We have a satisfactory rhead in xmm5 and rtail in xmm3
|
||||
; We now produce r in xmm0 and rr in xmm1, where the actual reduced argument
|
||||
; is the sum of r and rr, and rr is insignificant
|
||||
; with respect to r under addition (i.e., r + rr == r).
|
||||
movapd xmm0,xmm5 ; xmm0 <-- rhead
|
||||
subsd xmm0,xmm3 ; xmm0 <-- r = rhead - rtail
|
||||
movapd xmm1,xmm5 ; xmm1 <-- rhead
|
||||
subsd xmm1,xmm0 ; xmm1 <-- (rhead - r)
|
||||
subsd xmm1,xmm3 ; xmm1 <-- rr = (rhead - r) - rtail
|
||||
movd rax,xmm2 ; rax <-- region
|
||||
StackDeallocate sstack_size
|
||||
ret
|
||||
fsname endp
|
||||
|
||||
END
|
283
sdk/lib/crt/math/libm_sse2/remainder_piby2_forFMA3.asm
Normal file
283
sdk/lib/crt/math/libm_sse2/remainder_piby2_forFMA3.asm
Normal file
|
@ -0,0 +1,283 @@
|
|||
;
|
||||
;
|
||||
; MIT License
|
||||
; -----------
|
||||
;
|
||||
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
||||
;
|
||||
; Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
; of this Software and associated documentaon files (the "Software"), to deal
|
||||
; in the Software without restriction, including without limitation the rights
|
||||
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
; copies of the Software, and to permit persons to whom the Software is
|
||||
; furnished to do so, subject to the following conditions:
|
||||
;
|
||||
; The above copyright notice and this permission notice shall be included in
|
||||
; all copies or substantial portions of the Software.
|
||||
;
|
||||
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
; THE SOFTWARE.
|
||||
;
|
||||
; An implementation of the remainder by pi/2 function using fma3
|
||||
; This is a service routine for use by trig functions coded in asm that use fma3
|
||||
;
|
||||
; On input,
|
||||
; xmm0 = x;
|
||||
; On ouput
|
||||
; xmm0 = r
|
||||
; xmm1 = rr
|
||||
; rax = region
|
||||
|
||||
.const
|
||||
ALIGN 16
|
||||
L_piby2_lead DQ 03ff921fb54442d18h, 03ff921fb54442d18h
|
||||
L_fff800 DQ 0fffffffff8000000h, 0fffffffff8000000h
|
||||
L_piby2_part1 DQ 03ff921fb50000000h, 03ff921fb50000000h
|
||||
L_piby2_part2 DQ 03e5110b460000000h, 03e5110b460000000h
|
||||
L_piby2_part3 DQ 03c91a62633145c06h, 03c91a62633145c06h
|
||||
L_piby2_1 DQ 03FF921FB54400000h, 03FF921FB54400000h
|
||||
L_piby2_2 DQ 03DD0B4611A600000h, 03DD0B4611A600000h
|
||||
L_piby2_3 DQ 03BA3198A2E000000h, 03BA3198A2E000000h
|
||||
L_piby2_1tail DQ 03DD0B4611A626331h, 03DD0B4611A626331h
|
||||
L_piby2_2tail DQ 03BA3198A2E037073h, 03BA3198A2E037073h
|
||||
L_piby2_3tail DQ 0397B839A252049C1h, 0397B839A252049C1h
|
||||
L_sign_mask DQ 07FFFFFFFFFFFFFFFh, 07FFFFFFFFFFFFFFFh
|
||||
L_twobypi DQ 03FE45F306DC9C883h, 03FE45F306DC9C883h
|
||||
L_point_five DQ 03FE0000000000000h, 03FE0000000000000h
|
||||
L_int_three DQ 00000000000000003h, 00000000000000003h
|
||||
L_inf_mask_64 DQ 07FF0000000000000h, 07FF0000000000000h
|
||||
L_signbit DQ 08000000000000000h, 08000000000000000h
|
||||
;; constants for BDL reduction
|
||||
L_r DQ 03FE45F306DC9C883h, 03FE45F306DC9C883h ; 2/pi
|
||||
L_xc1 DQ 03FF921FB54442D18H, 03FF921FB54442D18h ; pi/2 (L_piby2_lead)
|
||||
L_xc2 DQ 03C91A62633145C00H, 03C91A62633145C00h ; pi/2 part 2
|
||||
L_xc3 DQ 0397B839A252049C0H, 0397B839A252049C0h ; pi/2 part 3
|
||||
; sigma is 3*2^(p-n-2) where n is 0 and p is 53.
|
||||
L_sigma DQ 04338000000000000h, 04338000000000000h ; 6755399441055744.
|
||||
|
||||
EXTRN __L_2_by_pi_bits:BYTE
|
||||
|
||||
region EQU 020h
|
||||
stack_size EQU 038h
|
||||
|
||||
include fm.inc
|
||||
|
||||
fname TEXTEQU <__remainder_piby2_fma3>
|
||||
fbname TEXTEQU <__remainder_piby2_fma3_bdl>
|
||||
|
||||
.code
|
||||
|
||||
PUBLIC fname
|
||||
fname PROC FRAME
|
||||
StackAllocate stack_size
|
||||
.ENDPROLOG
|
||||
|
||||
; This function is not using rdx, r8, and r9 as pointers;
|
||||
; all returns are in registers
|
||||
|
||||
; get the unbiased exponent and the mantissa part of x
|
||||
lea r9,__L_2_by_pi_bits
|
||||
|
||||
; xexp = (x >> 52) - 1023
|
||||
vmovq r11,xmm0
|
||||
mov rcx,r11
|
||||
shr r11,52
|
||||
sub r11,1023 ; r11 <-- xexp = exponent of input x
|
||||
|
||||
; calculate the last byte from which to start multiplication
|
||||
; last = 134 - (xexp >> 3)
|
||||
mov r10,r11
|
||||
shr r10,3
|
||||
sub r10,134 ; r10 <-- -last
|
||||
neg r10 ; r10 <-- last
|
||||
|
||||
; load 64 bits of 2_by_pi
|
||||
mov rax,[r9 + r10]
|
||||
|
||||
; mantissa of x = ((x << 12) >> 12) | implied bit
|
||||
shl rcx,12
|
||||
shr rcx,12 ; rcx <-- mantissa part of input x
|
||||
bts rcx,52 ; add the implied bit as well
|
||||
|
||||
; load next 128 bits of 2_by_pi
|
||||
add r10,8 ; increment to next 8 bytes of 2_by_pi
|
||||
vmovdqu xmm0,XMMWORD PTR[r9 + r10]
|
||||
|
||||
; do three 64-bit multiplications with mant of x
|
||||
mul rcx
|
||||
mov r8,rax ; r8 <-- last 64 bits of mul = res1[2]
|
||||
mov r10,rdx ; r10 <-- carry
|
||||
vmovq rax,xmm0
|
||||
mul rcx
|
||||
; resexp = xexp & 7
|
||||
and r11,7 ; r11 <-- resexp = last 3 bits of xexp
|
||||
vpsrldq xmm0,xmm0,8
|
||||
add rax,r10 ; add the previous carry
|
||||
adc rdx,0
|
||||
mov r9,rax ; r9 <-- next 64 bits of mul = res1[1]
|
||||
mov r10,rdx ; r10 <-- carry
|
||||
vmovq rax,xmm0
|
||||
mul rcx
|
||||
add r10,rax ; r10 <-- most sig. 64 bits = res1[0]
|
||||
|
||||
; find the region
|
||||
; last three bits ltb = most sig bits >> (54 - resexp));
|
||||
; decimal point in last 18 bits ==> 8 lsb's in first 64 bits
|
||||
; and 8 msb's in next 64 bits
|
||||
; point_five = ltb & 01h;
|
||||
; region = ((ltb >> 1) + point_five) & 3;
|
||||
mov rcx,54
|
||||
mov rax,r10
|
||||
sub rcx,r11
|
||||
xor rdx,rdx ; rdx <-- sign of x
|
||||
shr rax,cl
|
||||
jnc L__no_point_five
|
||||
; if there is carry then negate the result of multiplication
|
||||
not r10
|
||||
not r9
|
||||
not r8
|
||||
mov rdx,08000000000000000h
|
||||
|
||||
ALIGN 16
|
||||
L__no_point_five:
|
||||
adc rax,0
|
||||
and rax,3 ; rax now has region
|
||||
mov QWORD PTR [region+rsp], rax
|
||||
|
||||
; calculate the number of integer bits and zero them out
|
||||
mov rcx,r11
|
||||
add rcx,10 ; rcx = no. of integer bits
|
||||
shl r10,cl
|
||||
shr r10,cl ; r10 contains only mant bits
|
||||
sub rcx,64 ; form the exponent
|
||||
mov r11,rcx
|
||||
|
||||
; find the highest set bit
|
||||
bsr rcx,r10
|
||||
jnz L__form_mantissa
|
||||
mov r10,r9
|
||||
mov r9,r8
|
||||
mov r8,0
|
||||
bsr rcx,r10 ; rcx = hsb
|
||||
sub r11,64
|
||||
|
||||
ALIGN 16
|
||||
L__form_mantissa:
|
||||
add r11,rcx ; for exp of x
|
||||
sub rcx,52 ; rcx = no. of bits to shift in r10
|
||||
cmp rcx,0
|
||||
jl L__hsb_below_52
|
||||
je L__form_numbers
|
||||
; hsb above 52
|
||||
mov r8,r10 ; previous r8 not required
|
||||
shr r10,cl ; r10 = mantissa of x with hsb at 52
|
||||
shr r9,cl ; make space for bits from r10
|
||||
sub rcx,64
|
||||
neg rcx
|
||||
; rcx <-- no of bits to shift r10 to move those bits to r9
|
||||
shl r8,cl
|
||||
or r9,r8 ; r9 = mantissa bits of xx
|
||||
jmp L__form_numbers
|
||||
|
||||
ALIGN 16
|
||||
L__hsb_below_52:
|
||||
; rcx has shift count (< 0)
|
||||
neg rcx
|
||||
mov rax,r9
|
||||
shl r10,cl
|
||||
shl r9,cl
|
||||
sub rcx,64
|
||||
neg rcx
|
||||
shr rax,cl
|
||||
or r10,rax
|
||||
shr r8,cl
|
||||
or r9,r8
|
||||
|
||||
ALIGN 16
|
||||
; Here r11 has unbiased exponent
|
||||
; r10 has mantissa, with implicit bit possibly set
|
||||
; rdx has the sign bit
|
||||
L__form_numbers:
|
||||
add r11,1023 ; r11 <-- biased exponent
|
||||
btr r10,52 ; remove the implicit bit
|
||||
mov rcx,r11 ; rcx <-- copy of biased exponent
|
||||
or r10,rdx ; put the sign
|
||||
shl rcx,52 ; shift biased exponent into place
|
||||
or r10,rcx ; r10 <-- x
|
||||
vmovq xmm2,r10 ; xmm1l <-- x
|
||||
|
||||
; form xx
|
||||
; xor rcx,rcx ; Why is this necessary???
|
||||
bsr rcx,r9 ; scan for high bit of xx mantissa
|
||||
sub rcx,64 ; to shift the implied bit as well
|
||||
neg rcx
|
||||
shl r9,cl
|
||||
shr r9,12
|
||||
add rcx,52
|
||||
sub r11,rcx
|
||||
shl r11,52
|
||||
or r9,rdx
|
||||
or r9,r11
|
||||
vmovq xmm1,r9 ; xmm1 <-- xx
|
||||
vandpd xmm4,xmm2,L_fff800 ; xmm4 <-- hx
|
||||
vsubsd xmm0,xmm2,xmm4 ; xmm0 <-- tx
|
||||
vmulsd xmm5,xmm2,L_piby2_lead ; xmm5 <-- c
|
||||
vmulsd xmm3,xmm4,L_piby2_part1
|
||||
vsubsd xmm3,xmm3,xmm5
|
||||
vfmadd231sd xmm3,xmm0,L_piby2_part1
|
||||
vfmadd231sd xmm3,xmm4,L_piby2_part2
|
||||
vfmadd231sd xmm3,xmm0,L_piby2_part2
|
||||
vmulsd xmm4,xmm1,L_piby2_lead
|
||||
vfmadd231sd xmm4,xmm2,L_piby2_part3
|
||||
vaddsd xmm3,xmm3,xmm4 ; xmm3 <-- cc
|
||||
vaddsd xmm0,xmm5,xmm3 ; xmm0 <--r
|
||||
vsubsd xmm1,xmm5,xmm0
|
||||
vaddsd xmm1,xmm1,xmm3 ; xmm1 <-- rr
|
||||
mov rax, QWORD PTR [region+rsp]
|
||||
|
||||
StackDeallocate stack_size
|
||||
ret
|
||||
fname endp
|
||||
|
||||
ALIGN 16
|
||||
PUBLIC fbname
|
||||
fbname PROC FRAME
|
||||
.ENDPROLOG
|
||||
; Boldo, Daumas, annd Li, "Formally Verified Argument
|
||||
; Reduction With a Fused Multiply-Add,"
|
||||
; IEEE Trans. Comp., vol. 58, #8, Aug. 2009
|
||||
; coefficients are from table 1, mutatis mutandis
|
||||
; algorithm is their formula 3.1 (for getting z from sigma) and
|
||||
; algorithm 5.1 (and extended version) for actual reduction
|
||||
vmovapd xmm1,xmm0
|
||||
vmovapd xmm4,L_xc2 ; xmm4 <-- xc2
|
||||
vmovapd xmm2,L_sigma
|
||||
vfmadd132sd xmm1,xmm2,L_r ; z = arg*r + sigma
|
||||
vsubsd xmm1,xmm1,xmm2 ; xmm1 <-- z -= sigma
|
||||
vcvttpd2dq xmm5,xmm1
|
||||
vmovq rax, xmm5
|
||||
vmovapd xmm2,xmm1
|
||||
vfnmadd132sd xmm2,xmm0,L_xc1 ; xmm2 <-- u = arg - z*xc1
|
||||
vmulsd xmm3,xmm1,xmm4 ; xmm3 <-- p1 = z*xc2
|
||||
vmovapd xmm0,xmm1 ; xmm0 <-- copy of z
|
||||
vfmsub213sd xmm0,xmm4,xmm3 ; xmm0 <-- p2 = z*xc2 - p1
|
||||
vsubsd xmm5,xmm2,xmm3 ; xmm5 <-- t1 = u - p1
|
||||
; We really don't want to spill in this code, so we're commandeering xmm4
|
||||
vsubsd xmm4,xmm2,xmm5 ; xmm4 <-- temp = u - t1
|
||||
vsubsd xmm4,xmm4,xmm3 ; xmm4 <-- t2 = temp - p1
|
||||
; used to use xmm4 here for L_xc2
|
||||
vfnmadd231sd xmm2,xmm1,L_xc2 ; xmm2 <-- v1 = -xc2*z + u
|
||||
vsubsd xmm5,xmm5,xmm2 ; xmm5 <-- v2 = t1 - v1
|
||||
vaddsd xmm5,xmm5,xmm4 ; xmm5 <-- v2 += t2
|
||||
vsubsd xmm5,xmm5,xmm0 ; xmm5 <-- v2 -= p2
|
||||
vmovapd xmm0,xmm2 ; xmm0 <-- arghead = v1
|
||||
vfnmadd132sd xmm1,xmm5,L_xc3 ; xmm1 <-- argtail = -xc3*z + v2
|
||||
and rax, 3 ; rax <-- region
|
||||
ret
|
||||
fbname endp
|
||||
END
|
173
sdk/lib/crt/math/libm_sse2/remainder_piby2f.c
Normal file
173
sdk/lib/crt/math/libm_sse2/remainder_piby2f.c
Normal file
|
@ -0,0 +1,173 @@
|
|||
|
||||
/*******************************************************************************
|
||||
MIT License
|
||||
-----------
|
||||
|
||||
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this Software and associated documentaon files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "libm.h"
|
||||
#include "libm_util.h"
|
||||
|
||||
|
||||
/* Given positive argument x, reduce it to the range [-pi/4,pi/4] using
|
||||
extra precision, and return the result in r.
|
||||
Return value "region" tells how many lots of pi/2 were subtracted
|
||||
from x to put it in the range [-pi/4,pi/4], mod 4. */
|
||||
void __remainder_piby2f(unsigned long ux, double *r, int *region)
|
||||
{
|
||||
|
||||
|
||||
/* This method simulates multi-precision floating-point
|
||||
arithmetic and is accurate for all 1 <= x < infinity */
|
||||
#define bitsper 36
|
||||
unsigned long res[10];
|
||||
unsigned long u, carry, mask, mant, nextbits;
|
||||
int first, last, i, rexp, xexp, resexp, ltb, determ, bc;
|
||||
double dx;
|
||||
static const double
|
||||
piby2 = 1.57079632679489655800e+00; /* 0x3ff921fb54442d18 */
|
||||
static unsigned long pibits[] =
|
||||
{
|
||||
0LL,
|
||||
5215LL, 13000023176LL, 11362338026LL, 67174558139LL,
|
||||
34819822259LL, 10612056195LL, 67816420731LL, 57840157550LL,
|
||||
19558516809LL, 50025467026LL, 25186875954LL, 18152700886LL
|
||||
};
|
||||
|
||||
|
||||
xexp = (int)(((ux & EXPBITS_DP64) >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64);
|
||||
ux = ((ux & MANTBITS_DP64) | IMPBIT_DP64) >> 29;
|
||||
|
||||
|
||||
/* Now ux is the mantissa bit pattern of x as a long integer */
|
||||
mask = 1;
|
||||
mask = (mask << bitsper) - 1;
|
||||
|
||||
/* Set first and last to the positions of the first
|
||||
and last chunks of 2/pi that we need */
|
||||
first = xexp / bitsper;
|
||||
resexp = xexp - first * bitsper;
|
||||
/* 120 is the theoretical maximum number of bits (actually
|
||||
115 for IEEE single precision) that we need to extract
|
||||
from the middle of 2/pi to compute the reduced argument
|
||||
accurately enough for our purposes */
|
||||
last = first + 120 / bitsper;
|
||||
|
||||
|
||||
/* Do a long multiplication of the bits of 2/pi by the
|
||||
integer mantissa */
|
||||
#if 0
|
||||
for (i = last; i >= first; i--)
|
||||
{
|
||||
u = pibits[i] * ux + carry;
|
||||
res[i - first] = u & mask;
|
||||
carry = u >> bitsper;
|
||||
}
|
||||
res[last - first + 1] = 0;
|
||||
#else
|
||||
/* Unroll the loop. This is only correct because we know
|
||||
that bitsper is fixed as 36. */
|
||||
res[4] = 0;
|
||||
u = pibits[last] * ux;
|
||||
res[3] = u & mask;
|
||||
carry = u >> bitsper;
|
||||
u = pibits[last - 1] * ux + carry;
|
||||
res[2] = u & mask;
|
||||
carry = u >> bitsper;
|
||||
u = pibits[last - 2] * ux + carry;
|
||||
res[1] = u & mask;
|
||||
carry = u >> bitsper;
|
||||
u = pibits[first] * ux + carry;
|
||||
res[0] = u & mask;
|
||||
#endif
|
||||
|
||||
|
||||
/* Reconstruct the result */
|
||||
ltb = (int)((((res[0] << bitsper) | res[1])
|
||||
>> (bitsper - 1 - resexp)) & 7);
|
||||
|
||||
/* determ says whether the fractional part is >= 0.5 */
|
||||
determ = ltb & 1;
|
||||
|
||||
i = 1;
|
||||
if (determ)
|
||||
{
|
||||
/* The mantissa is >= 0.5. We want to subtract it
|
||||
from 1.0 by negating all the bits */
|
||||
*region = ((ltb >> 1) + 1) & 3;
|
||||
mant = 1;
|
||||
mant = ~(res[1]) & ((mant << (bitsper - resexp)) - 1);
|
||||
while (mant < 0x0000000000010000)
|
||||
{
|
||||
i++;
|
||||
mant = (mant << bitsper) | (~(res[i]) & mask);
|
||||
}
|
||||
nextbits = (~(res[i+1]) & mask);
|
||||
}
|
||||
else
|
||||
{
|
||||
*region = (ltb >> 1);
|
||||
mant = 1;
|
||||
mant = res[1] & ((mant << (bitsper - resexp)) - 1);
|
||||
while (mant < 0x0000000000010000)
|
||||
{
|
||||
i++;
|
||||
mant = (mant << bitsper) | res[i];
|
||||
}
|
||||
nextbits = res[i+1];
|
||||
}
|
||||
|
||||
|
||||
/* Normalize the mantissa. The shift value 6 here, determined by
|
||||
trial and error, seems to give optimal speed. */
|
||||
bc = 0;
|
||||
while (mant < 0x0000400000000000)
|
||||
{
|
||||
bc += 6;
|
||||
mant <<= 6;
|
||||
}
|
||||
while (mant < 0x0010000000000000)
|
||||
{
|
||||
bc++;
|
||||
mant <<= 1;
|
||||
}
|
||||
mant |= nextbits >> (bitsper - bc);
|
||||
|
||||
rexp = 52 + resexp - bc - i * bitsper;
|
||||
|
||||
|
||||
/* Put the result exponent rexp onto the mantissa pattern */
|
||||
u = ((unsigned long)rexp + EXPBIAS_DP64) << EXPSHIFTBITS_DP64;
|
||||
ux = (mant & MANTBITS_DP64) | u;
|
||||
if (determ)
|
||||
/* If we negated the mantissa we negate x too */
|
||||
ux |= SIGNBIT_DP64;
|
||||
PUT_BITS_DP64(ux, dx);
|
||||
|
||||
|
||||
/* x is a double precision version of the fractional part of
|
||||
x * 2 / pi. Multiply x by pi/2 in double precision
|
||||
to get the reduced argument r. */
|
||||
*r = dx * piby2;
|
||||
return;
|
||||
|
||||
}
|
180
sdk/lib/crt/math/libm_sse2/remainder_piby2f_forAsm.asm
Normal file
180
sdk/lib/crt/math/libm_sse2/remainder_piby2f_forAsm.asm
Normal file
|
@ -0,0 +1,180 @@
|
|||
;
|
||||
; MIT License
|
||||
; -----------
|
||||
;
|
||||
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
||||
;
|
||||
; Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
; of this Software and associated documentaon files (the "Software"), to deal
|
||||
; in the Software without restriction, including without limitation the rights
|
||||
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
; copies of the Software, and to permit persons to whom the Software is
|
||||
; furnished to do so, subject to the following conditions:
|
||||
;
|
||||
; The above copyright notice and this permission notice shall be included in
|
||||
; all copies or substantial portions of the Software.
|
||||
;
|
||||
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
; THE SOFTWARE.
|
||||
;
|
||||
; An implementation of the remainder by pi/2 function
|
||||
; This is a service routine for use by trig functions coded in asm
|
||||
;
|
||||
; On input,
|
||||
; xmm0 = x; Note that we assume x >= pi/4
|
||||
; On ouput
|
||||
; xmm0 = r
|
||||
; eax = region
|
||||
|
||||
.const
|
||||
|
||||
ALIGN 16
|
||||
L__piby2 DQ 03ff921fb54442d18h
|
||||
EXTRN __L_2_by_pi_bits:BYTE
|
||||
|
||||
|
||||
fname TEXTEQU <__remainder_piby2d2f_forAsm>
|
||||
|
||||
stack_size EQU 000h
|
||||
include fm.inc
|
||||
|
||||
.code
|
||||
PUBLIC fname
|
||||
fname PROC FRAME
|
||||
StackAllocate stack_size
|
||||
.ENDPROLOG
|
||||
|
||||
lea r9,__L_2_by_pi_bits
|
||||
|
||||
;get the unbiased exponent and the mantissa part of x
|
||||
;Since x >= pi/4, xexp = (x >> 52) - 1023
|
||||
movd r11,xmm0
|
||||
mov rcx,r11
|
||||
shr r11,52
|
||||
sub r11,1023 ; r11 <-- xexp = exponent of input x
|
||||
|
||||
;calculate the last byte from which to start multiplication
|
||||
;last = 134 - (xexp >> 3)
|
||||
mov r10,r11
|
||||
shr r10,3
|
||||
sub r10,134 ;r10 = -last
|
||||
neg r10 ;r10 = last
|
||||
|
||||
;load 64 bits of 2_by_pi
|
||||
mov rax,[r9 + r10]
|
||||
|
||||
;mantissa of x = ((x << 12) >> 12) | implied bit
|
||||
shl rcx,12
|
||||
shr rcx,12 ;rcx = mantissa part of input x
|
||||
bts rcx,52 ;add the implied bit as well
|
||||
|
||||
;load next 128 bits of 2_by_pi
|
||||
add r10,8 ;increment to next 8 bytes of 2_by_pi
|
||||
movdqu xmm0,[r9 + r10]
|
||||
|
||||
;do three 64-bit multiplications with mant of x
|
||||
mul rcx
|
||||
mov r8,rax ;r8 = last 64 bits of mul = res1[2]
|
||||
mov r10,rdx ;r10 <-- carry
|
||||
movd rax,xmm0
|
||||
mul rcx
|
||||
;resexp = xexp & 7
|
||||
and r11,7 ;r11 = resexp = xexp & 7 = last 3 bits
|
||||
psrldq xmm0,8
|
||||
add rax,r10 ; add the previous carry
|
||||
adc rdx,0
|
||||
mov r9,rax ;r9 = next 64 bits of mul = res1[1]
|
||||
mov r10,rdx ;r10 <-- carry
|
||||
movd rax,xmm0
|
||||
mul rcx
|
||||
add r10,rax ;r10 = most sig 64 bits = res1[0]
|
||||
|
||||
;find the region
|
||||
;last three bits ltb = most sig bits >> (54 - resexp))
|
||||
; decimal point in last 18 bits ==> 8 lsb's in first 64 bits and
|
||||
; 8 msb's in next 64 bits
|
||||
;point_five = ltb & 01h;
|
||||
;region = ((ltb >> 1) + point_five) & 3;
|
||||
mov rcx,54
|
||||
mov rax,r10
|
||||
sub rcx,r11
|
||||
xor rdx,rdx ;rdx = sign of x(i.e first part of x * 2bypi)
|
||||
shr rax,cl
|
||||
jnc L__no_point_five
|
||||
;;if there is carry.. then negate the result of multiplication
|
||||
not r10
|
||||
not r9
|
||||
not r8
|
||||
mov rdx,08000000000000000h
|
||||
|
||||
ALIGN 16
|
||||
L__no_point_five:
|
||||
adc rax,0
|
||||
and rax,3
|
||||
; Until / unless we find a better place to save it, we're putting
|
||||
; the region in xmm1.
|
||||
movd xmm1, rax
|
||||
|
||||
;calculate the number of integer bits and zero them out
|
||||
mov rcx,r11
|
||||
add rcx,10 ;rcx = no. of integer bits
|
||||
shl r10,cl
|
||||
shr r10,cl ;r10 contains only mant bits
|
||||
sub rcx,64 ;form the exponent
|
||||
mov r11,rcx
|
||||
|
||||
;find the highest set bit
|
||||
bsr rcx,r10
|
||||
jnz L__form_mantissa
|
||||
mov r10,r9
|
||||
mov r9,r8
|
||||
bsr rcx,r10 ;rcx = hsb
|
||||
sub r11,64
|
||||
|
||||
ALIGN 16
|
||||
L__form_mantissa:
|
||||
add r11,rcx ;for exp of x
|
||||
sub rcx,52 ;rcx = no. of bits to shift in r10
|
||||
cmp rcx,0
|
||||
jl L__hsb_below_52
|
||||
je L__form_numbers
|
||||
;hsb above 52
|
||||
mov r8,r10 ;previous contents of r8 not required
|
||||
shr r10,cl ;r10 = mantissa of x with hsb at 52
|
||||
jmp L__form_numbers
|
||||
|
||||
ALIGN 16
|
||||
L__hsb_below_52:
|
||||
neg rcx
|
||||
mov rax,r9
|
||||
shl r10,cl
|
||||
shl r9,cl
|
||||
sub rcx,64
|
||||
neg rcx
|
||||
shr rax,cl
|
||||
or r10,rax
|
||||
|
||||
ALIGN 16
|
||||
L__form_numbers:
|
||||
add r11,1023
|
||||
btr r10,52 ;remove the implied bit
|
||||
mov rcx,r11
|
||||
or r10,rdx ;put the sign
|
||||
shl rcx,52
|
||||
or r10,rcx ;x is in r10
|
||||
movd xmm0,r10 ; xmm0 = x
|
||||
movd rax, xmm1 ; rax <-- region
|
||||
|
||||
; At this point xmm0 has a double precision version of the fractional part
|
||||
; of x * 2/pi. To get the reduced argument r, we multiply that by pi/2.
|
||||
mulsd xmm0,L__piby2
|
||||
StackDeallocate stack_size
|
||||
ret
|
||||
|
||||
fname endp
|
||||
END
|
341
sdk/lib/crt/math/libm_sse2/remainder_piby2f_forC.asm
Normal file
341
sdk/lib/crt/math/libm_sse2/remainder_piby2f_forC.asm
Normal file
|
@ -0,0 +1,341 @@
|
|||
;
|
||||
; MIT License
|
||||
; -----------
|
||||
;
|
||||
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
||||
;
|
||||
; Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
; of this Software and associated documentaon files (the "Software"), to deal
|
||||
; in the Software without restriction, including without limitation the rights
|
||||
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
; copies of the Software, and to permit persons to whom the Software is
|
||||
; furnished to do so, subject to the following conditions:
|
||||
;
|
||||
; The above copyright notice and this permission notice shall be included in
|
||||
; all copies or substantial portions of the Software.
|
||||
;
|
||||
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
; THE SOFTWARE.
|
||||
;
|
||||
; An implementation of the remainder by pi/2 function
|
||||
; This is a service routine for use by trig functions coded in C
|
||||
;
|
||||
|
||||
fname TEXTEQU <__remainder_piby2d2f_forC>
|
||||
|
||||
save_rdi EQU 20h
|
||||
save_rsi EQU 30h
|
||||
stack_size EQU 088h
|
||||
include fm.inc
|
||||
|
||||
.code
|
||||
PUBLIC fname
|
||||
fname PROC FRAME
|
||||
StackAllocate stack_size
|
||||
SaveReg rdi,save_rdi
|
||||
SaveReg rsi,save_rsi
|
||||
.ENDPROLOG
|
||||
|
||||
mov rdi, rcx
|
||||
mov rsi, rdx
|
||||
mov rdx, r8
|
||||
|
||||
;get the unbiased exponent and the mantissa part of x
|
||||
movd xmm0,rdi
|
||||
lea r9,L__2_by_pi_bits
|
||||
|
||||
;xexp = (x >> 52) - 1023
|
||||
movd r11,xmm0
|
||||
mov rcx,r11
|
||||
shr r11,52
|
||||
sub r11,1023 ;r11 = xexp = exponent of input x
|
||||
|
||||
;calculate the last byte from which to start multiplication
|
||||
;last = 134 - (xexp >> 3)
|
||||
mov r10,r11
|
||||
shr r10,3
|
||||
sub r10,134 ;r10 = -last
|
||||
neg r10 ;r10 = last
|
||||
|
||||
;load 64 bits of 2_by_pi
|
||||
mov rax,[r9 + r10]
|
||||
mov rdi,rdx ; save address of region since mul modifies rdx
|
||||
|
||||
;mantissa of x = ((x << 12) >> 12) | implied bit
|
||||
shl rcx,12
|
||||
shr rcx,12 ;rcx = mantissa part of input x
|
||||
bts rcx,52 ;add the implied bit as well
|
||||
|
||||
;load next 128 bits of 2_by_pi
|
||||
add r10,8 ;increment to next 8 bytes of 2_by_pi
|
||||
movdqu xmm0,[r9 + r10]
|
||||
|
||||
;do three 64-bit multiplications with mant of x
|
||||
mul rcx
|
||||
mov r8,rax ;r8 = last 64 bits of multiplication = res1[2]
|
||||
mov r10,rdx ;r10 = carry
|
||||
movd rax,xmm0
|
||||
mul rcx
|
||||
;resexp = xexp & 7
|
||||
and r11,7 ;r11 = resexp = xexp & 7 = last 3 bits
|
||||
psrldq xmm0,8
|
||||
add rax,r10 ; add the previous carry
|
||||
adc rdx,0
|
||||
mov r9,rax ;r9 = next 64 bits of multiplication = res1[1]
|
||||
mov r10,rdx ;r10 = carry
|
||||
movd rax,xmm0
|
||||
mul rcx
|
||||
add r10,rax ;r10 = most significant 64 bits = res1[0]
|
||||
|
||||
;find the region
|
||||
;last three bits ltb = most sig bits >> (54 - resexp)); decimal point in last 18 bits == 8 lsb's in first 64 bits and 8 msb's in next 64 bits
|
||||
;point_five = ltb & 01h;
|
||||
;region = ((ltb >> 1) + point_five) & 3;
|
||||
mov rcx,54
|
||||
mov rax,r10
|
||||
sub rcx,r11
|
||||
xor rdx,rdx ;rdx = sign of x(i.e first part of x * 2bypi)
|
||||
shr rax,cl
|
||||
jnc L__no_point_five
|
||||
;;if there is carry.. then negate the result of multiplication
|
||||
not r10
|
||||
not r9
|
||||
not r8
|
||||
mov rdx,08000000000000000h
|
||||
|
||||
ALIGN 16
|
||||
L__no_point_five:
|
||||
adc rax,0
|
||||
and rax,3
|
||||
mov DWORD PTR[rdi],eax ;store region to memory
|
||||
|
||||
;calculate the number of integer bits and zero them out
|
||||
mov rcx,r11
|
||||
add rcx,10 ;rcx = no. of integer bits
|
||||
shl r10,cl
|
||||
shr r10,cl ;r10 contains only mant bits
|
||||
sub rcx,64 ;form the exponent
|
||||
mov r11,rcx
|
||||
|
||||
;find the highest set bit
|
||||
bsr rcx,r10
|
||||
jnz L__form_mantissa
|
||||
mov r10,r9
|
||||
mov r9,r8
|
||||
bsr rcx,r10 ;rcx = hsb
|
||||
sub r11,64
|
||||
|
||||
|
||||
ALIGN 16
|
||||
L__form_mantissa:
|
||||
add r11,rcx ;for exp of x
|
||||
sub rcx,52 ;rcx = no. of bits to shift in r10
|
||||
cmp rcx,0
|
||||
jl L__hsb_below_52
|
||||
je L__form_numbers
|
||||
;hsb above 52
|
||||
mov r8,r10 ;previous contents of r8 not required
|
||||
shr r10,cl ;r10 = mantissa of x with hsb at 52
|
||||
jmp L__form_numbers
|
||||
|
||||
ALIGN 16
|
||||
L__hsb_below_52:
|
||||
neg rcx
|
||||
mov rax,r9
|
||||
shl r10,cl
|
||||
shl r9,cl
|
||||
sub rcx,64
|
||||
neg rcx
|
||||
shr rax,cl
|
||||
or r10,rax
|
||||
|
||||
ALIGN 16
|
||||
L__form_numbers:
|
||||
add r11,1023
|
||||
btr r10,52 ;remove the implied bit
|
||||
mov rcx,r11
|
||||
or r10,rdx ;put the sign
|
||||
shl rcx,52
|
||||
or r10,rcx ;x is in r10
|
||||
|
||||
movd xmm0,r10 ;xmm0 = x
|
||||
mulsd xmm0,L__piby2
|
||||
movsd QWORD PTR[rsi],xmm0
|
||||
RestoreReg rsi,save_rsi
|
||||
RestoreReg rdi,save_rdi
|
||||
StackDeallocate stack_size
|
||||
ret
|
||||
|
||||
fname endp
|
||||
|
||||
.const
|
||||
ALIGN 16
|
||||
L__piby2 DQ 03ff921fb54442d18h
|
||||
|
||||
ALIGN 16
|
||||
L__2_by_pi_bits DB 224
|
||||
DB 241
|
||||
DB 27
|
||||
DB 193
|
||||
DB 12
|
||||
DB 88
|
||||
DB 33
|
||||
DB 116
|
||||
DB 53
|
||||
DB 126
|
||||
DB 196
|
||||
DB 126
|
||||
DB 237
|
||||
DB 175
|
||||
DB 169
|
||||
DB 75
|
||||
DB 74
|
||||
DB 41
|
||||
DB 222
|
||||
DB 231
|
||||
DB 28
|
||||
DB 244
|
||||
DB 236
|
||||
DB 197
|
||||
DB 151
|
||||
DB 175
|
||||
DB 31
|
||||
DB 235
|
||||
DB 158
|
||||
DB 212
|
||||
DB 181
|
||||
DB 168
|
||||
DB 127
|
||||
DB 121
|
||||
DB 154
|
||||
DB 253
|
||||
DB 24
|
||||
DB 61
|
||||
DB 221
|
||||
DB 38
|
||||
DB 44
|
||||
DB 159
|
||||
DB 60
|
||||
DB 251
|
||||
DB 217
|
||||
DB 180
|
||||
DB 125
|
||||
DB 180
|
||||
DB 41
|
||||
DB 104
|
||||
DB 45
|
||||
DB 70
|
||||
DB 188
|
||||
DB 188
|
||||
DB 63
|
||||
DB 96
|
||||
DB 22
|
||||
DB 120
|
||||
DB 255
|
||||
DB 95
|
||||
DB 226
|
||||
DB 127
|
||||
DB 236
|
||||
DB 160
|
||||
DB 228
|
||||
DB 247
|
||||
DB 46
|
||||
DB 126
|
||||
DB 17
|
||||
DB 114
|
||||
DB 210
|
||||
DB 231
|
||||
DB 76
|
||||
DB 13
|
||||
DB 230
|
||||
DB 88
|
||||
DB 71
|
||||
DB 230
|
||||
DB 4
|
||||
DB 249
|
||||
DB 125
|
||||
DB 209
|
||||
DB 154
|
||||
DB 192
|
||||
DB 113
|
||||
DB 166
|
||||
DB 19
|
||||
DB 18
|
||||
DB 237
|
||||
DB 186
|
||||
DB 212
|
||||
DB 215
|
||||
DB 8
|
||||
DB 162
|
||||
DB 251
|
||||
DB 156
|
||||
DB 166
|
||||
DB 196
|
||||
DB 114
|
||||
DB 172
|
||||
DB 119
|
||||
DB 248
|
||||
DB 115
|
||||
DB 72
|
||||
DB 70
|
||||
DB 39
|
||||
DB 168
|
||||
DB 187
|
||||
DB 36
|
||||
DB 25
|
||||
DB 128
|
||||
DB 75
|
||||
DB 55
|
||||
DB 9
|
||||
DB 233
|
||||
DB 184
|
||||
DB 145
|
||||
DB 220
|
||||
DB 134
|
||||
DB 21
|
||||
DB 239
|
||||
DB 122
|
||||
DB 175
|
||||
DB 142
|
||||
DB 69
|
||||
DB 249
|
||||
DB 7
|
||||
DB 65
|
||||
DB 14
|
||||
DB 241
|
||||
DB 100
|
||||
DB 86
|
||||
DB 138
|
||||
DB 109
|
||||
DB 3
|
||||
DB 119
|
||||
DB 211
|
||||
DB 212
|
||||
DB 71
|
||||
DB 95
|
||||
DB 157
|
||||
DB 240
|
||||
DB 167
|
||||
DB 84
|
||||
DB 16
|
||||
DB 57
|
||||
DB 185
|
||||
DB 13
|
||||
DB 230
|
||||
DB 139
|
||||
DB 2
|
||||
DB 0
|
||||
DB 0
|
||||
DB 0
|
||||
DB 0
|
||||
DB 0
|
||||
DB 0
|
||||
DB 0
|
||||
|
||||
END
|
||||
|
247
sdk/lib/crt/math/libm_sse2/remainderf.c
Normal file
247
sdk/lib/crt/math/libm_sse2/remainderf.c
Normal file
|
@ -0,0 +1,247 @@
|
|||
|
||||
/*******************************************************************************
|
||||
MIT License
|
||||
-----------
|
||||
|
||||
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this Software and associated documentaon files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "libm.h"
|
||||
#include "libm_util.h"
|
||||
|
||||
#define USE_NANF_WITH_FLAGS
|
||||
#define USE_SCALEDOUBLE_1
|
||||
#define USE_GET_FPSW_INLINE
|
||||
#define USE_SET_FPSW_INLINE
|
||||
#define USE_HANDLE_ERRORF
|
||||
#include "libm_inlines.h"
|
||||
#undef USE_NANF_WITH_FLAGS
|
||||
#undef USE_SCALEDOUBLE_1
|
||||
#undef USE_GET_FPSW_INLINE
|
||||
#undef USE_SET_FPSW_INLINE
|
||||
#undef USE_HANDLE_ERRORF
|
||||
|
||||
#if !defined(_CRTBLD_C9X)
|
||||
#define _CRTBLD_C9X
|
||||
#endif
|
||||
|
||||
#include "libm_errno.h"
|
||||
|
||||
// Disable "C4163: not available as intrinsic function" warning that older
|
||||
// compilers may issue here.
|
||||
#pragma warning(disable:4163)
|
||||
#pragma function(remainderf,fmodf)
|
||||
|
||||
|
||||
#undef _FUNCNAME
|
||||
#if defined(COMPILING_FMOD)
|
||||
float fmodf(float x, float y)
|
||||
#define _FUNCNAME "fmodf"
|
||||
#define _OPERATION OP_FMOD
|
||||
#else
|
||||
float remainderf(float x, float y)
|
||||
#define _FUNCNAME "remainderf"
|
||||
#define _OPERATION OP_REM
|
||||
#endif
|
||||
{
|
||||
double dx, dy, scale, w, t;
|
||||
int i, ntimes, xexp, yexp;
|
||||
unsigned long ux, uy, ax, ay;
|
||||
|
||||
unsigned int sw;
|
||||
|
||||
dx = x;
|
||||
dy = y;
|
||||
|
||||
|
||||
GET_BITS_DP64(dx, ux);
|
||||
GET_BITS_DP64(dy, uy);
|
||||
ax = ux & ~SIGNBIT_DP64;
|
||||
ay = uy & ~SIGNBIT_DP64;
|
||||
xexp = (int)((ux & EXPBITS_DP64) >> EXPSHIFTBITS_DP64);
|
||||
yexp = (int)((uy & EXPBITS_DP64) >> EXPSHIFTBITS_DP64);
|
||||
|
||||
if (xexp < 1 || xexp > BIASEDEMAX_DP64 ||
|
||||
yexp < 1 || yexp > BIASEDEMAX_DP64)
|
||||
{
|
||||
/* x or y is zero, NaN or infinity (neither x nor y can be
|
||||
denormalized because we promoted from float to double) */
|
||||
if (xexp > BIASEDEMAX_DP64)
|
||||
{
|
||||
/* x is NaN or infinity */
|
||||
if (ux & MANTBITS_DP64)
|
||||
{
|
||||
/* x is NaN */
|
||||
unsigned int ufx;
|
||||
GET_BITS_SP32(x, ufx);
|
||||
return _handle_errorf(_FUNCNAME, _OPERATION, ufx|0x00400000, _DOMAIN, 0,
|
||||
EDOM, x, y, 2);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* x is infinity; result is NaN */
|
||||
return _handle_errorf(_FUNCNAME, _OPERATION, INDEFBITPATT_SP32, _DOMAIN,
|
||||
AMD_F_INVALID, EDOM, x, y, 2);
|
||||
}
|
||||
}
|
||||
else if (yexp > BIASEDEMAX_DP64)
|
||||
{
|
||||
/* y is NaN or infinity */
|
||||
if (uy & MANTBITS_DP64)
|
||||
{
|
||||
/* y is NaN */
|
||||
unsigned int ufy;
|
||||
GET_BITS_SP32(y, ufy);
|
||||
return _handle_errorf(_FUNCNAME, _OPERATION, ufy|0x00400000, _DOMAIN, 0,
|
||||
EDOM, x, y, 2);
|
||||
}
|
||||
else
|
||||
{
|
||||
#ifdef _CRTBLD_C9X
|
||||
/* C99 return for y = +-inf is x */
|
||||
return x;
|
||||
#else
|
||||
/* y is infinity; result is indefinite */
|
||||
return _handle_errorf(_FUNCNAME, _OPERATION, INDEFBITPATT_SP32, _DOMAIN,
|
||||
AMD_F_INVALID, EDOM, x, y, 2);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
else if (xexp < 1)
|
||||
{
|
||||
/* x must be zero (cannot be denormalized) */
|
||||
if (yexp < 1)
|
||||
{
|
||||
/* y must be zero (cannot be denormalized) */
|
||||
return _handle_errorf(_FUNCNAME, _OPERATION, INDEFBITPATT_SP32, _DOMAIN,
|
||||
AMD_F_INVALID, EDOM, x, y, 2);
|
||||
}
|
||||
else
|
||||
/* C99 return for x = 0 must preserve sign */
|
||||
return x;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* y must be zero */
|
||||
return _handle_errorf(_FUNCNAME, _OPERATION, INDEFBITPATT_SP32, _DOMAIN,
|
||||
AMD_F_INVALID, EDOM, x, y, 2);
|
||||
}
|
||||
}
|
||||
else if (ax == ay)
|
||||
{
|
||||
/* abs(x) == abs(y); return zero with the sign of x */
|
||||
PUT_BITS_DP64(ux & SIGNBIT_DP64, dx);
|
||||
return (float)dx;
|
||||
}
|
||||
|
||||
/* Set dx = abs(x), dy = abs(y) */
|
||||
PUT_BITS_DP64(ax, dx);
|
||||
PUT_BITS_DP64(ay, dy);
|
||||
|
||||
if (ax < ay)
|
||||
{
|
||||
/* abs(x) < abs(y) */
|
||||
#if !defined(COMPILING_FMOD)
|
||||
if (dx > 0.5*dy)
|
||||
dx -= dy;
|
||||
#endif
|
||||
return (float)(x < 0.0? -dx : dx);
|
||||
}
|
||||
|
||||
/* Save the current floating-point status word. We need
|
||||
to do this because the remainder function is always
|
||||
exact for finite arguments, but our algorithm causes
|
||||
the inexact flag to be raised. We therefore need to
|
||||
restore the entry status before exiting. */
|
||||
sw = get_fpsw_inline();
|
||||
|
||||
/* Set ntimes to the number of times we need to do a
|
||||
partial remainder. If the exponent of x is an exact multiple
|
||||
of 24 larger than the exponent of y, and the mantissa of x is
|
||||
less than the mantissa of y, ntimes will be one too large
|
||||
but it doesn't matter - it just means that we'll go round
|
||||
the loop below one extra time. */
|
||||
if (xexp <= yexp)
|
||||
{
|
||||
ntimes = 0;
|
||||
w = dy;
|
||||
scale = 1.0;
|
||||
}
|
||||
else
|
||||
{
|
||||
ntimes = (xexp - yexp) / 24;
|
||||
|
||||
/* Set w = y * 2^(24*ntimes) */
|
||||
PUT_BITS_DP64((unsigned long)(ntimes * 24 + EXPBIAS_DP64) << EXPSHIFTBITS_DP64,
|
||||
scale);
|
||||
w = scale * dy;
|
||||
/* Set scale = 2^(-24) */
|
||||
PUT_BITS_DP64((unsigned long)(-24 + EXPBIAS_DP64) << EXPSHIFTBITS_DP64,
|
||||
scale);
|
||||
}
|
||||
|
||||
|
||||
/* Each time round the loop we compute a partial remainder.
|
||||
This is done by subtracting a large multiple of w
|
||||
from x each time, where w is a scaled up version of y.
|
||||
The subtraction can be performed exactly when performed
|
||||
in double precision, and the result at each stage can
|
||||
fit exactly in a single precision number. */
|
||||
for (i = 0; i < ntimes; i++)
|
||||
{
|
||||
/* t is the integer multiple of w that we will subtract.
|
||||
We use a truncated value for t. */
|
||||
t = (double)((int)(dx / w));
|
||||
dx -= w * t;
|
||||
/* Scale w down by 2^(-24) for the next iteration */
|
||||
w *= scale;
|
||||
}
|
||||
|
||||
/* One more time */
|
||||
#if defined(COMPILING_FMOD)
|
||||
t = (double)((int)(dx / w));
|
||||
dx -= w * t;
|
||||
#else
|
||||
{
|
||||
unsigned int todd;
|
||||
/* Variable todd says whether the integer t is odd or not */
|
||||
t = (double)((int)(dx / w));
|
||||
todd = ((int)(dx / w)) & 1;
|
||||
dx -= w * t;
|
||||
|
||||
/* At this point, dx lies in the range [0,dy) */
|
||||
/* For the remainder function, we need to adjust dx
|
||||
so that it lies in the range (-y/2, y/2] by carefully
|
||||
subtracting w (== dy == y) if necessary. */
|
||||
if (dx > 0.5 * w || ((dx == 0.5 * w) && todd))
|
||||
dx -= w;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* **** N.B. for some reason this breaks the 32 bit version
|
||||
of remainder when compiling with optimization. */
|
||||
/* Restore the entry status flags */
|
||||
set_fpsw_inline(sw);
|
||||
|
||||
/* Set the result sign according to input argument x */
|
||||
return (float)(x < 0.0? -dx : dx);
|
||||
|
||||
}
|
369
sdk/lib/crt/math/libm_sse2/simd.h
Normal file
369
sdk/lib/crt/math/libm_sse2/simd.h
Normal file
|
@ -0,0 +1,369 @@
|
|||
/***********************************************************************************/
|
||||
/** MIT License **/
|
||||
/** ----------- **/
|
||||
/** **/
|
||||
/** Copyright (c) 2002-2019 Advanced Micro Devices, Inc. **/
|
||||
/** **/
|
||||
/** Permission is hereby granted, free of charge, to any person obtaining a copy **/
|
||||
/** of this Software and associated documentaon files (the "Software"), to deal **/
|
||||
/** in the Software without restriction, including without limitation the rights **/
|
||||
/** to use, copy, modify, merge, publish, distribute, sublicense, and/or sell **/
|
||||
/** copies of the Software, and to permit persons to whom the Software is **/
|
||||
/** furnished to do so, subject to the following conditions: **/
|
||||
/** **/
|
||||
/** The above copyright notice and this permission notice shall be included in **/
|
||||
/** all copies or substantial portions of the Software. **/
|
||||
/** **/
|
||||
/** THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR **/
|
||||
/** IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, **/
|
||||
/** FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE **/
|
||||
/** AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER **/
|
||||
/** LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, **/
|
||||
/** OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN **/
|
||||
/** THE SOFTWARE. **/
|
||||
/***********************************************************************************/
|
||||
|
||||
/*
|
||||
******************************************************************************
|
||||
* Source File : simd.h
|
||||
* Archive File : $Archive: $
|
||||
* Date : 6/04/01
|
||||
* Description : The include file for the AMD SIMD exception filter routine
|
||||
* for Microsoft Structured Exception Handling
|
||||
*
|
||||
*
|
||||
$Revision:$
|
||||
$Name:$
|
||||
$Date:$
|
||||
$Author:$
|
||||
$History: simd.h $
|
||||
*
|
||||
*/
|
||||
|
||||
#include <emmintrin.h>
|
||||
|
||||
// simd.h
|
||||
// This file contains structure definitions to provide
|
||||
// convenient access to SIMD and MMX data as unsigned
|
||||
// integer data.
|
||||
|
||||
// change the following define to a 1 to print terse output
|
||||
#define DO_PRINT 0
|
||||
|
||||
// can't use the 3DNOW SDK as written with 64 bit tools
|
||||
#if !defined (_AMD64_)
|
||||
#define USE_3DNOW_SDK 1
|
||||
#define SUPPORTS_FTZ 1
|
||||
#endif
|
||||
|
||||
|
||||
/*****************************************************************/
|
||||
|
||||
// Basic type definitions
|
||||
|
||||
typedef UINT_PTR AWORD; // x86-64 safe
|
||||
|
||||
typedef union
|
||||
{
|
||||
float f;
|
||||
unsigned long l;
|
||||
} LFLOAT;
|
||||
|
||||
//typedef struct
|
||||
//{
|
||||
// DWORD dw[2];
|
||||
//}
|
||||
typedef unsigned _int64 QWORD;
|
||||
|
||||
typedef union
|
||||
{
|
||||
double f;
|
||||
unsigned long l[2];
|
||||
} LDOUBLE;
|
||||
|
||||
typedef __declspec(align(16)) struct
|
||||
{
|
||||
LFLOAT f0,f1,f2,f3;
|
||||
} SSESINGLE;
|
||||
|
||||
typedef __declspec(align(16)) struct
|
||||
{
|
||||
LDOUBLE d0,d1;
|
||||
} SSEDOUBLE;
|
||||
|
||||
|
||||
// this is the key data structure type used by the filter
|
||||
// and the test program. It will be aligned, since
|
||||
// the __m128 types are all aligned. It allows the
|
||||
// use of one variable to carry all the needed data
|
||||
// types.
|
||||
typedef union
|
||||
{
|
||||
__m128 m;
|
||||
__m128d md;
|
||||
__m128i mi;
|
||||
__m64 m64[2];
|
||||
DWORD l[4];
|
||||
int i[4];
|
||||
LFLOAT f[4];
|
||||
QWORD q[2];
|
||||
LDOUBLE d[2];
|
||||
} ML128;
|
||||
|
||||
// this defined to provide a MMX type for the FXSTOR structure.
|
||||
typedef union
|
||||
{
|
||||
unsigned short mmx[4]; // mmx regs are 64 bits
|
||||
unsigned short fp[5]; // floating point regs are 80 bits
|
||||
} MMX80;
|
||||
|
||||
/*****************************************************************/
|
||||
|
||||
// define constants used by SIMD
|
||||
|
||||
// define MXCSR rounding control bits.
|
||||
#define SDIMCW_RC 0x6000
|
||||
#define SDIRC_NEAR 0x0000
|
||||
#define SDIRC_DOWN 0x2000
|
||||
#define SDIRC_UP 0x4000
|
||||
#define SDIRC_CHOP 0x6000
|
||||
|
||||
// define other MXCSR control bits
|
||||
#define SDDAZ 0x0040
|
||||
#define SDFTZ 0x8000
|
||||
|
||||
#define opADD 0x58
|
||||
#define opAND 0x54
|
||||
#define opANDN 0x55
|
||||
#define opCMP 0xC2
|
||||
#define opCOMISS 0x2F
|
||||
#define opCVTPI2PS 0x2A
|
||||
#define opCVTTPS2PI 0x2C
|
||||
#define opCVTPS2PI 0x2D
|
||||
#define opCVTPS2PD 0x5A
|
||||
#define opCVTDQ2PS 0x5B
|
||||
#define opCVTTPD2DQ 0xE6
|
||||
#define opDIV 0x5E
|
||||
#define opMAX 0x5F
|
||||
#define opMIN 0x5D
|
||||
#define opMUL 0x59
|
||||
#define opSQRT 0x51
|
||||
#define opSUB 0x5C
|
||||
#define opUCOMISS 0x2E
|
||||
|
||||
// define EFlags bits
|
||||
#define ZF (1 << 6)
|
||||
#define PF (1 << 2)
|
||||
#define CF (1 << 0)
|
||||
|
||||
// define the REX prefix bits
|
||||
#define REX_PREFIX 0x40
|
||||
#define REX_W 0x8
|
||||
#define REX_R 0x4
|
||||
#define REX_X 0x2
|
||||
#define REX_B 0x1
|
||||
|
||||
|
||||
// define the exception information record
|
||||
|
||||
// constants for the status bits
|
||||
#define IEM_INEXACT 0x20
|
||||
#define IEM_UNDERFLOW 0x10
|
||||
#define IEM_OVERFLOW 0x08
|
||||
#define IEM_ZERODIVIDE 0x04
|
||||
#define IEM_DENORMAL 0x02
|
||||
#define IEM_INVALID 0x01
|
||||
#define IEM_MASK 0x3F
|
||||
|
||||
#define IMM_INEXACT 0x1000
|
||||
#define IMM_UNDERFLOW 0x0800
|
||||
#define IMM_OVERFLOW 0x0400
|
||||
#define IMM_ZERODIVIDE 0x0200
|
||||
#define IMM_DENORMAL 0x0100
|
||||
#define IMM_INVALID 0x0080
|
||||
#define IMM_MASK 0x1F80
|
||||
|
||||
/*****************************************************************/
|
||||
|
||||
// Instruction forms
|
||||
|
||||
// Type enumerations
|
||||
//
|
||||
|
||||
typedef enum
|
||||
{
|
||||
fGdWsd,
|
||||
fGdWss,
|
||||
fQqWpd,
|
||||
fQqWps,
|
||||
fVpdQq,
|
||||
fVpdWpd,
|
||||
fVpdWpdIb,
|
||||
fVpdWpdi,
|
||||
fVpdWps,
|
||||
fVpdiWpd,
|
||||
fVpdiWps,
|
||||
fVpsQq,
|
||||
fVpsWpd,
|
||||
fVpsWpdi,
|
||||
fVpsWps,
|
||||
fVpsWpsIb,
|
||||
fVsdEd,
|
||||
fVsdWsd,
|
||||
fVsdWsdIb,
|
||||
fVsdWss,
|
||||
fVssEd,
|
||||
fVssWsd,
|
||||
fVssWss,
|
||||
fVssWssIb
|
||||
} InstType;
|
||||
|
||||
// operand types
|
||||
typedef enum
|
||||
{
|
||||
oEd, //General register dword mod R/M
|
||||
oGd, //General register dword
|
||||
oQq, // MMX quadword mod R/M
|
||||
oVpd, // XMM register
|
||||
oVpdi,
|
||||
oVps,
|
||||
oVsd,
|
||||
oVss,
|
||||
oWpd, // XMM mod R/M
|
||||
oWpdi,
|
||||
oWps,
|
||||
oWsd,
|
||||
oWss
|
||||
} OpType;
|
||||
|
||||
// operand class
|
||||
typedef enum
|
||||
{
|
||||
oXMMreg,
|
||||
oXMMmrm,
|
||||
oMMXreg,
|
||||
oMMXmrm,
|
||||
oGENreg,
|
||||
oGENmrm,
|
||||
} OpClass;
|
||||
|
||||
// data types
|
||||
typedef enum
|
||||
{
|
||||
dDW, // integer DWORD
|
||||
dPD, // packed double precision
|
||||
dPDI, // packed integer DWORD
|
||||
dPS, // packed single precision
|
||||
dQ, // integer quadword
|
||||
dSD, // scalar double precision
|
||||
dSS // scalar single precision
|
||||
} DataType;
|
||||
|
||||
/*****************************************************************/
|
||||
|
||||
// Structure definitions
|
||||
//
|
||||
|
||||
|
||||
// define the format of the data used by
|
||||
// the FXSAVE and FXRSTOR commands
|
||||
typedef struct
|
||||
{
|
||||
MMX80 mmx; // the mmx/fp register
|
||||
unsigned short reserved[3]; // floating point regs are 80 bits
|
||||
} FPMMX;
|
||||
|
||||
#if defined (_AMD64_)
|
||||
// x86-64 version
|
||||
typedef struct _FXMM_SAVE_AREA {
|
||||
WORD ControlWord;
|
||||
WORD StatusWord;
|
||||
WORD TagWord;
|
||||
WORD OpCode;
|
||||
QWORD ErrorOffset;
|
||||
QWORD DataOffset;
|
||||
DWORD Mxcsr;
|
||||
DWORD reserved3;
|
||||
FPMMX FMMXreg[8];
|
||||
ML128 XMMreg[16];
|
||||
} FXMM_SAVE_AREA;
|
||||
#else
|
||||
// 32 bit x86 version
|
||||
typedef struct _FXMM_SAVE_AREA {
|
||||
WORD ControlWord;
|
||||
WORD StatusWord;
|
||||
WORD TagWord;
|
||||
WORD OpCode;
|
||||
DWORD ErrorOffset;
|
||||
WORD ErrorSelector;
|
||||
WORD reserved1;
|
||||
DWORD DataOffset;
|
||||
WORD DataSelector;
|
||||
WORD reserved2;
|
||||
DWORD Mxcsr;
|
||||
DWORD reserved3;
|
||||
FPMMX FMMXreg[8];
|
||||
ML128 XMMreg[8];
|
||||
} FXMM_SAVE_AREA;
|
||||
#endif
|
||||
typedef FXMM_SAVE_AREA *PFXMM_SAVE_AREA;
|
||||
|
||||
/* This structure is used to access the excepting opcode */
|
||||
typedef struct {
|
||||
unsigned char opcode;
|
||||
unsigned char rmbyte;
|
||||
union {
|
||||
unsigned long offset; // this will need work for x86-64
|
||||
unsigned char imm8;
|
||||
} data;
|
||||
|
||||
} SIMD_OP, *PSIMD_OP;
|
||||
|
||||
// Define a SIMD exception flag type.
|
||||
// This is just like the _FPIEEE_EXCEPTION_FLAGS
|
||||
// except that it adds the denormal field.
|
||||
typedef struct {
|
||||
unsigned int Inexact : 1;
|
||||
unsigned int Underflow : 1;
|
||||
unsigned int Overflow : 1;
|
||||
unsigned int ZeroDivide : 1;
|
||||
unsigned int InvalidOperation : 1;
|
||||
unsigned int Denormal : 1;
|
||||
} _SIMD_EXCEPTION_FLAGS;
|
||||
|
||||
|
||||
/* define the local simd record structures */
|
||||
typedef struct {
|
||||
unsigned int RoundingMode;
|
||||
_SIMD_EXCEPTION_FLAGS Cause;
|
||||
_SIMD_EXCEPTION_FLAGS Enable;
|
||||
_SIMD_EXCEPTION_FLAGS Status;
|
||||
PSIMD_OP opaddress; // points to 0F xx opcode
|
||||
int curAddr; // used when parsing mod R/M byte
|
||||
unsigned char prefix;
|
||||
unsigned char opcode;
|
||||
unsigned char rmbyte;
|
||||
unsigned char immediate8;
|
||||
// add a rex field for x86-64
|
||||
unsigned char rex;
|
||||
int eopcode; // encoded opcode (index for tables)
|
||||
int op_form;
|
||||
int op1_class; // XMM, MMX, or gen register
|
||||
int op1_type; // data format
|
||||
int op2_class;
|
||||
int op2_type;
|
||||
int is_commiss;
|
||||
int commiss_val;
|
||||
unsigned int mxcsr; // value of mscsr from context record.
|
||||
ML128 op1_value;
|
||||
ML128 op2_value;
|
||||
ML128 *op2_ptr;
|
||||
|
||||
} _SIMD_RECORD, *_PSIMD_RECORD;
|
||||
|
||||
/* define a record for the operand form table */
|
||||
typedef struct {
|
||||
int op1; // form of operand 1
|
||||
int op2; // form of operand 2
|
||||
} _OPERAND_RECORD;
|
||||
|
511
sdk/lib/crt/math/libm_sse2/sin.asm
Normal file
511
sdk/lib/crt/math/libm_sse2/sin.asm
Normal file
|
@ -0,0 +1,511 @@
|
|||
;
|
||||
; MIT License
|
||||
; -----------
|
||||
;
|
||||
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
||||
;
|
||||
; Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
; of this Software and associated documentaon files (the "Software"), to deal
|
||||
; in the Software without restriction, including without limitation the rights
|
||||
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
; copies of the Software, and to permit persons to whom the Software is
|
||||
; furnished to do so, subject to the following conditions:
|
||||
;
|
||||
; The above copyright notice and this permission notice shall be included in
|
||||
; all copies or substantial portions of the Software.
|
||||
;
|
||||
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
; THE SOFTWARE.
|
||||
;
|
||||
;
|
||||
; An implementation of the sin function.
|
||||
;
|
||||
; Prototype:
|
||||
;
|
||||
; double sin(double x);
|
||||
;
|
||||
; Computes sin(x).
|
||||
; It will provide proper C99 return values,
|
||||
; but may not raise floating point status bits properly.
|
||||
; Based on the NAG C implementation.
|
||||
;
|
||||
; If FMA3 hardware is available, an FMA3 implementation of sin will be used.
|
||||
|
||||
|
||||
.const
|
||||
ALIGN 16
|
||||
L_real_piby2_1 DQ 03ff921fb54400000h ; piby2_1
|
||||
DQ 0
|
||||
L_real_piby2_1tail DQ 03dd0b4611a626331h ; piby2_1tail
|
||||
DQ 0
|
||||
L_real_piby2_2 DQ 03dd0b4611a600000h ; piby2_2
|
||||
DQ 0
|
||||
L_real_piby2_2tail DQ 03ba3198a2e037073h ; piby2_2tail
|
||||
DQ 0
|
||||
ALIGN 16
|
||||
|
||||
L_one DQ 03FF0000000000000h, 03FF0000000000000h
|
||||
L_signbit DQ 08000000000000000h, 00000000000000000h
|
||||
L_int_one DQ 00000000000000001h, 00000000000000000h
|
||||
L_int_two DQ 00000000000000002h, 00000000000000000h
|
||||
L_int_three DQ 00000000000000003h, 00000000000000000h
|
||||
|
||||
L_2_by_pi DQ 03fe45f306dc9c883h ; 2/pi
|
||||
L_one_half DQ 03FE0000000000000h ; .5
|
||||
L_one_sixth DQ 03FC5555555555555h ; .1666...
|
||||
L_two_to_neg_27 DQ 03e40000000000000h ; 2^-27
|
||||
L_two_to_neg_13 DQ 03f20000000000000h ; 2^-13
|
||||
L_piby4 DQ 03FE921FB54442D18h ; pi/4
|
||||
L_small_arg_cw DQ 0411E848000000000h ; 5.e5, appropriate for CW
|
||||
L_small_arg_bdl DQ 0417312D000000000h ; 2e7, works for BDL
|
||||
|
||||
L__inf_mask_64 DQ 07FF0000000000000h ; +Inf
|
||||
|
||||
EXTRN __Lcosarray:QWORD
|
||||
EXTRN __Lsinarray:QWORD
|
||||
EXTRN __use_fma3_lib:DWORD
|
||||
|
||||
; define local variable storage offsets
|
||||
p_temp EQU 030h
|
||||
p_temp1 EQU 040h
|
||||
save_r10 EQU 050h
|
||||
dummy_space EQU 060h
|
||||
stack_size EQU 078h
|
||||
|
||||
include fm.inc
|
||||
|
||||
fname TEXTEQU <sin>
|
||||
fname_special TEXTEQU <_sin_special>
|
||||
|
||||
;Define name and any external functions being called
|
||||
EXTERN __remainder_piby2_forAsm : PROC
|
||||
EXTERN __remainder_piby2_fma3 : PROC
|
||||
EXTERN __remainder_piby2_fma3_bdl : PROC
|
||||
EXTERN fname_special : PROC
|
||||
|
||||
.code
|
||||
|
||||
PUBLIC fname
|
||||
fname PROC FRAME
|
||||
StackAllocate stack_size
|
||||
.ENDPROLOG
|
||||
cmp DWORD PTR __use_fma3_lib, 0
|
||||
jne Lsin_fma3
|
||||
|
||||
Lsin_sse2:
|
||||
movd rdx, xmm0
|
||||
xorpd xmm2, xmm2 ; zeroed out for later use
|
||||
|
||||
mov r10,rdx
|
||||
mov r8d, 1 ; for determining region later on
|
||||
btr r10,63 ; r10 <-- |x|
|
||||
cmp r10,L_piby4
|
||||
jb Lsin_sse2_absx_lt_piby4
|
||||
|
||||
Lsin_sse2_absx_nlt_piby4: ; common case
|
||||
mov r11,rdx
|
||||
shr r11,63
|
||||
movd xmm0,r10 ; xmm0 <-- |x|
|
||||
cmp r10, QWORD PTR L_small_arg_cw
|
||||
jae Lsin_reduce_precise ; Note NaN/Inf will branch
|
||||
|
||||
; At this point we have |x| < L_small_arg_cw, which is currently 500000.
|
||||
; Note that if |x| were too large, conversion of npi2 to integer would fail.
|
||||
; We reduce the argument to be in a range from -pi/4 to +pi/4
|
||||
; by subtracting multiples of pi/2
|
||||
movapd xmm2, xmm0
|
||||
mulsd xmm2, L_2_by_pi
|
||||
movapd xmm4, xmm0
|
||||
|
||||
; xexp = ax >> EXPSHIFTBITS_DP64;
|
||||
mov r9, r10
|
||||
shr r9, 52 ; >>EXPSHIFTBITS_DP64
|
||||
|
||||
; How many pi/2 is |x| a multiple of?
|
||||
; npi2 = (int)(x * twobypi + 0.5);
|
||||
addsd xmm2, L_one_half ; npi2
|
||||
|
||||
movsd xmm3, L_real_piby2_1
|
||||
cvttpd2dq xmm0, xmm2 ; convert npi2 to integer
|
||||
movsd xmm1, L_real_piby2_1tail
|
||||
cvtdq2pd xmm2, xmm0 ; npi2 back to double
|
||||
|
||||
; Subtract the multiple from x to get an extra-precision remainder
|
||||
; rhead = x - npi2 * piby2_1;
|
||||
mulsd xmm3, xmm2
|
||||
subsd xmm4, xmm3 ; rhead
|
||||
|
||||
; rtail = npi2 * piby2_1tail;
|
||||
mulsd xmm1, xmm2 ; rtail
|
||||
movd eax, xmm0 ; eax <-- npi2
|
||||
|
||||
; GET_BITS_DP64(rhead-rtail, uy);
|
||||
; originally only rhead
|
||||
movapd xmm0, xmm4
|
||||
subsd xmm0, xmm1
|
||||
|
||||
movsd xmm3, L_real_piby2_2
|
||||
movd rcx, xmm0 ; rcx <-- rhead - rtail
|
||||
movsd xmm5, L_real_piby2_2tail ; piby2_2tail
|
||||
|
||||
; xmm0=r, xmm1=rtail, xmm2=npi2, xmm3=temp for calc,
|
||||
; xmm4=rhead, xmm5= temp for calc
|
||||
; expdiff = xexp - ((uy & EXPBITS_DP64) >> EXPSHIFTBITS_DP64);
|
||||
; expdiff measures how close rhead - rtail is to |x|
|
||||
; (larger expdiff ==> more cancellation in |x| - (rhead-rtail) ==> closer)
|
||||
shl rcx, 1 ; strip any sign bit
|
||||
shr rcx, 53 ; >> EXPSHIFTBITS_DP64 +1
|
||||
sub r9, rcx ; expdiff
|
||||
|
||||
;; if (expdiff > 15)
|
||||
cmp r9, 15
|
||||
jle Lsin_sse2_cw_reduction_done
|
||||
|
||||
; Here the remainder is pretty small compared with x, which
|
||||
; implies that x is a near multiple of pi/2
|
||||
; (x matches the multiple to at least 15 bits)
|
||||
; So we do another stage of argument reduction.
|
||||
|
||||
; t = rhead;
|
||||
movapd xmm1, xmm4
|
||||
|
||||
; rtail = npi2 * piby2_2;
|
||||
mulsd xmm3, xmm2
|
||||
|
||||
; rhead = t - rtail;
|
||||
mulsd xmm5, xmm2 ; npi2 * piby2_2tail
|
||||
subsd xmm4, xmm3 ; rhead
|
||||
|
||||
; rtail = npi2 * piby2_2tail - ((t - rhead) - rtail);
|
||||
subsd xmm1, xmm4 ; t - rhead
|
||||
subsd xmm1, xmm3 ; -rtail
|
||||
subsd xmm5, xmm1 ; rtail
|
||||
|
||||
; r = rhead - rtail;
|
||||
movapd xmm0, xmm4
|
||||
|
||||
;HARSHA
|
||||
;xmm1=rtail
|
||||
movapd xmm1, xmm5 ; xmm1 <-- copy of rtail
|
||||
subsd xmm0, xmm5
|
||||
|
||||
|
||||
; xmm0=r, xmm4=rhead, xmm1=rtail
|
||||
Lsin_sse2_cw_reduction_done:
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; if the input was close to a pi/2 multiple
|
||||
; The original NAG code missed this trick.
|
||||
; If the input is very close to n*pi/2 after reduction, so r < 2^-27,
|
||||
; then the sin is either ~ 1.0 or ~r, to within 53 bits.
|
||||
|
||||
; Note: Unfortunately this introduces two jcc instructions close to each
|
||||
; other and to other branches. As r < 2^-13 should be rather uncommon, it
|
||||
; almost certainly costs more than it saves. - WAT
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; region = npi2 & 3;
|
||||
|
||||
subsd xmm4, xmm0 ; rhead-r
|
||||
subsd xmm4, xmm1 ; rr = (rhead-r) - rtail
|
||||
|
||||
Lsin_piby4:
|
||||
; perform taylor series to calc sinx, sinx for |x| <= pi/4
|
||||
; x2 = r * r;
|
||||
|
||||
;xmm4 = a part of rr for the sin path, xmm4 is overwritten in the sin path
|
||||
;instead use xmm3 because that was freed up in the sin path, xmm3 is overwritten in sin path
|
||||
movapd xmm3, xmm0
|
||||
movapd xmm2, xmm0
|
||||
mulsd xmm2, xmm0 ;x2
|
||||
|
||||
bt eax,0
|
||||
jc Lsin_sse2_calc_cos
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; region 0 or 2 do a sin calculation
|
||||
movsd xmm3, __Lsinarray+50h ; s6
|
||||
mulsd xmm3, xmm2 ; x2s6
|
||||
movsd xmm5, __Lsinarray+20h ; s3
|
||||
movsd QWORD PTR p_temp[rsp], xmm4 ; store xx
|
||||
movapd xmm1, xmm2 ; move for x4
|
||||
mulsd xmm1, xmm2 ; x4
|
||||
movsd QWORD PTR p_temp1[rsp], xmm0 ; store x
|
||||
mulsd xmm5, xmm2 ; x2s3
|
||||
movapd xmm4, xmm0 ; move for x3
|
||||
addsd xmm3, __Lsinarray+40h ; s5+x2s6
|
||||
mulsd xmm1, xmm2 ; x6
|
||||
mulsd xmm3, xmm2 ; x2(s5+x2s6)
|
||||
mulsd xmm4, xmm2 ; x3
|
||||
addsd xmm5, __Lsinarray+10h ; s2+x2s3
|
||||
mulsd xmm5, xmm2 ; x2(s2+x2s3)
|
||||
addsd xmm3, __Lsinarray+30h ; s4 + x2(s5+x2s6)
|
||||
mulsd xmm2, L_one_half ; 0.5 *x2
|
||||
movsd xmm0, QWORD PTR p_temp[rsp] ; load xx
|
||||
mulsd xmm3, xmm1 ; x6(s4 + x2(s5+x2s6))
|
||||
addsd xmm5, __Lsinarray ; s1+x2(s2+x2s3)
|
||||
mulsd xmm2, xmm0 ; 0.5 * x2 *xx
|
||||
addsd xmm3, xmm5 ; zs
|
||||
mulsd xmm4, xmm3 ; *x3
|
||||
subsd xmm4, xmm2 ; x3*zs - 0.5 * x2 *xx
|
||||
addsd xmm0, xmm4 ; +xx
|
||||
addsd xmm0, QWORD PTR p_temp1[rsp] ; +x
|
||||
|
||||
jmp Lsin_sse2_adjust_region
|
||||
|
||||
ALIGN 16
|
||||
Lsin_sse2_calc_cos:
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; region 1 or 3 - do a cos calculation
|
||||
; zc = (c2 + x2 * (c3 + x2 * (c4 + x2 * (c5 + x2 * c6))));
|
||||
mulsd xmm4, xmm0 ; x*xx
|
||||
movsd xmm5, L_one_half
|
||||
movsd xmm1, __Lcosarray+50h ; c6
|
||||
movsd xmm0, __Lcosarray+20h ; c3
|
||||
mulsd xmm5, xmm2 ; r = 0.5 *x2
|
||||
movapd xmm3, xmm2 ; copy of x2
|
||||
movsd QWORD PTR p_temp[rsp], xmm4 ; store x*xx
|
||||
mulsd xmm1, xmm2 ; c6*x2
|
||||
mulsd xmm0, xmm2 ; c3*x2
|
||||
subsd xmm5, L_one ; -t=r-1.0, trash r
|
||||
mulsd xmm3, xmm2 ; x4
|
||||
addsd xmm1, __Lcosarray+40h ; c5+x2c6
|
||||
addsd xmm0, __Lcosarray+10h ; c2+x2C3
|
||||
addsd xmm5, L_one ; 1 + (-t), trash t
|
||||
mulsd xmm3, xmm2 ; x6
|
||||
mulsd xmm1, xmm2 ; x2(c5+x2c6)
|
||||
mulsd xmm0, xmm2 ; x2(c2+x2C3)
|
||||
movapd xmm4, xmm2 ; copy of x2
|
||||
mulsd xmm4, L_one_half ; r recalculate
|
||||
addsd xmm1, __Lcosarray+30h ; c4 + x2(c5+x2c6)
|
||||
addsd xmm0, __Lcosarray ; c1+x2(c2+x2C3)
|
||||
mulsd xmm2, xmm2 ; x4 recalculate
|
||||
subsd xmm5, xmm4 ; (1 + (-t)) - r
|
||||
mulsd xmm1, xmm3 ; x6(c4 + x2(c5+x2c6))
|
||||
addsd xmm0, xmm1 ; zc
|
||||
subsd xmm4, L_one ; t relaculate
|
||||
subsd xmm5, QWORD PTR p_temp[rsp] ; ((1 + (-t)) - r) - x*xx
|
||||
mulsd xmm0, xmm2 ; x4 * zc
|
||||
addsd xmm0, xmm5 ; x4 * zc + ((1 + (-t)) - r -x*xx)
|
||||
subsd xmm0, xmm4 ; result - (-t)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
Lsin_sse2_adjust_region:
|
||||
; positive or negative
|
||||
; switch (region)
|
||||
shr eax, 1
|
||||
mov ecx, eax
|
||||
and eax, r11d
|
||||
|
||||
not ecx
|
||||
not r11d
|
||||
and ecx, r11d
|
||||
|
||||
or eax, ecx
|
||||
and eax, 1
|
||||
jnz Lsin_sse2_cleanup
|
||||
|
||||
;; if the original region 0, 1 and arg is negative, then we negate the result.
|
||||
;; if the original region 2, 3 and arg is positive, then we negate the result.
|
||||
movapd xmm2, xmm0
|
||||
xorpd xmm0, xmm0
|
||||
subsd xmm0, xmm2
|
||||
|
||||
ALIGN 16
|
||||
Lsin_sse2_cleanup:
|
||||
StackDeallocate stack_size
|
||||
ret
|
||||
|
||||
ALIGN 16
|
||||
Lsin_sse2_absx_lt_piby4:
|
||||
; sin = sin_piby4(x, 0.0);
|
||||
|
||||
; x2 = r * r;
|
||||
movapd xmm2, xmm0
|
||||
mulsd xmm2, xmm0 ; x2
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; region 0 - do a sin calculation
|
||||
; zs = (s2 + x2 * (s3 + x2 * (s4 + x2 * (s5 + x2 * s6))));
|
||||
movsd xmm3, __Lsinarray+50h ; s6
|
||||
mulsd xmm3, xmm2 ; x2s6
|
||||
movsd xmm5, __Lsinarray+20h ; s3
|
||||
movapd xmm1, xmm2 ; move for x4
|
||||
mulsd xmm1, xmm2 ; x4
|
||||
mulsd xmm5, xmm2 ; x2s3
|
||||
movapd xmm4, xmm0 ; move for x3
|
||||
addsd xmm3, __Lsinarray+40h ; s5+x2s6
|
||||
mulsd xmm1, xmm2 ; x6
|
||||
mulsd xmm3, xmm2 ; x2(s5+x2s6)
|
||||
mulsd xmm4, xmm2 ; x3
|
||||
addsd xmm5, __Lsinarray+10h ; s2+x2s3
|
||||
mulsd xmm5, xmm2 ; x2(s2+x2s3)
|
||||
addsd xmm3, __Lsinarray+30h ; s4 + x2(s5+x2s6)
|
||||
mulsd xmm3, xmm1 ; x6(s4 + x2(s5+x2s6))
|
||||
addsd xmm5, __Lsinarray ; s1+x2(s2+x2s3)
|
||||
addsd xmm3, xmm5 ; zs
|
||||
mulsd xmm4, xmm3 ; *x3
|
||||
addsd xmm0, xmm4 ; +x
|
||||
jmp Lsin_sse2_cleanup
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
ALIGN 16
|
||||
Lsin_reduce_precise:
|
||||
; Reduce x into range [-pi/4, pih/4]
|
||||
cmp r10,L__inf_mask_64
|
||||
jae Lsin_x_naninf
|
||||
mov QWORD PTR p_temp[rsp], r11
|
||||
call __remainder_piby2_forAsm
|
||||
mov r11, QWORD PTR p_temp[rsp]
|
||||
|
||||
; At this point xmm0 has r, xmm1 has rr, rax has region
|
||||
|
||||
movapd xmm4, xmm1 ; xmm4 <-- rr
|
||||
jmp Lsin_piby4
|
||||
|
||||
; xmm0 = x, xmm4 = xx, eax= region
|
||||
|
||||
|
||||
ALIGN 16
|
||||
Lsin_x_naninf:
|
||||
call fname_special
|
||||
StackDeallocate stack_size
|
||||
ret
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; From this point we assume that FMA3 and AVX hardware are present.
|
||||
|
||||
ALIGN 16
|
||||
Lsin_fma3:
|
||||
vmovq r9,xmm0
|
||||
mov r10,r9 ; save x to get sign later
|
||||
btr r9,63 ; r9 <-- |x|
|
||||
cmp r9,L_piby4
|
||||
jae Lsin_fma3_absx_nlt_piby4 ; Note that NaN will branch
|
||||
cmp r9,L_two_to_neg_13
|
||||
jae Lsin_fma3_calc_sin_for_absx_lt_piby4
|
||||
cmp r9,L_two_to_neg_27
|
||||
jae Lsin_fma3_compute_x_xxx_0_1666
|
||||
StackDeallocate stack_size
|
||||
ret ; sin x ~= x for |x| < 2^-27
|
||||
|
||||
ALIGN 16
|
||||
Lsin_fma3_compute_x_xxx_0_1666: ; |x| in [2^-27,2^-13]
|
||||
vmulsd xmm1,xmm0,xmm0 ; xmm1l <-- x*x
|
||||
vmulsd xmm1,xmm1,xmm0 ; xmm1l <-- x*x*x
|
||||
vfnmadd231sd xmm0,xmm1,L_one_sixth ; xmm0l <-- x - x*x*x*(1/6)
|
||||
StackDeallocate stack_size
|
||||
ret
|
||||
|
||||
ALIGN 16
|
||||
Lsin_fma3_calc_sin_for_absx_lt_piby4: ; |x| in [2^-13,pi/4]
|
||||
vmovsd xmm5,__Lsinarray+050h
|
||||
vmulsd xmm3,xmm0,xmm0 ; xmm3l <-- x^2
|
||||
|
||||
vfmadd213sd xmm5,xmm3,__Lsinarray+040h
|
||||
vfmadd213sd xmm5,xmm3,__Lsinarray+030h
|
||||
vfmadd213sd xmm5,xmm3,__Lsinarray+020h
|
||||
vfmadd213sd xmm5,xmm3,__Lsinarray+010h
|
||||
|
||||
vmulsd xmm4,xmm0,xmm3 ; xmm4l <-- x^3
|
||||
vfmadd213sd xmm5,xmm3,__Lsinarray
|
||||
vfmadd231sd xmm0,xmm4,xmm5 ; xmm0l <-- x + x^3 p(x^2)
|
||||
|
||||
StackDeallocate stack_size
|
||||
ret
|
||||
|
||||
ALIGN 16
|
||||
Lsin_fma3_absx_nlt_piby4: ; !(|x| < pi/4)
|
||||
; here r9 has |x|
|
||||
cmp r9,L__inf_mask_64
|
||||
jae Lsin_x_naninf
|
||||
;Lrange_reduce: ;; unused label
|
||||
|
||||
vmovq xmm0,r9 ; xmm0 <-- |x|
|
||||
cmp r9,L_small_arg_bdl
|
||||
jae Lsin_fma3_do_general_arg_reduction
|
||||
|
||||
; Note that __remainder_piby2_fma3 conventions are
|
||||
; on input
|
||||
; |x| is in xmm0
|
||||
; on output
|
||||
; r is in xmm0
|
||||
; rr is in xmm1
|
||||
; region of |x| is in rax
|
||||
|
||||
; Boldo-Daumas-Li reduction for reasonably small |x|
|
||||
call __remainder_piby2_fma3_bdl
|
||||
Lsin_fma3_exit_s:
|
||||
bt rax,0
|
||||
vmulsd xmm3,xmm0,xmm0 ; xmm3 <-- x2 = x * x
|
||||
jc Lsin_fma3_calc_cos
|
||||
|
||||
Lsin_fma3_calc_sin: ;; unused label
|
||||
; region 0 or 2
|
||||
; compute the sine of r+rr, where this sum is in [-pi/4,pi/4]
|
||||
vmovsd xmm5,__Lsinarray+050h
|
||||
vfmadd213sd xmm5,xmm3,__Lsinarray+040h
|
||||
vfmadd213sd xmm5,xmm3,__Lsinarray+030h
|
||||
vfmadd213sd xmm5,xmm3,__Lsinarray+020h
|
||||
vfmadd213sd xmm5,xmm3,__Lsinarray+010h ; xmm5 <-- r
|
||||
|
||||
vmulsd xmm4,xmm0,xmm3 ; xmm4 <-- x3 = x*x*x
|
||||
vmulsd xmm2,xmm4,xmm5 ; xmm2 <-- x*x*x * r
|
||||
vmulsd xmm5,xmm1,L_one_half ; xmm5 <-- .5*x*x
|
||||
vsubsd xmm2,xmm5,xmm2 ; xmm2 <-- .5*x*x - x*x*x*r
|
||||
vmulsd xmm2,xmm3,xmm2
|
||||
vsubsd xmm2,xmm2,xmm1
|
||||
vfnmadd231sd xmm2, xmm4,__Lsinarray
|
||||
vsubsd xmm0,xmm0,xmm2
|
||||
jmp Lsin_fma3_exit_s_1
|
||||
|
||||
ALIGN 16
|
||||
Lsin_fma3_calc_cos:
|
||||
; region 1 or 3
|
||||
; compute the cosine of r+rr, where this sum is in [-pi/4,pi/4]
|
||||
vmovapd xmm2,L_one
|
||||
vmulsd xmm5,xmm3,L_one_half ; xmm5 <-- x*x*.5 == r
|
||||
vsubsd xmm4,xmm2,xmm5 ; xmm4 <-- t = 1. - x*x*.5
|
||||
vsubsd xmm2,xmm2,xmm4 ; 1-t
|
||||
vsubsd xmm2,xmm2,xmm5 ; xmm2 <-- (1-t) - r
|
||||
vmovsd xmm5,__Lcosarray+050h
|
||||
vfnmadd231sd xmm2,xmm0,xmm1 ; (1.0 - t) - r) - x * xx) xmm2
|
||||
vmulsd xmm1,xmm3,xmm3 ; x2 * x2 xmm1
|
||||
vfmadd213sd xmm5,xmm3,__Lcosarray+040h
|
||||
vfmadd213sd xmm5,xmm3,__Lcosarray+030h
|
||||
vfmadd213sd xmm5,xmm3,__Lcosarray+020h
|
||||
vfmadd213sd xmm5,xmm3,__Lcosarray+010h
|
||||
vfmadd213sd xmm5,xmm3,__Lcosarray
|
||||
vfmadd213sd xmm5,xmm1,xmm2
|
||||
vaddsd xmm0,xmm5,xmm4
|
||||
|
||||
Lsin_fma3_exit_s_1:
|
||||
xor r8,r8 ; prepare r8 for cmov
|
||||
and r10,L_signbit ; isolate original sign of x
|
||||
bt eax,1
|
||||
cmovc r8,L_signbit
|
||||
xor r8,r10
|
||||
vmovq xmm3,r8
|
||||
vxorpd xmm0,xmm0,xmm3
|
||||
|
||||
StackDeallocate stack_size
|
||||
ret
|
||||
|
||||
ALIGN 16
|
||||
Lsin_fma3_do_general_arg_reduction:
|
||||
; argument reduction for general x
|
||||
|
||||
; NOTE: the BDL argument reduction routine does not touch r10,
|
||||
; but the general-purpose reduction does.
|
||||
mov QWORD PTR [save_r10+rsp], r10
|
||||
call __remainder_piby2_fma3
|
||||
mov r10, QWORD PTR [save_r10+rsp]
|
||||
jmp Lsin_fma3_exit_s
|
||||
|
||||
fname endp
|
||||
END
|
||||
|
130
sdk/lib/crt/math/libm_sse2/sincos_special.c
Normal file
130
sdk/lib/crt/math/libm_sse2/sincos_special.c
Normal file
|
@ -0,0 +1,130 @@
|
|||
|
||||
/*******************************************************************************
|
||||
MIT License
|
||||
-----------
|
||||
|
||||
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this Software and associated documentaon files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include <fpieee.h>
|
||||
#include <excpt.h>
|
||||
#include <float.h>
|
||||
#include <math.h>
|
||||
#include <errno.h>
|
||||
|
||||
#include "libm_new.h"
|
||||
|
||||
double _sincos_special(double x, char *name, unsigned int operation)
|
||||
{
|
||||
UT64 xu;
|
||||
unsigned int is_snan;
|
||||
|
||||
xu.f64 = x;
|
||||
|
||||
if((xu.u64 & INF_POS_64) == INF_POS_64)
|
||||
{
|
||||
// x is Inf or NaN
|
||||
if((xu.u64 & MANTISSA_MASK_64) == 0x0)
|
||||
{
|
||||
// x is Inf
|
||||
xu.u64 = IND_64;
|
||||
_handle_error(name, operation, xu.u64, _DOMAIN, AMD_F_INVALID, EDOM, x, 0, 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
// x is NaN
|
||||
is_snan = (((xu.u64 & QNAN_MASK_64) == QNAN_MASK_64) ? 0 : 1);
|
||||
if(is_snan)
|
||||
{
|
||||
xu.u64 |= QNAN_MASK_64;
|
||||
}
|
||||
_handle_error(name, operation, xu.u64, _DOMAIN, 0, EDOM, x, 0, 1);
|
||||
}
|
||||
}
|
||||
|
||||
return xu.f64;
|
||||
}
|
||||
|
||||
float _sincosf_special(float x, char *name, unsigned int operation)
|
||||
{
|
||||
UT64 xu;
|
||||
unsigned int is_snan;
|
||||
|
||||
xu.u64 = 0;
|
||||
xu.f32[0] = x;
|
||||
|
||||
if((xu.u32[0] & INF_POS_32) == INF_POS_32)
|
||||
{
|
||||
// x is Inf or NaN
|
||||
if((xu.u32[0] & MANTISSA_MASK_32) == 0x0)
|
||||
{
|
||||
// x is Inf
|
||||
xu.u32[0] = IND_32;
|
||||
_handle_errorf(name, operation, xu.u64, _DOMAIN, AMD_F_INVALID, EDOM, x, 0, 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
// x is NaN
|
||||
is_snan = (((xu.u32[0] & QNAN_MASK_32) == QNAN_MASK_32) ? 0 : 1);
|
||||
if(is_snan)
|
||||
{
|
||||
xu.u32[0] |= QNAN_SET_32;
|
||||
_handle_errorf(name, operation, xu.u64, _DOMAIN, AMD_F_INVALID, EDOM, x, 0, 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
_handle_errorf(name, operation, xu.u64, _DOMAIN, 0, EDOM, x, 0, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return xu.f32[0];
|
||||
}
|
||||
|
||||
float _sinf_special(float x)
|
||||
{
|
||||
return _sincosf_special(x, "sinf", _FpCodeSin);
|
||||
}
|
||||
|
||||
double _sin_special(double x)
|
||||
{
|
||||
return _sincos_special(x, "sin", _FpCodeSin);
|
||||
}
|
||||
|
||||
float _cosf_special(float x)
|
||||
{
|
||||
return _sincosf_special(x, "cosf", _FpCodeCos);
|
||||
}
|
||||
|
||||
double _cos_special(double x)
|
||||
{
|
||||
return _sincos_special(x, "cos", _FpCodeCos);
|
||||
}
|
||||
|
||||
double _tan_special(double x)
|
||||
{
|
||||
return _sincos_special(x, "tan",_FpCodeTan);
|
||||
}
|
||||
|
||||
float _tanf_special(float x)
|
||||
{
|
||||
return _sincosf_special(x, "tanf",_FpCodeTan);
|
||||
}
|
664
sdk/lib/crt/math/libm_sse2/sinf.asm
Normal file
664
sdk/lib/crt/math/libm_sse2/sinf.asm
Normal file
|
@ -0,0 +1,664 @@
|
|||
;
|
||||
; MIT License
|
||||
; -----------
|
||||
;
|
||||
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
||||
;
|
||||
; Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
; of this Software and associated documentaon files (the "Software"), to deal
|
||||
; in the Software without restriction, including without limitation the rights
|
||||
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
; copies of the Software, and to permit persons to whom the Software is
|
||||
; furnished to do so, subject to the following conditions:
|
||||
;
|
||||
; The above copyright notice and this permission notice shall be included in
|
||||
; all copies or substantial portions of the Software.
|
||||
;
|
||||
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
; THE SOFTWARE.
|
||||
;
|
||||
;
|
||||
; An implementation of the sinf function.
|
||||
;
|
||||
; Prototype
|
||||
;
|
||||
; float sinf(float x);
|
||||
;
|
||||
; Computes sinf(x).
|
||||
; It will provide proper C99 return values,
|
||||
; but may not raise floating point status bits properly.
|
||||
; Based on the NAG C implementation.
|
||||
;
|
||||
|
||||
.const
|
||||
ALIGN 16
|
||||
L_signbit DQ 08000000000000000h
|
||||
DQ 08000000000000000h
|
||||
L_sign_mask DQ 07FFFFFFFFFFFFFFFh
|
||||
DQ 07FFFFFFFFFFFFFFFh
|
||||
L_one DQ 03FF0000000000000h
|
||||
DQ 03FF0000000000000h
|
||||
L_int_three DQ 00000000000000003h
|
||||
DQ 00000000000000003h
|
||||
L_one_half DQ 03FE0000000000000h
|
||||
DQ 03FE0000000000000h
|
||||
L_twobypi DQ 03FE45F306DC9C883h
|
||||
DQ 03FE45F306DC9C883h
|
||||
L_piby2_1 DQ 03FF921FB54400000h
|
||||
DQ 03FF921FB54400000h
|
||||
L_one_sixth DQ 03FC5555555555555h
|
||||
DQ 03FC5555555555555h
|
||||
L_piby2_1tail DQ 03DD0B4611A626331h
|
||||
DQ 03DD0B4611A626331h
|
||||
L_piby2_2 DQ 03dd0b4611a600000h
|
||||
DQ 03dd0b4611a600000h
|
||||
L_piby2_2tail DQ 03ba3198a2e037073h
|
||||
DQ 03ba3198a2e037073h
|
||||
L_inf_mask_32 DD 07F800000h
|
||||
DD 07F800000h
|
||||
DQ 07F8000007F800000h
|
||||
L_int_two DQ 00000000000000002h
|
||||
DQ 00000000000000002h
|
||||
L_piby2_lead DQ 03ff921fb54442d18h
|
||||
DQ 03ff921fb54442d18h
|
||||
L_piby4 DQ 03fe921fb54442d18h
|
||||
DQ 03fe921fb54442d18h
|
||||
L_mask_3f2 DQ 03f20000000000000h
|
||||
DQ 03f20000000000000h
|
||||
L_mask_3f8 DQ 03f80000000000000h
|
||||
DQ 03f80000000000000h
|
||||
|
||||
; Do these really need to be different?
|
||||
L_large_x_fma3 DQ 04170008AC0000000h ; 16779436
|
||||
L_large_x_sse2 DQ 0416E848000000000h ; 16000000
|
||||
|
||||
EXTRN __Lcosfarray:QWORD
|
||||
EXTRN __Lsinfarray:QWORD
|
||||
EXTRN __use_fma3_lib:DWORD
|
||||
EXTRN __L_2_by_pi_bits:BYTE
|
||||
|
||||
; define local variable storage offsets
|
||||
p_temp EQU 010h ; temporary for get/put bits operation
|
||||
p_temp1 EQU 018h ; temporary for get/put bits operation
|
||||
region EQU 020h ; pointer to region for remainder_piby2
|
||||
r EQU 028h ; pointer to r for remainder_piby2
|
||||
dummy_space EQU 040h
|
||||
|
||||
stack_size EQU 058h
|
||||
|
||||
include fm.inc
|
||||
|
||||
fname TEXTEQU <sinf>
|
||||
fname_special TEXTEQU <_sinf_special>
|
||||
|
||||
;Define name and any external functions being called
|
||||
EXTRN __remainder_piby2d2f_forC : PROC ; NEAR
|
||||
EXTERN fname_special : PROC
|
||||
|
||||
.code
|
||||
ALIGN 16
|
||||
PUBLIC fname
|
||||
fname PROC FRAME
|
||||
StackAllocate stack_size
|
||||
.ENDPROLOG
|
||||
cmp DWORD PTR __use_fma3_lib, 0
|
||||
jne Lsinf_fma3
|
||||
|
||||
Lsinf_sse2:
|
||||
|
||||
xorpd xmm2, xmm2 ; zeroed out for later use
|
||||
|
||||
;; if NaN or inf
|
||||
movd edx, xmm0
|
||||
mov eax, 07f800000h
|
||||
mov r10d, eax
|
||||
and r10d, edx
|
||||
cmp r10d, eax
|
||||
jz Lsinf_sse2_naninf
|
||||
|
||||
; GET_BITS_DP64(x, ux);
|
||||
; get the input value to an integer register.
|
||||
cvtss2sd xmm0, xmm0 ; convert input to double.
|
||||
movd rdx, xmm0 ; rdx is ux
|
||||
|
||||
; ax = (ux & ~SIGNBIT_DP64);
|
||||
mov r10, rdx
|
||||
btr r10, 63 ; r10 is ax
|
||||
mov r8d, 1 ; for determining region later on
|
||||
|
||||
;; if (ax <= 0x3fe921fb54442d18) abs(x) <= pi/4
|
||||
mov rax, 03fe921fb54442d18h
|
||||
cmp r10, rax
|
||||
jg Lsinf_absx_gt_piby4
|
||||
|
||||
;; if (ax < 0x3f80000000000000) abs(x) < 2.0^(-7)
|
||||
mov rax, 3f80000000000000h
|
||||
cmp r10, rax
|
||||
jge Lsinf_sse2_small
|
||||
|
||||
;; if (ax < 0x3f20000000000000) abs(x) < 2.0^(-13)
|
||||
mov rax, 3f20000000000000h
|
||||
cmp r10, rax
|
||||
jge Lsinf_sse2_smaller
|
||||
|
||||
; sinf = x;
|
||||
jmp Lsinf_sse2_cleanup
|
||||
|
||||
ALIGN 16
|
||||
Lsinf_sse2_smaller:
|
||||
; sinf = x - x^3 * 0.1666666666666666666;
|
||||
movsd xmm2, xmm0
|
||||
movsd xmm4, QWORD PTR L_one_sixth ; 0.1666666666666666666
|
||||
mulsd xmm2, xmm2 ; x^2
|
||||
mulsd xmm2, xmm0 ; x^3
|
||||
mulsd xmm2, xmm4 ; x^3 * 0.1666666666666666666
|
||||
subsd xmm0, xmm2 ; x - x^3 * 0.1666666666666666666
|
||||
jmp Lsinf_sse2_cleanup
|
||||
|
||||
ALIGN 16
|
||||
Lsinf_sse2_small:
|
||||
movsd xmm2, xmm0 ; x2 = r * r;
|
||||
mulsd xmm2, xmm0 ; x2
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; region 0 or 2 - do a sinf calculation
|
||||
; zs = x + x3((s1 + x2 * s2) + x4(s3 + x2 * s4));
|
||||
movsd xmm1, QWORD PTR __Lsinfarray+18h ; s4
|
||||
mulsd xmm1, xmm2 ; s4x2
|
||||
movsd xmm4, xmm2 ; move for x4
|
||||
movsd xmm5, QWORD PTR __Lsinfarray+8h ; s2
|
||||
mulsd xmm4, xmm2 ; x4
|
||||
movsd xmm3, xmm0 ; move for x3
|
||||
mulsd xmm5, xmm2 ; s2x2
|
||||
mulsd xmm3, xmm2 ; x3
|
||||
addsd xmm1, QWORD PTR __Lsinfarray+10h ; s3+s4x2
|
||||
mulsd xmm1, xmm4 ; s3x4+s4x6
|
||||
addsd xmm5, QWORD PTR __Lsinfarray ; s1+s2x2
|
||||
addsd xmm1, xmm5 ; s1+s2x2+s3x4+s4x6
|
||||
mulsd xmm1, xmm3 ; x3(s1+s2x2+s3x4+s4x6)
|
||||
addsd xmm0, xmm1 ; x + x3(s1+s2x2+s3x4+s4x6)
|
||||
jmp Lsinf_sse2_cleanup
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
ALIGN 16
|
||||
Lsinf_absx_gt_piby4:
|
||||
; xneg = (ax != ux);
|
||||
cmp rdx, r10
|
||||
mov r11d, 0
|
||||
;; if (xneg) x = -x;
|
||||
jz Lsinf_sse2_reduce_moderate
|
||||
|
||||
mov r11d, 1
|
||||
subsd xmm2, xmm0
|
||||
movsd xmm0, xmm2
|
||||
|
||||
Lsinf_sse2_reduce_moderate:
|
||||
;; if (x < 5.0e6)
|
||||
cmp r10, QWORD PTR L_large_x_sse2
|
||||
jae Lsinf_sse2_reduce_large
|
||||
|
||||
; reduce the argument to be in a range from -pi/4 to +pi/4
|
||||
; by subtracting multiples of pi/2
|
||||
movsd xmm2, xmm0
|
||||
movsd xmm3, QWORD PTR L_twobypi
|
||||
movsd xmm4, xmm0
|
||||
movsd xmm5, QWORD PTR L_one_half ; .5
|
||||
mulsd xmm2, xmm3
|
||||
|
||||
;/* How many pi/2 is x a multiple of? */
|
||||
; xexp = ax >> EXPSHIFTBITS_DP64;
|
||||
mov r9, r10
|
||||
shr r9, 52 ; >>EXPSHIFTBITS_DP64
|
||||
|
||||
; npi2 = (int)(x * twobypi + 0.5);
|
||||
addsd xmm2, xmm5 ; npi2
|
||||
|
||||
movsd xmm3, QWORD PTR L_piby2_1
|
||||
cvttpd2dq xmm0, xmm2 ; convert to integer
|
||||
movsd xmm1, QWORD PTR L_piby2_1tail
|
||||
cvtdq2pd xmm2, xmm0 ; and back to double.
|
||||
|
||||
; /* Subtract the multiple from x to get an extra-precision remainder */
|
||||
; rhead = x - npi2 * piby2_1;
|
||||
mulsd xmm3, xmm2
|
||||
subsd xmm4, xmm3 ; rhead
|
||||
|
||||
; rtail = npi2 * piby2_1tail;
|
||||
mulsd xmm1, xmm2
|
||||
movd eax, xmm0
|
||||
|
||||
; GET_BITS_DP64(rhead-rtail, uy);
|
||||
; originally only rhead
|
||||
movsd xmm0, xmm4
|
||||
subsd xmm0, xmm1
|
||||
|
||||
movsd xmm3, QWORD PTR L_piby2_2
|
||||
movd rcx, xmm0
|
||||
movsd xmm5, QWORD PTR L_piby2_2tail
|
||||
|
||||
; xmm0=r, xmm4=rhead, xmm1=rtail, xmm2=npi2, xmm3=temp for calc, xmm5= temp for calc
|
||||
; expdiff = xexp - ((uy & EXPBITS_DP64) >> EXPSHIFTBITS_DP64);
|
||||
shl rcx, 1 ; strip any sign bit
|
||||
shr rcx, 53 ; >> EXPSHIFTBITS_DP64 +1
|
||||
sub r9, rcx ; expdiff
|
||||
|
||||
;; if (expdiff > 15)
|
||||
cmp r9, 15
|
||||
jle Lsinf_sse2_expdiff_le_15
|
||||
|
||||
; The remainder is pretty small compared with x, which
|
||||
; implies that x is a near multiple of pi/2
|
||||
; (x matches the multiple to at least 15 bits)
|
||||
; t = rhead;
|
||||
movsd xmm1, xmm4
|
||||
|
||||
; rtail = npi2 * piby2_2;
|
||||
mulsd xmm3, xmm2
|
||||
|
||||
; rhead = t - rtail;
|
||||
mulsd xmm5, xmm2 ; npi2 * piby2_2tail
|
||||
subsd xmm4, xmm3 ; rhead
|
||||
|
||||
; rtail = npi2 * piby2_2tail - ((t - rhead) - rtail);
|
||||
subsd xmm1, xmm4 ; t - rhead
|
||||
subsd xmm1, xmm3 ; -rtail
|
||||
subsd xmm5, xmm1 ; rtail
|
||||
|
||||
; r = rhead - rtail;
|
||||
movsd xmm0, xmm4
|
||||
|
||||
;HARSHA
|
||||
;xmm1=rtail
|
||||
movsd xmm1, xmm5
|
||||
subsd xmm0, xmm5
|
||||
|
||||
; xmm0=r, xmm4=rhead, xmm1=rtail
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
Lsinf_sse2_expdiff_le_15:
|
||||
cmp rcx, 03f2h ; is r < 2^-13 ?
|
||||
jge Lsinf_sse2_calc_sincosf_piby4 ; use taylor series if not
|
||||
cmp rcx, 03deh ; if r really small.
|
||||
jle Lsinf_sse2_r_very_small ; then sinf(r) ~ r or 1
|
||||
|
||||
movsd xmm2, xmm0
|
||||
mulsd xmm2, xmm0 ; xmm2 <-- r^2
|
||||
|
||||
;; if region is 0 or 2 do a sinf calc.
|
||||
and r8d, eax
|
||||
jnz Lsinf_sse2_small_calc_sin
|
||||
|
||||
; region 0 or 2 do a sinf calculation
|
||||
; use simply polynomial
|
||||
; x - x*x*x*0.166666666666666666;
|
||||
movsd xmm3, QWORD PTR L_one_sixth
|
||||
mulsd xmm3, xmm0 ; * x
|
||||
mulsd xmm3, xmm2 ; * x^2
|
||||
subsd xmm0, xmm3 ; xs
|
||||
jmp Lsinf_sse2_adjust_region
|
||||
|
||||
ALIGN 16
|
||||
Lsinf_sse2_small_calc_sin:
|
||||
; region 1 or 3 do a cosf calculation
|
||||
; use simply polynomial
|
||||
; 1.0 - x*x*0.5;
|
||||
movsd xmm0, QWORD PTR L_one ; 1.0
|
||||
mulsd xmm2, QWORD PTR L_one_half ; 0.5 *x^2
|
||||
subsd xmm0, xmm2 ; xc
|
||||
jmp Lsinf_sse2_adjust_region
|
||||
|
||||
ALIGN 16
|
||||
Lsinf_sse2_r_very_small:
|
||||
;; if region is 0 or 2 do a sinf calc. (sinf ~ x)
|
||||
and r8d, eax
|
||||
jz Lsinf_sse2_adjust_region
|
||||
|
||||
movsd xmm0, QWORD PTR L_one ; cosf(r) is a 1
|
||||
jmp Lsinf_sse2_adjust_region
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
ALIGN 16
|
||||
Lsinf_sse2_reduce_large:
|
||||
; Reduce x into range [-pi/4, pi/4]
|
||||
; __remainder_piby2d2f_forC(x, &r, ®ion);
|
||||
|
||||
mov QWORD PTR p_temp[rsp], r11
|
||||
lea rdx, QWORD PTR r[rsp]
|
||||
lea r8, QWORD PTR region[rsp]
|
||||
movd rcx, xmm0
|
||||
call __remainder_piby2d2f_forC
|
||||
mov r11, QWORD PTR p_temp[rsp]
|
||||
mov r8d, 1 ; for determining region later on
|
||||
movsd xmm1, QWORD PTR r[rsp] ; x
|
||||
mov eax, DWORD PTR region[rsp] ; region
|
||||
|
||||
; xmm0 = x, xmm4 = xx, r8d = 1, eax= region
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
; perform taylor series to calc sinfx, cosfx
|
||||
Lsinf_sse2_calc_sincosf_piby4:
|
||||
; x2 = r * r;
|
||||
movsd xmm2, xmm0
|
||||
mulsd xmm2, xmm0 ; x2
|
||||
|
||||
;; if region is 1 or 3, do a cosf calc.
|
||||
and r8d, eax
|
||||
jnz Lsinf_sse2_do_cosf_calc
|
||||
|
||||
; region is 0 or 2: do a sinf calc.
|
||||
; zs = x + x3((s1 + x2 * s2) + x4(s3 + x2 * s4));
|
||||
Lsinf_sse2_do_sinf_calc:
|
||||
movsd xmm1, QWORD PTR __Lsinfarray+18h ; s4
|
||||
mulsd xmm1, xmm2 ; s4x2
|
||||
movsd xmm4, xmm2 ; move for x4
|
||||
mulsd xmm4, xmm2 ; x4
|
||||
movsd xmm5, QWORD PTR __Lsinfarray+8h ; s2
|
||||
mulsd xmm5, xmm2 ; s2x2
|
||||
movsd xmm3, xmm0 ; move for x3
|
||||
mulsd xmm3, xmm2 ; x3
|
||||
addsd xmm1, QWORD PTR __Lsinfarray+10h ; s3+s4x2
|
||||
mulsd xmm1, xmm4 ; s3x4+s4x6
|
||||
addsd xmm5, QWORD PTR __Lsinfarray ; s1+s2x2
|
||||
addsd xmm1, xmm5 ; s1+s2x2+s3x4+s4x6
|
||||
mulsd xmm1, xmm3 ; x3(s1+s2x2+s3x4+s4x6)
|
||||
addsd xmm0, xmm1 ; x + x3(s1+s2x2+s3x4+s4x6)
|
||||
jmp Lsinf_sse2_adjust_region
|
||||
|
||||
ALIGN 16
|
||||
Lsinf_sse2_do_cosf_calc:
|
||||
|
||||
; region 1 or 3 - do a cosf calculation
|
||||
; zc = 1-0.5*x2+ c1*x4 +c2*x6 +c3*x8;
|
||||
; zc = 1-0.5*x2+ c1*x4 +c2*x6 +c3*x8 + c4*x10 for a higher precision
|
||||
movsd xmm1, QWORD PTR __Lcosfarray+20h ; c4
|
||||
movsd xmm4, xmm2 ; move for x4
|
||||
mulsd xmm1, xmm2 ; c4x2
|
||||
movsd xmm3, QWORD PTR __Lcosfarray+10h ; c2
|
||||
mulsd xmm4, xmm2 ; x4
|
||||
movsd xmm0, QWORD PTR __Lcosfarray ; c0
|
||||
mulsd xmm3, xmm2 ; c2x2
|
||||
mulsd xmm0, xmm2 ; c0x2 (=-0.5x2)
|
||||
addsd xmm1, QWORD PTR __Lcosfarray+18h ; c3+c4x2
|
||||
mulsd xmm1, xmm4 ; c3x4 + c4x6
|
||||
addsd xmm3, QWORD PTR __Lcosfarray+8h ; c1+c2x2
|
||||
addsd xmm1, xmm3 ; c1 + c2x2 + c3x4 + c4x6
|
||||
mulsd xmm1, xmm4 ; c1x4 + c2x6 + c3x8 + c4x10
|
||||
addsd xmm0, QWORD PTR L_one ; 1 - 0.5x2
|
||||
addsd xmm0, xmm1 ; 1 - 0.5x2 + c1x4 + c2x6 + c3x8 + c4x10
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
|
||||
Lsinf_sse2_adjust_region:
|
||||
; positive or negative
|
||||
; switch (region)
|
||||
shr eax, 1
|
||||
mov ecx, eax
|
||||
and eax, r11d
|
||||
|
||||
not ecx
|
||||
not r11d
|
||||
and ecx, r11d
|
||||
|
||||
or eax, ecx
|
||||
and eax, 1
|
||||
jnz Lsinf_sse2_cleanup
|
||||
|
||||
;; if the original region 0, 1 and arg is negative, then we negate the result.
|
||||
;; if the original region 2, 3 and arg is positive, then we negate the result.
|
||||
movsd xmm2, xmm0
|
||||
xorpd xmm0, xmm0
|
||||
subsd xmm0, xmm2
|
||||
|
||||
|
||||
Lsinf_sse2_cleanup:
|
||||
cvtsd2ss xmm0, xmm0
|
||||
StackDeallocate stack_size
|
||||
ret
|
||||
|
||||
ALIGN 16
|
||||
Lsinf_sse2_naninf:
|
||||
call fname_special
|
||||
StackDeallocate stack_size
|
||||
ret
|
||||
|
||||
ALIGN 16
|
||||
Lsinf_fma3:
|
||||
vmovd eax,xmm0
|
||||
mov r8d,L_inf_mask_32
|
||||
and eax,r8d
|
||||
cmp eax, r8d
|
||||
jz Lsinf_fma3_naninf
|
||||
|
||||
vcvtss2sd xmm5,xmm0,xmm0
|
||||
vmovq r9,xmm5
|
||||
btr r9,63 ; r9 <-- |x|
|
||||
cmp r9,L_piby4
|
||||
jg Lsinf_fma3_range_reduce
|
||||
|
||||
cmp r9,L_mask_3f8
|
||||
jge Lsinf_fma3_compute_sinf_piby_4
|
||||
|
||||
cmp r9,L_mask_3f2
|
||||
jge Lsinf_fma3_compute_x_xxx_0_1666
|
||||
|
||||
; Here |x| < 2^-13; just return sin x ~ x
|
||||
StackDeallocate stack_size
|
||||
ret
|
||||
|
||||
ALIGN 16
|
||||
Lsinf_fma3_compute_x_xxx_0_1666:
|
||||
; Here |x| < 2^-7; return sin x ~ x + 1/6 x^3
|
||||
vmulsd xmm1,xmm5,xmm5
|
||||
vmulsd xmm0,xmm1,xmm5 ; xmm1 <-- x^3
|
||||
vfnmadd132sd xmm0,xmm5,L_one_sixth ; x - x*x*x*0.166666666666666666
|
||||
jmp Lsinf_fma3_return_sinf_s
|
||||
|
||||
ALIGN 16
|
||||
Lsinf_fma3_compute_sinf_piby_4:
|
||||
vmovapd xmm0,xmm5
|
||||
vmovsd xmm1,__Lsinfarray+010h
|
||||
vmulsd xmm3,xmm0,xmm0 ; xmm3 <-- x^2
|
||||
vfmadd231sd xmm1,xmm3,__Lsinfarray+018h
|
||||
vfmadd213sd xmm1,xmm3,__Lsinfarray+08h
|
||||
vfmadd213sd xmm1,xmm3,__Lsinfarray
|
||||
vmulsd xmm3,xmm0,xmm3 ; xmm3 <-- x^3
|
||||
vfmadd231sd xmm0,xmm1,xmm3
|
||||
jmp Lsinf_fma3_return_sinf_s
|
||||
|
||||
ALIGN 16
|
||||
Lsinf_fma3_range_reduce:
|
||||
vmovq xmm0,r9 ; xmm0 <-- |x|
|
||||
cmp r9,L_large_x_fma3
|
||||
jge Lsinf_fma3_reduce_large
|
||||
|
||||
Lsinf_fma3_sinf_reduce_moderate:
|
||||
vandpd xmm1,xmm0,L_sign_mask ; xmm1 <-- |x| mov should suffice WAT
|
||||
vmovapd xmm2,L_twobypi
|
||||
vfmadd213sd xmm2,xmm1,L_one_half
|
||||
vcvttpd2dq xmm2,xmm2
|
||||
vpmovsxdq xmm1,xmm2
|
||||
vandpd xmm4,xmm1,L_int_three ; xmm4 <-- region
|
||||
vshufps xmm1 ,xmm1,xmm1,8
|
||||
vcvtdq2pd xmm1,xmm1
|
||||
vmovdqa xmm2,xmm0
|
||||
vfnmadd231sd xmm2,xmm1,L_piby2_1 ; xmm2 <-- rhead
|
||||
vmulsd xmm3,xmm1,L_piby2_1tail ; xmm3 <-- rtail
|
||||
vsubsd xmm0,xmm2,xmm3 ; xmm0 <-- r_1
|
||||
vsubsd xmm2,xmm2,xmm0
|
||||
vsubsd xmm1,xmm2,xmm3 ; xmm4 <-- rr_1
|
||||
jmp Lsinf_fma3_exit_s
|
||||
|
||||
ALIGN 16
|
||||
Lsinf_fma3_reduce_large:
|
||||
lea r9,__L_2_by_pi_bits
|
||||
;xexp = (x >> 52) 1023
|
||||
vmovq r11,xmm0
|
||||
mov rcx,r11
|
||||
shr r11,52
|
||||
sub r11,1023 ; r11 <-- xexp = exponent of input x
|
||||
;calculate the last byte from which to start multiplication
|
||||
;last = 134 (xexp >> 3)
|
||||
mov r10,r11
|
||||
shr r10,3
|
||||
sub r10,134 ;r10 = last
|
||||
neg r10 ;r10 = last
|
||||
;load 64 bits of 2_by_pi
|
||||
mov rax,[r9+r10]
|
||||
;mantissa of x = ((x << 12) >> 12) | implied bit
|
||||
shl rcx,12
|
||||
shr rcx,12 ;rcx = mantissa part of input x
|
||||
bts rcx,52 ;add the implied bit as well
|
||||
;load next 128 bits of 2_by_pi
|
||||
add r10,8 ;increment to next 8 bytes of 2_by_pi
|
||||
vmovdqu xmm0,XMMWORD PTR[r9+r10]
|
||||
;do three 64bit multiplications with mant of x
|
||||
mul rcx
|
||||
mov r8,rax ; r8 <-- last 64 bits of mul = res1[2]
|
||||
mov r10,rdx ; r10 <-- carry
|
||||
vmovq rax,xmm0
|
||||
mul rcx
|
||||
;resexp = xexp & 7
|
||||
and r11,7 ; r11 <-- resexp = last 3 bits
|
||||
psrldq xmm0,8
|
||||
add rax,r10 ; add the previous carry
|
||||
adc rdx,0
|
||||
mov r9,rax ; r9 <-- next 64 bits of mul = res1[1]
|
||||
mov r10,rdx ; r10 <-- carry
|
||||
vmovq rax,xmm0
|
||||
mul rcx
|
||||
add r10,rax ; r10 = most sig 64 bits = res1[0]
|
||||
;find the region
|
||||
;last three bits ltb = most sig bits >> (54 resexp))
|
||||
; decimal point in last 18 bits == 8 lsb's in first 64 bits
|
||||
; and 8 msb's in next 64 bits
|
||||
;point_five = ltb & 01h;
|
||||
;region = ((ltb >> 1) + point_five) & 3;
|
||||
mov rcx,54
|
||||
mov rax,r10
|
||||
sub rcx,r11
|
||||
xor rdx,rdx ;rdx = sign of x(i.e first part of x * 2bypi)
|
||||
shr rax,cl
|
||||
jnc Lsinf_fma3_no_point_five_f
|
||||
;;if there is carry.. then negate the result of multiplication
|
||||
not r10
|
||||
not r9
|
||||
not r8
|
||||
mov rdx,08000000000000000h
|
||||
|
||||
Lsinf_fma3_no_point_five_f:
|
||||
adc rax,0
|
||||
and rax,3
|
||||
vmovd xmm4,eax ;store region to xmm4
|
||||
;calculate the number of integer bits and zero them out
|
||||
mov rcx,r11
|
||||
add rcx,10 ; rcx <-- no. of integer bits
|
||||
shl r10,cl
|
||||
shr r10,cl ; r10 contains only mant bits
|
||||
sub rcx,64 ; form the exponent
|
||||
mov r11,rcx
|
||||
;find the highest set bit
|
||||
bsr rcx,r10
|
||||
jnz Lsinf_fma3_form_mantissa_f
|
||||
mov r10,r9
|
||||
mov r9,r8
|
||||
mov r8,0
|
||||
bsr rcx,r10 ; rcx <-- hsb
|
||||
sub r11,64
|
||||
|
||||
Lsinf_fma3_form_mantissa_f:
|
||||
add r11,rcx ;for exp of x
|
||||
sub rcx,52 ;rcx = no. of bits to shift in r10
|
||||
cmp rcx,0
|
||||
jl Lsinf_fma3_hsb_below_52_f
|
||||
je Lsinf_fma3_form_numbers_f
|
||||
;hsb above 52
|
||||
mov r8,r10 ; previous contents of r8 not required
|
||||
shr r10,cl ; r10 = mantissa of x with hsb at 52
|
||||
shr r9,cl ; make space for bits from r10
|
||||
sub rcx,64
|
||||
neg rcx
|
||||
shl r8,cl
|
||||
or r9,r8 ; r9 = mantissa bits of xx
|
||||
jmp Lsinf_fma3_form_numbers_f
|
||||
|
||||
ALIGN 16
|
||||
Lsinf_fma3_hsb_below_52_f:
|
||||
neg rcx
|
||||
mov rax,r9
|
||||
shl r10,cl
|
||||
shl r9,cl
|
||||
sub rcx,64
|
||||
neg rcx
|
||||
shr rax,cl
|
||||
or r10,rax
|
||||
shr r8,cl
|
||||
or r9,r8
|
||||
|
||||
ALIGN 16
|
||||
Lsinf_fma3_form_numbers_f:
|
||||
add r11,1023
|
||||
btr r10,52 ; remove the implied bit
|
||||
mov rcx,r11
|
||||
or r10,rdx ; put the sign
|
||||
shl rcx,52
|
||||
or r10,rcx ; r10 <-- x
|
||||
vmovq xmm0,r10 ; xmm0 <-- x
|
||||
vmulsd xmm0,xmm0,L_piby2_lead
|
||||
Lsinf_fma3_exit_s:
|
||||
vmovq rax,xmm4
|
||||
and rax,01h
|
||||
cmp rax,01h
|
||||
jz Lsinf_fma3_cos_piby4_compute
|
||||
|
||||
Lsinf_fma3_sin_piby4_compute:
|
||||
;; vmovapd xmm1,__Lsinfarray+010h
|
||||
vmovsd xmm1,__Lsinfarray+010h
|
||||
vmulsd xmm3,xmm0,xmm0
|
||||
vfmadd231sd xmm1,xmm3,__Lsinfarray+018h
|
||||
vfmadd213sd xmm1,xmm3,__Lsinfarray+008h
|
||||
vfmadd213sd xmm1,xmm3,__Lsinfarray
|
||||
vmulsd xmm3,xmm0,xmm3 ; xmm3 <-- x^3
|
||||
vfmadd231sd xmm0,xmm1,xmm3
|
||||
jmp Lsinf_fma3_exit_s_1
|
||||
|
||||
ALIGN 16
|
||||
Lsinf_fma3_cos_piby4_compute:
|
||||
vmovapd xmm2,L_one
|
||||
vmulsd xmm3,xmm0,xmm0
|
||||
vfmadd231sd xmm2,xmm3,__Lcosfarray ; xmm2 <-- 1 + c0 x^2
|
||||
; would simple Horner's be slower?
|
||||
vmovsd xmm1,__Lcosfarray+018h ; xmm1 <-- c3
|
||||
vfmadd231sd xmm1,xmm3,__Lcosfarray+020h ; xmm1 <-- c4 x^2+ c3
|
||||
vfmadd213sd xmm1,xmm3,__Lcosfarray+010h ; xmm1 <-- (c4 x^2+ c3)x^2 + c2
|
||||
vfmadd213sd xmm1,xmm3,__Lcosfarray+008h ; xmm1 <-- ((c4 x^2+ c3)x^2 + c2)x^2 + c1
|
||||
vmulsd xmm3,xmm3,xmm3 ; xmm3 <-- x^4
|
||||
vmovdqa xmm0,xmm2
|
||||
vfmadd231sd xmm0,xmm1,xmm3
|
||||
Lsinf_fma3_exit_s_1:
|
||||
; assuming FMA3 ==> AVX ==> SSE4.1
|
||||
vpcmpeqq xmm2,xmm4,XMMWORD PTR L_int_two
|
||||
vpcmpeqq xmm3,xmm4,XMMWORD PTR L_int_three
|
||||
vorpd xmm3,xmm2,xmm3
|
||||
vandnpd xmm3,xmm3,L_signbit
|
||||
vxorpd xmm0,xmm0,xmm3
|
||||
|
||||
vandnpd xmm1,xmm5,L_signbit
|
||||
vxorpd xmm0,xmm1,xmm0
|
||||
Lsinf_fma3_return_sinf_s:
|
||||
vcvtsd2ss xmm0,xmm0,xmm0
|
||||
StackDeallocate stack_size
|
||||
ret
|
||||
|
||||
Lsinf_fma3_naninf:
|
||||
call fname_special
|
||||
StackDeallocate stack_size
|
||||
ret
|
||||
|
||||
fname endp
|
||||
END
|
340
sdk/lib/crt/math/libm_sse2/sinh.c
Normal file
340
sdk/lib/crt/math/libm_sse2/sinh.c
Normal file
|
@ -0,0 +1,340 @@
|
|||
|
||||
/*******************************************************************************
|
||||
MIT License
|
||||
-----------
|
||||
|
||||
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this Software and associated documentaon files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "libm.h"
|
||||
#include "libm_util.h"
|
||||
|
||||
#define USE_SPLITEXP
|
||||
#define USE_SCALEDOUBLE_1
|
||||
#define USE_SCALEDOUBLE_2
|
||||
#define USE_INFINITY_WITH_FLAGS
|
||||
#define USE_VAL_WITH_FLAGS
|
||||
#define USE_HANDLE_ERROR
|
||||
#include "libm_inlines.h"
|
||||
#undef USE_SPLITEXP
|
||||
#undef USE_SCALEDOUBLE_1
|
||||
#undef USE_SCALEDOUBLE_2
|
||||
#undef USE_INFINITY_WITH_FLAGS
|
||||
#undef USE_VAL_WITH_FLAGS
|
||||
#undef USE_HANDLE_ERROR
|
||||
|
||||
#include "libm_errno.h"
|
||||
|
||||
|
||||
#pragma function(sinh)
|
||||
double sinh(double x)
|
||||
{
|
||||
/*
|
||||
After dealing with special cases the computation is split into
|
||||
regions as follows:
|
||||
|
||||
abs(x) >= max_sinh_arg:
|
||||
sinh(x) = sign(x)*Inf
|
||||
|
||||
abs(x) >= small_threshold:
|
||||
sinh(x) = sign(x)*exp(abs(x))/2 computed using the
|
||||
splitexp and scaleDouble functions as for exp_amd().
|
||||
|
||||
abs(x) < small_threshold:
|
||||
compute p = exp(y) - 1 and then z = 0.5*(p+(p/(p+1.0)))
|
||||
sinh(x) is then sign(x)*z. */
|
||||
|
||||
static const double
|
||||
max_sinh_arg = 7.10475860073943977113e+02, /* 0x408633ce8fb9f87e */
|
||||
thirtytwo_by_log2 = 4.61662413084468283841e+01, /* 0x40471547652b82fe */
|
||||
log2_by_32_lead = 2.16608493356034159660e-02, /* 0x3f962e42fe000000 */
|
||||
log2_by_32_tail = 5.68948749532545630390e-11, /* 0x3dcf473de6af278e */
|
||||
small_threshold = 8*BASEDIGITS_DP64*0.30102999566398119521373889;
|
||||
/* (8*BASEDIGITS_DP64*log10of2) ' exp(-x) insignificant c.f. exp(x) */
|
||||
|
||||
/* Lead and tail tabulated values of sinh(i) and cosh(i)
|
||||
for i = 0,...,36. The lead part has 26 leading bits. */
|
||||
|
||||
static const double sinh_lead[37] = {
|
||||
0.00000000000000000000e+00, /* 0x0000000000000000 */
|
||||
1.17520117759704589844e+00, /* 0x3ff2cd9fc0000000 */
|
||||
3.62686038017272949219e+00, /* 0x400d03cf60000000 */
|
||||
1.00178747177124023438e+01, /* 0x40240926e0000000 */
|
||||
2.72899169921875000000e+01, /* 0x403b4a3800000000 */
|
||||
7.42032089233398437500e+01, /* 0x40528d0160000000 */
|
||||
2.01713153839111328125e+02, /* 0x406936d228000000 */
|
||||
5.48316116333007812500e+02, /* 0x4081228768000000 */
|
||||
1.49047882080078125000e+03, /* 0x409749ea50000000 */
|
||||
4.05154187011718750000e+03, /* 0x40afa71570000000 */
|
||||
1.10132326660156250000e+04, /* 0x40c5829dc8000000 */
|
||||
2.99370708007812500000e+04, /* 0x40dd3c4488000000 */
|
||||
8.13773945312500000000e+04, /* 0x40f3de1650000000 */
|
||||
2.21206695312500000000e+05, /* 0x410b00b590000000 */
|
||||
6.01302140625000000000e+05, /* 0x412259ac48000000 */
|
||||
1.63450865625000000000e+06, /* 0x4138f0cca8000000 */
|
||||
4.44305525000000000000e+06, /* 0x4150f2ebd0000000 */
|
||||
1.20774762500000000000e+07, /* 0x4167093488000000 */
|
||||
3.28299845000000000000e+07, /* 0x417f4f2208000000 */
|
||||
8.92411500000000000000e+07, /* 0x419546d8f8000000 */
|
||||
2.42582596000000000000e+08, /* 0x41aceb0888000000 */
|
||||
6.59407856000000000000e+08, /* 0x41c3a6e1f8000000 */
|
||||
1.79245641600000000000e+09, /* 0x41dab5adb8000000 */
|
||||
4.87240166400000000000e+09, /* 0x41f226af30000000 */
|
||||
1.32445608960000000000e+10, /* 0x4208ab7fb0000000 */
|
||||
3.60024494080000000000e+10, /* 0x4220c3d390000000 */
|
||||
9.78648043520000000000e+10, /* 0x4236c93268000000 */
|
||||
2.66024116224000000000e+11, /* 0x424ef822f0000000 */
|
||||
7.23128516608000000000e+11, /* 0x42650bba30000000 */
|
||||
1.96566712320000000000e+12, /* 0x427c9aae40000000 */
|
||||
5.34323724288000000000e+12, /* 0x4293704708000000 */
|
||||
1.45244246507520000000e+13, /* 0x42aa6b7658000000 */
|
||||
3.94814795284480000000e+13, /* 0x42c1f43fc8000000 */
|
||||
1.07321789251584000000e+14, /* 0x42d866f348000000 */
|
||||
2.91730863685632000000e+14, /* 0x42f0953e28000000 */
|
||||
7.93006722514944000000e+14, /* 0x430689e220000000 */
|
||||
2.15561576592179200000e+15}; /* 0x431ea215a0000000 */
|
||||
|
||||
static const double sinh_tail[37] = {
|
||||
0.00000000000000000000e+00, /* 0x0000000000000000 */
|
||||
1.60467555584448807892e-08, /* 0x3e513ae6096a0092 */
|
||||
2.76742892754807136947e-08, /* 0x3e5db70cfb79a640 */
|
||||
2.09697499555224576530e-07, /* 0x3e8c2526b66dc067 */
|
||||
2.04940252448908240062e-07, /* 0x3e8b81b18647f380 */
|
||||
1.65444891522700935932e-06, /* 0x3ebbc1cdd1e1eb08 */
|
||||
3.53116789999998198721e-06, /* 0x3ecd9f201534fb09 */
|
||||
6.94023870987375490695e-06, /* 0x3edd1c064a4e9954 */
|
||||
4.98876893611587449271e-06, /* 0x3ed4eca65d06ea74 */
|
||||
3.19656024605152215752e-05, /* 0x3f00c259bcc0ecc5 */
|
||||
2.08687768377236501204e-04, /* 0x3f2b5a6647cf9016 */
|
||||
4.84668088325403796299e-05, /* 0x3f09691adefb0870 */
|
||||
1.17517985422733832468e-03, /* 0x3f53410fc29cde38 */
|
||||
6.90830086959560562415e-04, /* 0x3f46a31a50b6fb3c */
|
||||
1.45697262451506548420e-03, /* 0x3f57defc71805c40 */
|
||||
2.99859023684906737806e-02, /* 0x3f9eb49fd80e0bab */
|
||||
1.02538800507941396667e-02, /* 0x3f84fffc7bcd5920 */
|
||||
1.26787628407699110022e-01, /* 0x3fc03a93b6c63435 */
|
||||
6.86652479544033744752e-02, /* 0x3fb1940bb255fd1c */
|
||||
4.81593627621056619148e-01, /* 0x3fded26e14260b50 */
|
||||
1.70489513795397629181e+00, /* 0x3ffb47401fc9f2a2 */
|
||||
1.12416073482258713767e+01, /* 0x40267bb3f55634f1 */
|
||||
7.06579578070110514432e+00, /* 0x401c435ff8194ddc */
|
||||
5.91244512999659974639e+01, /* 0x404d8fee052ba63a */
|
||||
1.68921736147050694399e+02, /* 0x40651d7edccde3f6 */
|
||||
2.60692936262073658327e+02, /* 0x40704b1644557d1a */
|
||||
3.62419382134885609048e+02, /* 0x4076a6b5ca0a9dc4 */
|
||||
4.07689930834187271103e+03, /* 0x40afd9cc72249aba */
|
||||
1.55377375868385224749e+04, /* 0x40ce58de693edab5 */
|
||||
2.53720210371943067003e+04, /* 0x40d8c70158ac6363 */
|
||||
4.78822310734952334315e+04, /* 0x40e7614764f43e20 */
|
||||
1.81871712615542812273e+05, /* 0x4106337db36fc718 */
|
||||
5.62892347580489004031e+05, /* 0x41212d98b1f611e2 */
|
||||
6.41374032312148716301e+05, /* 0x412392bc108b37cc */
|
||||
7.57809544070145115256e+06, /* 0x415ce87bdc3473dc */
|
||||
3.64177136406482197344e+06, /* 0x414bc8d5ae99ad14 */
|
||||
7.63580561355670914054e+06}; /* 0x415d20d76744835c */
|
||||
|
||||
static const double cosh_lead[37] = {
|
||||
1.00000000000000000000e+00, /* 0x3ff0000000000000 */
|
||||
1.54308062791824340820e+00, /* 0x3ff8b07550000000 */
|
||||
3.76219564676284790039e+00, /* 0x400e18fa08000000 */
|
||||
1.00676617622375488281e+01, /* 0x402422a490000000 */
|
||||
2.73082327842712402344e+01, /* 0x403b4ee858000000 */
|
||||
7.42099475860595703125e+01, /* 0x40528d6fc8000000 */
|
||||
2.01715633392333984375e+02, /* 0x406936e678000000 */
|
||||
5.48317031860351562500e+02, /* 0x4081228948000000 */
|
||||
1.49047915649414062500e+03, /* 0x409749eaa8000000 */
|
||||
4.05154199218750000000e+03, /* 0x40afa71580000000 */
|
||||
1.10132329101562500000e+04, /* 0x40c5829dd0000000 */
|
||||
2.99370708007812500000e+04, /* 0x40dd3c4488000000 */
|
||||
8.13773945312500000000e+04, /* 0x40f3de1650000000 */
|
||||
2.21206695312500000000e+05, /* 0x410b00b590000000 */
|
||||
6.01302140625000000000e+05, /* 0x412259ac48000000 */
|
||||
1.63450865625000000000e+06, /* 0x4138f0cca8000000 */
|
||||
4.44305525000000000000e+06, /* 0x4150f2ebd0000000 */
|
||||
1.20774762500000000000e+07, /* 0x4167093488000000 */
|
||||
3.28299845000000000000e+07, /* 0x417f4f2208000000 */
|
||||
8.92411500000000000000e+07, /* 0x419546d8f8000000 */
|
||||
2.42582596000000000000e+08, /* 0x41aceb0888000000 */
|
||||
6.59407856000000000000e+08, /* 0x41c3a6e1f8000000 */
|
||||
1.79245641600000000000e+09, /* 0x41dab5adb8000000 */
|
||||
4.87240166400000000000e+09, /* 0x41f226af30000000 */
|
||||
1.32445608960000000000e+10, /* 0x4208ab7fb0000000 */
|
||||
3.60024494080000000000e+10, /* 0x4220c3d390000000 */
|
||||
9.78648043520000000000e+10, /* 0x4236c93268000000 */
|
||||
2.66024116224000000000e+11, /* 0x424ef822f0000000 */
|
||||
7.23128516608000000000e+11, /* 0x42650bba30000000 */
|
||||
1.96566712320000000000e+12, /* 0x427c9aae40000000 */
|
||||
5.34323724288000000000e+12, /* 0x4293704708000000 */
|
||||
1.45244246507520000000e+13, /* 0x42aa6b7658000000 */
|
||||
3.94814795284480000000e+13, /* 0x42c1f43fc8000000 */
|
||||
1.07321789251584000000e+14, /* 0x42d866f348000000 */
|
||||
2.91730863685632000000e+14, /* 0x42f0953e28000000 */
|
||||
7.93006722514944000000e+14, /* 0x430689e220000000 */
|
||||
2.15561576592179200000e+15}; /* 0x431ea215a0000000 */
|
||||
|
||||
static const double cosh_tail[37] = {
|
||||
0.00000000000000000000e+00, /* 0x0000000000000000 */
|
||||
6.89700037027478056904e-09, /* 0x3e3d9f5504c2bd28 */
|
||||
4.43207835591715833630e-08, /* 0x3e67cb66f0a4c9fd */
|
||||
2.33540217013828929694e-07, /* 0x3e8f58617928e588 */
|
||||
5.17452463948269748331e-08, /* 0x3e6bc7d000c38d48 */
|
||||
9.38728274131605919153e-07, /* 0x3eaf7f9d4e329998 */
|
||||
2.73012191010840495544e-06, /* 0x3ec6e6e464885269 */
|
||||
3.29486051438996307950e-06, /* 0x3ecba3a8b946c154 */
|
||||
4.75803746362771416375e-06, /* 0x3ed3f4e76110d5a4 */
|
||||
3.33050940471947692369e-05, /* 0x3f017622515a3e2b */
|
||||
9.94707313972136215365e-06, /* 0x3ee4dc4b528af3d0 */
|
||||
6.51685096227860253398e-05, /* 0x3f11156278615e10 */
|
||||
1.18132406658066663359e-03, /* 0x3f535ad50ed821f5 */
|
||||
6.93090416366541877541e-04, /* 0x3f46b61055f2935c */
|
||||
1.45780415323416845386e-03, /* 0x3f57e2794a601240 */
|
||||
2.99862082708111758744e-02, /* 0x3f9eb4b45f6aadd3 */
|
||||
1.02539925859688602072e-02, /* 0x3f85000b967b3698 */
|
||||
1.26787669807076286421e-01, /* 0x3fc03a940fadc092 */
|
||||
6.86652631843830962843e-02, /* 0x3fb1940bf3bf874c */
|
||||
4.81593633223853068159e-01, /* 0x3fded26e1a2a2110 */
|
||||
1.70489514001513020602e+00, /* 0x3ffb4740205796d6 */
|
||||
1.12416073489841270572e+01, /* 0x40267bb3f55cb85d */
|
||||
7.06579578098005001152e+00, /* 0x401c435ff81e18ac */
|
||||
5.91244513000686140458e+01, /* 0x404d8fee052bdea4 */
|
||||
1.68921736147088438429e+02, /* 0x40651d7edccde926 */
|
||||
2.60692936262087528121e+02, /* 0x40704b1644557e0e */
|
||||
3.62419382134890611269e+02, /* 0x4076a6b5ca0a9e1c */
|
||||
4.07689930834187453002e+03, /* 0x40afd9cc72249abe */
|
||||
1.55377375868385224749e+04, /* 0x40ce58de693edab5 */
|
||||
2.53720210371943103382e+04, /* 0x40d8c70158ac6364 */
|
||||
4.78822310734952334315e+04, /* 0x40e7614764f43e20 */
|
||||
1.81871712615542812273e+05, /* 0x4106337db36fc718 */
|
||||
5.62892347580489004031e+05, /* 0x41212d98b1f611e2 */
|
||||
6.41374032312148716301e+05, /* 0x412392bc108b37cc */
|
||||
7.57809544070145115256e+06, /* 0x415ce87bdc3473dc */
|
||||
3.64177136406482197344e+06, /* 0x414bc8d5ae99ad14 */
|
||||
7.63580561355670914054e+06}; /* 0x415d20d76744835c */
|
||||
|
||||
unsigned long ux, aux, xneg;
|
||||
double y, z, z1, z2;
|
||||
int m;
|
||||
|
||||
/* Special cases */
|
||||
|
||||
GET_BITS_DP64(x, ux);
|
||||
aux = ux & ~SIGNBIT_DP64;
|
||||
if (aux < 0x3e30000000000000) /* |x| small enough that sinh(x) = x */
|
||||
{
|
||||
if (aux == 0)
|
||||
/* with no inexact */
|
||||
return x;
|
||||
else
|
||||
return val_with_flags(x, AMD_F_INEXACT);
|
||||
}
|
||||
else if (aux >= 0x7ff0000000000000) /* |x| is NaN or Inf */
|
||||
{
|
||||
if (aux > 0x7ff0000000000000)
|
||||
/* x is NaN */
|
||||
return _handle_error("sinh", OP_SINH, ux|0x0008000000000000, _DOMAIN,
|
||||
0, EDOM, x, 0.0, 1);
|
||||
else
|
||||
return x + x;
|
||||
}
|
||||
|
||||
|
||||
xneg = (aux != ux);
|
||||
|
||||
y = x;
|
||||
if (xneg) y = -x;
|
||||
|
||||
if (y >= max_sinh_arg)
|
||||
{
|
||||
if (xneg)
|
||||
return _handle_error("sinh", OP_SINH, NINFBITPATT_DP64, _OVERFLOW,
|
||||
AMD_F_OVERFLOW, ERANGE, x, 0.0, 1);
|
||||
else
|
||||
return _handle_error("sinh", OP_SINH, PINFBITPATT_DP64, _OVERFLOW,
|
||||
AMD_F_OVERFLOW, ERANGE, x, 0.0, 1);
|
||||
}
|
||||
else if (y >= small_threshold)
|
||||
{
|
||||
/* In this range y is large enough so that
|
||||
the negative exponential is negligible,
|
||||
so sinh(y) is approximated by sign(x)*exp(y)/2. The
|
||||
code below is an inlined version of that from
|
||||
exp() with two changes (it operates on
|
||||
y instead of x, and the division by 2 is
|
||||
done by reducing m by 1). */
|
||||
|
||||
splitexp(y, 1.0, thirtytwo_by_log2, log2_by_32_lead,
|
||||
log2_by_32_tail, &m, &z1, &z2);
|
||||
m -= 1;
|
||||
|
||||
if (m >= EMIN_DP64 && m <= EMAX_DP64)
|
||||
z = scaleDouble_1((z1+z2),m);
|
||||
else
|
||||
z = scaleDouble_2((z1+z2),m);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* In this range we find the integer part y0 of y
|
||||
and the increment dy = y - y0. We then compute
|
||||
|
||||
z = sinh(y) = sinh(y0)cosh(dy) + cosh(y0)sinh(dy)
|
||||
|
||||
where sinh(y0) and cosh(y0) are tabulated above. */
|
||||
|
||||
int ind;
|
||||
double dy, dy2, sdy, cdy, sdy1, sdy2;
|
||||
|
||||
ind = (int)y;
|
||||
dy = y - ind;
|
||||
|
||||
dy2 = dy*dy;
|
||||
sdy = dy*dy2*(0.166666666666666667013899e0 +
|
||||
(0.833333333333329931873097e-2 +
|
||||
(0.198412698413242405162014e-3 +
|
||||
(0.275573191913636406057211e-5 +
|
||||
(0.250521176994133472333666e-7 +
|
||||
(0.160576793121939886190847e-9 +
|
||||
0.7746188980094184251527126e-12*dy2)*dy2)*dy2)*dy2)*dy2)*dy2);
|
||||
|
||||
cdy = dy2*(0.500000000000000005911074e0 +
|
||||
(0.416666666666660876512776e-1 +
|
||||
(0.138888888889814854814536e-2 +
|
||||
(0.248015872460622433115785e-4 +
|
||||
(0.275573350756016588011357e-6 +
|
||||
(0.208744349831471353536305e-8 +
|
||||
0.1163921388172173692062032e-10*dy2)*dy2)*dy2)*dy2)*dy2)*dy2);
|
||||
|
||||
/* At this point sinh(dy) is approximated by dy + sdy.
|
||||
Shift some significant bits from dy to sdy. */
|
||||
|
||||
GET_BITS_DP64(dy, ux);
|
||||
ux &= 0xfffffffff8000000;
|
||||
PUT_BITS_DP64(ux, sdy1);
|
||||
sdy2 = sdy + (dy - sdy1);
|
||||
|
||||
z = ((((((cosh_tail[ind]*sdy2 + sinh_tail[ind]*cdy)
|
||||
+ cosh_tail[ind]*sdy1) + sinh_tail[ind])
|
||||
+ cosh_lead[ind]*sdy2) + sinh_lead[ind]*cdy)
|
||||
+ cosh_lead[ind]*sdy1) + sinh_lead[ind];
|
||||
}
|
||||
|
||||
if (xneg) z = - z;
|
||||
return z;
|
||||
}
|
256
sdk/lib/crt/math/libm_sse2/sinhf.c
Normal file
256
sdk/lib/crt/math/libm_sse2/sinhf.c
Normal file
|
@ -0,0 +1,256 @@
|
|||
|
||||
/*******************************************************************************
|
||||
MIT License
|
||||
-----------
|
||||
|
||||
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this Software and associated documentaon files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "libm.h"
|
||||
#include "libm_util.h"
|
||||
|
||||
#define USE_SPLITEXP
|
||||
#define USE_SCALEDOUBLE_1
|
||||
#define USE_INFINITY_WITH_FLAGS
|
||||
#define USE_VALF_WITH_FLAGS
|
||||
#define USE_HANDLE_ERRORF
|
||||
#include "libm_inlines.h"
|
||||
#undef USE_SPLITEXP
|
||||
#undef USE_SCALEDOUBLE_1
|
||||
#undef USE_INFINITY_WITH_FLAGS
|
||||
#undef USE_VALF_WITH_FLAGS
|
||||
#undef USE_HANDLE_ERRORF
|
||||
|
||||
#include "libm_errno.h"
|
||||
|
||||
// Disable "C4163: not available as intrinsic function" warning that older
|
||||
// compilers may issue here.
|
||||
#pragma warning(disable:4163)
|
||||
#pragma function(sinhf)
|
||||
|
||||
|
||||
float sinhf(float fx)
|
||||
{
|
||||
/*
|
||||
After dealing with special cases the computation is split into
|
||||
regions as follows:
|
||||
|
||||
abs(x) >= max_sinh_arg:
|
||||
sinh(x) = sign(x)*Inf
|
||||
|
||||
abs(x) >= small_threshold:
|
||||
sinh(x) = sign(x)*exp(abs(x))/2 computed using the
|
||||
splitexp and scaleDouble functions as for exp_amd().
|
||||
|
||||
abs(x) < small_threshold:
|
||||
compute p = exp(y) - 1 and then z = 0.5*(p+(p/(p+1.0)))
|
||||
sinh(x) is then sign(x)*z. */
|
||||
|
||||
static const double
|
||||
/* The max argument of sinhf, but stored as a double */
|
||||
max_sinh_arg = 8.94159862922329438106e+01, /* 0x40565a9f84f82e63 */
|
||||
thirtytwo_by_log2 = 4.61662413084468283841e+01, /* 0x40471547652b82fe */
|
||||
log2_by_32_lead = 2.16608493356034159660e-02, /* 0x3f962e42fe000000 */
|
||||
log2_by_32_tail = 5.68948749532545630390e-11, /* 0x3dcf473de6af278e */
|
||||
small_threshold = 8*BASEDIGITS_DP64*0.30102999566398119521373889;
|
||||
/* (8*BASEDIGITS_DP64*log10of2) ' exp(-x) insignificant c.f. exp(x) */
|
||||
|
||||
/* Tabulated values of sinh(i) and cosh(i) for i = 0,...,36. */
|
||||
|
||||
static const double sinh_lead[37] = {
|
||||
0.00000000000000000000e+00, /* 0x0000000000000000 */
|
||||
1.17520119364380137839e+00, /* 0x3ff2cd9fc44eb982 */
|
||||
3.62686040784701857476e+00, /* 0x400d03cf63b6e19f */
|
||||
1.00178749274099008204e+01, /* 0x40240926e70949ad */
|
||||
2.72899171971277496596e+01, /* 0x403b4a3803703630 */
|
||||
7.42032105777887522891e+01, /* 0x40528d0166f07374 */
|
||||
2.01713157370279219549e+02, /* 0x406936d22f67c805 */
|
||||
5.48316123273246489589e+02, /* 0x408122876ba380c9 */
|
||||
1.49047882578955000099e+03, /* 0x409749ea514eca65 */
|
||||
4.05154190208278987484e+03, /* 0x40afa7157430966f */
|
||||
1.10132328747033916443e+04, /* 0x40c5829dced69991 */
|
||||
2.99370708492480553105e+04, /* 0x40dd3c4488cb48d6 */
|
||||
8.13773957064298447222e+04, /* 0x40f3de1654d043f0 */
|
||||
2.21206696003330085659e+05, /* 0x410b00b5916a31a5 */
|
||||
6.01302142081972560845e+05, /* 0x412259ac48bef7e3 */
|
||||
1.63450868623590236530e+06, /* 0x4138f0ccafad27f6 */
|
||||
4.44305526025387924165e+06, /* 0x4150f2ebd0a7ffe3 */
|
||||
1.20774763767876271158e+07, /* 0x416709348c0ea4ed */
|
||||
3.28299845686652474105e+07, /* 0x417f4f22091940bb */
|
||||
8.92411504815936237574e+07, /* 0x419546d8f9ed26e1 */
|
||||
2.42582597704895108938e+08, /* 0x41aceb088b68e803 */
|
||||
6.59407867241607308388e+08, /* 0x41c3a6e1fd9eecfd */
|
||||
1.79245642306579566002e+09, /* 0x41dab5adb9c435ff */
|
||||
4.87240172312445068359e+09, /* 0x41f226af33b1fdc0 */
|
||||
1.32445610649217357635e+10, /* 0x4208ab7fb5475fb7 */
|
||||
3.60024496686929321289e+10, /* 0x4220c3d3920962c8 */
|
||||
9.78648047144193725586e+10, /* 0x4236c932696a6b5c */
|
||||
2.66024120300899291992e+11, /* 0x424ef822f7f6731c */
|
||||
7.23128532145737548828e+11, /* 0x42650bba3796379a */
|
||||
1.96566714857202099609e+12, /* 0x427c9aae4631c056 */
|
||||
5.34323729076223046875e+12, /* 0x429370470aec28ec */
|
||||
1.45244248326237109375e+13, /* 0x42aa6b765d8cdf6c */
|
||||
3.94814800913403437500e+13, /* 0x42c1f43fcc4b662c */
|
||||
1.07321789892958031250e+14, /* 0x42d866f34a725782 */
|
||||
2.91730871263727437500e+14, /* 0x42f0953e2f3a1ef7 */
|
||||
7.93006726156715250000e+14, /* 0x430689e221bc8d5a */
|
||||
2.15561577355759750000e+15}; /* 0x431ea215a1d20d76 */
|
||||
|
||||
static const double cosh_lead[37] = {
|
||||
1.00000000000000000000e+00, /* 0x3ff0000000000000 */
|
||||
1.54308063481524371241e+00, /* 0x3ff8b07551d9f550 */
|
||||
3.76219569108363138810e+00, /* 0x400e18fa0df2d9bc */
|
||||
1.00676619957777653269e+01, /* 0x402422a497d6185e */
|
||||
2.73082328360164865444e+01, /* 0x403b4ee858de3e80 */
|
||||
7.42099485247878334349e+01, /* 0x40528d6fcbeff3a9 */
|
||||
2.01715636122455890700e+02, /* 0x406936e67db9b919 */
|
||||
5.48317035155212010977e+02, /* 0x4081228949ba3a8b */
|
||||
1.49047916125217807348e+03, /* 0x409749eaa93f4e76 */
|
||||
4.05154202549259389343e+03, /* 0x40afa715845d8894 */
|
||||
1.10132329201033226127e+04, /* 0x40c5829dd053712d */
|
||||
2.99370708659497577173e+04, /* 0x40dd3c4489115627 */
|
||||
8.13773957125740562333e+04, /* 0x40f3de1654d6b543 */
|
||||
2.21206696005590405548e+05, /* 0x410b00b5916b6105 */
|
||||
6.01302142082804115489e+05, /* 0x412259ac48bf13ca */
|
||||
1.63450868623620807193e+06, /* 0x4138f0ccafad2d17 */
|
||||
4.44305526025399193168e+06, /* 0x4150f2ebd0a8005c */
|
||||
1.20774763767876680940e+07, /* 0x416709348c0ea503 */
|
||||
3.28299845686652623117e+07, /* 0x417f4f22091940bf */
|
||||
8.92411504815936237574e+07, /* 0x419546d8f9ed26e1 */
|
||||
2.42582597704895138741e+08, /* 0x41aceb088b68e804 */
|
||||
6.59407867241607308388e+08, /* 0x41c3a6e1fd9eecfd */
|
||||
1.79245642306579566002e+09, /* 0x41dab5adb9c435ff */
|
||||
4.87240172312445068359e+09, /* 0x41f226af33b1fdc0 */
|
||||
1.32445610649217357635e+10, /* 0x4208ab7fb5475fb7 */
|
||||
3.60024496686929321289e+10, /* 0x4220c3d3920962c8 */
|
||||
9.78648047144193725586e+10, /* 0x4236c932696a6b5c */
|
||||
2.66024120300899291992e+11, /* 0x424ef822f7f6731c */
|
||||
7.23128532145737548828e+11, /* 0x42650bba3796379a */
|
||||
1.96566714857202099609e+12, /* 0x427c9aae4631c056 */
|
||||
5.34323729076223046875e+12, /* 0x429370470aec28ec */
|
||||
1.45244248326237109375e+13, /* 0x42aa6b765d8cdf6c */
|
||||
3.94814800913403437500e+13, /* 0x42c1f43fcc4b662c */
|
||||
1.07321789892958031250e+14, /* 0x42d866f34a725782 */
|
||||
2.91730871263727437500e+14, /* 0x42f0953e2f3a1ef7 */
|
||||
7.93006726156715250000e+14, /* 0x430689e221bc8d5a */
|
||||
2.15561577355759750000e+15}; /* 0x431ea215a1d20d76 */
|
||||
|
||||
unsigned long ux, aux, xneg;
|
||||
double x = fx, y, z, z1, z2;
|
||||
int m;
|
||||
|
||||
/* Special cases */
|
||||
|
||||
GET_BITS_DP64(x, ux);
|
||||
aux = ux & ~SIGNBIT_DP64;
|
||||
if (aux < 0x3f10000000000000) /* |x| small enough that sinh(x) = x */
|
||||
{
|
||||
if (aux == 0)
|
||||
/* with no inexact */
|
||||
return fx;
|
||||
else
|
||||
return valf_with_flags(fx, AMD_F_INEXACT);
|
||||
}
|
||||
else if (aux >= 0x7ff0000000000000) /* |x| is NaN or Inf */
|
||||
{
|
||||
if (aux > 0x7ff0000000000000)
|
||||
{
|
||||
/* x is NaN */
|
||||
unsigned int uhx;
|
||||
GET_BITS_SP32(fx, uhx);
|
||||
return _handle_errorf("sinhf", OP_SINH, uhx|0x00400000, _DOMAIN,
|
||||
0, EDOM, fx, 0.0F, 1);
|
||||
}
|
||||
else
|
||||
return fx + fx;
|
||||
}
|
||||
|
||||
xneg = (aux != ux);
|
||||
|
||||
y = x;
|
||||
if (xneg) y = -x;
|
||||
|
||||
if (y >= max_sinh_arg)
|
||||
{
|
||||
/* Return infinity with overflow flag. */
|
||||
if (xneg)
|
||||
return _handle_errorf("sinhf", OP_SINH, NINFBITPATT_SP32, _OVERFLOW,
|
||||
AMD_F_OVERFLOW, ERANGE, fx, 0.0F, 1);
|
||||
else
|
||||
return _handle_errorf("sinhf", OP_SINH, PINFBITPATT_SP32, _OVERFLOW,
|
||||
AMD_F_OVERFLOW, ERANGE, fx, 0.0F, 1);
|
||||
}
|
||||
else if (y >= small_threshold)
|
||||
{
|
||||
/* In this range y is large enough so that
|
||||
the negative exponential is negligible,
|
||||
so sinh(y) is approximated by sign(x)*exp(y)/2. The
|
||||
code below is an inlined version of that from
|
||||
exp() with two changes (it operates on
|
||||
y instead of x, and the division by 2 is
|
||||
done by reducing m by 1). */
|
||||
|
||||
splitexp(y, 1.0, thirtytwo_by_log2, log2_by_32_lead,
|
||||
log2_by_32_tail, &m, &z1, &z2);
|
||||
m -= 1;
|
||||
/* scaleDouble_1 is always safe because the argument x was
|
||||
float, rather than double */
|
||||
z = scaleDouble_1((z1+z2),m);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* In this range we find the integer part y0 of y
|
||||
and the increment dy = y - y0. We then compute
|
||||
|
||||
z = sinh(y) = sinh(y0)cosh(dy) + cosh(y0)sinh(dy)
|
||||
|
||||
where sinh(y0) and cosh(y0) are tabulated above. */
|
||||
|
||||
int ind;
|
||||
double dy, dy2, sdy, cdy;
|
||||
|
||||
ind = (int)y;
|
||||
dy = y - ind;
|
||||
|
||||
dy2 = dy*dy;
|
||||
|
||||
sdy = dy + dy*dy2*(0.166666666666666667013899e0 +
|
||||
(0.833333333333329931873097e-2 +
|
||||
(0.198412698413242405162014e-3 +
|
||||
(0.275573191913636406057211e-5 +
|
||||
(0.250521176994133472333666e-7 +
|
||||
(0.160576793121939886190847e-9 +
|
||||
0.7746188980094184251527126e-12*dy2)*dy2)*dy2)*dy2)*dy2)*dy2);
|
||||
|
||||
cdy = 1 + dy2*(0.500000000000000005911074e0 +
|
||||
(0.416666666666660876512776e-1 +
|
||||
(0.138888888889814854814536e-2 +
|
||||
(0.248015872460622433115785e-4 +
|
||||
(0.275573350756016588011357e-6 +
|
||||
(0.208744349831471353536305e-8 +
|
||||
0.1163921388172173692062032e-10*dy2)*dy2)*dy2)*dy2)*dy2)*dy2);
|
||||
|
||||
z = sinh_lead[ind]*cdy + cosh_lead[ind]*sdy;
|
||||
}
|
||||
|
||||
if (xneg) z = - z;
|
||||
return (float)z;
|
||||
}
|
88
sdk/lib/crt/math/libm_sse2/sqrt.c
Normal file
88
sdk/lib/crt/math/libm_sse2/sqrt.c
Normal file
|
@ -0,0 +1,88 @@
|
|||
|
||||
/*******************************************************************************
|
||||
MIT License
|
||||
-----------
|
||||
|
||||
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this Software and associated documentaon files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "libm.h"
|
||||
#include "libm_util.h"
|
||||
|
||||
#if USE_SOFTWARE_SQRT
|
||||
#define USE_SQRT_AMD_INLINE
|
||||
#endif
|
||||
#define USE_NAN_WITH_FLAGS
|
||||
#define USE_HANDLE_ERROR
|
||||
#include "libm_inlines.h"
|
||||
#if USE_SOFTWARE_SQRT
|
||||
#undef USE_SQRT_AMD_INLINE
|
||||
#endif
|
||||
#undef USE_NAN_WITH_FLAGS
|
||||
#undef USE_HANDLE_ERROR
|
||||
|
||||
#include "libm_errno.h"
|
||||
|
||||
#pragma function(sqrt)
|
||||
|
||||
double sqrt(double x)
|
||||
{
|
||||
#if USE_SOFTWARE_SQRT
|
||||
return sqrt_amd_inline(x);
|
||||
#else
|
||||
double r;
|
||||
unsigned long ux;
|
||||
GET_BITS_DP64(x, ux);
|
||||
|
||||
/* Check for special cases for Microsoft error handling */
|
||||
if ((ux & PINFBITPATT_DP64) == PINFBITPATT_DP64)
|
||||
{
|
||||
/* x is infinity, or NaN */
|
||||
if (ux & MANTBITS_DP64)
|
||||
{
|
||||
/* NaN of some sort */
|
||||
/* If it's a signaling NaN, convert to QNaN */
|
||||
return _handle_error("sqrt", OP_SQRT, ux|0x0008000000000000,
|
||||
_DOMAIN, 0,EDOM, x, 0.0, 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* +/-infinity */
|
||||
if (ux & SIGNBIT_DP64)
|
||||
{
|
||||
/* - infinity */
|
||||
return _handle_error("sqrt", OP_SQRT, INDEFBITPATT_DP64,
|
||||
_DOMAIN, AMD_F_INVALID, EDOM, x, 0.0, 1);
|
||||
}
|
||||
/* positive infinite is not a problem */
|
||||
}
|
||||
}
|
||||
if ((ux & SIGNBIT_DP64)&&(ux & ~SIGNBIT_DP64)) /* if x < zero */
|
||||
{
|
||||
return _handle_error("sqrt", OP_SQRT, INDEFBITPATT_DP64,
|
||||
_DOMAIN, AMD_F_INVALID, EDOM, x, 0.0, 1);
|
||||
}
|
||||
|
||||
/* VC++ intrinsic call */
|
||||
_mm_store_sd(&r, _mm_sqrt_sd(_mm_setzero_pd(), _mm_load_sd(&x)));
|
||||
return r;
|
||||
#endif
|
||||
}
|
91
sdk/lib/crt/math/libm_sse2/sqrtf.c
Normal file
91
sdk/lib/crt/math/libm_sse2/sqrtf.c
Normal file
|
@ -0,0 +1,91 @@
|
|||
|
||||
/*******************************************************************************
|
||||
MIT License
|
||||
-----------
|
||||
|
||||
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this Software and associated documentaon files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "libm.h"
|
||||
#include "libm_util.h"
|
||||
|
||||
#if USE_SOFTWARE_SQRT
|
||||
#define USE_SQRTF_AMD_INLINE
|
||||
#endif
|
||||
#define USE_NANF_WITH_FLAGS
|
||||
#define USE_HANDLE_ERRORF
|
||||
#include "libm_inlines.h"
|
||||
#if USE_SOFTWARE_SQRT
|
||||
#undef USE_SQRTF_AMD_INLINE
|
||||
#endif
|
||||
#undef USE_NANF_WITH_FLAGS
|
||||
#undef USE_HANDLE_ERRORF
|
||||
|
||||
#include "libm_errno.h"
|
||||
|
||||
// Disable "C4163: not available as intrinsic function" warning that older
|
||||
// compilers may issue here.
|
||||
#pragma warning(disable:4163)
|
||||
#pragma function(sqrtf)
|
||||
|
||||
|
||||
float sqrtf(float x)
|
||||
{
|
||||
#if USE_SOFTWARE_SQRT
|
||||
return sqrtf_amd_inline(x);
|
||||
#else
|
||||
float r;
|
||||
unsigned int ux;
|
||||
GET_BITS_SP32(x, ux);
|
||||
/* Check for special cases for Microsoft error handling */
|
||||
if ((ux & PINFBITPATT_SP32) == PINFBITPATT_SP32)
|
||||
{
|
||||
/* x is infinity, or NaN */
|
||||
if (ux & MANTBITS_SP32)
|
||||
{
|
||||
/* NaN of some sort */
|
||||
/* If it's a signaling NaN, convert to QNaN */
|
||||
return _handle_errorf("sqrtf", OP_SQRT, ux|0x00400000, _DOMAIN, 0,
|
||||
EDOM, x, 0.0F, 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* +/-infinity */
|
||||
if (ux & SIGNBIT_SP32)
|
||||
{
|
||||
/* - infinity */
|
||||
return _handle_errorf("sqrtf", OP_SQRT, INDEFBITPATT_SP32,
|
||||
_DOMAIN, AMD_F_INVALID, EDOM, x, 0.0F, 1);
|
||||
}
|
||||
/* positive infinite is not a problem */
|
||||
}
|
||||
}
|
||||
if ((ux & SIGNBIT_SP32)&&(ux & ~SIGNBIT_SP32)) /* if x < zero */
|
||||
{
|
||||
return _handle_errorf("sqrtf", OP_SQRT, INDEFBITPATT_SP32,
|
||||
_DOMAIN, AMD_F_INVALID, EDOM, x, 0.0F, 1);
|
||||
}
|
||||
|
||||
/* VC++ intrinsic call */
|
||||
_mm_store_ss(&r, _mm_sqrt_ss(_mm_load_ss(&x)));
|
||||
return r;
|
||||
#endif
|
||||
}
|
762
sdk/lib/crt/math/libm_sse2/tan.asm
Normal file
762
sdk/lib/crt/math/libm_sse2/tan.asm
Normal file
|
@ -0,0 +1,762 @@
|
|||
;
|
||||
;
|
||||
; MIT License
|
||||
; -----------
|
||||
;
|
||||
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
||||
;
|
||||
; Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
; of this Software and associated documentaon files (the "Software"), to deal
|
||||
; in the Software without restriction, including without limitation the rights
|
||||
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
; copies of the Software, and to permit persons to whom the Software is
|
||||
; furnished to do so, subject to the following conditions:
|
||||
;
|
||||
; The above copyright notice and this permission notice shall be included in
|
||||
; all copies or substantial portions of the Software.
|
||||
;
|
||||
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
; THE SOFTWARE.
|
||||
;
|
||||
; An implementation of the tan function.
|
||||
;
|
||||
; Prototype:
|
||||
;
|
||||
; double tan(double x);
|
||||
;
|
||||
; Computes tan(x).
|
||||
; It will provide proper C99 return values,
|
||||
; but may not raise floating point status bits properly.
|
||||
; Based on the NAG C implementation.
|
||||
;
|
||||
; If FMA3 hardware is present, it will be used for the calculation.
|
||||
;
|
||||
|
||||
.const
|
||||
ALIGN 16
|
||||
L_signbit DQ 08000000000000000h
|
||||
DQ 08000000000000000h ; duplicate for pd
|
||||
|
||||
L_sign_mask DQ 07FFFFFFFFFFFFFFFh
|
||||
DQ 07FFFFFFFFFFFFFFFh ; duplicate for pd
|
||||
|
||||
L_int_one DQ 00000000000000001h
|
||||
DQ 00000000000000001h ; duplicate for pd
|
||||
|
||||
L_twobypi DQ 03FE45F306DC9C883h
|
||||
DQ 03FE45F306DC9C883h ; duplicate for pd
|
||||
|
||||
L_point_333 DQ 03FD5555555555555h; 1/3
|
||||
DQ 03FD5555555555555h ; duplicate for pd
|
||||
|
||||
L_tan_p0 DQ 03FD7D50F6638564Ah ; 0.372379159759792203640806338901e0
|
||||
DQ 03FD7D50F6638564Ah ; duplicate for pd
|
||||
|
||||
L_tan_p2 DQ 0BF977C24C7569ABBh ; -0.229345080057565662883358588111e-1
|
||||
DQ 0BF977C24C7569ABBh ; duplicate for pd
|
||||
|
||||
L_tan_p4 DQ 03F2D5DAF289C385Ah ; 0.224044448537022097264602535574e-3
|
||||
DQ 03F2D5DAF289C385Ah ; duplicate for pd
|
||||
|
||||
L_tan_q0 DQ 03FF1DFCB8CAA40B8h ; 0.111713747927937668539901657944e1
|
||||
DQ 03FF1DFCB8CAA40B8h ; duplicate for pd
|
||||
|
||||
L_tan_q2 DQ 0BFE08046499EB90Fh ; -0.515658515729031149329237816945e0
|
||||
DQ 0BFE08046499EB90Fh ; duplicate for pd
|
||||
|
||||
L_tan_q4 DQ 03F9AB0F4F80A0ACFh ; 0.260656620398645407524064091208e-1
|
||||
DQ 03F9AB0F4F80A0ACFh ; duplicate for pd
|
||||
|
||||
L_tan_q6 DQ 0BF2E7517EF6D98F8h ; -0.232371494088563558304549252913e-3
|
||||
DQ 0BF2E7517EF6D98F8h ; duplicate for pd
|
||||
|
||||
L_half_mask DQ 0ffffffff00000000h
|
||||
DQ 0ffffffff00000000h ; duplicate for pd
|
||||
|
||||
L_piby4_lead DQ 03FE921FB54442D18h ; pi/4, high part
|
||||
DQ 03FE921FB54442D18h ; duplicate for pd
|
||||
|
||||
L_piby4_tail DQ 03C81A62633145C06h ; pi/4, low parft
|
||||
DQ 03C81A62633145C06h ; duplicate for pd
|
||||
|
||||
; Different parts of argument reduction need different versions of pi/2
|
||||
|
||||
L_piby2_1 DQ 03FF921FB54400000h ; pi/2, high 33 bits
|
||||
L_piby2_1tail DQ 03DD0B4611A626331h ; pi/2, second 53 bits, overlaps...
|
||||
L_piby2_2 DQ 03DD0B4611A600000h ; pi/2, second 33 bits
|
||||
L_piby2_2tail DQ 03BA3198A2E037073h ; pi/2, third 53 bits, overlaps...
|
||||
L_piby2_3 DQ 03BA3198A2E000000h ; pi/2, third 33 bits
|
||||
L_piby2_3tail DQ 0397B839A252049C1h ; pi/2, fourth 53 bits
|
||||
|
||||
; end of pi/2 versions
|
||||
|
||||
L_two_to_neg_27 DQ 03e40000000000000h ; 2^-27
|
||||
L_two_to_neg_13 DQ 03f20000000000000h ; 2^-13
|
||||
|
||||
L_inf_mask_64 DQ 07FF0000000000000h
|
||||
L_point_five DQ 03FE0000000000000h
|
||||
L_point_68 DQ 03FE5C28F5C28F5C3h ; .68
|
||||
L_n_point_68 DQ 0BFE5C28F5C28F5C3h ; -.68
|
||||
|
||||
L_zero DQ -0000000000000000h ; 0.0
|
||||
L_one DQ 03FF0000000000000h ; 1.0
|
||||
L_n_one DQ 0BFF0000000000000h ; -1.0
|
||||
L_two DQ 04000000000000000h ; 2.0
|
||||
|
||||
L_moderate_arg_cw DQ 0411E848000000000h ; 5.e5
|
||||
L_moderate_arg_bdl DQ 0417312D000000000h ; 2e7, works for BDL
|
||||
|
||||
fname TEXTEQU <tan>
|
||||
fname_special TEXTEQU <_tan_special>
|
||||
|
||||
; local storage offsets
|
||||
save_xmm6 EQU 020h
|
||||
save_xmm7 EQU 030h
|
||||
store_input EQU 040h
|
||||
save_r10 EQU 050h
|
||||
dummy_space EQU 060h
|
||||
stack_size EQU 088h
|
||||
|
||||
include fm.inc
|
||||
|
||||
EXTERN __use_fma3_lib:DWORD
|
||||
EXTERN fname_special : PROC
|
||||
EXTERN __remainder_piby2_fma3 : PROC
|
||||
EXTERN __remainder_piby2_fma3_bdl : PROC
|
||||
EXTERN __remainder_piby2_forAsm : PROC
|
||||
EXTERN _set_statfp : PROC
|
||||
|
||||
.code
|
||||
ALIGN 16
|
||||
PUBLIC fname
|
||||
fname PROC FRAME
|
||||
StackAllocate stack_size
|
||||
SaveXmm xmm6, save_xmm6
|
||||
SaveXmm xmm7, save_xmm7
|
||||
.ENDPROLOG
|
||||
cmp DWORD PTR __use_fma3_lib, 0
|
||||
jne Ltan_fma3
|
||||
|
||||
Ltan_sse2:
|
||||
movd rdx, xmm0 ; really movq
|
||||
movaps xmm6, xmm0
|
||||
mov rcx, rdx
|
||||
btr rcx, 63 ; rcx <-- |x|
|
||||
|
||||
cmp rcx, L_piby4_lead
|
||||
ja Ltan_abs_x_nle_pio4 ; branch if > pi/4 or NaN
|
||||
|
||||
|
||||
cmp rcx, L_two_to_neg_13
|
||||
jae Ltan_abs_x_ge_two_to_neg_13
|
||||
|
||||
cmp rcx, L_two_to_neg_27
|
||||
jae Labs_x_ge_two_to_neg_27
|
||||
|
||||
; At this point tan(x) ~= x; if it's not exact, set the inexact flag
|
||||
|
||||
test rcx, rcx
|
||||
je Ltan_return
|
||||
|
||||
mov ecx, 20h ; ecx <-- AMD_F_INEXACT
|
||||
call _set_statfp
|
||||
movaps xmm0, xmm6 ; may be redundant, but xmm0 <-- x
|
||||
|
||||
RestoreXmm xmm7, save_xmm7
|
||||
RestoreXmm xmm6, save_xmm6
|
||||
StackDeallocate stack_size
|
||||
ret 0
|
||||
|
||||
Labs_x_ge_two_to_neg_27:
|
||||
|
||||
mulsd xmm0, xmm0
|
||||
mulsd xmm0, xmm6
|
||||
mulsd xmm0, QWORD PTR L_point_333
|
||||
|
||||
addsd xmm0, xmm6
|
||||
|
||||
RestoreXmm xmm7, save_xmm7
|
||||
RestoreXmm xmm6, save_xmm6
|
||||
StackDeallocate stack_size
|
||||
ret 0
|
||||
|
||||
Ltan_abs_x_ge_two_to_neg_13:
|
||||
xorps xmm1, xmm1 ; xmm1 <-- xx = 0
|
||||
xor r8d, r8d ; r8 <-- recip flag = 0
|
||||
call _tan_piby4
|
||||
|
||||
Ltan_return:
|
||||
RestoreXmm xmm7, save_xmm7
|
||||
RestoreXmm xmm6, save_xmm6
|
||||
StackDeallocate stack_size
|
||||
ret 0
|
||||
|
||||
Ltan_abs_x_nle_pio4:
|
||||
|
||||
cmp rcx, L_inf_mask_64 ; |x| uint >= +inf as uint ?
|
||||
jnae Ltan_x_is_finite
|
||||
|
||||
call fname_special
|
||||
RestoreXmm xmm7, save_xmm7
|
||||
RestoreXmm xmm6, save_xmm6
|
||||
StackDeallocate stack_size
|
||||
ret
|
||||
|
||||
ALIGN 16
|
||||
Ltan_x_is_finite:
|
||||
xor r8d, r8d
|
||||
xor r10, r10
|
||||
cmp rcx, rdx
|
||||
setne r10b ; r10 <-- x was negative flag
|
||||
andpd xmm6, L_sign_mask
|
||||
|
||||
movsd xmm0, QWORD PTR L_moderate_arg_cw ; currently 5e5
|
||||
comisd xmm0, xmm6
|
||||
jbe Ltan_x_is_very_large
|
||||
|
||||
Ltan_x_is_moderate: ; unused label
|
||||
|
||||
; For these arguments we do a Cody-Waite reduction, subtracting the
|
||||
; appropriate multiple of pi/2, using extra precision where x is close
|
||||
; to an exact multiple of pi/2
|
||||
; We special-case region setting for |x| <= 9pi/4
|
||||
; It seems strange that this speeds things up, but it does
|
||||
|
||||
mov rdx, rcx
|
||||
|
||||
mov rax, 4616025215990052958 ; 400f6a7a2955385eH (5pi/4)
|
||||
shr rdx, 52 ; rdx <-- xexp
|
||||
cmp rcx, rax
|
||||
ja Labs_x_gt_5pio4
|
||||
|
||||
mov rax, 4612488097114038738 ; 4002d97c7f3321d2H (3pi/4)
|
||||
cmp rcx, rax
|
||||
seta r8b
|
||||
inc r8d ; r8d <-- region (1 or 2)
|
||||
jmp Lhave_region
|
||||
|
||||
Labs_x_gt_5pio4:
|
||||
mov rax, 4619644535898419899 ; 401c463abeccb2bbH (9pi/4)
|
||||
cmp rcx, rax
|
||||
ja Lneed_region_computation
|
||||
mov rax, 4617875976460412789 ; 4015fdbbe9bba775H (7pi/4)
|
||||
cmp rcx, rax
|
||||
seta r8b
|
||||
add r8d, 3 ; r8d <-- region (3 or 4)
|
||||
jmp Lhave_region
|
||||
|
||||
ALIGN 16
|
||||
Lneed_region_computation:
|
||||
movaps xmm0, xmm6
|
||||
mulsd xmm0, QWORD PTR L_twobypi
|
||||
addsd xmm0, QWORD PTR L_point_five
|
||||
cvttsd2si r8d, xmm0 ; r8d <-- region
|
||||
|
||||
Lhave_region:
|
||||
movd xmm3, r8d
|
||||
cvtdq2pd xmm3, xmm3
|
||||
|
||||
movaps xmm2, xmm3
|
||||
movaps xmm0, xmm3
|
||||
mulsd xmm0, QWORD PTR L_piby2_1
|
||||
mulsd xmm2, QWORD PTR L_piby2_1tail ; xmm2 < rtail = npi2 * piby2_1tail
|
||||
subsd xmm6, xmm0 ; xmm6 <-- rhead = x - npi2*piby2_1
|
||||
|
||||
; If x is not too close to multiple of pi/2,
|
||||
; we're essentially done with reduction
|
||||
; If the exponent of rhead is not close to that of x,
|
||||
; then most of x has been subtracted away in computing rhead;
|
||||
; i.e., x is close to a multiple of pi/2.
|
||||
|
||||
movd rax, xmm6
|
||||
shr rax, 52
|
||||
and eax, 2047
|
||||
sub rdx, rax ; rdx <-- exp diff of x vs rhead
|
||||
|
||||
cmp rdx, 15
|
||||
jbe Ltan_have_rhead_rtail
|
||||
|
||||
; Oops, x is almost a multiple of pi/2. Compute more bits of reduced x
|
||||
|
||||
; t = rhead;
|
||||
; rtail = npi2 * piby2_2;
|
||||
; rhead = t - rtail;
|
||||
; rtail = npi2 * piby2_2tail - ((t - rhead) - rtail);
|
||||
|
||||
movaps xmm1, xmm6
|
||||
movaps xmm0, xmm3
|
||||
|
||||
movaps xmm2, xmm3
|
||||
mulsd xmm0, QWORD PTR L_piby2_2
|
||||
mulsd xmm2, QWORD PTR L_piby2_2tail
|
||||
subsd xmm6, xmm0
|
||||
subsd xmm1, xmm6
|
||||
subsd xmm1, xmm0
|
||||
subsd xmm2, xmm1
|
||||
|
||||
cmp rdx, 48
|
||||
jbe Ltan_have_rhead_rtail ; We've done enough
|
||||
|
||||
; Wow, x is REALLY close to a multiple of pi/2. Compute more bits.
|
||||
|
||||
; t = rhead;
|
||||
; rtail = npi2 * piby2_3;
|
||||
; rhead = t - rtail;
|
||||
; rtail = npi2 * piby2_3tail - ((t - rhead) - rtail);
|
||||
|
||||
movaps xmm1, xmm6
|
||||
movaps xmm0, xmm3
|
||||
movaps xmm2, xmm3
|
||||
mulsd xmm0, QWORD PTR L_piby2_3
|
||||
mulsd xmm2, QWORD PTR L_piby2_3tail
|
||||
subsd xmm6, xmm0 ; xmm6 <-- rhead = t - rtail
|
||||
subsd xmm1, xmm6 ; xmm1 <-- t - rhead
|
||||
subsd xmm1, xmm0 ; xmm1 <-- ((t - rhead) - rtail)
|
||||
subsd xmm2, xmm1 ; xmm2 <-- final rtail
|
||||
|
||||
Ltan_have_rhead_rtail:
|
||||
|
||||
; At this point xmm6 has a suitable rhead, xmm2 a suitable rtail
|
||||
movaps xmm0, xmm6 ; xmm0 <-- copy of rhead
|
||||
|
||||
; r = rhead - rtail
|
||||
; rr = (rhead - r) - rtail;
|
||||
; region = npi2 & 3;
|
||||
|
||||
and r8d, 3 ; r8d <-- region
|
||||
subsd xmm0, xmm2 ; xmm0 <-- r = rhead - rtail
|
||||
subsd xmm6, xmm0 ; xmm6 <-- rhead - r
|
||||
subsd xmm6, xmm2 ; xmm6 <-- rr = (rhead - r) - rtail
|
||||
|
||||
Ltan_do_tan_computation:
|
||||
and r8d, 1 ; r8d <-- region & 1
|
||||
movaps xmm1, xmm6
|
||||
call _tan_piby4
|
||||
test r10d, r10d
|
||||
je Ltan_pos_return
|
||||
xorpd xmm0, QWORD PTR L_signbit
|
||||
Ltan_pos_return:
|
||||
RestoreXmm xmm7, save_xmm7
|
||||
RestoreXmm xmm6, save_xmm6
|
||||
StackDeallocate stack_size
|
||||
ret 0
|
||||
|
||||
ALIGN 16
|
||||
Ltan_x_is_very_large:
|
||||
; Reduce x into range [-pi/4,pi/4] (general case)
|
||||
movaps xmm0, xmm6
|
||||
mov QWORD PTR [rsp+save_r10], r10
|
||||
call __remainder_piby2_forAsm ; this call clobbers r10
|
||||
mov r10, QWORD PTR [rsp+save_r10]
|
||||
movapd xmm6,xmm1 ; xmm6 <-- rr
|
||||
mov r8d,eax ; r8d <-- region
|
||||
jmp Ltan_do_tan_computation
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;
|
||||
; From here on, it is assumed that the hardware supports FMA3 (and AVX).
|
||||
|
||||
ALIGN 16
|
||||
Ltan_fma3:
|
||||
vmovq r9,xmm0
|
||||
mov rdx,r9 ; rdx <-- x
|
||||
btr r9,63 ; r9 <-- |x|
|
||||
cmp r9,L_piby4_lead
|
||||
jae Ltan_fma3_absx_gt_pio4 ; Note that NaN will branch
|
||||
|
||||
Ltan_fma3_absx_le_pio4:
|
||||
; no argument reduction is needed, so recip is 0, xx is 0.
|
||||
; Note that this routine is not special-casing very small |x|
|
||||
vmovsd xmm5,L_piby4_lead
|
||||
vmovsd xmm6,L_piby4_tail
|
||||
vxorpd xmm1,xmm1,xmm1 ; xx <-- 0.
|
||||
vxorpd xmm7,xmm7,xmm7 ; transform <-- 0
|
||||
comisd xmm0,L_point_68
|
||||
jbe Ltan_fma3_small_x_le_point_68
|
||||
Ltan_fma3_x_small_gt_point_68:
|
||||
vmovsd xmm7,L_one ; xmm7 <-- transform = 1.0
|
||||
vsubsd xmm0,xmm5,xmm0 ; x = piby4_lead - x
|
||||
vaddsd xmm0,xmm0,xmm6 ; xmm0 <-- x = x + xl = x + piby4_tail
|
||||
jmp Ltan_fma3_compute_Remez_for_small_x
|
||||
ALIGN 16
|
||||
Ltan_fma3_small_x_le_point_68:
|
||||
comisd xmm0,L_n_point_68
|
||||
jae Ltan_fma3_compute_Remez_for_small_x
|
||||
Ltan_fma3_small_x_lt_neg_point_68:
|
||||
vmovsd xmm7,L_n_one ; xmm7 <-- transform = -1.0
|
||||
vaddsd xmm0,xmm5,xmm0 ; x = piby4_lead + x
|
||||
vaddsd xmm0,xmm0,xmm6 ; xmm0 <-- x = x + xl = x + piby4_tail
|
||||
Ltan_fma3_compute_Remez_for_small_x:
|
||||
; At this point xmm0 holds x, possibly transformed
|
||||
|
||||
; now do core Remez rational approximation for x in [0,0.68]
|
||||
vmovsd xmm4,L_tan_q6
|
||||
vmovsd xmm3,L_tan_p4
|
||||
vmulsd xmm2,xmm0,xmm0 ; xx is 0, so xmm2 <-- r = x*x
|
||||
vfmadd213sd xmm4,xmm2,L_tan_q4
|
||||
vfmadd213sd xmm3,xmm2,L_tan_p2
|
||||
vfmadd213sd xmm4,xmm2,L_tan_q2
|
||||
vfmadd213sd xmm3,xmm2,L_tan_p0 ; xmm3 <-- p2 (polynomial)
|
||||
vfmadd213sd xmm4,xmm2,L_tan_q0 ; xmm4 <-- q3 (polynomial)
|
||||
vdivsd xmm3,xmm3,xmm4 ; xmm3 <-- r3 = p2/q3
|
||||
vmulsd xmm3,xmm3,xmm2 ; xmm3 <-- r * r3
|
||||
vfmadd132sd xmm0,xmm0,xmm3 ; xx = 0, so xmm0 <-- t = x + x*(r*r3)
|
||||
comisd xmm7,L_zero ; did we transform x?
|
||||
; if x was transformed, we need to transform t to get answer;
|
||||
; if not, the answer is just t.
|
||||
je Ltan_fma3_ext_piby4_zero
|
||||
|
||||
; x was transformed, so answer is +- (1. - 2.*t/(1.+t))
|
||||
; (remember recip is 0 here)
|
||||
vmovsd xmm3,L_one
|
||||
vaddsd xmm4,xmm0,L_one ; xmm4 <-- 1. + t
|
||||
vdivsd xmm6,xmm0,xmm4 ; xmm6 <-- t / (1.+t)
|
||||
vfnmadd231sd xmm3,xmm6,L_two ; xmm3 <-- 1. - 2.*t/(1.+t)
|
||||
vmulsd xmm0,xmm3,xmm7 ; multiply by +- 1.
|
||||
|
||||
Ltan_fma3_ext_piby4_zero:
|
||||
; restore volatile registers
|
||||
AVXRestoreXmm xmm7, save_xmm7
|
||||
AVXRestoreXmm xmm6, save_xmm6
|
||||
StackDeallocate stack_size
|
||||
ret 0
|
||||
|
||||
ALIGN 16
|
||||
Ltan_fma3_absx_gt_pio4: ;;; come here if |x| > pi/4
|
||||
cmp r9, L_inf_mask_64
|
||||
jae Ltan_fma3_naninf
|
||||
|
||||
;Ltan_fma3_range_reduce:
|
||||
vmovapd [store_input + rsp],xmm0 ; save copy of x
|
||||
vmovq xmm0,r9 ; xmm0l <-- |x|
|
||||
cmp r9,L_moderate_arg_bdl
|
||||
jge Ltan_fma3_remainder_piby2 ; go elsewhere if |x| > 500000.
|
||||
|
||||
; Note that __remainder_piby2_fma3 and __remainder_piby2_fma3_bdl
|
||||
; have calling conventions that differ from the C routine
|
||||
; on input
|
||||
; |x| is in xmm0
|
||||
; on output
|
||||
; z is in xmm0
|
||||
; zz is in xmm1
|
||||
; where z + zz = arg reduced |x| and zz is small compared to z
|
||||
; region of |x| is in rax
|
||||
|
||||
Ltan_fma3_remainder_piby2_small:
|
||||
; Boldo-Daumas-Li reduction for reasonably small |x|
|
||||
call __remainder_piby2_fma3_bdl
|
||||
|
||||
|
||||
Ltan_fma3_full_computation:
|
||||
; we have done argument reduction; recip and xx may be nonzero
|
||||
; x is in xmm0, xx is in xmm1
|
||||
; recip is region & 1, and region is in rax.
|
||||
|
||||
vmovsd xmm5,L_piby4_lead
|
||||
vmovsd xmm6,L_piby4_tail
|
||||
|
||||
vxorpd xmm7,xmm7,xmm7 ; transform <-- 0
|
||||
vcomisd xmm0,L_point_68
|
||||
jbe Ltan_fma3_full_x_le_point_68
|
||||
Ltan_fma3_full_x_gt_point_68:
|
||||
vmovsd xmm7,L_one ; xmm7 <-- transform = 1.0
|
||||
vsubsd xmm0,xmm5,xmm0 ; xmm0 <-- x = piby4_lead - x
|
||||
vsubsd xmm2,xmm6,xmm1 ; xmm2 <-- xl = pibi4_tail - xx
|
||||
vaddsd xmm0,xmm0,xmm2 ; xmm0 <-- x = x + xl
|
||||
vxorps xmm1,xmm1,xmm1 ; xmm1 <-- xx = 0
|
||||
jmp Ltan_fma3_compute_Remez
|
||||
ALIGN 16
|
||||
Ltan_fma3_full_x_le_point_68:
|
||||
vcomisd xmm0,L_n_point_68
|
||||
jae Ltan_fma3_compute_Remez
|
||||
Ltan_fma3_full_x_lt_neg_point_68:
|
||||
vmovsd xmm7,L_n_one ; xmm7 <-- transform = -1.0
|
||||
vaddsd xmm0,xmm5,xmm0 ; x = piby4_lead + x
|
||||
vaddsd xmm2,xmm6,xmm1 ; xmm2 <-- xl = piby4_tail + xx
|
||||
vaddsd xmm0,xmm0,xmm2 ; xmm0 <-- x = x + xl
|
||||
vxorps xmm1,xmm1,xmm1 ; xmm1 <-- xx = 0
|
||||
|
||||
Ltan_fma3_compute_Remez:
|
||||
vmulsd xmm2,xmm0,xmm0 ; xmm2 <-- x*x
|
||||
vmulsd xmm5,xmm1,xmm0 ; xmm5 <-- x*xx
|
||||
vfmadd132sd xmm5,xmm2,L_two ; xmm5 <-- r = x*x + 2.*x*xx
|
||||
vmovsd xmm2,L_tan_p4
|
||||
vfmadd213sd xmm2,xmm5,L_tan_p2 ; xmm2 <-- p4*r+p2
|
||||
vfmadd213sd xmm2,xmm5,L_tan_p0 ; xmm2 <-- p = (p4*r+p2)*r+p0
|
||||
vmovsd xmm4,L_tan_q6
|
||||
vfmadd213sd xmm4,xmm5,L_tan_q4 ; xmm4 <-- q6*r+q4
|
||||
vfmadd213sd xmm4,xmm5,L_tan_q2 ; xmm4 <-- (q6*r+q4)*r+q2
|
||||
vfmadd213sd xmm4,xmm5,L_tan_q0 ; xmm4 <-- q = ((q6*r+q4)*r+q2)*r+q0
|
||||
vdivsd xmm2,xmm2,xmm4 ; xmm2 <-- p/q
|
||||
vmulsd xmm2,xmm2,xmm5 ; xmm2 <-- r*p/q
|
||||
vfmadd213sd xmm2,xmm0,xmm1 ; xmm2 <-- t2 = xx + x*r*(p/q)
|
||||
vaddsd xmm1,xmm0,xmm2 ; xmm1 <-- t = (t1=x) + t2
|
||||
|
||||
; If |x| > .68 we transformed, and t is an approximation of
|
||||
; tan(pi/4 +- (x+xx))
|
||||
; otherwise, t is just tan(x+xx)
|
||||
vxorpd xmm6,xmm6,xmm6
|
||||
vcomisd xmm7,xmm6 ; did we transform? (|x| > .68) ?
|
||||
jz Ltan_fma3_if_recip_set ; if not, go check recip
|
||||
|
||||
Ltan_fma3_if_transfor_set:
|
||||
; Because we transformed x+xx, we have to transform t before returning
|
||||
; let transform be 1 for x > .68, -1 for x < -.68, then we return
|
||||
; transform * (recip ? (2.*t/(t-1.) - 1.) : (1. - 2.*t/(1.+t)))
|
||||
vaddsd xmm6,xmm1,xmm1 ; xmm6 <-- 2.*t
|
||||
vmovsd xmm4,L_one
|
||||
vaddsd xmm2,xmm1,xmm4 ; xmm2 <-- t+1
|
||||
vsubsd xmm5,xmm1,xmm4 ; xmm5 <-- t-1
|
||||
bt rax,0
|
||||
jc Ltan_fma3_transform_and_recip_set
|
||||
; here recip is not set
|
||||
vaddsd xmm2,xmm1,xmm4 ; xmm2 <-- t+1
|
||||
vdivsd xmm2,xmm1,xmm2 ; xmm2 <-- t/(t+1)
|
||||
vfnmadd132sd xmm2,xmm4,L_two ; xmm2 <-- 1 - 2*t/(t+1)
|
||||
vmulsd xmm1,xmm2,xmm7 ; xmm1 <-- transform*(1 - 2*t/(t+1))
|
||||
jmp Ltan_fma3_exit_piby4
|
||||
ALIGN 16
|
||||
Ltan_fma3_transform_and_recip_set:
|
||||
; here recip is set
|
||||
vsubsd xmm2,xmm1,xmm4 ; xmm2 <-- t-1
|
||||
vdivsd xmm2,xmm1,xmm2 ; xmm2 <-- t/(t-1)
|
||||
vfmsub132sd xmm2,xmm4,L_two ; xmm2 <-- 2*t/(t-1) - 1
|
||||
vmulsd xmm1,xmm2,xmm7 ; xmm1 <-- transform*(2*t/(t-1) - 1)
|
||||
jmp Ltan_fma3_exit_piby4
|
||||
|
||||
ALIGN 16
|
||||
Ltan_fma3_if_recip_set:
|
||||
; Here we did not transform x and xx, but if we are in an odd quadrant
|
||||
; we will need to return -1./(t1+t2), computed accurately
|
||||
; (t=t1 is in xmm1, t2 is in xmm2)
|
||||
bt rax,0
|
||||
jnc Ltan_fma3_exit_piby4
|
||||
|
||||
vandpd xmm7,xmm1,L_half_mask ; xmm7 <-- z1 = high bits of t
|
||||
vsubsd xmm4,xmm7,xmm0 ; xmm4 <-- z1 - t1
|
||||
vsubsd xmm4,xmm2,xmm4 ; xmm4 <-- z2 = t2 - (z1-t1)
|
||||
vmovsd xmm2,L_n_one
|
||||
vdivsd xmm2,xmm2,xmm1 ; xmm2 <-- trec = -1./t
|
||||
vandpd xmm5,xmm2,L_half_mask ; xmm5 <-- trec_top=high bits of trec
|
||||
vfmadd213sd xmm7,xmm5,L_one ; xmm7 <-- trec_top*z1 + 1.
|
||||
vfmadd231sd xmm7 ,xmm4,xmm5 ; xmm7 <-- z2*trec_top + (trec_top*z1 + 1.)
|
||||
vfmadd213sd xmm7,xmm2,xmm5 ; xmm7 <-- u = trec_top + trec*(z2*trec_top + (trec_top*z1+1.))
|
||||
vmovapd xmm1,xmm7 ; xmm1 <-- u
|
||||
|
||||
Ltan_fma3_exit_piby4:
|
||||
vmovapd xmm0,xmm1 ; xmm0 <-- t, u, or v, as needed
|
||||
|
||||
vmovapd xmm1,[store_input + rsp]
|
||||
vandpd xmm1,xmm1,L_signbit
|
||||
vxorpd xmm0,xmm0,xmm1 ; tan(-x) = -tan(x)
|
||||
|
||||
; restore volatile registers
|
||||
AVXRestoreXmm xmm7, save_xmm7
|
||||
AVXRestoreXmm xmm6, save_xmm6
|
||||
StackDeallocate stack_size
|
||||
ret
|
||||
|
||||
ALIGN 16
|
||||
Ltan_fma3_remainder_piby2:
|
||||
; argument reduction for general x
|
||||
|
||||
call __remainder_piby2_fma3
|
||||
jmp Ltan_fma3_full_computation
|
||||
|
||||
|
||||
Ltan_fma3_naninf: ; here argument is +-Inf or NaN. Special case.
|
||||
call fname_special
|
||||
AVXRestoreXmm xmm7, save_xmm7
|
||||
AVXRestoreXmm xmm6, save_xmm6
|
||||
StackDeallocate stack_size
|
||||
ret
|
||||
|
||||
fname endp
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
.const
|
||||
tan_piby4_save_xmm6 EQU 030h
|
||||
tan_piby4_stack_size EQU 048h
|
||||
.code
|
||||
ALIGN 16
|
||||
_tan_piby4 PROC PRIVATE FRAME
|
||||
StackAllocate tan_piby4_stack_size
|
||||
SaveXmm xmm6, tan_piby4_save_xmm6
|
||||
.ENDPROLOG
|
||||
|
||||
; Compute tangent for x+xx in [-pi/4,pi/4].
|
||||
; xmm0 has x
|
||||
; xmm1 has xx
|
||||
; r8d has recip. If recip is true, return -1/tan(x+xx) else tan(x+xx)
|
||||
|
||||
xor eax, eax
|
||||
|
||||
comisd xmm0, QWORD PTR L_point_68
|
||||
movaps xmm3, xmm1
|
||||
movaps xmm6, xmm0
|
||||
jbe Ltan_piby4_x_le_point_68
|
||||
|
||||
; Here x > .68, so we transform x using the identity
|
||||
; tan(pi/4-x) = (1-tan(x))/(1+tan(x))
|
||||
|
||||
movsd xmm2, QWORD PTR L_piby4_lead
|
||||
mov eax, 1 ; eax <-- transform = 1
|
||||
subsd xmm2, xmm0 ; xmm2 <-- x = piby4_lead - x
|
||||
movsd xmm0, QWORD PTR L_piby4_tail
|
||||
subsd xmm0, xmm1 ; xmm0 <-- xl = piby4_tail - xx
|
||||
movaps xmm6, xmm2
|
||||
addsd xmm6, xmm0 ; xmm6 <-- x = x + xl
|
||||
xorps xmm3,xmm3 ; xmm3 <-- xx = 0.
|
||||
jmp Ltan_piby4_do_remez
|
||||
|
||||
Ltan_piby4_x_le_point_68:
|
||||
; 43 : else if (x < -0.68)
|
||||
|
||||
movsd xmm0, QWORD PTR L_n_point_68
|
||||
comisd xmm0, xmm6
|
||||
jbe Ltan_piby4_do_remez ; jump if x >= -.68
|
||||
|
||||
; Here x < -.68, so we transform x using the identity
|
||||
; tan(x-pi/4) = (tan(x)-1)/(tan(x)+1)
|
||||
|
||||
addsd xmm6, QWORD PTR L_piby4_lead ; xmm6 <-- x = piby4_lead + x
|
||||
addsd xmm3, QWORD PTR L_piby4_tail ; xmm3 <-- xl = piby4_tail + xx
|
||||
or eax, -1 ; eax <-- transform = -1
|
||||
addsd xmm6, xmm3 ; xmm6 <-- x = x + xl
|
||||
xorps xmm3, xmm3 ; xmm3 <-- xx = 0
|
||||
|
||||
Ltan_piby4_do_remez:
|
||||
|
||||
; Core Remez [2,3] approximation to tan(x+xx) on the interval [0,0.68].
|
||||
movaps xmm0, xmm6
|
||||
movaps xmm2, xmm6;
|
||||
; An implementation of the tan function.
|
||||
;
|
||||
; Prototype:
|
||||
;
|
||||
; double tan(double x);
|
||||
;
|
||||
; Computes tan(x).
|
||||
; It will provide proper C99 return values,
|
||||
; but may not raise floating point status bits properly.
|
||||
; Based on the NAG C implementation.
|
||||
;
|
||||
;
|
||||
|
||||
mulsd xmm0, xmm6 ; xmm0 <-- x*x
|
||||
addsd xmm2, xmm2 ; xmm2 <-- 2*x
|
||||
mulsd xmm2, xmm3 ; xmm2 <-- 2*x*xx
|
||||
addsd xmm2, xmm0 ; xmm2 <-- r = x*x + 2*x*xx
|
||||
|
||||
; Magic Remez approximation
|
||||
movaps xmm0, xmm2
|
||||
movaps xmm5, xmm2
|
||||
movaps xmm1, xmm2
|
||||
mulsd xmm5, QWORD PTR L_tan_p4
|
||||
mulsd xmm1, QWORD PTR L_tan_q6
|
||||
mulsd xmm0, xmm6
|
||||
addsd xmm5, QWORD PTR L_tan_p2
|
||||
mulsd xmm5, xmm2
|
||||
addsd xmm5, QWORD PTR L_tan_p0
|
||||
mulsd xmm5, xmm0
|
||||
movsd xmm0, QWORD PTR L_tan_q4
|
||||
addsd xmm0, xmm1
|
||||
mulsd xmm0, xmm2
|
||||
addsd xmm0, QWORD PTR L_tan_q2
|
||||
mulsd xmm0, xmm2
|
||||
addsd xmm0, QWORD PTR L_tan_q0
|
||||
divsd xmm5, xmm0
|
||||
addsd xmm5, xmm3 ; xmm5 <-- t2
|
||||
|
||||
test eax, eax
|
||||
je Ltan_piby4_transform_false
|
||||
|
||||
addsd xmm5, xmm6 ; xmm5 <-- t = t1 + t2 = x + t2
|
||||
|
||||
test r8d, r8d
|
||||
je Ltan_piby4_transform_true_recip_false
|
||||
|
||||
; Here transform and recip are both true.
|
||||
; return transform*(2*t/(t-1) - 1.0);
|
||||
|
||||
movaps xmm0, xmm5
|
||||
subsd xmm5, QWORD PTR L_one
|
||||
movd xmm1, eax
|
||||
addsd xmm0, xmm0
|
||||
divsd xmm0, xmm5
|
||||
cvtdq2pd xmm1, xmm1
|
||||
subsd xmm0, QWORD PTR L_one
|
||||
mulsd xmm0, xmm1
|
||||
RestoreXmm xmm6, tan_piby4_save_xmm6
|
||||
StackDeallocate tan_piby4_stack_size
|
||||
ret 0
|
||||
|
||||
Ltan_piby4_transform_true_recip_false:
|
||||
; Here return transform*(1.0 - 2*t/(1+t));
|
||||
movsd xmm0, QWORD PTR L_one
|
||||
movaps xmm1, xmm5
|
||||
addsd xmm5, xmm0
|
||||
addsd xmm1, xmm1
|
||||
divsd xmm1, xmm5
|
||||
subsd xmm0, xmm1
|
||||
movd xmm1, eax
|
||||
cvtdq2pd xmm1, xmm1
|
||||
mulsd xmm0, xmm1
|
||||
RestoreXmm xmm6, tan_piby4_save_xmm6
|
||||
StackDeallocate tan_piby4_stack_size
|
||||
ret 0
|
||||
|
||||
Ltan_piby4_transform_false:
|
||||
test r8d, r8d
|
||||
je Ltan_piby4_atransform_false_recip_false
|
||||
|
||||
; Here transform is false but recip is true
|
||||
; We return an accurate computation of -1.0/(t1 + t2).
|
||||
|
||||
movsd xmm4, QWORD PTR L_n_one
|
||||
movaps xmm0, xmm5
|
||||
mov rcx, -4294967296 ; ffffffff00000000H
|
||||
addsd xmm0, xmm6
|
||||
movd rax, xmm0 ; really movq
|
||||
divsd xmm4, xmm0
|
||||
and rax, rcx
|
||||
movd xmm3, rax ; really movq
|
||||
movaps xmm1, xmm3
|
||||
subsd xmm1, xmm6
|
||||
|
||||
movd rax, xmm4 ; really movq
|
||||
subsd xmm5, xmm1
|
||||
|
||||
and rax, rcx
|
||||
movd xmm2, rax ; really movq
|
||||
|
||||
; return trec_top + trec * ((1.0 + trec_top * z1) + trec_top * z2);
|
||||
|
||||
movaps xmm0, xmm2
|
||||
mulsd xmm5, xmm2
|
||||
mulsd xmm0, xmm3
|
||||
addsd xmm0, QWORD PTR L_one
|
||||
addsd xmm0, xmm5
|
||||
mulsd xmm0, xmm4
|
||||
addsd xmm0, xmm2
|
||||
|
||||
RestoreXmm xmm6, tan_piby4_save_xmm6
|
||||
StackDeallocate tan_piby4_stack_size
|
||||
ret 0
|
||||
|
||||
Ltan_piby4_atransform_false_recip_false:
|
||||
; Here both transform and recip are false; we just return t1 + t2
|
||||
addsd xmm5, xmm6
|
||||
movaps xmm0, xmm5
|
||||
RestoreXmm xmm6, tan_piby4_save_xmm6
|
||||
StackDeallocate tan_piby4_stack_size
|
||||
ret 0
|
||||
|
||||
_tan_piby4 endp
|
||||
END
|
242
sdk/lib/crt/math/libm_sse2/tan.c
Normal file
242
sdk/lib/crt/math/libm_sse2/tan.c
Normal file
|
@ -0,0 +1,242 @@
|
|||
|
||||
/*******************************************************************************
|
||||
MIT License
|
||||
-----------
|
||||
|
||||
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this Software and associated documentaon files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "libm.h"
|
||||
#include "libm_util.h"
|
||||
|
||||
#define USE_NAN_WITH_FLAGS
|
||||
#define USE_VAL_WITH_FLAGS
|
||||
#define USE_HANDLE_ERROR
|
||||
#include "libm_inlines.h"
|
||||
#undef USE_NAN_WITH_FLAGS
|
||||
#undef USE_VAL_WITH_FLAGS
|
||||
#undef USE_HANDLE_ERROR
|
||||
|
||||
#include "libm_errno.h"
|
||||
|
||||
/* tan(x + xx) approximation valid on the interval [-pi/4,pi/4].
|
||||
If recip is true return -1/tan(x + xx) instead. */
|
||||
static inline double tan_piby4(double x, double xx, int recip)
|
||||
{
|
||||
double r, t1, t2, xl;
|
||||
int transform = 0;
|
||||
static const double
|
||||
piby4_lead = 7.85398163397448278999e-01, /* 0x3fe921fb54442d18 */
|
||||
piby4_tail = 3.06161699786838240164e-17; /* 0x3c81a62633145c06 */
|
||||
|
||||
/* In order to maintain relative precision transform using the identity:
|
||||
tan(pi/4-x) = (1-tan(x))/(1+tan(x)) for arguments close to pi/4.
|
||||
Similarly use tan(x-pi/4) = (tan(x)-1)/(tan(x)+1) close to -pi/4. */
|
||||
|
||||
if (x > 0.68)
|
||||
{
|
||||
transform = 1;
|
||||
x = piby4_lead - x;
|
||||
xl = piby4_tail - xx;
|
||||
x += xl;
|
||||
xx = 0.0;
|
||||
}
|
||||
else if (x < -0.68)
|
||||
{
|
||||
transform = -1;
|
||||
x = piby4_lead + x;
|
||||
xl = piby4_tail + xx;
|
||||
x += xl;
|
||||
xx = 0.0;
|
||||
}
|
||||
|
||||
/* Core Remez [2,3] approximation to tan(x+xx) on the
|
||||
interval [0,0.68]. */
|
||||
|
||||
r = x*x + 2.0 * x * xx;
|
||||
t1 = x;
|
||||
t2 = xx + x*r*
|
||||
(0.372379159759792203640806338901e0 +
|
||||
(-0.229345080057565662883358588111e-1 +
|
||||
0.224044448537022097264602535574e-3*r)*r)/
|
||||
(0.111713747927937668539901657944e1 +
|
||||
(-0.515658515729031149329237816945e0 +
|
||||
(0.260656620398645407524064091208e-1 -
|
||||
0.232371494088563558304549252913e-3*r)*r)*r);
|
||||
|
||||
/* Reconstruct tan(x) in the transformed case. */
|
||||
|
||||
if (transform)
|
||||
{
|
||||
double t;
|
||||
t = t1 + t2;
|
||||
if (recip)
|
||||
return transform*(2*t/(t-1) - 1.0);
|
||||
else
|
||||
return transform*(1.0 - 2*t/(1+t));
|
||||
}
|
||||
|
||||
if (recip)
|
||||
{
|
||||
/* Compute -1.0/(t1 + t2) accurately */
|
||||
double trec, trec_top, z1, z2, t;
|
||||
unsigned long u;
|
||||
t = t1 + t2;
|
||||
GET_BITS_DP64(t, u);
|
||||
u &= 0xffffffff00000000;
|
||||
PUT_BITS_DP64(u, z1);
|
||||
z2 = t2 - (z1 - t1);
|
||||
trec = -1.0 / t;
|
||||
GET_BITS_DP64(trec, u);
|
||||
u &= 0xffffffff00000000;
|
||||
PUT_BITS_DP64(u, trec_top);
|
||||
return trec_top + trec * ((1.0 + trec_top * z1) + trec_top * z2);
|
||||
|
||||
}
|
||||
else
|
||||
return t1 + t2;
|
||||
}
|
||||
|
||||
#pragma function(tan)
|
||||
|
||||
double tan(double x)
|
||||
{
|
||||
double r, rr;
|
||||
int region, xneg;
|
||||
|
||||
unsigned long ux, ax;
|
||||
GET_BITS_DP64(x, ux);
|
||||
ax = (ux & ~SIGNBIT_DP64);
|
||||
if (ax <= 0x3fe921fb54442d18) /* abs(x) <= pi/4 */
|
||||
{
|
||||
if (ax < 0x3f20000000000000) /* abs(x) < 2.0^(-13) */
|
||||
{
|
||||
if (ax < 0x3e40000000000000) /* abs(x) < 2.0^(-27) */
|
||||
{
|
||||
if (ax == 0x0000000000000000) return x;
|
||||
else return val_with_flags(x, AMD_F_INEXACT);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Using a temporary variable prevents 64-bit VC++ from
|
||||
rearranging
|
||||
x + x*x*x*0.333333333333333333;
|
||||
into
|
||||
x * (1 + x*x*0.333333333333333333);
|
||||
The latter results in an incorrectly rounded answer. */
|
||||
double tmp;
|
||||
tmp = x*x*x*0.333333333333333333;
|
||||
return x + tmp;
|
||||
}
|
||||
}
|
||||
else
|
||||
return tan_piby4(x, 0.0, 0);
|
||||
}
|
||||
else if ((ux & EXPBITS_DP64) == EXPBITS_DP64)
|
||||
{
|
||||
/* x is either NaN or infinity */
|
||||
if (ux & MANTBITS_DP64)
|
||||
/* x is NaN */
|
||||
return _handle_error("tan", OP_TAN, ux|0x0008000000000000, _DOMAIN, 0,
|
||||
EDOM, x, 0.0, 1);
|
||||
else
|
||||
/* x is infinity. Return a NaN */
|
||||
return _handle_error("tan", OP_TAN, INDEFBITPATT_DP64, _DOMAIN, AMD_F_INVALID,
|
||||
EDOM, x, 0.0, 1);
|
||||
}
|
||||
xneg = (ax != ux);
|
||||
|
||||
|
||||
if (xneg)
|
||||
x = -x;
|
||||
|
||||
if (x < 5.0e5)
|
||||
{
|
||||
/* For these size arguments we can just carefully subtract the
|
||||
appropriate multiple of pi/2, using extra precision where
|
||||
x is close to an exact multiple of pi/2 */
|
||||
static const double
|
||||
twobypi = 6.36619772367581382433e-01, /* 0x3fe45f306dc9c883 */
|
||||
piby2_1 = 1.57079632673412561417e+00, /* 0x3ff921fb54400000 */
|
||||
piby2_1tail = 6.07710050650619224932e-11, /* 0x3dd0b4611a626331 */
|
||||
piby2_2 = 6.07710050630396597660e-11, /* 0x3dd0b4611a600000 */
|
||||
piby2_2tail = 2.02226624879595063154e-21, /* 0x3ba3198a2e037073 */
|
||||
piby2_3 = 2.02226624871116645580e-21, /* 0x3ba3198a2e000000 */
|
||||
piby2_3tail = 8.47842766036889956997e-32; /* 0x397b839a252049c1 */
|
||||
double t, rhead, rtail;
|
||||
int npi2;
|
||||
unsigned long uy, xexp, expdiff;
|
||||
xexp = ax >> EXPSHIFTBITS_DP64;
|
||||
/* How many pi/2 is x a multiple of? */
|
||||
if (ax <= 0x400f6a7a2955385e) /* 5pi/4 */
|
||||
{
|
||||
if (ax <= 0x4002d97c7f3321d2) /* 3pi/4 */
|
||||
npi2 = 1;
|
||||
else
|
||||
npi2 = 2;
|
||||
}
|
||||
else if (ax <= 0x401c463abeccb2bb) /* 9pi/4 */
|
||||
{
|
||||
if (ax <= 0x4015fdbbe9bba775) /* 7pi/4 */
|
||||
npi2 = 3;
|
||||
else
|
||||
npi2 = 4;
|
||||
}
|
||||
else
|
||||
npi2 = (int)(x * twobypi + 0.5);
|
||||
/* Subtract the multiple from x to get an extra-precision remainder */
|
||||
rhead = x - npi2 * piby2_1;
|
||||
rtail = npi2 * piby2_1tail;
|
||||
GET_BITS_DP64(rhead, uy);
|
||||
expdiff = xexp - ((uy & EXPBITS_DP64) >> EXPSHIFTBITS_DP64);
|
||||
if (expdiff > 15)
|
||||
{
|
||||
/* The remainder is pretty small compared with x, which
|
||||
implies that x is a near multiple of pi/2
|
||||
(x matches the multiple to at least 15 bits) */
|
||||
t = rhead;
|
||||
rtail = npi2 * piby2_2;
|
||||
rhead = t - rtail;
|
||||
rtail = npi2 * piby2_2tail - ((t - rhead) - rtail);
|
||||
if (expdiff > 48)
|
||||
{
|
||||
/* x matches a pi/2 multiple to at least 48 bits */
|
||||
t = rhead;
|
||||
rtail = npi2 * piby2_3;
|
||||
rhead = t - rtail;
|
||||
rtail = npi2 * piby2_3tail - ((t - rhead) - rtail);
|
||||
}
|
||||
}
|
||||
r = rhead - rtail;
|
||||
rr = (rhead - r) - rtail;
|
||||
region = npi2 & 3;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Reduce x into range [-pi/4,pi/4] */
|
||||
__remainder_piby2(x, &r, &rr, ®ion);
|
||||
}
|
||||
|
||||
if (xneg)
|
||||
return -tan_piby4(r, rr, region & 1);
|
||||
else
|
||||
return tan_piby4(r, rr, region & 1);
|
||||
}
|
551
sdk/lib/crt/math/libm_sse2/tanf.asm
Normal file
551
sdk/lib/crt/math/libm_sse2/tanf.asm
Normal file
|
@ -0,0 +1,551 @@
|
|||
;
|
||||
;
|
||||
; MIT License
|
||||
; -----------
|
||||
;
|
||||
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
||||
;
|
||||
; Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
; of this Software and associated documentaon files (the "Software"), to deal
|
||||
; in the Software without restriction, including without limitation the rights
|
||||
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
; copies of the Software, and to permit persons to whom the Software is
|
||||
; furnished to do so, subject to the following conditions:
|
||||
;
|
||||
; The above copyright notice and this permission notice shall be included in
|
||||
; all copies or substantial portions of the Software.
|
||||
;
|
||||
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
; THE SOFTWARE.
|
||||
;
|
||||
; An implementation of the tanf function using the fma3 instruction.
|
||||
;
|
||||
; Prototype:
|
||||
;
|
||||
; float tanf(float x);
|
||||
;
|
||||
; Computes tanf(x).
|
||||
; It will provide proper C99 return values,
|
||||
; but may not raise floating point status bits properly.
|
||||
; Based on the NAG C implementation.
|
||||
;
|
||||
.const
|
||||
ALIGN 16
|
||||
L_sign_mask DQ 07FFFFFFFFFFFFFFFh
|
||||
DQ 07FFFFFFFFFFFFFFFh
|
||||
L_twobypi DQ 03FE45F306DC9C883h
|
||||
DQ 03FE45F306DC9C883h
|
||||
L_int_three DQ 00000000000000003h
|
||||
DQ 00000000000000003h
|
||||
L_int_one DQ 00000000000000001h
|
||||
DQ 00000000000000001h
|
||||
L_signbit DQ 08000000000000000h
|
||||
DQ 08000000000000000h
|
||||
|
||||
L_tanf DQ 03FD8A8B0DA56CB17h ; c0
|
||||
DQ 0BF919DBA6EFD6AADh ; c1
|
||||
DQ 03FF27E84A3E73A2Eh ; d0
|
||||
DQ 0BFE07266D7B3511Bh ; d1
|
||||
DQ 03F92E29003C692D9h ; d2
|
||||
|
||||
L_large_x_sse2 DQ 04160000000000000h ; 8388608.
|
||||
L_large_x_fma3 DQ 041E921FB40000000h ; 3.373259264e9
|
||||
L_point_333 DQ 03FD5555555555555h
|
||||
L_mask_3e4 DQ 03e40000000000000h
|
||||
L_mask_3f2 DQ 03f20000000000000h
|
||||
L_point_five DQ 03FE0000000000000h
|
||||
L_piby2_1 DQ 03FF921FB54400000h
|
||||
L_piby2_1tail DQ 03DD0B4611A626331h
|
||||
L_piby2_lead DQ 03ff921fb54442d18h
|
||||
L_n_one DQ 0BFF0000000000000h
|
||||
L_piby4 DQ 03fe921fb54442d18h
|
||||
L_min_norm DQ 00010000000000000h
|
||||
|
||||
|
||||
L_inf_mask_32 DD 07F800000h
|
||||
DD 07F800000h
|
||||
|
||||
EXTRN __use_fma3_lib:DWORD
|
||||
EXTRN __L_2_by_pi_bits:BYTE
|
||||
|
||||
fname TEXTEQU <tanf>
|
||||
fname_special TEXTEQU <_tanf_special>
|
||||
|
||||
; define local variable storage offsets
|
||||
; actually there aren't any, but we need to leave room for _tanf_special.
|
||||
dummy_space EQU 20h
|
||||
stack_size EQU 38h
|
||||
|
||||
include fm.inc
|
||||
|
||||
;Define name and any external functions being called
|
||||
EXTERN fname_special : PROC
|
||||
|
||||
.code
|
||||
PUBLIC fname
|
||||
fname PROC FRAME
|
||||
StackAllocate stack_size
|
||||
.ENDPROLOG
|
||||
cmp DWORD PTR __use_fma3_lib, 0
|
||||
jne Ltanf_fma3
|
||||
|
||||
Ltanf_sse2:
|
||||
movd eax,xmm0
|
||||
mov r8d,L_inf_mask_32
|
||||
and eax,r8d
|
||||
cmp eax, r8d
|
||||
jz Ltanf_sse2_naninf
|
||||
|
||||
cvtss2sd xmm5,xmm0
|
||||
movd r9,xmm5
|
||||
btr r9,63 ; r9 <-- |x|
|
||||
|
||||
cmp r9,L_piby4
|
||||
jg Ltanf_sse2_range_reduce
|
||||
cmp r9,L_mask_3f2 ; compare to 2^-13 = 0.0001220703125
|
||||
jge Ltanf_sse2_compute_tanf_piby_4
|
||||
cmp r9,L_mask_3e4 ; compare to 2^-27 = 7.4505805969238281e-009
|
||||
jge Ltanf_sse2_compute_x_xxx_0_333
|
||||
; At this point tan(x) ~= x; if it's not exact, set the inexact flag.
|
||||
|
||||
test r9, r9
|
||||
je Ltanf_sse2_exact_return
|
||||
movsd xmm1, L_n_one
|
||||
addsd xmm1, L_min_norm ; set inexact
|
||||
|
||||
Ltanf_sse2_exact_return:
|
||||
StackDeallocate stack_size
|
||||
ret
|
||||
|
||||
ALIGN 16
|
||||
Ltanf_sse2_compute_x_xxx_0_333:
|
||||
movapd xmm2,xmm5
|
||||
mulsd xmm2,xmm2 ; xmm2 <-- x^2
|
||||
movapd xmm0,xmm2
|
||||
mulsd xmm0,xmm5 ; xmm0 <-- x^3
|
||||
mulsd xmm0,L_point_333
|
||||
addsd xmm0,xmm5 ; x + x*x*x*0.3333333333333333;
|
||||
jmp Ltanf_sse2_return_s
|
||||
|
||||
ALIGN 16
|
||||
Ltanf_sse2_compute_tanf_piby_4:
|
||||
movapd xmm0,xmm5 ; xmm0 <-- x (as double)
|
||||
|
||||
movapd xmm1,xmm0
|
||||
mulsd xmm1,xmm0 ; xmm1 <-- x*x
|
||||
|
||||
movsd xmm3,L_tanf+008h ; xmm3 <-- c1
|
||||
mulsd xmm3,xmm1 ; xmm3 <-- c1*x^2
|
||||
addsd xmm3,L_tanf ; xmm3 <-- c = c1*x^2 + c0
|
||||
|
||||
movsd xmm2,L_tanf+020h ; xmm2 <-- d2
|
||||
mulsd xmm2,xmm1 ; xmm2 <-- d2*x^2
|
||||
addsd xmm2,L_tanf+018h ; xmm2 <-- d2*x^2 + d1
|
||||
mulsd xmm2,xmm1 ; xmm2 <-- (d2*x^2 + d1)*x^2
|
||||
addsd xmm2,L_tanf+010h ; xmm2 <-- d = (d2*x^2 + d1)*x^2 + d0
|
||||
divsd xmm3,xmm2 ; xmm3 <-- c/d
|
||||
mulsd xmm1,xmm0 ; xmm1 <-- x^3
|
||||
mulsd xmm1,xmm3 ; xmm1 <-- x^3 * c/d
|
||||
addsd xmm0,xmm1 ; xmm0 <-- x + x^3 * c/d
|
||||
jmp Ltanf_sse2_return_s
|
||||
|
||||
Ltanf_sse2_range_reduce:
|
||||
movd xmm0,r9
|
||||
cmp r9,L_large_x_sse2
|
||||
jge Ltanf_sse2_tanf_reduce_large
|
||||
|
||||
Ltanf_sse2_tanf_reduce_moderate:
|
||||
movapd xmm1,xmm0
|
||||
andpd xmm1,L_sign_mask
|
||||
movapd xmm2,L_twobypi
|
||||
mulsd xmm2,xmm1
|
||||
addsd xmm2,L_point_five
|
||||
cvttpd2dq xmm4,xmm2
|
||||
cvtdq2pd xmm1,xmm4
|
||||
andpd xmm4,L_int_three ; xmm4 <-- region
|
||||
movapd xmm2,xmm0
|
||||
|
||||
movapd xmm3,xmm1
|
||||
mulsd xmm1,L_piby2_1
|
||||
subsd xmm2,xmm1
|
||||
mulsd xmm3,L_piby2_1tail ; xmm3 rtail
|
||||
movapd xmm0,xmm2
|
||||
subsd xmm0,xmm3
|
||||
subsd xmm2,xmm0
|
||||
movapd xmm1,xmm2
|
||||
subsd xmm1,xmm3
|
||||
jmp Ltanf_sse2_exit_s
|
||||
|
||||
Ltanf_sse2_tanf_reduce_large:
|
||||
lea r9,__L_2_by_pi_bits
|
||||
;xexp = (x >> 52) 1023
|
||||
movd r11,xmm0
|
||||
mov rcx,r11
|
||||
shr r11,52
|
||||
sub r11,1023 ; r11 <-- xexp = exponent of input x
|
||||
;calculate the last byte from which to start multiplication
|
||||
;last = 134 (xexp >> 3)
|
||||
mov r10,r11
|
||||
shr r10,3
|
||||
sub r10,134 ; r10 <-- -last
|
||||
neg r10 ; r10 <-- last
|
||||
;load 64 bits of 2_by_pi
|
||||
mov rax,[r9+r10]
|
||||
;mantissa of x = ((x << 12) >> 12) | implied bit
|
||||
shl rcx,12
|
||||
shr rcx,12 ; rcx <-- mantissa part of input x
|
||||
bts rcx,52 ; add the implied bit as well
|
||||
;load next 128 bits of 2_by_pi
|
||||
add r10,8 ; increment to next 8 bytes of 2_by_pi
|
||||
movdqu xmm0,[r9+r10]
|
||||
;do three 64bit multiplications with mant of x
|
||||
mul rcx
|
||||
mov r8,rax ; r8 = last 64 bits of mul = res1[2]
|
||||
mov r10,rdx ; r10 = carry
|
||||
vmovq rax,xmm0
|
||||
mul rcx
|
||||
;resexp = xexp & 7
|
||||
and r11,7 ; r11 <-- resexp = last 3 bits of xexp
|
||||
psrldq xmm0,8
|
||||
add rax,r10 ; add the previous carry
|
||||
adc rdx,0
|
||||
mov r9,rax ; r9 <-- next 64 bits of mul = res1[1]
|
||||
mov r10,rdx ; r10 <-- carry
|
||||
movd rax,xmm0
|
||||
mul rcx
|
||||
add r10,rax ;r10 = most sig 64 bits = res1[0]
|
||||
;find the region
|
||||
;last three bits ltb = most sig bits >> (54 resexp))
|
||||
; decimal point in last 18 bits == 8 lsb's in first 64 bits
|
||||
; and 8 msb's in next 64 bits
|
||||
;point_five = ltb & 01h;
|
||||
;region = ((ltb >> 1) + point_five) & 3;
|
||||
mov rcx,54
|
||||
mov rax,r10
|
||||
sub rcx,r11
|
||||
xor rdx,rdx ;rdx = sign of x
|
||||
shr rax,cl
|
||||
jnc Ltanf_sse2_no_point_five_f
|
||||
;;if there is carry.. then negate the result of multiplication
|
||||
not r10
|
||||
not r9
|
||||
not r8
|
||||
mov rdx,08000000000000000h
|
||||
ALIGN 16
|
||||
Ltanf_sse2_no_point_five_f:
|
||||
adc rax,0
|
||||
and rax,3
|
||||
movd xmm4,eax ; xmm4 <-- region
|
||||
;calculate the number of integer bits and zero them out
|
||||
mov rcx,r11
|
||||
add rcx,10 ; rcx = no. of integer bits
|
||||
shl r10,cl
|
||||
shr r10,cl ; r10 contains only mant bits
|
||||
sub rcx,64 ; form the exponent
|
||||
mov r11,rcx
|
||||
;find the highest set bit
|
||||
bsr rcx,r10
|
||||
jnz Ltanf_sse2_form_mantissa_f
|
||||
mov r10,r9
|
||||
mov r9,r8
|
||||
mov r8,0
|
||||
bsr rcx,r10 ;rcx = hsb
|
||||
sub r11,64
|
||||
ALIGN 16
|
||||
Ltanf_sse2_form_mantissa_f:
|
||||
add r11,rcx ; for exp of x
|
||||
sub rcx,52 ; rcx = no. of bits to shift in r10
|
||||
cmp rcx,0
|
||||
jl Ltanf_sse2_hsb_below_52_f
|
||||
je Ltanf_sse2_form_numbers_f
|
||||
;hsb above 52
|
||||
mov r8,r10
|
||||
shr r10,cl ; r10 = mantissa of x with hsb at 52
|
||||
shr r9,cl ; make space for bits from r10
|
||||
sub rcx,64
|
||||
neg rcx ; rcx = no of bits to shift r10
|
||||
shl r8,cl
|
||||
or r9,r8 ; r9 = mantissa bits of xx
|
||||
jmp Ltanf_sse2_form_numbers_f
|
||||
|
||||
ALIGN 16
|
||||
Ltanf_sse2_hsb_below_52_f:
|
||||
neg rcx
|
||||
mov rax,r9
|
||||
shl r10,cl
|
||||
shl r9,cl
|
||||
sub rcx,64
|
||||
neg rcx
|
||||
shr rax,cl
|
||||
or r10,rax
|
||||
shr r8,cl
|
||||
or r9,r8
|
||||
ALIGN 16
|
||||
Ltanf_sse2_form_numbers_f:
|
||||
add r11,1023
|
||||
btr r10,52 ; remove the implied bit
|
||||
mov rcx,r11
|
||||
or r10,rdx ; put the sign
|
||||
shl rcx,52
|
||||
or r10,rcx ; x is in r10
|
||||
movd xmm0,r10 ; xmm0 <-- x
|
||||
mulsd xmm0,L_piby2_lead
|
||||
|
||||
Ltanf_sse2_exit_s:
|
||||
movd eax,xmm4
|
||||
and eax,1 ; eax <-- region & 1
|
||||
movapd xmm1,xmm0
|
||||
mulsd xmm1,xmm0 ; xmm1 <-- x*x
|
||||
|
||||
movsd xmm3,L_tanf+008h ; xmm3 <-- c1
|
||||
mulsd xmm3,xmm1 ; xmm3 <-- c1*x^2
|
||||
addsd xmm3,L_tanf ; xmm3 <-- c = c1*x^2 + c0
|
||||
|
||||
movsd xmm2,L_tanf+020h ; xmm2 <-- d2
|
||||
mulsd xmm2,xmm1 ; xmm2 <-- d2*x^2
|
||||
addsd xmm2,L_tanf+018h ; xmm2 <-- d2*x^2 + d1
|
||||
mulsd xmm2,xmm1 ; xmm2 <-- (d2*x^2 + d1)*x^2
|
||||
addsd xmm2,L_tanf+010h ; xmm2 <-- d = (d2*x^2 + d1)*x^2 + d0
|
||||
divsd xmm3,xmm2 ; xmm3 <-- c/d
|
||||
mulsd xmm1,xmm0 ; xmm1 <-- x^3
|
||||
mulsd xmm1,xmm3 ; xmm1 <-- x^3 * c/d
|
||||
addsd xmm0,xmm1 ; xmm0 <-- x + x^3 * c/d
|
||||
cmp eax,01h
|
||||
jne Ltanf_sse2_exit_tanpiby4
|
||||
Ltanf_sse2_recip :
|
||||
movd xmm3,L_n_one
|
||||
divsd xmm3,xmm0
|
||||
movsd xmm0,xmm3
|
||||
Ltanf_sse2_exit_tanpiby4 :
|
||||
andpd xmm5,L_signbit
|
||||
xorpd xmm0,xmm5
|
||||
|
||||
Ltanf_sse2_return_s:
|
||||
cvtsd2ss xmm0,xmm0
|
||||
Ltanf_sse2_return_c:
|
||||
StackDeallocate stack_size
|
||||
ret
|
||||
|
||||
Ltanf_sse2_naninf:
|
||||
call fname_special
|
||||
StackDeallocate stack_size
|
||||
ret
|
||||
|
||||
ALIGN 16
|
||||
Ltanf_fma3:
|
||||
vmovd eax,xmm0
|
||||
mov r8d,L_inf_mask_32
|
||||
and eax,r8d
|
||||
cmp eax, r8d
|
||||
jz Ltanf_fma3_naninf
|
||||
|
||||
vcvtss2sd xmm5,xmm0,xmm0
|
||||
vmovq r9,xmm5
|
||||
btr r9,63 ; r9 <-- |x|
|
||||
|
||||
cmp r9,L_piby4
|
||||
jg Ltanf_fma3_range_reduce
|
||||
cmp r9,L_mask_3f2
|
||||
jge Ltanf_fma3_compute_tanf_piby_4
|
||||
cmp r9,L_mask_3e4
|
||||
jge Ltanf_fma3_compute_x_xxx_0_333
|
||||
jmp Ltanf_fma3_return_c
|
||||
|
||||
Ltanf_fma3_compute_x_xxx_0_333:
|
||||
vmulsd xmm2,xmm5,xmm5
|
||||
vmulsd xmm0,xmm2,xmm5
|
||||
vfmadd132sd xmm0,xmm5,L_point_333 ; x + x*x*x*0.3333333333333333;
|
||||
jmp Ltanf_fma3_return_s
|
||||
|
||||
Ltanf_fma3_compute_tanf_piby_4:
|
||||
vmovsd xmm0,xmm5,xmm5
|
||||
vmulsd xmm1,xmm0,xmm0
|
||||
vmovsd xmm3,L_tanf+008h
|
||||
vfmadd213sd xmm3,xmm1,L_tanf
|
||||
vmovsd xmm2,L_tanf+020h
|
||||
vfmadd213sd xmm2,xmm1,L_tanf+018h
|
||||
vfmadd213sd xmm2,xmm1,L_tanf+010h
|
||||
vdivsd xmm3,xmm3,xmm2
|
||||
vmulsd xmm1,xmm1,xmm0
|
||||
vfmadd231sd xmm0,xmm1,xmm3
|
||||
jmp Ltanf_fma3_return_s
|
||||
|
||||
Ltanf_fma3_range_reduce:
|
||||
vmovq xmm0,r9
|
||||
cmp r9,L_large_x_fma3
|
||||
jge Ltanf_fma3_tanf_reduce_large
|
||||
|
||||
Ltanf_fma3_tanf_reduce_moderate:
|
||||
vandpd xmm1,xmm0,L_sign_mask
|
||||
vmovapd xmm2,L_twobypi
|
||||
vfmadd213sd xmm2,xmm1,L_point_five
|
||||
vcvttpd2dq xmm2,xmm2
|
||||
vpmovsxdq xmm1,xmm2
|
||||
vandpd xmm4,xmm1,L_int_three ; xmm4 <-- region
|
||||
vshufps xmm1 ,xmm1,xmm1,8
|
||||
vcvtdq2pd xmm1,xmm1
|
||||
vmovdqa xmm2,xmm0
|
||||
vfnmadd231sd xmm2,xmm1,L_piby2_1 ; xmm2 rhead
|
||||
vmulsd xmm3,xmm1,L_piby2_1tail ; xmm3 rtail
|
||||
vsubsd xmm0,xmm2,xmm3
|
||||
vsubsd xmm2,xmm2,xmm0
|
||||
vsubsd xmm1,xmm2,xmm3
|
||||
jmp Ltanf_fma3_exit_s
|
||||
|
||||
Ltanf_fma3_tanf_reduce_large:
|
||||
lea r9,__L_2_by_pi_bits
|
||||
;xexp = (x >> 52) 1023
|
||||
vmovq r11,xmm0
|
||||
mov rcx,r11
|
||||
shr r11,52
|
||||
sub r11,1023 ; r11 <-- xexp = exponent of input x
|
||||
;calculate the last byte from which to start multiplication
|
||||
;last = 134 (xexp >> 3)
|
||||
mov r10,r11
|
||||
shr r10,3
|
||||
sub r10,134 ; r10 <-- -last
|
||||
neg r10 ; r10 <-- last
|
||||
;load 64 bits of 2_by_pi
|
||||
mov rax,[r9+r10]
|
||||
;mantissa of x = ((x << 12) >> 12) | implied bit
|
||||
shl rcx,12
|
||||
shr rcx,12 ; rcx <-- mantissa part of input x
|
||||
bts rcx,52 ; add the implied bit as well
|
||||
;load next 128 bits of 2_by_pi
|
||||
add r10,8 ; increment to next 8 bytes of 2_by_pi
|
||||
vmovdqu xmm0,XMMWORD PTR[r9+r10]
|
||||
;do three 64bit multiplications with mant of x
|
||||
mul rcx
|
||||
mov r8,rax ; r8 = last 64 bits of mul = res1[2]
|
||||
mov r10,rdx ; r10 = carry
|
||||
vmovq rax,xmm0
|
||||
mul rcx
|
||||
;resexp = xexp & 7
|
||||
and r11,7 ; r11 <-- resexp = last 3 bits of xexp
|
||||
vpsrldq xmm0,xmm0,8
|
||||
add rax,r10 ; add the previous carry
|
||||
adc rdx,0
|
||||
mov r9,rax ; r9 <-- next 64 bits of mul = res1[1]
|
||||
mov r10,rdx ; r10 <-- carry
|
||||
vmovq rax,xmm0
|
||||
mul rcx
|
||||
add r10,rax ;r10 = most sig 64 bits = res1[0]
|
||||
;find the region
|
||||
;last three bits ltb = most sig bits >> (54 resexp))
|
||||
; decimal point in last 18 bits == 8 lsb's in first 64 bits
|
||||
; and 8 msb's in next 64 bits
|
||||
;point_five = ltb & 01h;
|
||||
;region = ((ltb >> 1) + point_five) & 3;
|
||||
mov rcx,54
|
||||
mov rax,r10
|
||||
sub rcx,r11
|
||||
xor rdx,rdx ;rdx = sign of x
|
||||
shr rax,cl
|
||||
jnc Ltanf_fma3_no_point_five_f
|
||||
;;if there is carry.. then negate the result of multiplication
|
||||
not r10
|
||||
not r9
|
||||
not r8
|
||||
mov rdx,08000000000000000h
|
||||
ALIGN 16
|
||||
Ltanf_fma3_no_point_five_f:
|
||||
adc rax,0
|
||||
and rax,3
|
||||
vmovd xmm4,eax ; xmm4 <-- region
|
||||
;calculate the number of integer bits and zero them out
|
||||
mov rcx,r11
|
||||
add rcx,10 ; rcx = no. of integer bits
|
||||
shl r10,cl
|
||||
shr r10,cl ; r10 contains only mant bits
|
||||
sub rcx,64 ; form the exponent
|
||||
mov r11,rcx
|
||||
;find the highest set bit
|
||||
bsr rcx,r10
|
||||
jnz Ltanf_fma3_form_mantissa_f
|
||||
mov r10,r9
|
||||
mov r9,r8
|
||||
mov r8,0
|
||||
bsr rcx,r10 ;rcx = hsb
|
||||
sub r11,64
|
||||
ALIGN 16
|
||||
Ltanf_fma3_form_mantissa_f:
|
||||
add r11,rcx ; for exp of x
|
||||
sub rcx,52 ; rcx = no. of bits to shift in r10
|
||||
cmp rcx,0
|
||||
jl Ltanf_fma3_hsb_below_52_f
|
||||
je Ltanf_fma3_form_numbers_f
|
||||
;hsb above 52
|
||||
mov r8,r10
|
||||
shr r10,cl ; r10 = mantissa of x with hsb at 52
|
||||
shr r9,cl ; make space for bits from r10
|
||||
sub rcx,64
|
||||
neg rcx ; rcx = no of bits to shift r10
|
||||
shl r8,cl
|
||||
or r9,r8 ; r9 = mantissa bits of xx
|
||||
jmp Ltanf_fma3_form_numbers_f
|
||||
|
||||
ALIGN 16
|
||||
Ltanf_fma3_hsb_below_52_f:
|
||||
neg rcx
|
||||
mov rax,r9
|
||||
shl r10,cl
|
||||
shl r9,cl
|
||||
sub rcx,64
|
||||
neg rcx
|
||||
shr rax,cl
|
||||
or r10,rax
|
||||
shr r8,cl
|
||||
or r9,r8
|
||||
ALIGN 16
|
||||
Ltanf_fma3_form_numbers_f:
|
||||
add r11,1023
|
||||
btr r10,52 ; remove the implied bit
|
||||
mov rcx,r11
|
||||
or r10,rdx ; put the sign
|
||||
shl rcx,52
|
||||
or r10,rcx ; x is in r10
|
||||
vmovq xmm0,r10 ; xmm0 <-- x
|
||||
vmulsd xmm0,xmm0,L_piby2_lead
|
||||
|
||||
Ltanf_fma3_exit_s:
|
||||
vandpd xmm2,xmm4,XMMWORD PTR L_int_one
|
||||
vmovd eax,xmm2
|
||||
vmulsd xmm1,xmm0,xmm0
|
||||
vmovsd xmm3,L_tanf+008h
|
||||
vfmadd213sd xmm3,xmm1,L_tanf
|
||||
vmovsd xmm2,L_tanf+020h
|
||||
vfmadd213sd xmm2,xmm1,L_tanf+018h
|
||||
vfmadd213sd xmm2,xmm1,L_tanf+010h
|
||||
vdivsd xmm3,xmm3,xmm2
|
||||
vmulsd xmm1,xmm1,xmm0
|
||||
vfmadd231sd xmm0,xmm1,xmm3
|
||||
cmp eax,01h
|
||||
je Ltanf_fma3_recip
|
||||
jmp Ltanf_fma3_exit_tanpiby4
|
||||
|
||||
Ltanf_fma3_recip :
|
||||
vmovq xmm3,L_n_one
|
||||
vdivsd xmm0,xmm3,xmm0
|
||||
|
||||
Ltanf_fma3_exit_tanpiby4 :
|
||||
vandpd xmm5,xmm5,L_signbit
|
||||
vxorpd xmm0,xmm0,xmm5
|
||||
|
||||
Ltanf_fma3_return_s:
|
||||
vcvtsd2ss xmm0,xmm0,xmm0
|
||||
Ltanf_fma3_return_c:
|
||||
StackDeallocate stack_size
|
||||
ret
|
||||
|
||||
Ltanf_fma3_naninf:
|
||||
call fname_special
|
||||
StackDeallocate stack_size
|
||||
ret
|
||||
|
||||
fname endp
|
||||
END
|
193
sdk/lib/crt/math/libm_sse2/tanf.c
Normal file
193
sdk/lib/crt/math/libm_sse2/tanf.c
Normal file
|
@ -0,0 +1,193 @@
|
|||
|
||||
/*******************************************************************************
|
||||
MIT License
|
||||
-----------
|
||||
|
||||
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this Software and associated documentaon files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "libm.h"
|
||||
#include "libm_util.h"
|
||||
|
||||
#define USE_REMAINDER_PIBY2F_INLINE
|
||||
#define USE_VALF_WITH_FLAGS
|
||||
#define USE_NANF_WITH_FLAGS
|
||||
#define USE_HANDLE_ERRORF
|
||||
#include "libm_inlines.h"
|
||||
#undef USE_VALF_WITH_FLAGS
|
||||
#undef USE_NANF_WITH_FLAGS
|
||||
#undef USE_REMAINDER_PIBY2F_INLINE
|
||||
#undef USE_HANDLE_ERRORF
|
||||
|
||||
#include "libm_errno.h"
|
||||
|
||||
// Disable "C4163: not available as intrinsic function" warning that older
|
||||
// compilers may issue here.
|
||||
#pragma warning(disable:4163)
|
||||
#pragma function(tanf)
|
||||
|
||||
/* tan(x) approximation valid on the interval [-pi/4,pi/4].
|
||||
If recip is true return -1/tan(x) instead. */
|
||||
static inline double tanf_piby4(double x, int recip)
|
||||
{
|
||||
double r, t;
|
||||
|
||||
/* Core Remez [1,2] approximation to tan(x) on the
|
||||
interval [0,pi/4]. */
|
||||
r = x*x;
|
||||
t = x + x*r*
|
||||
(0.385296071263995406715129e0 -
|
||||
0.172032480471481694693109e-1 * r) /
|
||||
(0.115588821434688393452299e+1 +
|
||||
(-0.51396505478854532132342e0 +
|
||||
0.1844239256901656082986661e-1 * r) * r);
|
||||
|
||||
if (recip)
|
||||
return -1.0 / t;
|
||||
else
|
||||
return t;
|
||||
}
|
||||
|
||||
|
||||
float tanf(float x)
|
||||
{
|
||||
double r, dx;
|
||||
int region, xneg;
|
||||
|
||||
unsigned long ux, ax;
|
||||
|
||||
dx = x;
|
||||
|
||||
GET_BITS_DP64(dx, ux);
|
||||
ax = (ux & ~SIGNBIT_DP64);
|
||||
|
||||
if (ax <= 0x3fe921fb54442d18) /* abs(x) <= pi/4 */
|
||||
{
|
||||
if (ax < 0x3f80000000000000) /* abs(x) < 2.0^(-7) */
|
||||
{
|
||||
if (ax < 0x3f20000000000000) /* abs(x) < 2.0^(-13) */
|
||||
{
|
||||
if (ax == 0x0000000000000000)
|
||||
return x;
|
||||
else
|
||||
return valf_with_flags(x, AMD_F_INEXACT);
|
||||
}
|
||||
else
|
||||
return (float)(dx + dx*dx*dx*0.333333333333333333);
|
||||
}
|
||||
else
|
||||
return (float)tanf_piby4(x, 0);
|
||||
}
|
||||
else if ((ux & EXPBITS_DP64) == EXPBITS_DP64)
|
||||
{
|
||||
/* x is either NaN or infinity */
|
||||
if (ux & MANTBITS_DP64)
|
||||
{
|
||||
/* x is NaN */
|
||||
unsigned int ufx;
|
||||
GET_BITS_SP32(x, ufx);
|
||||
return _handle_errorf("tanf", OP_TAN, ufx|0x00400000, _DOMAIN, 0,
|
||||
EDOM, x, 0.0F, 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* x is infinity. Return a NaN */
|
||||
return _handle_errorf("tanf", OP_TAN, INDEFBITPATT_SP32, _DOMAIN, AMD_F_INVALID,
|
||||
EDOM, x, 0.0F, 1);
|
||||
}
|
||||
}
|
||||
|
||||
xneg = (int)(ux >> 63);
|
||||
|
||||
if (xneg)
|
||||
dx = -dx;
|
||||
|
||||
if (dx < 5.0e5)
|
||||
{
|
||||
/* For these size arguments we can just carefully subtract the
|
||||
appropriate multiple of pi/2, using extra precision where
|
||||
dx is close to an exact multiple of pi/2 */
|
||||
static const double
|
||||
twobypi = 6.36619772367581382433e-01, /* 0x3fe45f306dc9c883 */
|
||||
piby2_1 = 1.57079632673412561417e+00, /* 0x3ff921fb54400000 */
|
||||
piby2_1tail = 6.07710050650619224932e-11, /* 0x3dd0b4611a626331 */
|
||||
piby2_2 = 6.07710050630396597660e-11, /* 0x3dd0b4611a600000 */
|
||||
piby2_2tail = 2.02226624879595063154e-21, /* 0x3ba3198a2e037073 */
|
||||
piby2_3 = 2.02226624871116645580e-21, /* 0x3ba3198a2e000000 */
|
||||
piby2_3tail = 8.47842766036889956997e-32; /* 0x397b839a252049c1 */
|
||||
double t, rhead, rtail;
|
||||
int npi2;
|
||||
unsigned long uy, xexp, expdiff;
|
||||
xexp = ax >> EXPSHIFTBITS_DP64;
|
||||
/* How many pi/2 is dx a multiple of? */
|
||||
if (ax <= 0x400f6a7a2955385e) /* 5pi/4 */
|
||||
{
|
||||
if (ax <= 0x4002d97c7f3321d2) /* 3pi/4 */
|
||||
npi2 = 1;
|
||||
else
|
||||
npi2 = 2;
|
||||
}
|
||||
else if (ax <= 0x401c463abeccb2bb) /* 9pi/4 */
|
||||
{
|
||||
if (ax <= 0x4015fdbbe9bba775) /* 7pi/4 */
|
||||
npi2 = 3;
|
||||
else
|
||||
npi2 = 4;
|
||||
}
|
||||
else
|
||||
npi2 = (int)(dx * twobypi + 0.5);
|
||||
/* Subtract the multiple from dx to get an extra-precision remainder */
|
||||
rhead = dx - npi2 * piby2_1;
|
||||
rtail = npi2 * piby2_1tail;
|
||||
GET_BITS_DP64(rhead, uy);
|
||||
expdiff = xexp - ((uy & EXPBITS_DP64) >> EXPSHIFTBITS_DP64);
|
||||
if (expdiff > 15)
|
||||
{
|
||||
/* The remainder is pretty small compared with dx, which
|
||||
implies that dx is a near multiple of pi/2
|
||||
(dx matches the multiple to at least 15 bits) */
|
||||
t = rhead;
|
||||
rtail = npi2 * piby2_2;
|
||||
rhead = t - rtail;
|
||||
rtail = npi2 * piby2_2tail - ((t - rhead) - rtail);
|
||||
if (expdiff > 48)
|
||||
{
|
||||
/* dx matches a pi/2 multiple to at least 48 bits */
|
||||
t = rhead;
|
||||
rtail = npi2 * piby2_3;
|
||||
rhead = t - rtail;
|
||||
rtail = npi2 * piby2_3tail - ((t - rhead) - rtail);
|
||||
}
|
||||
}
|
||||
r = rhead - rtail;
|
||||
region = npi2 & 3;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Reduce x into range [-pi/4,pi/4] */
|
||||
__remainder_piby2f_inline(ax, &r, ®ion);
|
||||
}
|
||||
|
||||
if (xneg)
|
||||
return (float)-tanf_piby4(r, region & 1);
|
||||
else
|
||||
return (float)tanf_piby4(r, region & 1);
|
||||
}
|
137
sdk/lib/crt/math/libm_sse2/tanh.c
Normal file
137
sdk/lib/crt/math/libm_sse2/tanh.c
Normal file
|
@ -0,0 +1,137 @@
|
|||
|
||||
/*******************************************************************************
|
||||
MIT License
|
||||
-----------
|
||||
|
||||
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this Software and associated documentaon files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "libm.h"
|
||||
#include "libm_util.h"
|
||||
|
||||
#define USE_HANDLE_ERROR
|
||||
#define USE_SPLITEXP
|
||||
#define USE_SCALEDOUBLE_2
|
||||
#define USE_VAL_WITH_FLAGS
|
||||
#include "libm_inlines.h"
|
||||
#undef USE_SPLITEXP
|
||||
#undef USE_SCALEDOUBLE_2
|
||||
#undef USE_VAL_WITH_FLAGS
|
||||
#undef USE_HANDLE_ERROR
|
||||
|
||||
#include "libm_errno.h"
|
||||
|
||||
|
||||
#pragma function(tanh)
|
||||
double tanh(double x)
|
||||
{
|
||||
/*
|
||||
The definition of tanh(x) is sinh(x)/cosh(x), which is also equivalent
|
||||
to the following three formulae:
|
||||
1. (exp(x) - exp(-x))/(exp(x) + exp(-x))
|
||||
2. (1 - (2/(exp(2*x) + 1 )))
|
||||
3. (exp(2*x) - 1)/(exp(2*x) + 1)
|
||||
but computationally, some formulae are better on some ranges.
|
||||
*/
|
||||
static const double
|
||||
thirtytwo_by_log2 = 4.61662413084468283841e+01, /* 0x40471547652b82fe */
|
||||
log2_by_32_lead = 2.16608493356034159660e-02, /* 0x3f962e42fe000000 */
|
||||
log2_by_32_tail = 5.68948749532545630390e-11, /* 0x3dcf473de6af278e */
|
||||
large_threshold = 20.0; /* 0x4034000000000000 */
|
||||
|
||||
unsigned long ux, aux, xneg;
|
||||
double y, z, p, z1, z2;
|
||||
int m;
|
||||
|
||||
/* Special cases */
|
||||
|
||||
GET_BITS_DP64(x, ux);
|
||||
aux = ux & ~SIGNBIT_DP64;
|
||||
if (aux < 0x3e30000000000000) /* |x| small enough that tanh(x) = x */
|
||||
{
|
||||
if (aux == 0)
|
||||
return x; /* with no inexact */
|
||||
else
|
||||
return val_with_flags(x, AMD_F_INEXACT);
|
||||
}
|
||||
else if (aux > 0x7ff0000000000000) /* |x| is NaN */
|
||||
return _handle_error("tanh", OP_TANH, ux|0x0008000000000000, _DOMAIN,
|
||||
0, EDOM, x, 0.0, 1);
|
||||
// return x + x;
|
||||
|
||||
xneg = (aux != ux);
|
||||
|
||||
y = x;
|
||||
if (xneg) y = -x;
|
||||
|
||||
if (y > large_threshold)
|
||||
{
|
||||
/* If x is large then exp(-x) is negligible and
|
||||
formula 1 reduces to plus or minus 1.0 */
|
||||
z = 1.0;
|
||||
}
|
||||
else if (y <= 1.0)
|
||||
{
|
||||
double y2;
|
||||
y2 = y*y;
|
||||
if (y < 0.9)
|
||||
{
|
||||
/* Use a [3,3] Remez approximation on [0,0.9]. */
|
||||
z = y + y*y2*
|
||||
(-0.274030424656179760118928e0 +
|
||||
(-0.176016349003044679402273e-1 +
|
||||
(-0.200047621071909498730453e-3 -
|
||||
0.142077926378834722618091e-7*y2)*y2)*y2)/
|
||||
(0.822091273968539282568011e0 +
|
||||
(0.381641414288328849317962e0 +
|
||||
(0.201562166026937652780575e-1 +
|
||||
0.2091140262529164482568557e-3*y2)*y2)*y2);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Use a [3,3] Remez approximation on [0.9,1]. */
|
||||
z = y + y*y2*
|
||||
(-0.227793870659088295252442e0 +
|
||||
(-0.146173047288731678404066e-1 +
|
||||
(-0.165597043903549960486816e-3 -
|
||||
0.115475878996143396378318e-7*y2)*y2)*y2)/
|
||||
(0.683381611977295894959554e0 +
|
||||
(0.317204558977294374244770e0 +
|
||||
(0.167358775461896562588695e-1 +
|
||||
0.173076050126225961768710e-3*y2)*y2)*y2);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Compute p = exp(2*y) + 1. The code is basically inlined
|
||||
from exp_amd. */
|
||||
|
||||
splitexp(2*y, 1.0, thirtytwo_by_log2, log2_by_32_lead,
|
||||
log2_by_32_tail, &m, &z1, &z2);
|
||||
p = scaleDouble_2(z1 + z2, m) + 1.0;
|
||||
|
||||
/* Now reconstruct tanh from p. */
|
||||
z = (1.0 - 2.0/p);
|
||||
}
|
||||
|
||||
if (xneg) z = - z;
|
||||
return z;
|
||||
}
|
136
sdk/lib/crt/math/libm_sse2/tanhf.c
Normal file
136
sdk/lib/crt/math/libm_sse2/tanhf.c
Normal file
|
@ -0,0 +1,136 @@
|
|||
|
||||
/*******************************************************************************
|
||||
MIT License
|
||||
-----------
|
||||
|
||||
Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this Software and associated documentaon files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "libm.h"
|
||||
#include "libm_util.h"
|
||||
|
||||
#define USE_HANDLE_ERRORF
|
||||
#define USE_SPLITEXPF
|
||||
#define USE_SCALEFLOAT_2
|
||||
#define USE_VALF_WITH_FLAGS
|
||||
#include "libm_inlines.h"
|
||||
#undef USE_SPLITEXPF
|
||||
#undef USE_SCALEFLOAT_2
|
||||
#undef USE_VALF_WITH_FLAGS
|
||||
#undef USE_HANDLE_ERRORF
|
||||
|
||||
#include "libm_errno.h"
|
||||
|
||||
// Disable "C4163: not available as intrinsic function" warning that older
|
||||
// compilers may issue here.
|
||||
#pragma warning(disable:4163)
|
||||
#pragma function(tanhf)
|
||||
|
||||
float tanhf(float x)
|
||||
{
|
||||
/*
|
||||
The definition of tanh(x) is sinh(x)/cosh(x), which is also equivalent
|
||||
to the following three formulae:
|
||||
1. (exp(x) - exp(-x))/(exp(x) + exp(-x))
|
||||
2. (1 - (2/(exp(2*x) + 1 )))
|
||||
3. (exp(2*x) - 1)/(exp(2*x) + 1)
|
||||
but computationally, some formulae are better on some ranges.
|
||||
*/
|
||||
static const float
|
||||
thirtytwo_by_log2 = 4.6166240692e+01F, /* 0x4238aa3b */
|
||||
log2_by_32_lead = 2.1659851074e-02F, /* 0x3cb17000 */
|
||||
log2_by_32_tail = 9.9831822808e-07F, /* 0x3585fdf4 */
|
||||
large_threshold = 10.0F; /* 0x41200000 */
|
||||
|
||||
unsigned int ux, aux;
|
||||
float y, z, p, z1, z2, xneg;
|
||||
int m;
|
||||
|
||||
/* Special cases */
|
||||
|
||||
GET_BITS_SP32(x, ux);
|
||||
aux = ux & ~SIGNBIT_SP32;
|
||||
if (aux < 0x39000000) /* |x| small enough that tanh(x) = x */
|
||||
{
|
||||
if (aux == 0)
|
||||
return x; /* with no inexact */
|
||||
else
|
||||
return valf_with_flags(x, AMD_F_INEXACT);
|
||||
}
|
||||
else if (aux > 0x7f800000) /* |x| is NaN */
|
||||
{
|
||||
unsigned int ufx;
|
||||
GET_BITS_SP32(x, ufx);
|
||||
return _handle_errorf("tanhf", OP_TANH, ufx|0x00400000, _DOMAIN, 0,
|
||||
EDOM, x, 0.0F, 1);
|
||||
}
|
||||
// return x + x;
|
||||
|
||||
xneg = 1.0F - 2.0F * (aux != ux);
|
||||
|
||||
y = xneg * x;
|
||||
|
||||
if (y > large_threshold)
|
||||
{
|
||||
/* If x is large then exp(-x) is negligible and
|
||||
formula 1 reduces to plus or minus 1.0 */
|
||||
z = 1.0F;
|
||||
}
|
||||
else if (y <= 1.0F)
|
||||
{
|
||||
float y2;
|
||||
y2 = y*y;
|
||||
|
||||
if (y < 0.9F)
|
||||
{
|
||||
/* Use a [2,1] Remez approximation on [0,0.9]. */
|
||||
z = y + y*y2*
|
||||
(-0.28192806108402678e0F +
|
||||
(-0.14628356048797849e-2F +
|
||||
0.4891631088530669873e-4F*y2)*y2)/
|
||||
(0.845784192581041099e0F +
|
||||
0.3427017942262751343e0F*y2);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Use a [2,1] Remez approximation on [0.9,1]. */
|
||||
z = y + y*y2*
|
||||
(-0.24069858695196524e0F +
|
||||
(-0.12325644183611929e-2F +
|
||||
0.3827534993599483396e-4F*y2)*y2)/
|
||||
(0.72209738473684982e0F +
|
||||
0.292529068698052819e0F*y2);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Compute p = exp(2*y) + 1. The code is basically inlined
|
||||
from exp_amd. */
|
||||
|
||||
splitexpf(2*y, 1.0F, thirtytwo_by_log2, log2_by_32_lead,
|
||||
log2_by_32_tail, &m, &z1, &z2);
|
||||
p = scaleFloat_2(z1 + z2, m) + 1.0F;
|
||||
/* Now reconstruct tanh from p. */
|
||||
z = (1.0F - 2.0F/p);
|
||||
}
|
||||
|
||||
return xneg * z;
|
||||
}
|
165
sdk/lib/crt/math/libm_sse2/two_to_jby64_head_tail_table.asm
Normal file
165
sdk/lib/crt/math/libm_sse2/two_to_jby64_head_tail_table.asm
Normal file
|
@ -0,0 +1,165 @@
|
|||
;;
|
||||
;
|
||||
; MIT License
|
||||
; -----------
|
||||
;
|
||||
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
||||
;
|
||||
; Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
; of this Software and associated documentaon files (the "Software"), to deal
|
||||
; in the Software without restriction, including without limitation the rights
|
||||
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
; copies of the Software, and to permit persons to whom the Software is
|
||||
; furnished to do so, subject to the following conditions:
|
||||
;
|
||||
; The above copyright notice and this permission notice shall be included in
|
||||
; all copies or substantial portions of the Software.
|
||||
;
|
||||
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
; THE SOFTWARE.
|
||||
;
|
||||
;; Defines __two_to_jby64_head_table and __two_to_jby64_tail_table tables
|
||||
;; Used in exp and pow
|
||||
;;
|
||||
|
||||
.const
|
||||
|
||||
ALIGN 16
|
||||
PUBLIC __two_to_jby64_head_table
|
||||
__two_to_jby64_head_table DQ 3ff0000000000000h
|
||||
DQ 3ff02c9a30000000h
|
||||
DQ 3ff059b0d0000000h
|
||||
DQ 3ff0874510000000h
|
||||
DQ 3ff0b55860000000h
|
||||
DQ 3ff0e3ec30000000h
|
||||
DQ 3ff11301d0000000h
|
||||
DQ 3ff1429aa0000000h
|
||||
DQ 3ff172b830000000h
|
||||
DQ 3ff1a35be0000000h
|
||||
DQ 3ff1d48730000000h
|
||||
DQ 3ff2063b80000000h
|
||||
DQ 3ff2387a60000000h
|
||||
DQ 3ff26b4560000000h
|
||||
DQ 3ff29e9df0000000h
|
||||
DQ 3ff2d285a0000000h
|
||||
DQ 3ff306fe00000000h
|
||||
DQ 3ff33c08b0000000h
|
||||
DQ 3ff371a730000000h
|
||||
DQ 3ff3a7db30000000h
|
||||
DQ 3ff3dea640000000h
|
||||
DQ 3ff4160a20000000h
|
||||
DQ 3ff44e0860000000h
|
||||
DQ 3ff486a2b0000000h
|
||||
DQ 3ff4bfdad0000000h
|
||||
DQ 3ff4f9b270000000h
|
||||
DQ 3ff5342b50000000h
|
||||
DQ 3ff56f4730000000h
|
||||
DQ 3ff5ab07d0000000h
|
||||
DQ 3ff5e76f10000000h
|
||||
DQ 3ff6247eb0000000h
|
||||
DQ 3ff6623880000000h
|
||||
DQ 3ff6a09e60000000h
|
||||
DQ 3ff6dfb230000000h
|
||||
DQ 3ff71f75e0000000h
|
||||
DQ 3ff75feb50000000h
|
||||
DQ 3ff7a11470000000h
|
||||
DQ 3ff7e2f330000000h
|
||||
DQ 3ff8258990000000h
|
||||
DQ 3ff868d990000000h
|
||||
DQ 3ff8ace540000000h
|
||||
DQ 3ff8f1ae90000000h
|
||||
DQ 3ff93737b0000000h
|
||||
DQ 3ff97d8290000000h
|
||||
DQ 3ff9c49180000000h
|
||||
DQ 3ffa0c6670000000h
|
||||
DQ 3ffa5503b0000000h
|
||||
DQ 3ffa9e6b50000000h
|
||||
DQ 3ffae89f90000000h
|
||||
DQ 3ffb33a2b0000000h
|
||||
DQ 3ffb7f76f0000000h
|
||||
DQ 3ffbcc1e90000000h
|
||||
DQ 3ffc199bd0000000h
|
||||
DQ 3ffc67f120000000h
|
||||
DQ 3ffcb720d0000000h
|
||||
DQ 3ffd072d40000000h
|
||||
DQ 3ffd5818d0000000h
|
||||
DQ 3ffda9e600000000h
|
||||
DQ 3ffdfc9730000000h
|
||||
DQ 3ffe502ee0000000h
|
||||
DQ 3ffea4afa0000000h
|
||||
DQ 3ffefa1be0000000h
|
||||
DQ 3fff507650000000h
|
||||
DQ 3fffa7c180000000h
|
||||
|
||||
ALIGN 16
|
||||
PUBLIC __two_to_jby64_tail_table
|
||||
__two_to_jby64_tail_table DQ 0000000000000000h
|
||||
DQ 3e6cef00c1dcdef9h
|
||||
DQ 3e48ac2ba1d73e2ah
|
||||
DQ 3e60eb37901186beh
|
||||
DQ 3e69f3121ec53172h
|
||||
DQ 3e469e8d10103a17h
|
||||
DQ 3df25b50a4ebbf1ah
|
||||
DQ 3e6d525bbf668203h
|
||||
DQ 3e68faa2f5b9bef9h
|
||||
DQ 3e66df96ea796d31h
|
||||
DQ 3e368b9aa7805b80h
|
||||
DQ 3e60c519ac771dd6h
|
||||
DQ 3e6ceac470cd83f5h
|
||||
DQ 3e5789f37495e99ch
|
||||
DQ 3e547f7b84b09745h
|
||||
DQ 3e5b900c2d002475h
|
||||
DQ 3e64636e2a5bd1abh
|
||||
DQ 3e4320b7fa64e430h
|
||||
DQ 3e5ceaa72a9c5154h
|
||||
DQ 3e53967fdba86f24h
|
||||
DQ 3e682468446b6824h
|
||||
DQ 3e3f72e29f84325bh
|
||||
DQ 3e18624b40c4dbd0h
|
||||
DQ 3e5704f3404f068eh
|
||||
DQ 3e54d8a89c750e5eh
|
||||
DQ 3e5a74b29ab4cf62h
|
||||
DQ 3e5a753e077c2a0fh
|
||||
DQ 3e5ad49f699bb2c0h
|
||||
DQ 3e6a90a852b19260h
|
||||
DQ 3e56b48521ba6f93h
|
||||
DQ 3e0d2ac258f87d03h
|
||||
DQ 3e42a91124893ecfh
|
||||
DQ 3e59fcef32422cbeh
|
||||
DQ 3e68ca345de441c5h
|
||||
DQ 3e61d8bee7ba46e1h
|
||||
DQ 3e59099f22fdba6ah
|
||||
DQ 3e4f580c36bea881h
|
||||
DQ 3e5b3d398841740ah
|
||||
DQ 3e62999c25159f11h
|
||||
DQ 3e668925d901c83bh
|
||||
DQ 3e415506dadd3e2ah
|
||||
DQ 3e622aee6c57304eh
|
||||
DQ 3e29b8bc9e8a0387h
|
||||
DQ 3e6fbc9c9f173d24h
|
||||
DQ 3e451f8480e3e235h
|
||||
DQ 3e66bbcac96535b5h
|
||||
DQ 3e41f12ae45a1224h
|
||||
DQ 3e55e7f6fd0fac90h
|
||||
DQ 3e62b5a75abd0e69h
|
||||
DQ 3e609e2bf5ed7fa1h
|
||||
DQ 3e47daf237553d84h
|
||||
DQ 3e12f074891ee83dh
|
||||
DQ 3e6b0aa538444196h
|
||||
DQ 3e6cafa29694426fh
|
||||
DQ 3e69df20d22a0797h
|
||||
DQ 3e640f12f71a1e45h
|
||||
DQ 3e69f7490e4bb40bh
|
||||
DQ 3e4ed9942b84600dh
|
||||
DQ 3e4bdcdaf5cb4656h
|
||||
DQ 3e5e2cffd89cf44ch
|
||||
DQ 3e452486cc2c7b9dh
|
||||
DQ 3e6cc2b44eee3fa4h
|
||||
DQ 3e66dc8a80ce9f09h
|
||||
DQ 3e39e90d82e90a7eh
|
||||
END
|
99
sdk/lib/crt/math/libm_sse2/two_to_jby64_table.asm
Normal file
99
sdk/lib/crt/math/libm_sse2/two_to_jby64_table.asm
Normal file
|
@ -0,0 +1,99 @@
|
|||
;;
|
||||
;
|
||||
; MIT License
|
||||
; -----------
|
||||
;
|
||||
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
|
||||
;
|
||||
; Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
; of this Software and associated documentaon files (the "Software"), to deal
|
||||
; in the Software without restriction, including without limitation the rights
|
||||
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
; copies of the Software, and to permit persons to whom the Software is
|
||||
; furnished to do so, subject to the following conditions:
|
||||
;
|
||||
; The above copyright notice and this permission notice shall be included in
|
||||
; all copies or substantial portions of the Software.
|
||||
;
|
||||
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
; THE SOFTWARE.
|
||||
;
|
||||
;; Defines __two_to_jby64_table table
|
||||
;; Used by exp and expf
|
||||
;;
|
||||
|
||||
.const
|
||||
|
||||
ALIGN 16
|
||||
PUBLIC __two_to_jby64_table
|
||||
__two_to_jby64_table DQ 3ff0000000000000h
|
||||
DQ 3ff02c9a3e778061h
|
||||
DQ 3ff059b0d3158574h
|
||||
DQ 3ff0874518759bc8h
|
||||
DQ 3ff0b5586cf9890fh
|
||||
DQ 3ff0e3ec32d3d1a2h
|
||||
DQ 3ff11301d0125b51h
|
||||
DQ 3ff1429aaea92de0h
|
||||
DQ 3ff172b83c7d517bh
|
||||
DQ 3ff1a35beb6fcb75h
|
||||
DQ 3ff1d4873168b9aah
|
||||
DQ 3ff2063b88628cd6h
|
||||
DQ 3ff2387a6e756238h
|
||||
DQ 3ff26b4565e27cddh
|
||||
DQ 3ff29e9df51fdee1h
|
||||
DQ 3ff2d285a6e4030bh
|
||||
DQ 3ff306fe0a31b715h
|
||||
DQ 3ff33c08b26416ffh
|
||||
DQ 3ff371a7373aa9cbh
|
||||
DQ 3ff3a7db34e59ff7h
|
||||
DQ 3ff3dea64c123422h
|
||||
DQ 3ff4160a21f72e2ah
|
||||
DQ 3ff44e086061892dh
|
||||
DQ 3ff486a2b5c13cd0h
|
||||
DQ 3ff4bfdad5362a27h
|
||||
DQ 3ff4f9b2769d2ca7h
|
||||
DQ 3ff5342b569d4f82h
|
||||
DQ 3ff56f4736b527dah
|
||||
DQ 3ff5ab07dd485429h
|
||||
DQ 3ff5e76f15ad2148h
|
||||
DQ 3ff6247eb03a5585h
|
||||
DQ 3ff6623882552225h
|
||||
DQ 3ff6a09e667f3bcdh
|
||||
DQ 3ff6dfb23c651a2fh
|
||||
DQ 3ff71f75e8ec5f74h
|
||||
DQ 3ff75feb564267c9h
|
||||
DQ 3ff7a11473eb0187h
|
||||
DQ 3ff7e2f336cf4e62h
|
||||
DQ 3ff82589994cce13h
|
||||
DQ 3ff868d99b4492edh
|
||||
DQ 3ff8ace5422aa0dbh
|
||||
DQ 3ff8f1ae99157736h
|
||||
DQ 3ff93737b0cdc5e5h
|
||||
DQ 3ff97d829fde4e50h
|
||||
DQ 3ff9c49182a3f090h
|
||||
DQ 3ffa0c667b5de565h
|
||||
DQ 3ffa5503b23e255dh
|
||||
DQ 3ffa9e6b5579fdbfh
|
||||
DQ 3ffae89f995ad3adh
|
||||
DQ 3ffb33a2b84f15fbh
|
||||
DQ 3ffb7f76f2fb5e47h
|
||||
DQ 3ffbcc1e904bc1d2h
|
||||
DQ 3ffc199bdd85529ch
|
||||
DQ 3ffc67f12e57d14bh
|
||||
DQ 3ffcb720dcef9069h
|
||||
DQ 3ffd072d4a07897ch
|
||||
DQ 3ffd5818dcfba487h
|
||||
DQ 3ffda9e603db3285h
|
||||
DQ 3ffdfc97337b9b5fh
|
||||
DQ 3ffe502ee78b3ff6h
|
||||
DQ 3ffea4afa2a490dah
|
||||
DQ 3ffefa1bee615a27h
|
||||
DQ 3fff50765b6e4540h
|
||||
DQ 3fffa7c1819e90d8h
|
||||
|
||||
END
|
Loading…
Reference in a new issue