diff --git a/sdk/lib/crt/math/libm_sse2/L2_by_pi_bits.asm b/sdk/lib/crt/math/libm_sse2/L2_by_pi_bits.asm new file mode 100644 index 00000000000..b267015fb75 --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/L2_by_pi_bits.asm @@ -0,0 +1,54 @@ +; +; MIT License +; ----------- +; +; Copyright (c) 2002-2019 Advanced Micro Devices, Inc. +; +; Permission is hereby granted, free of charge, to any person obtaining a copy +; of this Software and associated documentaon files (the "Software"), to deal +; in the Software without restriction, including without limitation the rights +; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +; copies of the Software, and to permit persons to whom the Software is +; furnished to do so, subject to the following conditions: +; +; The above copyright notice and this permission notice shall be included in +; all copies or substantial portions of the Software. +; +; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +; THE SOFTWARE. +; +;; +;; Defines __L_2_by_pi_bits array +;; Used in trigonometric argument reduction +;; + +.const + +ALIGN 16 +PUBLIC __L_2_by_pi_bits +__L_2_by_pi_bits DB 224, 241, 27, 193, 12, 88, 33, 116 + DB 53, 126, 196, 126, 237, 175, 169, 75 + DB 74, 41, 222, 231, 28, 244, 236, 197 + DB 151, 175, 31, 235, 158, 212, 181, 168 + DB 127, 121, 154, 253, 24, 61, 221, 38 + DB 44, 159, 60, 251, 217, 180, 125, 180 + DB 41, 104, 45, 70, 188, 188, 63, 96 + DB 22, 120, 255, 95, 226, 127, 236, 160 + DB 228, 247, 46, 126, 17, 114, 210, 231 + DB 76, 13, 230, 88, 71, 230, 4, 249 + DB 125, 209, 154, 192, 113, 166, 19, 18 + DB 237, 186, 212, 215, 8, 162, 251, 156 + DB 166, 196, 114, 172, 119, 248, 115, 72 + DB 70, 39, 168, 187, 36, 25, 128, 75 + DB 55, 9, 233, 184, 145, 220, 134, 21 + DB 239, 122, 175, 142, 69, 249, 7, 65 + DB 14, 241, 100, 86, 138, 109, 3, 119 + DB 211, 212, 71, 95, 157, 240, 167, 84 + DB 16, 57, 185, 13, 230, 139, 2, 0 + DB 0, 0, 0, 0, 0, 0 +END diff --git a/sdk/lib/crt/math/libm_sse2/Lsincos_array.asm b/sdk/lib/crt/math/libm_sse2/Lsincos_array.asm new file mode 100644 index 00000000000..03f32d08e92 --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/Lsincos_array.asm @@ -0,0 +1,62 @@ +;; +; +; MIT License +; ----------- +; +; Copyright (c) 2002-2019 Advanced Micro Devices, Inc. +; +; Permission is hereby granted, free of charge, to any person obtaining a copy +; of this Software and associated documentaon files (the "Software"), to deal +; in the Software without restriction, including without limitation the rights +; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +; copies of the Software, and to permit persons to whom the Software is +; furnished to do so, subject to the following conditions: +; +; The above copyright notice and this permission notice shall be included in +; all copies or substantial portions of the Software. +; +; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +; THE SOFTWARE. +; +;; Defines __Lcosarray and __Lsinarray arrays. +;; Used in sin.asm and cos.asm +;; + +.const + +ALIGN 16 +PUBLIC __Lcosarray +__Lcosarray DQ 03fa5555555555555h ; 0.0416667 c1 + DQ 0 + DQ 0bf56c16c16c16967h ; -0.00138889 c2 + DQ 0 + DQ 03EFA01A019F4EC91h ; 2.48016e-005 c3 + DQ 0 + DQ 0bE927E4FA17F667Bh ; -2.75573e-007 c4 + DQ 0 + DQ 03E21EEB690382EECh ; 2.08761e-009 c5 + DQ 0 + DQ 0bDA907DB47258AA7h ; -1.13826e-011 c6 + DQ 0 + +ALIGN 16 +PUBLIC __Lsinarray +__Lsinarray DQ 0bfc5555555555555h ; -0.166667 s1 + DQ 0 + DQ 03f81111111110bb3h ; 0.00833333 s2 + DQ 0 + DQ 0bf2a01a019e83e5ch ; -0.000198413 s3 + DQ 0 + DQ 03ec71de3796cde01h ; 2.75573e-006 s4 + DQ 0 + DQ 0be5ae600b42fdfa7h ; -2.50511e-008 s5 + DQ 0 + DQ 03de5e0b2f9a43bb8h ; 1.59181e-010 s6 + DQ 0 + +END diff --git a/sdk/lib/crt/math/libm_sse2/Lsincosf_array.asm b/sdk/lib/crt/math/libm_sse2/Lsincosf_array.asm new file mode 100644 index 00000000000..871b9ce8410 --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/Lsincosf_array.asm @@ -0,0 +1,48 @@ +;; +; +; MIT License +; ----------- +; +; Copyright (c) 2002-2019 Advanced Micro Devices, Inc. +; +; Permission is hereby granted, free of charge, to any person obtaining a copy +; of this Software and associated documentaon files (the "Software"), to deal +; in the Software without restriction, including without limitation the rights +; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +; copies of the Software, and to permit persons to whom the Software is +; furnished to do so, subject to the following conditions: +; +; The above copyright notice and this permission notice shall be included in +; all copies or substantial portions of the Software. +; +; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +; THE SOFTWARE. +; +;; Defines __Lcosarray and __Lsinarray arrays. +;; Used in sin.asm and cos.asm +;; These coefficients are actually from Taylor series. +;; + +.const + +ALIGN 16 +PUBLIC __Lcosfarray +__Lcosfarray DQ 0bfe0000000000000h ; -0.5 c0 + DQ 03fa5555555555555h ; 0.0416667 c1 + DQ 0bf56c16c16c16c16h ; -0.00138889 c2 + DQ 03EFA01A01A01A019h ; 2.48016e-005 c3 + DQ 0be927e4fb7789f5ch ; -2.75573e-007 c4 + +ALIGN 16 +PUBLIC __Lsinfarray +__Lsinfarray DQ 0bfc5555555555555h ; -0.166667 s1 + DQ 03f81111111111111h ; 0.00833333 s2 + DQ 0bf2a01a01a01a01ah ; -0.000198413 s3 + DQ 03ec71de3a556c734h ; 2.75573e-006 s4 + +END diff --git a/sdk/lib/crt/math/libm_sse2/_chgsign.c b/sdk/lib/crt/math/libm_sse2/_chgsign.c new file mode 100644 index 00000000000..f22ce58a743 --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/_chgsign.c @@ -0,0 +1,41 @@ + +/******************************************************************************* +MIT License +----------- + +Copyright (c) 2002-2019 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this Software and associated documentaon files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include "libm.h" +#include "libm_util.h" + +double FN_PROTOTYPE(_chgsign)(double x) +{ + /* Returns x with its sign reversed. + NaNs are not considered special; their sign bits are handled + the same as for any other number */ + unsigned long u; + GET_BITS_DP64(x, u); + u ^= SIGNBIT_DP64; + PUT_BITS_DP64(u, x); + return x; +} + diff --git a/sdk/lib/crt/math/libm_sse2/_chgsignf.c b/sdk/lib/crt/math/libm_sse2/_chgsignf.c new file mode 100644 index 00000000000..1996aa5af96 --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/_chgsignf.c @@ -0,0 +1,40 @@ + +/******************************************************************************* +MIT License +----------- + +Copyright (c) 2002-2019 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this Software and associated documentaon files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include "libm.h" +#include "libm_util.h" + +float FN_PROTOTYPE(_chgsignf)(float x) +{ + /* Returns x with its sign reversed. + NaNs are not considered special; their sign bits are handled + the same as for any other number */ + unsigned int u; + GET_BITS_SP32(x, u); + u ^= SIGNBIT_SP32; + PUT_BITS_SP32(u, x); + return x; +} diff --git a/sdk/lib/crt/math/libm_sse2/_copysign.c b/sdk/lib/crt/math/libm_sse2/_copysign.c new file mode 100644 index 00000000000..c3944276567 --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/_copysign.c @@ -0,0 +1,44 @@ + +/******************************************************************************* +MIT License +----------- + +Copyright (c) 2002-2019 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this Software and associated documentaon files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include "libm.h" +#include "libm_util.h" + +/* Returns the absolute value of x with the sign of y. + NaNs are not considered special; their sign bits are handled + the same as for any other number. */ + +double FN_PROTOTYPE(_copysign)(double x, double y) +{ + + unsigned long ux, uy; + GET_BITS_DP64(x, ux); + GET_BITS_DP64(y, uy); + if ((ux ^ uy) & SIGNBIT_DP64) + PUT_BITS_DP64(ux ^ SIGNBIT_DP64, x); + return x; + +} diff --git a/sdk/lib/crt/math/libm_sse2/_copysignf.c b/sdk/lib/crt/math/libm_sse2/_copysignf.c new file mode 100644 index 00000000000..874f00ca0c0 --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/_copysignf.c @@ -0,0 +1,42 @@ + +/******************************************************************************* +MIT License +----------- + +Copyright (c) 2002-2019 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this Software and associated documentaon files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include "libm.h" +#include "libm_util.h" + + /* Returns the absolute value of x with the sign of y. + NaNs are not considered special; their sign bits are handled + the same as for any other number. */ + +float FN_PROTOTYPE(_copysignf)(float x, float y) +{ + unsigned int ux, uy; + GET_BITS_SP32(x, ux); + GET_BITS_SP32(y, uy); + if ((ux ^ uy) & SIGNBIT_SP32) + PUT_BITS_SP32(ux ^ SIGNBIT_SP32, x); + return x; +} diff --git a/sdk/lib/crt/math/libm_sse2/_finite.c b/sdk/lib/crt/math/libm_sse2/_finite.c new file mode 100644 index 00000000000..c3ca86f4b05 --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/_finite.c @@ -0,0 +1,39 @@ + +/******************************************************************************* +MIT License +----------- + +Copyright (c) 2002-2019 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this Software and associated documentaon files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include "libm.h" +#include "libm_util.h" + +/* Returns 0 if x is infinite or NaN, otherwise returns 1 */ + +int FN_PROTOTYPE(_finite)(double x) +{ + + + unsigned long ux; + GET_BITS_DP64(x, ux); + return (int)(((ux & ~SIGNBIT_DP64) - PINFBITPATT_DP64) >> 63); +} diff --git a/sdk/lib/crt/math/libm_sse2/_finitef.c b/sdk/lib/crt/math/libm_sse2/_finitef.c new file mode 100644 index 00000000000..3fbfbc7c2e6 --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/_finitef.c @@ -0,0 +1,40 @@ + +/******************************************************************************* +MIT License +----------- + +Copyright (c) 2002-2019 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this Software and associated documentaon files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include "libm.h" +#include "libm_util.h" + +/* Returns 0 if x is infinite or NaN, otherwise returns 1 */ + +int FN_PROTOTYPE(_finitef)(float x) +{ + + + unsigned int ux; + GET_BITS_SP32(x, ux); + return (int)(((ux & ~SIGNBIT_SP32) - PINFBITPATT_SP32) >> 31); + +} diff --git a/sdk/lib/crt/math/libm_sse2/acos.c b/sdk/lib/crt/math/libm_sse2/acos.c new file mode 100644 index 00000000000..cb46803e536 --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/acos.c @@ -0,0 +1,145 @@ + +/******************************************************************************* +MIT License +----------- + +Copyright (c) 2002-2019 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this Software and associated documentaon files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include "libm.h" +#include "libm_util.h" + +#define USE_VAL_WITH_FLAGS +#define USE_NAN_WITH_FLAGS +#define USE_HANDLE_ERROR +#include "libm_inlines.h" +#undef USE_NAN_WITH_FLAGS +#undef USE_VAL_WITH_FLAGS +#undef USE_HANDLE_ERROR + +#include "libm_errno.h" + + +#pragma function(acos) + +double FN_PROTOTYPE(acos)(double x) +{ + /* Computes arccos(x). + The argument is first reduced by noting that arccos(x) + is invalid for abs(x) > 1. For denormal and small + arguments arccos(x) = pi/2 to machine accuracy. + Remaining argument ranges are handled as follows. + For abs(x) <= 0.5 use + arccos(x) = pi/2 - arcsin(x) + = pi/2 - (x + x^3*R(x^2)) + where R(x^2) is a rational minimax approximation to + (arcsin(x) - x)/x^3. + For abs(x) > 0.5 exploit the identity: + arccos(x) = pi - 2*arcsin(sqrt(1-x)/2) + together with the above rational approximation, and + reconstruct the terms carefully. + */ + + /* Some constants and split constants. */ + + static const double + pi = 3.1415926535897933e+00, /* 0x400921fb54442d18 */ + piby2 = 1.5707963267948965580e+00, /* 0x3ff921fb54442d18 */ + piby2_head = 1.5707963267948965580e+00, /* 0x3ff921fb54442d18 */ + piby2_tail = 6.12323399573676603587e-17; /* 0x3c91a62633145c07 */ + + double u, y, s=0.0, r; + int xexp, xnan, transform=0; + + unsigned long ux, aux, xneg; + GET_BITS_DP64(x, ux); + aux = ux & ~SIGNBIT_DP64; + xneg = (ux & SIGNBIT_DP64); + xnan = (aux > PINFBITPATT_DP64); + xexp = (int)((ux & EXPBITS_DP64) >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64; + + /* Special cases */ + + if (xnan) + { + return _handle_error("acos", OP_ACOS, ux|0x0008000000000000, _DOMAIN, + 0, EDOM, x, 0.0, 1); + } + else if (xexp < -56) + { /* y small enough that arccos(x) = pi/2 */ + return val_with_flags(piby2, AMD_F_INEXACT); + } + else if (xexp >= 0) + { /* abs(x) >= 1.0 */ + if (x == 1.0) + return 0.0; + else if (x == -1.0) + return val_with_flags(pi, AMD_F_INEXACT); + else + return _handle_error("acos", OP_ACOS, INDEFBITPATT_DP64, _DOMAIN, + AMD_F_INVALID, EDOM, x, 0.0, 1); + } + + if (xneg) y = -x; + else y = x; + + transform = (xexp >= -1); /* abs(x) >= 0.5 */ + + if (transform) + { /* Transform y into the range [0,0.5) */ + r = 0.5*(1.0 - y); + /* VC++ intrinsic call */ + _mm_store_sd(&s, _mm_sqrt_sd(_mm_setzero_pd(), _mm_load_sd(&r))); + y = s; + } + else + r = y*y; + + /* Use a rational approximation for [0.0, 0.5] */ + + u = r*(0.227485835556935010735943483075 + + (-0.445017216867635649900123110649 + + (0.275558175256937652532686256258 + + (-0.0549989809235685841612020091328 + + (0.00109242697235074662306043804220 + + 0.0000482901920344786991880522822991*r)*r)*r)*r)*r)/ + (1.36491501334161032038194214209 + + (-3.28431505720958658909889444194 + + (2.76568859157270989520376345954 + + (-0.943639137032492685763471240072 + + 0.105869422087204370341222318533*r)*r)*r)*r); + + if (transform) + { /* Reconstruct acos carefully in transformed region */ + if (xneg) return pi - 2.0*(s+(y*u - piby2_tail)); + else + { + double c, s1; + unsigned long us; + GET_BITS_DP64(s, us); + PUT_BITS_DP64(0xffffffff00000000 & us, s1); + c = (r-s1*s1)/(s+s1); + return 2.0*s1 + (2.0*c+2.0*y*u); + } + } + else + return piby2_head - (x - (piby2_tail - x*u)); +} diff --git a/sdk/lib/crt/math/libm_sse2/acosf.c b/sdk/lib/crt/math/libm_sse2/acosf.c new file mode 100644 index 00000000000..5422177b317 --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/acosf.c @@ -0,0 +1,146 @@ + +/******************************************************************************* +MIT License +----------- + +Copyright (c) 2002-2019 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this Software and associated documentaon files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include "libm.h" +#include "libm_util.h" + +#define USE_VALF_WITH_FLAGS +#define USE_NANF_WITH_FLAGS +#define USE_HANDLE_ERRORF +#include "libm_inlines.h" +#undef USE_NANF_WITH_FLAGS +#undef USE_VALF_WITH_FLAGS +#undef USE_HANDLE_ERRORF + +#include "libm_errno.h" + +// Disable "C4163: not available as intrinsic function" warning that older +// compilers may issue here. +#pragma warning(disable:4163) +#pragma function(acosf) + + +float FN_PROTOTYPE(acosf)(float x) +{ + /* Computes arccos(x). + The argument is first reduced by noting that arccos(x) + is invalid for abs(x) > 1. For denormal and small + arguments arccos(x) = pi/2 to machine accuracy. + Remaining argument ranges are handled as follows. + For abs(x) <= 0.5 use + arccos(x) = pi/2 - arcsin(x) + = pi/2 - (x + x^3*R(x^2)) + where R(x^2) is a rational minimax approximation to + (arcsin(x) - x)/x^3. + For abs(x) > 0.5 exploit the identity: + arccos(x) = pi - 2*arcsin(sqrt(1-x)/2) + together with the above rational approximation, and + reconstruct the terms carefully. + */ + + /* Some constants and split constants. */ + + static const float + piby2 = 1.5707963705e+00F; /* 0x3fc90fdb */ + static const double + pi = 3.1415926535897933e+00, /* 0x400921fb54442d18 */ + piby2_head = 1.5707963267948965580e+00, /* 0x3ff921fb54442d18 */ + piby2_tail = 6.12323399573676603587e-17; /* 0x3c91a62633145c07 */ + + float u, y, s = 0.0F, r; + int xexp, xnan, transform = 0; + + unsigned int ux, aux, xneg; + + GET_BITS_SP32(x, ux); + aux = ux & ~SIGNBIT_SP32; + xneg = (ux & SIGNBIT_SP32); + xnan = (aux > PINFBITPATT_SP32); + xexp = (int)((ux & EXPBITS_SP32) >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32; + + /* Special cases */ + + if (xnan) + { + return _handle_errorf("acosf", OP_ACOS, ux|0x00400000, _DOMAIN, 0, + EDOM, x, 0.0F, 1); + } + else if (xexp < -26) + /* y small enough that arccos(x) = pi/2 */ + return valf_with_flags(piby2, AMD_F_INEXACT); + else if (xexp >= 0) + { /* abs(x) >= 1.0 */ + if (x == 1.0F) + return 0.0F; + else if (x == -1.0F) + return valf_with_flags((float)pi, AMD_F_INEXACT); + else + return _handle_errorf("acosf", OP_ACOS, INDEFBITPATT_SP32, _DOMAIN, + AMD_F_INVALID, EDOM, x, 0.0F, 1); + } + + if (xneg) y = -x; + else y = x; + + transform = (xexp >= -1); /* abs(x) >= 0.5 */ + + if (transform) + { /* Transform y into the range [0,0.5) */ + r = 0.5F*(1.0F - y); + /* VC++ intrinsic call */ + _mm_store_ss(&s, _mm_sqrt_ss(_mm_load_ss(&r))); + y = s; + } + else + r = y*y; + + /* Use a rational approximation for [0.0, 0.5] */ + + u=r*(0.184161606965100694821398249421F + + (-0.0565298683201845211985026327361F + + (-0.0133819288943925804214011424456F - + 0.00396137437848476485201154797087F*r)*r)*r)/ + (1.10496961524520294485512696706F - + 0.836411276854206731913362287293F*r); + + if (transform) + { + /* Reconstruct acos carefully in transformed region */ + if (xneg) + return (float)(pi - 2.0*(s+(y*u - piby2_tail))); + else + { + float c, s1; + unsigned int us; + GET_BITS_SP32(s, us); + PUT_BITS_SP32(0xffff0000 & us, s1); + c = (r-s1*s1)/(s+s1); + return 2.0F*s1 + (2.0F*c+2.0F*y*u); + } + } + else + return (float)(piby2_head - (x - (piby2_tail - x*u))); +} diff --git a/sdk/lib/crt/math/libm_sse2/asin.c b/sdk/lib/crt/math/libm_sse2/asin.c new file mode 100644 index 00000000000..31e652b73c6 --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/asin.c @@ -0,0 +1,153 @@ + +/******************************************************************************* +MIT License +----------- + +Copyright (c) 2002-2019 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this Software and associated documentaon files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include "libm.h" +#include "libm_util.h" + +#define USE_VAL_WITH_FLAGS +#define USE_NAN_WITH_FLAGS +#define USE_HANDLE_ERROR +#include "libm_inlines.h" +#undef USE_NAN_WITH_FLAGS +#undef USE_VAL_WITH_FLAGS +#undef USE_HANDLE_ERROR + +#include "libm_errno.h" + +#pragma function(asin) + +double FN_PROTOTYPE(asin)(double x) +{ + /* Computes arcsin(x). + The argument is first reduced by noting that arcsin(x) + is invalid for abs(x) > 1 and arcsin(-x) = -arcsin(x). + For denormal and small arguments arcsin(x) = x to machine + accuracy. Remaining argument ranges are handled as follows. + For abs(x) <= 0.5 use + arcsin(x) = x + x^3*R(x^2) + where R(x^2) is a rational minimax approximation to + (arcsin(x) - x)/x^3. + For abs(x) > 0.5 exploit the identity: + arcsin(x) = pi/2 - 2*arcsin(sqrt(1-x)/2) + together with the above rational approximation, and + reconstruct the terms carefully. + */ + + /* Some constants and split constants. */ + + static const double + piby2_tail = 6.1232339957367660e-17, /* 0x3c91a62633145c07 */ + hpiby2_head = 7.8539816339744831e-01, /* 0x3fe921fb54442d18 */ + piby2 = 1.5707963267948965e+00; /* 0x3ff921fb54442d18 */ + double u, v, y, s=0.0, r; + int xexp, xnan, transform=0; + + unsigned long ux, aux, xneg; + GET_BITS_DP64(x, ux); + aux = ux & ~SIGNBIT_DP64; + xneg = (ux & SIGNBIT_DP64); + xnan = (aux > PINFBITPATT_DP64); + xexp = (int)((ux & EXPBITS_DP64) >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64; + + /* Special cases */ + + if (xnan) + { + return _handle_error("asin", OP_ASIN, ux|0x0008000000000000, _DOMAIN, + 0, EDOM, x, 0.0, 1); + } + else if (xexp < -28) + { /* y small enough that arcsin(x) = x */ + return val_with_flags(x, AMD_F_INEXACT); + } + else if (xexp >= 0) + { /* abs(x) >= 1.0 */ + if (x == 1.0) + return val_with_flags(piby2, AMD_F_INEXACT); + else if (x == -1.0) + return val_with_flags(-piby2, AMD_F_INEXACT); + else + return _handle_error("asin", OP_ASIN, INDEFBITPATT_DP64, _DOMAIN, + AMD_F_INVALID, EDOM, x, 0.0, 1); + } + + if (xneg) y = -x; + else y = x; + + transform = (xexp >= -1); /* abs(x) >= 0.5 */ + + if (transform) + { /* Transform y into the range [0,0.5) */ + r = 0.5*(1.0 - y); + /* VC++ intrinsic call */ + _mm_store_sd(&s, _mm_sqrt_sd(_mm_setzero_pd(), _mm_load_sd(&r))); + y = s; + } + else + r = y*y; + + /* Use a rational approximation for [0.0, 0.5] */ + + u = r*(0.227485835556935010735943483075 + + (-0.445017216867635649900123110649 + + (0.275558175256937652532686256258 + + (-0.0549989809235685841612020091328 + + (0.00109242697235074662306043804220 + + 0.0000482901920344786991880522822991*r)*r)*r)*r)*r)/ + (1.36491501334161032038194214209 + + (-3.28431505720958658909889444194 + + (2.76568859157270989520376345954 + + (-0.943639137032492685763471240072 + + 0.105869422087204370341222318533*r)*r)*r)*r); + + if (transform) + { /* Reconstruct asin carefully in transformed region */ + { + double c, s1, p, q; + unsigned long us; + GET_BITS_DP64(s, us); + PUT_BITS_DP64(0xffffffff00000000 & us, s1); + c = (r-s1*s1)/(s+s1); + p = 2.0*s*u - (piby2_tail-2.0*c); + q = hpiby2_head - 2.0*s1; + v = hpiby2_head - (p-q); + } + } + else + { + /* Use a temporary variable to prevent VC++ rearranging + y + y*u + into + y * (1 + u) + and getting an incorrectly rounded result */ + double tmp; + tmp = y * u; + v = y + tmp; + } + + if (xneg) return -v; + else return v; +} diff --git a/sdk/lib/crt/math/libm_sse2/asinf.c b/sdk/lib/crt/math/libm_sse2/asinf.c new file mode 100644 index 00000000000..89dba1059ff --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/asinf.c @@ -0,0 +1,151 @@ + +/******************************************************************************* +MIT License +----------- + +Copyright (c) 2002-2019 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this Software and associated documentaon files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include "libm.h" +#include "libm_util.h" + +#define USE_VALF_WITH_FLAGS +#define USE_NANF_WITH_FLAGS +#define USE_HANDLE_ERRORF +#include "libm_inlines.h" +#undef USE_NANF_WITH_FLAGS +#undef USE_VALF_WITH_FLAGS +#undef USE_HANDLE_ERRORF + +#include "libm_errno.h" + +// Disable "C4163: not available as intrinsic function" warning that older +// compilers may issue here. +#pragma warning(disable:4163) +#pragma function(asinf) + + +float FN_PROTOTYPE(asinf)(float x) +{ + /* Computes arcsin(x). + The argument is first reduced by noting that arcsin(x) + is invalid for abs(x) > 1 and arcsin(-x) = -arcsin(x). + For denormal and small arguments arcsin(x) = x to machine + accuracy. Remaining argument ranges are handled as follows. + For abs(x) <= 0.5 use + arcsin(x) = x + x^3*R(x^2) + where R(x^2) is a rational minimax approximation to + (arcsin(x) - x)/x^3. + For abs(x) > 0.5 exploit the identity: + arcsin(x) = pi/2 - 2*arcsin(sqrt(1-x)/2) + together with the above rational approximation, and + reconstruct the terms carefully. + */ + + /* Some constants and split constants. */ + + static const float + piby2_tail = 7.5497894159e-08F, /* 0x33a22168 */ + hpiby2_head = 7.8539812565e-01F, /* 0x3f490fda */ + piby2 = 1.5707963705e+00F; /* 0x3fc90fdb */ + float u, v, y, s = 0.0F, r; + int xexp, xnan, transform = 0; + + unsigned int ux, aux, xneg; + GET_BITS_SP32(x, ux); + aux = ux & ~SIGNBIT_SP32; + xneg = (ux & SIGNBIT_SP32); + xnan = (aux > PINFBITPATT_SP32); + xexp = (int)((ux & EXPBITS_SP32) >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32; + + /* Special cases */ + + if (xnan) + { + return _handle_errorf("asinf", OP_ASIN, ux|0x00400000, _DOMAIN, 0, + EDOM, x, 0.0F, 1); + } + else if (xexp < -14) + /* y small enough that arcsin(x) = x */ + return valf_with_flags(x, AMD_F_INEXACT); + else if (xexp >= 0) + { + /* abs(x) >= 1.0 */ + if (x == 1.0F) + return valf_with_flags(piby2, AMD_F_INEXACT); + else if (x == -1.0F) + return valf_with_flags(-piby2, AMD_F_INEXACT); + else + return _handle_errorf("asinf", OP_ASIN, INDEFBITPATT_SP32, _DOMAIN, + AMD_F_INVALID, EDOM, x, 0.0F, 1); + } + + if (xneg) y = -x; + else y = x; + + transform = (xexp >= -1); /* abs(x) >= 0.5 */ + + if (transform) + { /* Transform y into the range [0,0.5) */ + r = 0.5F*(1.0F - y); + /* VC++ intrinsic call */ + _mm_store_ss(&s, _mm_sqrt_ss(_mm_load_ss(&r))); + y = s; + } + else + r = y*y; + + /* Use a rational approximation for [0.0, 0.5] */ + + u=r*(0.184161606965100694821398249421F + + (-0.0565298683201845211985026327361F + + (-0.0133819288943925804214011424456F - + 0.00396137437848476485201154797087F*r)*r)*r)/ + (1.10496961524520294485512696706F - + 0.836411276854206731913362287293F*r); + + if (transform) + { + /* Reconstruct asin carefully in transformed region */ + float c, s1, p, q; + unsigned int us; + GET_BITS_SP32(s, us); + PUT_BITS_SP32(0xffff0000 & us, s1); + c = (r-s1*s1)/(s+s1); + p = 2.0F*s*u - (piby2_tail-2.0F*c); + q = hpiby2_head - 2.0F*s1; + v = hpiby2_head - (p-q); + } + else + { + /* Use a temporary variable to prevent VC++ rearranging + y + y*u + into + y * (1 + u) + and getting an incorrectly rounded result */ + float tmp; + tmp = y * u; + v = y + tmp; + } + + if (xneg) return -v; + else return v; +} diff --git a/sdk/lib/crt/math/libm_sse2/atan.c b/sdk/lib/crt/math/libm_sse2/atan.c new file mode 100644 index 00000000000..c28e0672779 --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/atan.c @@ -0,0 +1,132 @@ + +/******************************************************************************* +MIT License +----------- + +Copyright (c) 2002-2019 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this Software and associated documentaon files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include "libm.h" +#include "libm_util.h" + +#define USE_VAL_WITH_FLAGS +#define USE_NAN_WITH_FLAGS +#define USE_HANDLE_ERROR +#include "libm_inlines.h" +#undef USE_VAL_WITH_FLAGS +#undef USE_NAN_WITH_FLAGS +#undef USE_HANDLE_ERROR + +#include "libm_errno.h" + +#pragma function(atan) + +double FN_PROTOTYPE(atan)(double x) +{ + + /* Some constants and split constants. */ + + static double piby2 = 1.5707963267948966e+00; /* 0x3ff921fb54442d18 */ + double chi, clo, v, s, q, z; + + /* Find properties of argument x. */ + + unsigned long ux, aux, xneg; + GET_BITS_DP64(x, ux); + aux = ux & ~SIGNBIT_DP64; + xneg = (ux != aux); + + if (xneg) v = -x; + else v = x; + + /* Argument reduction to range [-7/16,7/16] */ + + if (aux > 0x4003800000000000) /* v > 39./16. */ + { + + if (aux > PINFBITPATT_DP64) + { + /* x is NaN */ + return _handle_error("atan", OP_ATAN, ux|0x0008000000000000, _DOMAIN, 0, + EDOM, x, 0.0, 1); + } + else if (v > 0x4370000000000000) + { /* abs(x) > 2^56 => arctan(1/x) is + insignificant compared to piby2 */ + if (xneg) + return val_with_flags(-piby2, AMD_F_INEXACT); + else + return val_with_flags(piby2, AMD_F_INEXACT); + } + + x = -1.0/v; + /* (chi + clo) = arctan(infinity) */ + chi = 1.57079632679489655800e+00; /* 0x3ff921fb54442d18 */ + clo = 6.12323399573676480327e-17; /* 0x3c91a62633145c06 */ + } + else if (aux > 0x3ff3000000000000) /* 39./16. > v > 19./16. */ + { + x = (v-1.5)/(1.0+1.5*v); + /* (chi + clo) = arctan(1.5) */ + chi = 9.82793723247329054082e-01; /* 0x3fef730bd281f69b */ + clo = 1.39033110312309953701e-17; /* 0x3c7007887af0cbbc */ + } + else if (aux > 0x3fe6000000000000) /* 19./16. > v > 11./16. */ + { + x = (v-1.0)/(1.0+v); + /* (chi + clo) = arctan(1.) */ + chi = 7.85398163397448278999e-01; /* 0x3fe921fb54442d18 */ + clo = 3.06161699786838240164e-17; /* 0x3c81a62633145c06 */ + } + else if (aux > 0x3fdc000000000000) /* 11./16. > v > 7./16. */ + { + x = (2.0*v-1.0)/(2.0+v); + /* (chi + clo) = arctan(0.5) */ + chi = 4.63647609000806093515e-01; /* 0x3fddac670561bb4f */ + clo = 2.26987774529616809294e-17; /* 0x3c7a2b7f222f65e0 */ + } + else /* v < 7./16. */ + { + x = v; + chi = 0.0; + clo = 0.0; + } + + /* Core approximation: Remez(4,4) on [-7/16,7/16] */ + + s = x*x; + q = x*s* + (0.268297920532545909e0 + + (0.447677206805497472e0 + + (0.220638780716667420e0 + + (0.304455919504853031e-1 + + 0.142316903342317766e-3*s)*s)*s)*s)/ + (0.804893761597637733e0 + + (0.182596787737507063e1 + + (0.141254259931958921e1 + + (0.424602594203847109e0 + + 0.389525873944742195e-1*s)*s)*s)*s); + + z = chi - ((q - clo) - x); + + if (xneg) z = -z; + return z; +} diff --git a/sdk/lib/crt/math/libm_sse2/atan2.c b/sdk/lib/crt/math/libm_sse2/atan2.c new file mode 100644 index 00000000000..fb9d1e8482f --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/atan2.c @@ -0,0 +1,750 @@ + +/******************************************************************************* +MIT License +----------- + +Copyright (c) 2002-2019 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this Software and associated documentaon files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include "libm.h" +#include "libm_util.h" + +#define USE_VAL_WITH_FLAGS +#define USE_NAN_WITH_FLAGS +#define USE_SCALEDOUBLE_1 +#define USE_SCALEDOUBLE_2 +#define USE_SCALEUPDOUBLE1024 +#define USE_SCALEDOWNDOUBLE +#define USE_HANDLE_ERROR +#include "libm_inlines.h" +#undef USE_VAL_WITH_FLAGS +#undef USE_NAN_WITH_FLAGS +#undef USE_SCALEDOUBLE_1 +#undef USE_SCALEDOUBLE_2 +#undef USE_SCALEUPDOUBLE1024 +#undef USE_SCALEDOWNDOUBLE +#undef USE_HANDLE_ERROR + +#include "libm_errno.h" + +#pragma function(atan2) + +double FN_PROTOTYPE(atan2)(double y, double x) +{ + /* Arrays atan_jby256_lead and atan_jby256_tail contain + leading and trailing parts respectively of precomputed + values of atan(j/256), for j = 16, 17, ..., 256. + atan_jby256_lead contains the first 21 bits of precision, + and atan_jby256_tail contains a further 53 bits precision. */ + + static const double atan_jby256_lead[ 241] = { + 6.24187886714935302734e-02, /* 0x3faff55b00000000 */ + 6.63088560104370117188e-02, /* 0x3fb0f99e00000000 */ + 7.01969265937805175781e-02, /* 0x3fb1f86d00000000 */ + 7.40829110145568847656e-02, /* 0x3fb2f71900000000 */ + 7.79666304588317871094e-02, /* 0x3fb3f59f00000000 */ + 8.18479657173156738281e-02, /* 0x3fb4f3fd00000000 */ + 8.57268571853637695312e-02, /* 0x3fb5f23200000000 */ + 8.96031260490417480469e-02, /* 0x3fb6f03b00000000 */ + 9.34767723083496093750e-02, /* 0x3fb7ee1800000000 */ + 9.73475575447082519531e-02, /* 0x3fb8ebc500000000 */ + 1.01215422153472900391e-01, /* 0x3fb9e94100000000 */ + 1.05080246925354003906e-01, /* 0x3fbae68a00000000 */ + 1.08941912651062011719e-01, /* 0x3fbbe39e00000000 */ + 1.12800359725952148438e-01, /* 0x3fbce07c00000000 */ + 1.16655409336090087891e-01, /* 0x3fbddd2100000000 */ + 1.20507001876831054688e-01, /* 0x3fbed98c00000000 */ + 1.24354958534240722656e-01, /* 0x3fbfd5ba00000000 */ + 1.28199219703674316406e-01, /* 0x3fc068d500000000 */ + 1.32039666175842285156e-01, /* 0x3fc0e6ad00000000 */ + 1.35876297950744628906e-01, /* 0x3fc1646500000000 */ + 1.39708757400512695312e-01, /* 0x3fc1e1fa00000000 */ + 1.43537282943725585938e-01, /* 0x3fc25f6e00000000 */ + 1.47361397743225097656e-01, /* 0x3fc2dcbd00000000 */ + 1.51181221008300781250e-01, /* 0x3fc359e800000000 */ + 1.54996633529663085938e-01, /* 0x3fc3d6ee00000000 */ + 1.58807516098022460938e-01, /* 0x3fc453ce00000000 */ + 1.62613749504089355469e-01, /* 0x3fc4d08700000000 */ + 1.66415214538574218750e-01, /* 0x3fc54d1800000000 */ + 1.70211911201477050781e-01, /* 0x3fc5c98100000000 */ + 1.74003481864929199219e-01, /* 0x3fc645bf00000000 */ + 1.77790164947509765625e-01, /* 0x3fc6c1d400000000 */ + 1.81571602821350097656e-01, /* 0x3fc73dbd00000000 */ + 1.85347914695739746094e-01, /* 0x3fc7b97b00000000 */ + 1.89118742942810058594e-01, /* 0x3fc8350b00000000 */ + 1.92884206771850585938e-01, /* 0x3fc8b06e00000000 */ + 1.96644186973571777344e-01, /* 0x3fc92ba300000000 */ + 2.00398445129394531250e-01, /* 0x3fc9a6a800000000 */ + 2.04147100448608398438e-01, /* 0x3fca217e00000000 */ + 2.07889914512634277344e-01, /* 0x3fca9c2300000000 */ + 2.11626768112182617188e-01, /* 0x3fcb169600000000 */ + 2.15357661247253417969e-01, /* 0x3fcb90d700000000 */ + 2.19082474708557128906e-01, /* 0x3fcc0ae500000000 */ + 2.22801089286804199219e-01, /* 0x3fcc84bf00000000 */ + 2.26513504981994628906e-01, /* 0x3fccfe6500000000 */ + 2.30219483375549316406e-01, /* 0x3fcd77d500000000 */ + 2.33919143676757812500e-01, /* 0x3fcdf11000000000 */ + 2.37612247467041015625e-01, /* 0x3fce6a1400000000 */ + 2.41298794746398925781e-01, /* 0x3fcee2e100000000 */ + 2.44978547096252441406e-01, /* 0x3fcf5b7500000000 */ + 2.48651623725891113281e-01, /* 0x3fcfd3d100000000 */ + 2.52317905426025390625e-01, /* 0x3fd025fa00000000 */ + 2.55977153778076171875e-01, /* 0x3fd061ee00000000 */ + 2.59629487991333007812e-01, /* 0x3fd09dc500000000 */ + 2.63274669647216796875e-01, /* 0x3fd0d97e00000000 */ + 2.66912937164306640625e-01, /* 0x3fd1151a00000000 */ + 2.70543813705444335938e-01, /* 0x3fd1509700000000 */ + 2.74167299270629882812e-01, /* 0x3fd18bf500000000 */ + 2.77783632278442382812e-01, /* 0x3fd1c73500000000 */ + 2.81392335891723632812e-01, /* 0x3fd2025500000000 */ + 2.84993648529052734375e-01, /* 0x3fd23d5600000000 */ + 2.88587331771850585938e-01, /* 0x3fd2783700000000 */ + 2.92173147201538085938e-01, /* 0x3fd2b2f700000000 */ + 2.95751571655273437500e-01, /* 0x3fd2ed9800000000 */ + 2.99322128295898437500e-01, /* 0x3fd3281800000000 */ + 3.02884817123413085938e-01, /* 0x3fd3627700000000 */ + 3.06439399719238281250e-01, /* 0x3fd39cb400000000 */ + 3.09986352920532226562e-01, /* 0x3fd3d6d100000000 */ + 3.13524961471557617188e-01, /* 0x3fd410cb00000000 */ + 3.17055702209472656250e-01, /* 0x3fd44aa400000000 */ + 3.20578098297119140625e-01, /* 0x3fd4845a00000000 */ + 3.24092388153076171875e-01, /* 0x3fd4bdee00000000 */ + 3.27598333358764648438e-01, /* 0x3fd4f75f00000000 */ + 3.31095933914184570312e-01, /* 0x3fd530ad00000000 */ + 3.34585189819335937500e-01, /* 0x3fd569d800000000 */ + 3.38066101074218750000e-01, /* 0x3fd5a2e000000000 */ + 3.41538190841674804688e-01, /* 0x3fd5dbc300000000 */ + 3.45002174377441406250e-01, /* 0x3fd6148400000000 */ + 3.48457098007202148438e-01, /* 0x3fd64d1f00000000 */ + 3.51903676986694335938e-01, /* 0x3fd6859700000000 */ + 3.55341434478759765625e-01, /* 0x3fd6bdea00000000 */ + 3.58770608901977539062e-01, /* 0x3fd6f61900000000 */ + 3.62190723419189453125e-01, /* 0x3fd72e2200000000 */ + 3.65602254867553710938e-01, /* 0x3fd7660700000000 */ + 3.69004726409912109375e-01, /* 0x3fd79dc600000000 */ + 3.72398376464843750000e-01, /* 0x3fd7d56000000000 */ + 3.75782966613769531250e-01, /* 0x3fd80cd400000000 */ + 3.79158496856689453125e-01, /* 0x3fd8442200000000 */ + 3.82525205612182617188e-01, /* 0x3fd87b4b00000000 */ + 3.85882616043090820312e-01, /* 0x3fd8b24d00000000 */ + 3.89230966567993164062e-01, /* 0x3fd8e92900000000 */ + 3.92570018768310546875e-01, /* 0x3fd91fde00000000 */ + 3.95900011062622070312e-01, /* 0x3fd9566d00000000 */ + 3.99220705032348632812e-01, /* 0x3fd98cd500000000 */ + 4.02532100677490234375e-01, /* 0x3fd9c31600000000 */ + 4.05834197998046875000e-01, /* 0x3fd9f93000000000 */ + 4.09126996994018554688e-01, /* 0x3fda2f2300000000 */ + 4.12410259246826171875e-01, /* 0x3fda64ee00000000 */ + 4.15684223175048828125e-01, /* 0x3fda9a9200000000 */ + 4.18948888778686523438e-01, /* 0x3fdad00f00000000 */ + 4.22204017639160156250e-01, /* 0x3fdb056400000000 */ + 4.25449609756469726562e-01, /* 0x3fdb3a9100000000 */ + 4.28685665130615234375e-01, /* 0x3fdb6f9600000000 */ + 4.31912183761596679688e-01, /* 0x3fdba47300000000 */ + 4.35129165649414062500e-01, /* 0x3fdbd92800000000 */ + 4.38336372375488281250e-01, /* 0x3fdc0db400000000 */ + 4.41534280776977539062e-01, /* 0x3fdc421900000000 */ + 4.44722414016723632812e-01, /* 0x3fdc765500000000 */ + 4.47900772094726562500e-01, /* 0x3fdcaa6800000000 */ + 4.51069593429565429688e-01, /* 0x3fdcde5300000000 */ + 4.54228639602661132812e-01, /* 0x3fdd121500000000 */ + 4.57377910614013671875e-01, /* 0x3fdd45ae00000000 */ + 4.60517644882202148438e-01, /* 0x3fdd791f00000000 */ + 4.63647603988647460938e-01, /* 0x3fddac6700000000 */ + 4.66767549514770507812e-01, /* 0x3fdddf8500000000 */ + 4.69877958297729492188e-01, /* 0x3fde127b00000000 */ + 4.72978591918945312500e-01, /* 0x3fde454800000000 */ + 4.76069211959838867188e-01, /* 0x3fde77eb00000000 */ + 4.79150056838989257812e-01, /* 0x3fdeaa6500000000 */ + 4.82221126556396484375e-01, /* 0x3fdedcb600000000 */ + 4.85282421112060546875e-01, /* 0x3fdf0ede00000000 */ + 4.88333940505981445312e-01, /* 0x3fdf40dd00000000 */ + 4.91375446319580078125e-01, /* 0x3fdf72b200000000 */ + 4.94406938552856445312e-01, /* 0x3fdfa45d00000000 */ + 4.97428894042968750000e-01, /* 0x3fdfd5e000000000 */ + 5.00440597534179687500e-01, /* 0x3fe0039c00000000 */ + 5.03442764282226562500e-01, /* 0x3fe01c3400000000 */ + 5.06434917449951171875e-01, /* 0x3fe034b700000000 */ + 5.09417057037353515625e-01, /* 0x3fe04d2500000000 */ + 5.12389183044433593750e-01, /* 0x3fe0657e00000000 */ + 5.15351772308349609375e-01, /* 0x3fe07dc300000000 */ + 5.18304347991943359375e-01, /* 0x3fe095f300000000 */ + 5.21246910095214843750e-01, /* 0x3fe0ae0e00000000 */ + 5.24179458618164062500e-01, /* 0x3fe0c61400000000 */ + 5.27101993560791015625e-01, /* 0x3fe0de0500000000 */ + 5.30014991760253906250e-01, /* 0x3fe0f5e200000000 */ + 5.32917976379394531250e-01, /* 0x3fe10daa00000000 */ + 5.35810947418212890625e-01, /* 0x3fe1255d00000000 */ + 5.38693904876708984375e-01, /* 0x3fe13cfb00000000 */ + 5.41567325592041015625e-01, /* 0x3fe1548500000000 */ + 5.44430732727050781250e-01, /* 0x3fe16bfa00000000 */ + 5.47284126281738281250e-01, /* 0x3fe1835a00000000 */ + 5.50127506256103515625e-01, /* 0x3fe19aa500000000 */ + 5.52961349487304687500e-01, /* 0x3fe1b1dc00000000 */ + 5.55785179138183593750e-01, /* 0x3fe1c8fe00000000 */ + 5.58598995208740234375e-01, /* 0x3fe1e00b00000000 */ + 5.61403274536132812500e-01, /* 0x3fe1f70400000000 */ + 5.64197540283203125000e-01, /* 0x3fe20de800000000 */ + 5.66981792449951171875e-01, /* 0x3fe224b700000000 */ + 5.69756031036376953125e-01, /* 0x3fe23b7100000000 */ + 5.72520732879638671875e-01, /* 0x3fe2521700000000 */ + 5.75275897979736328125e-01, /* 0x3fe268a900000000 */ + 5.78021049499511718750e-01, /* 0x3fe27f2600000000 */ + 5.80756187438964843750e-01, /* 0x3fe2958e00000000 */ + 5.83481788635253906250e-01, /* 0x3fe2abe200000000 */ + 5.86197376251220703125e-01, /* 0x3fe2c22100000000 */ + 5.88903427124023437500e-01, /* 0x3fe2d84c00000000 */ + 5.91599464416503906250e-01, /* 0x3fe2ee6200000000 */ + 5.94285964965820312500e-01, /* 0x3fe3046400000000 */ + 5.96962928771972656250e-01, /* 0x3fe31a5200000000 */ + 5.99629878997802734375e-01, /* 0x3fe3302b00000000 */ + 6.02287292480468750000e-01, /* 0x3fe345f000000000 */ + 6.04934692382812500000e-01, /* 0x3fe35ba000000000 */ + 6.07573032379150390625e-01, /* 0x3fe3713d00000000 */ + 6.10201358795166015625e-01, /* 0x3fe386c500000000 */ + 6.12820148468017578125e-01, /* 0x3fe39c3900000000 */ + 6.15428924560546875000e-01, /* 0x3fe3b19800000000 */ + 6.18028640747070312500e-01, /* 0x3fe3c6e400000000 */ + 6.20618820190429687500e-01, /* 0x3fe3dc1c00000000 */ + 6.23198986053466796875e-01, /* 0x3fe3f13f00000000 */ + 6.25770092010498046875e-01, /* 0x3fe4064f00000000 */ + 6.28331184387207031250e-01, /* 0x3fe41b4a00000000 */ + 6.30883216857910156250e-01, /* 0x3fe4303200000000 */ + 6.33425712585449218750e-01, /* 0x3fe4450600000000 */ + 6.35958671569824218750e-01, /* 0x3fe459c600000000 */ + 6.38482093811035156250e-01, /* 0x3fe46e7200000000 */ + 6.40995979309082031250e-01, /* 0x3fe4830a00000000 */ + 6.43500804901123046875e-01, /* 0x3fe4978f00000000 */ + 6.45996093750000000000e-01, /* 0x3fe4ac0000000000 */ + 6.48482322692871093750e-01, /* 0x3fe4c05e00000000 */ + 6.50959014892578125000e-01, /* 0x3fe4d4a800000000 */ + 6.53426170349121093750e-01, /* 0x3fe4e8de00000000 */ + 6.55884265899658203125e-01, /* 0x3fe4fd0100000000 */ + 6.58332824707031250000e-01, /* 0x3fe5111000000000 */ + 6.60772323608398437500e-01, /* 0x3fe5250c00000000 */ + 6.63202762603759765625e-01, /* 0x3fe538f500000000 */ + 6.65623664855957031250e-01, /* 0x3fe54cca00000000 */ + 6.68035984039306640625e-01, /* 0x3fe5608d00000000 */ + 6.70438766479492187500e-01, /* 0x3fe5743c00000000 */ + 6.72832489013671875000e-01, /* 0x3fe587d800000000 */ + 6.75216674804687500000e-01, /* 0x3fe59b6000000000 */ + 6.77592277526855468750e-01, /* 0x3fe5aed600000000 */ + 6.79958820343017578125e-01, /* 0x3fe5c23900000000 */ + 6.82316303253173828125e-01, /* 0x3fe5d58900000000 */ + 6.84664726257324218750e-01, /* 0x3fe5e8c600000000 */ + 6.87004089355468750000e-01, /* 0x3fe5fbf000000000 */ + 6.89334869384765625000e-01, /* 0x3fe60f0800000000 */ + 6.91656589508056640625e-01, /* 0x3fe6220d00000000 */ + 6.93969249725341796875e-01, /* 0x3fe634ff00000000 */ + 6.96272850036621093750e-01, /* 0x3fe647de00000000 */ + 6.98567867279052734375e-01, /* 0x3fe65aab00000000 */ + 7.00854301452636718750e-01, /* 0x3fe66d6600000000 */ + 7.03131675720214843750e-01, /* 0x3fe6800e00000000 */ + 7.05400466918945312500e-01, /* 0x3fe692a400000000 */ + 7.07660198211669921875e-01, /* 0x3fe6a52700000000 */ + 7.09911346435546875000e-01, /* 0x3fe6b79800000000 */ + 7.12153911590576171875e-01, /* 0x3fe6c9f700000000 */ + 7.14387893676757812500e-01, /* 0x3fe6dc4400000000 */ + 7.16613292694091796875e-01, /* 0x3fe6ee7f00000000 */ + 7.18829631805419921875e-01, /* 0x3fe700a700000000 */ + 7.21037864685058593750e-01, /* 0x3fe712be00000000 */ + 7.23237514495849609375e-01, /* 0x3fe724c300000000 */ + 7.25428581237792968750e-01, /* 0x3fe736b600000000 */ + 7.27611064910888671875e-01, /* 0x3fe7489700000000 */ + 7.29785442352294921875e-01, /* 0x3fe75a6700000000 */ + 7.31950759887695312500e-01, /* 0x3fe76c2400000000 */ + 7.34108448028564453125e-01, /* 0x3fe77dd100000000 */ + 7.36257076263427734375e-01, /* 0x3fe78f6b00000000 */ + 7.38397598266601562500e-01, /* 0x3fe7a0f400000000 */ + 7.40530014038085937500e-01, /* 0x3fe7b26c00000000 */ + 7.42654323577880859375e-01, /* 0x3fe7c3d300000000 */ + 7.44770050048828125000e-01, /* 0x3fe7d52800000000 */ + 7.46877670288085937500e-01, /* 0x3fe7e66c00000000 */ + 7.48976707458496093750e-01, /* 0x3fe7f79e00000000 */ + 7.51068115234375000000e-01, /* 0x3fe808c000000000 */ + 7.53150939941406250000e-01, /* 0x3fe819d000000000 */ + 7.55226135253906250000e-01, /* 0x3fe82ad000000000 */ + 7.57292747497558593750e-01, /* 0x3fe83bbe00000000 */ + 7.59351730346679687500e-01, /* 0x3fe84c9c00000000 */ + 7.61402606964111328125e-01, /* 0x3fe85d6900000000 */ + 7.63445377349853515625e-01, /* 0x3fe86e2500000000 */ + 7.65480041503906250000e-01, /* 0x3fe87ed000000000 */ + 7.67507076263427734375e-01, /* 0x3fe88f6b00000000 */ + 7.69526004791259765625e-01, /* 0x3fe89ff500000000 */ + 7.71537303924560546875e-01, /* 0x3fe8b06f00000000 */ + 7.73540973663330078125e-01, /* 0x3fe8c0d900000000 */ + 7.75536537170410156250e-01, /* 0x3fe8d13200000000 */ + 7.77523994445800781250e-01, /* 0x3fe8e17a00000000 */ + 7.79504299163818359375e-01, /* 0x3fe8f1b300000000 */ + 7.81476497650146484375e-01, /* 0x3fe901db00000000 */ + 7.83441066741943359375e-01, /* 0x3fe911f300000000 */ + 7.85398006439208984375e-01}; /* 0x3fe921fb00000000 */ + + static const double atan_jby256_tail[ 241] = { + 2.13244638182005395671e-08, /* 0x3e56e59fbd38db2c */ + 3.89093864761712760656e-08, /* 0x3e64e3aa54dedf96 */ + 4.44780900009437454576e-08, /* 0x3e67e105ab1bda88 */ + 1.15344768460112754160e-08, /* 0x3e48c5254d013fd0 */ + 3.37271051945395312705e-09, /* 0x3e2cf8ab3ad62670 */ + 2.40857608736109859459e-08, /* 0x3e59dca4bec80468 */ + 1.85853810450623807768e-08, /* 0x3e53f4b5ec98a8da */ + 5.14358299969225078306e-08, /* 0x3e6b9d49619d81fe */ + 8.85023985412952486748e-09, /* 0x3e43017887460934 */ + 1.59425154214358432060e-08, /* 0x3e511e3eca0b9944 */ + 1.95139937737755753164e-08, /* 0x3e54f3f73c5a332e */ + 2.64909755273544319715e-08, /* 0x3e5c71c8ae0e00a6 */ + 4.43388037881231070144e-08, /* 0x3e67cde0f86fbdc7 */ + 2.14757072421821274557e-08, /* 0x3e570f328c889c72 */ + 2.61049792670754218852e-08, /* 0x3e5c07ae9b994efe */ + 7.81439350674466302231e-09, /* 0x3e40c8021d7b1698 */ + 3.60125207123751024094e-08, /* 0x3e635585edb8cb22 */ + 6.15276238179343767917e-08, /* 0x3e70842567b30e96 */ + 9.54387964641184285058e-08, /* 0x3e799e811031472e */ + 3.02789566851502754129e-08, /* 0x3e6041821416bcee */ + 1.16888650949870856331e-07, /* 0x3e7f6086e4dc96f4 */ + 1.07580956468653338863e-08, /* 0x3e471a535c5f1b58 */ + 8.33454265379535427653e-08, /* 0x3e765f743fe63ca1 */ + 1.10790279272629526068e-07, /* 0x3e7dbd733472d014 */ + 1.08394277896366207424e-07, /* 0x3e7d18cc4d8b0d1d */ + 9.22176086126841098800e-08, /* 0x3e78c12553c8fb29 */ + 7.90938592199048786990e-08, /* 0x3e753b49e2e8f991 */ + 8.66445407164293125637e-08, /* 0x3e77422ae148c141 */ + 1.40839973537092438671e-08, /* 0x3e4e3ec269df56a8 */ + 1.19070438507307600689e-07, /* 0x3e7ff6754e7e0ac9 */ + 6.40451663051716197071e-08, /* 0x3e7131267b1b5aad */ + 1.08338682076343674522e-07, /* 0x3e7d14fa403a94bc */ + 3.52999550187922736222e-08, /* 0x3e62f396c089a3d8 */ + 1.05983273930043077202e-07, /* 0x3e7c731d78fa95bb */ + 1.05486124078259553339e-07, /* 0x3e7c50f385177399 */ + 5.82167732281776477773e-08, /* 0x3e6f41409c6f2c20 */ + 1.08696483983403942633e-07, /* 0x3e7d2d90c4c39ec0 */ + 4.47335086122377542835e-08, /* 0x3e680420696f2106 */ + 1.26896287162615723528e-08, /* 0x3e4b40327943a2e8 */ + 4.06534471589151404531e-08, /* 0x3e65d35e02f3d2a2 */ + 3.84504846300557026690e-08, /* 0x3e64a498288117b0 */ + 3.60715006404807269080e-08, /* 0x3e635da119afb324 */ + 6.44725903165522722801e-08, /* 0x3e714e85cdb9a908 */ + 3.63749249976409461305e-08, /* 0x3e638754e5547b9a */ + 1.03901294413833913794e-07, /* 0x3e7be40ae6ce3246 */ + 6.25379756302167880580e-08, /* 0x3e70c993b3bea7e7 */ + 6.63984302368488828029e-08, /* 0x3e71d2dd89ac3359 */ + 3.21844598971548278059e-08, /* 0x3e61476603332c46 */ + 1.16030611712765830905e-07, /* 0x3e7f25901bac55b7 */ + 1.17464622142347730134e-07, /* 0x3e7f881b7c826e28 */ + 7.54604017965808996596e-08, /* 0x3e7441996d698d20 */ + 1.49234929356206556899e-07, /* 0x3e8407ac521ea089 */ + 1.41416924523217430259e-07, /* 0x3e82fb0c6c4b1723 */ + 2.13308065617483489011e-07, /* 0x3e8ca135966a3e18 */ + 5.04230937933302320146e-08, /* 0x3e6b1218e4d646e4 */ + 5.45874922281655519035e-08, /* 0x3e6d4e72a350d288 */ + 1.51849028914786868886e-07, /* 0x3e84617e2f04c329 */ + 3.09004308703769273010e-08, /* 0x3e6096ec41e82650 */ + 9.67574548184738317664e-08, /* 0x3e79f91f25773e6e */ + 4.02508285529322212824e-08, /* 0x3e659c0820f1d674 */ + 3.01222268096861091157e-08, /* 0x3e602bf7a2df1064 */ + 2.36189860670079288680e-07, /* 0x3e8fb36bfc40508f */ + 1.14095158111080887695e-07, /* 0x3e7ea08f3f8dc892 */ + 7.42349089746573467487e-08, /* 0x3e73ed6254656a0e */ + 5.12515583196230380184e-08, /* 0x3e6b83f5e5e69c58 */ + 2.19290391828763918102e-07, /* 0x3e8d6ec2af768592 */ + 3.83263512187553886471e-08, /* 0x3e6493889a226f94 */ + 1.61513486284090523855e-07, /* 0x3e85ad8fa65279ba */ + 5.09996743535589922261e-08, /* 0x3e6b615784d45434 */ + 1.23694037861246766534e-07, /* 0x3e809a184368f145 */ + 8.23367955351123783984e-08, /* 0x3e761a2439b0d91c */ + 1.07591766213053694014e-07, /* 0x3e7ce1a65e39a978 */ + 1.42789947524631815640e-07, /* 0x3e832a39a93b6a66 */ + 1.32347123024711878538e-07, /* 0x3e81c3699af804e7 */ + 2.17626067316598149229e-08, /* 0x3e575e0f4e44ede8 */ + 2.34454866923044288656e-07, /* 0x3e8f77ced1a7a83b */ + 2.82966370261766916053e-09, /* 0x3e284e7f0cb1b500 */ + 2.29300919890907632975e-07, /* 0x3e8ec6b838b02dfe */ + 1.48428270450261284915e-07, /* 0x3e83ebf4dfbeda87 */ + 1.87937408574313982512e-07, /* 0x3e89397aed9cb475 */ + 6.13685946813334055347e-08, /* 0x3e707937bc239c54 */ + 1.98585022733583817493e-07, /* 0x3e8aa754553131b6 */ + 7.68394131623752961662e-08, /* 0x3e74a05d407c45dc */ + 1.28119052312436745644e-07, /* 0x3e8132231a206dd0 */ + 7.02119104719236502733e-08, /* 0x3e72d8ecfdd69c88 */ + 9.87954793820636301943e-08, /* 0x3e7a852c74218606 */ + 1.72176752381034986217e-07, /* 0x3e871bf2baeebb50 */ + 1.12877225146169704119e-08, /* 0x3e483d7db7491820 */ + 5.33549829555851737993e-08, /* 0x3e6ca50d92b6da14 */ + 2.13833275710816521345e-08, /* 0x3e56f5cde8530298 */ + 1.16243518048290556393e-07, /* 0x3e7f343198910740 */ + 6.29926408369055877943e-08, /* 0x3e70e8d241ccd80a */ + 6.45429039328021963791e-08, /* 0x3e71535ac619e6c8 */ + 8.64001922814281933403e-08, /* 0x3e77316041c36cd2 */ + 9.50767572202325800240e-08, /* 0x3e7985a000637d8e */ + 5.80851497508121135975e-08, /* 0x3e6f2f29858c0a68 */ + 1.82350561135024766232e-07, /* 0x3e8879847f96d909 */ + 1.98948680587390608655e-07, /* 0x3e8ab3d319e12e42 */ + 7.83548663450197659846e-08, /* 0x3e75088162dfc4c2 */ + 3.04374234486798594427e-08, /* 0x3e605749a1cd9d8c */ + 2.76135725629797411787e-08, /* 0x3e5da65c6c6b8618 */ + 4.32610105454203065470e-08, /* 0x3e6739bf7df1ad64 */ + 5.17107515324127256994e-08, /* 0x3e6bc31252aa3340 */ + 2.82398327875841444660e-08, /* 0x3e5e528191ad3aa8 */ + 1.87482469524195595399e-07, /* 0x3e8929d93df19f18 */ + 2.97481891662714096139e-08, /* 0x3e5ff11eb693a080 */ + 9.94421570843584316402e-09, /* 0x3e455ae3f145a3a0 */ + 1.07056210730391848428e-07, /* 0x3e7cbcd8c6c0ca82 */ + 6.25589580466881163081e-08, /* 0x3e70cb04d425d304 */ + 9.56641013869464593803e-08, /* 0x3e79adfcab5be678 */ + 1.88056307148355440276e-07, /* 0x3e893d90c5662508 */ + 8.38850689379557880950e-08, /* 0x3e768489bd35ff40 */ + 5.01215865527674122924e-09, /* 0x3e3586ed3da2b7e0 */ + 1.74166095998522089762e-07, /* 0x3e87604d2e850eee */ + 9.96779574395363585849e-08, /* 0x3e7ac1d12bfb53d8 */ + 5.98432026368321460686e-09, /* 0x3e39b3d468274740 */ + 1.18362922366887577169e-07, /* 0x3e7fc5d68d10e53c */ + 1.86086833284154215946e-07, /* 0x3e88f9e51884becb */ + 1.97671457251348941011e-07, /* 0x3e8a87f0869c06d1 */ + 1.42447160717199237159e-07, /* 0x3e831e7279f685fa */ + 1.05504240785546574184e-08, /* 0x3e46a8282f9719b0 */ + 3.13335218371639189324e-08, /* 0x3e60d2724a8a44e0 */ + 1.96518418901914535399e-07, /* 0x3e8a60524b11ad4e */ + 2.17692035039173536059e-08, /* 0x3e575fdf832750f0 */ + 2.15613114426529981675e-07, /* 0x3e8cf06902e4cd36 */ + 5.68271098300441214948e-08, /* 0x3e6e82422d4f6d10 */ + 1.70331455823369124256e-08, /* 0x3e524a091063e6c0 */ + 9.17590028095709583247e-08, /* 0x3e78a1a172dc6f38 */ + 2.77266304112916566247e-07, /* 0x3e929b6619f8a92d */ + 9.37041937614656939690e-08, /* 0x3e79274d9c1b70c8 */ + 1.56116346368316796511e-08, /* 0x3e50c34b1fbb7930 */ + 4.13967433808382727413e-08, /* 0x3e6639866c20eb50 */ + 1.70164749185821616276e-07, /* 0x3e86d6d0f6832e9e */ + 4.01708788545600086008e-07, /* 0x3e9af54def99f25e */ + 2.59663539226050551563e-07, /* 0x3e916cfc52a00262 */ + 2.22007487655027469542e-07, /* 0x3e8dcc1e83569c32 */ + 2.90542250809644081369e-07, /* 0x3e937f7a551ed425 */ + 4.67720537666628903341e-07, /* 0x3e9f6360adc98887 */ + 2.79799803956772554802e-07, /* 0x3e92c6ec8d35a2c1 */ + 2.07344552327432547723e-07, /* 0x3e8bd44df84cb036 */ + 2.54705698692735196368e-07, /* 0x3e9117cf826e310e */ + 4.26848589539548450728e-07, /* 0x3e9ca533f332cfc9 */ + 2.52506723633552216197e-07, /* 0x3e90f208509dbc2e */ + 2.14684129933849704964e-07, /* 0x3e8cd07d93c945de */ + 3.20134822201596505431e-07, /* 0x3e957bdfd67e6d72 */ + 9.93537565749855712134e-08, /* 0x3e7aab89c516c658 */ + 3.70792944827917252327e-08, /* 0x3e63e823b1a1b8a0 */ + 1.41772749369083698972e-07, /* 0x3e8307464a9d6d3c */ + 4.22446601490198804306e-07, /* 0x3e9c5993cd438843 */ + 4.11818433724801511540e-07, /* 0x3e9ba2fca02ab554 */ + 1.19976381502605310519e-07, /* 0x3e801a5b6983a268 */ + 3.43703078571520905265e-08, /* 0x3e6273d1b350efc8 */ + 1.66128705555453270379e-07, /* 0x3e864c238c37b0c6 */ + 5.00499610023283006540e-08, /* 0x3e6aded07370a300 */ + 1.75105139941208062123e-07, /* 0x3e878091197eb47e */ + 7.70807146729030327334e-08, /* 0x3e74b0f245e0dabc */ + 2.45918607526895836121e-07, /* 0x3e9080d9794e2eaf */ + 2.18359020958626199345e-07, /* 0x3e8d4ec242b60c76 */ + 8.44342887976445333569e-09, /* 0x3e4221d2f940caa0 */ + 1.07506148687888629299e-07, /* 0x3e7cdbc42b2bba5c */ + 5.36544954316820904572e-08, /* 0x3e6cce37bb440840 */ + 3.39109101518396596341e-07, /* 0x3e96c1d999cf1dd0 */ + 2.60098720293920613340e-08, /* 0x3e5bed8a07eb0870 */ + 8.42678991664621455827e-08, /* 0x3e769ed88f490e3c */ + 5.36972237470183633197e-08, /* 0x3e6cd41719b73ef0 */ + 4.28192558171921681288e-07, /* 0x3e9cbc4ac95b41b7 */ + 2.71535491483955143294e-07, /* 0x3e9238f1b890f5d7 */ + 7.84094998145075780203e-08, /* 0x3e750c4282259cc4 */ + 3.43880599134117431863e-07, /* 0x3e9713d2de87b3e2 */ + 1.32878065060366481043e-07, /* 0x3e81d5a7d2255276 */ + 4.18046802627967629428e-07, /* 0x3e9c0dfd48227ac1 */ + 2.65042411765766019424e-07, /* 0x3e91c964dab76753 */ + 1.70383695347518643694e-07, /* 0x3e86de56d5704496 */ + 1.54096497259613515678e-07, /* 0x3e84aeb71fd19968 */ + 2.36543402412459813461e-07, /* 0x3e8fbf91c57b1918 */ + 4.38416350106876736790e-07, /* 0x3e9d6bef7fbe5d9a */ + 3.03892161339927775731e-07, /* 0x3e9464d3dc249066 */ + 3.31136771605664899240e-07, /* 0x3e9638e2ec4d9073 */ + 6.49494294526590682218e-08, /* 0x3e716f4a7247ea7c */ + 4.10423429887181345747e-09, /* 0x3e31a0a740f1d440 */ + 1.70831640869113847224e-07, /* 0x3e86edbb0114a33c */ + 1.10811512657909180966e-07, /* 0x3e7dbee8bf1d513c */ + 3.23677724749783611964e-07, /* 0x3e95b8bdb0248f73 */ + 3.55662734259192678528e-07, /* 0x3e97de3d3f5eac64 */ + 2.30102333489738219140e-07, /* 0x3e8ee24187ae448a */ + 4.47429004000738629714e-07, /* 0x3e9e06c591ec5192 */ + 7.78167135617329598659e-08, /* 0x3e74e3861a332738 */ + 9.90345291908535415737e-08, /* 0x3e7a9599dcc2bfe4 */ + 5.85800913143113728314e-08, /* 0x3e6f732fbad43468 */ + 4.57859062410871843857e-07, /* 0x3e9eb9f573b727d9 */ + 3.67993069723390929794e-07, /* 0x3e98b212a2eb9897 */ + 2.90836464322977276043e-07, /* 0x3e9384884c167215 */ + 2.51621574250131388318e-07, /* 0x3e90e2d363020051 */ + 2.75789824740652815545e-07, /* 0x3e92820879fbd022 */ + 3.88985776250314403593e-07, /* 0x3e9a1ab9893e4b30 */ + 1.40214080183768019611e-07, /* 0x3e82d1b817a24478 */ + 3.23451432223550478373e-08, /* 0x3e615d7b8ded4878 */ + 9.15979180730608444470e-08, /* 0x3e78968f9db3a5e4 */ + 3.44371402498640470421e-07, /* 0x3e971c4171fe135f */ + 3.40401897215059498077e-07, /* 0x3e96d80f605d0d8c */ + 1.06431813453707950243e-07, /* 0x3e7c91f043691590 */ + 1.46204238932338846248e-07, /* 0x3e839f8a15fce2b2 */ + 9.94610376972039046878e-09, /* 0x3e455beda9d94b80 */ + 2.01711528092681771039e-07, /* 0x3e8b12c15d60949a */ + 2.72027977986191568296e-07, /* 0x3e924167b312bfe3 */ + 2.48402602511693757964e-07, /* 0x3e90ab8633070277 */ + 1.58480011219249621715e-07, /* 0x3e854554ebbc80ee */ + 3.00372828113368713281e-08, /* 0x3e60204aef5a4bb8 */ + 3.67816204583541976394e-07, /* 0x3e98af08c679cf2c */ + 2.46169793032343824291e-07, /* 0x3e90852a330ae6c8 */ + 1.70080468270204253247e-07, /* 0x3e86d3eb9ec32916 */ + 1.67806717763872914315e-07, /* 0x3e8685cb7fcbbafe */ + 2.67715622006907942620e-07, /* 0x3e91f751c1e0bd95 */ + 2.14411342550299170574e-08, /* 0x3e5705b1b0f72560 */ + 4.11228221283669073277e-07, /* 0x3e9b98d8d808ca92 */ + 3.52311752396749662260e-08, /* 0x3e62ea22c75cc980 */ + 3.52718000397367821054e-07, /* 0x3e97aba62bca0350 */ + 4.38857387992911129814e-07, /* 0x3e9d73833442278c */ + 3.22574606753482540743e-07, /* 0x3e95a5ca1fb18bf9 */ + 3.28730371182804296828e-08, /* 0x3e61a6092b6ecf28 */ + 7.56672470607639279700e-08, /* 0x3e744fd049aac104 */ + 3.26750155316369681821e-09, /* 0x3e2c114fd8df5180 */ + 3.21724445362095284743e-07, /* 0x3e95972f130feae5 */ + 1.06639427371776571151e-07, /* 0x3e7ca034a55fe198 */ + 3.41020788139524715063e-07, /* 0x3e96e2b149990227 */ + 1.00582838631232552824e-07, /* 0x3e7b00000294592c */ + 3.68439433859276640065e-07, /* 0x3e98b9bdc442620e */ + 2.20403078342388012027e-07, /* 0x3e8d94fdfabf3e4e */ + 1.62841467098298142534e-07, /* 0x3e85db30b145ad9a */ + 2.25325348296680733838e-07, /* 0x3e8e3e1eb95022b0 */ + 4.37462238226421614339e-07, /* 0x3e9d5b8b45442bd6 */ + 3.52055880555040706500e-07, /* 0x3e97a046231ecd2e */ + 4.75614398494781776825e-07, /* 0x3e9feafe3ef55232 */ + 3.60998399033215317516e-07, /* 0x3e9839e7bfd78267 */ + 3.79292434611513945954e-08, /* 0x3e645cf49d6fa900 */ + 1.29859015528549300061e-08, /* 0x3e4be3132b27f380 */ + 3.15927546985474913188e-07, /* 0x3e9533980bb84f9f */ + 2.28533679887379668031e-08, /* 0x3e5889e2ce3ba390 */ + 1.17222541823553133877e-07, /* 0x3e7f7778c3ad0cc8 */ + 1.51991208405464415857e-07, /* 0x3e846660cec4eba2 */ + 1.56958239325240655564e-07}; /* 0x3e85110b4611a626 */ + + /* Some constants and split constants. */ + + static double pi = 3.1415926535897932e+00, /* 0x400921fb54442d18 */ + piby2 = 1.5707963267948966e+00, /* 0x3ff921fb54442d18 */ + piby4 = 7.8539816339744831e-01, /* 0x3fe921fb54442d18 */ + three_piby4 = 2.3561944901923449e+00, /* 0x4002d97c7f3321d2 */ + pi_head = 3.1415926218032836e+00, /* 0x400921fb50000000 */ + pi_tail = 3.1786509547056392e-08, /* 0x3e6110b4611a6263 */ + piby2_head = 1.5707963267948965e+00, /* 0x3ff921fb54442d18 */ + piby2_tail = 6.1232339957367660e-17; /* 0x3c91a62633145c07 */ + + double u, v, vbyu, q1, q2, s, u1, vu1, u2, vu2, uu, c, r; + unsigned int swap_vu, index, xzero, yzero, xnan, ynan, xinf, yinf; + int m, xexp, yexp, diffexp; + + /* Find properties of arguments x and y. */ + + unsigned long ux, ui, aux, xneg, uy, auy, yneg; + + GET_BITS_DP64(x, ux); + GET_BITS_DP64(y, uy); + aux = ux & ~SIGNBIT_DP64; + auy = uy & ~SIGNBIT_DP64; + xexp = (int)((ux & EXPBITS_DP64) >> EXPSHIFTBITS_DP64); + yexp = (int)((uy & EXPBITS_DP64) >> EXPSHIFTBITS_DP64); + xneg = ux & SIGNBIT_DP64; + yneg = uy & SIGNBIT_DP64; + xzero = (aux == 0); + yzero = (auy == 0); + xnan = (aux > PINFBITPATT_DP64); + ynan = (auy > PINFBITPATT_DP64); + xinf = (aux == PINFBITPATT_DP64); + yinf = (auy == PINFBITPATT_DP64); + + diffexp = yexp - xexp; + + /* Special cases */ + + if (xnan) + return _handle_error("atan2", OP_ATAN2, ux|0x0008000000000000, _DOMAIN, 0, + EDOM, x, y, 2); + else if (ynan) + return _handle_error("atan2", OP_ATAN2, uy|0x0008000000000000, _DOMAIN, 0, + EDOM, x, y, 2); + else if (yzero) + { /* Zero y gives +-0 for positive x + and +-pi for negative x */ + if (xneg) + { + if (yneg) return val_with_flags(-pi,AMD_F_INEXACT); + else return val_with_flags(pi,AMD_F_INEXACT); + } + else return y; + } + else if (xzero) + { /* Zero x gives +- pi/2 + depending on sign of y */ + if (yneg) return val_with_flags(-piby2,AMD_F_INEXACT); + else val_with_flags(piby2,AMD_F_INEXACT); + } + + /* Scale up both x and y if they are both below 1/4. + This avoids any possible later denormalised arithmetic. */ + + if ((xexp < 1021 && yexp < 1021)) + { + scaleUpDouble1024(ux, &ux); + scaleUpDouble1024(uy, &uy); + PUT_BITS_DP64(ux, x); + PUT_BITS_DP64(uy, y); + xexp = (int)((ux & EXPBITS_DP64) >> EXPSHIFTBITS_DP64); + yexp = (int)((uy & EXPBITS_DP64) >> EXPSHIFTBITS_DP64); + diffexp = yexp - xexp; + } + + if (diffexp > 56) + { /* abs(y)/abs(x) > 2^56 => arctan(x/y) + is insignificant compared to piby2 */ + if (yneg) return val_with_flags(-piby2,AMD_F_INEXACT); + else return val_with_flags(piby2,AMD_F_INEXACT); + } + else if (diffexp < -28 && (!xneg)) + { /* x positive and dominant over y by a factor of 2^28. + In this case atan(y/x) is y/x to machine accuracy. */ + + if (diffexp < -1074) /* Result underflows */ + { + if (yneg) + return val_with_flags(-0.0,AMD_F_INEXACT | AMD_F_UNDERFLOW); + else + return val_with_flags(0.0,AMD_F_INEXACT | AMD_F_UNDERFLOW); + } + else + { + if (diffexp < -1022) + { + /* Result will likely be denormalized */ + y = scaleDouble_1(y, 100); + y /= x; + /* Now y is 2^100 times the true result. Scale it back down. */ + GET_BITS_DP64(y, uy); + scaleDownDouble(uy, 100, &uy); + PUT_BITS_DP64(uy, y); + if ((uy & EXPBITS_DP64) == 0) + return val_with_flags(y, AMD_F_INEXACT | AMD_F_UNDERFLOW); + else + return y; + } + else + return y / x; + } + } + else if (diffexp < -56 && xneg) + { /* abs(x)/abs(y) > 2^56 and x < 0 => arctan(y/x) + is insignificant compared to pi */ + if (yneg) return val_with_flags(-pi,AMD_F_INEXACT); + else return val_with_flags(pi,AMD_F_INEXACT); + } + else if (yinf && xinf) + { /* If abs(x) and abs(y) are both infinity + return +-pi/4 or +- 3pi/4 according to + signs. */ + if (xneg) + { + if (yneg) return val_with_flags(-three_piby4,AMD_F_INEXACT); + else return val_with_flags(three_piby4,AMD_F_INEXACT); + } + else + { + if (yneg) return val_with_flags(-piby4,AMD_F_INEXACT); + else return val_with_flags(piby4,AMD_F_INEXACT); + } + } + + /* General case: take absolute values of arguments */ + + u = x; v = y; + if (xneg) u = -x; + if (yneg) v = -y; + + /* Swap u and v if necessary to obtain 0 < v < u. Compute v/u. */ + + swap_vu = (u < v); + if (swap_vu) { uu = u; u = v; v = uu; } + vbyu = v/u; + + if (vbyu > 0.0625) + { /* General values of v/u. Use a look-up + table and series expansion. */ + + index = (int)(256*vbyu + 0.5); + q1 = atan_jby256_lead[index-16]; + q2 = atan_jby256_tail[index-16]; + c = index*1./256; + GET_BITS_DP64(u, ui); + m = (int)((ui & EXPBITS_DP64) >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64; + u = scaleDouble_2(u,-m); + v = scaleDouble_2(v,-m); + GET_BITS_DP64(u, ui); + PUT_BITS_DP64(0xfffffffff8000000 & ui, u1); /* 26 leading bits of u */ + u2 = u - u1; + + r = ((v-c*u1)-c*u2)/(u+c*v); + + /* Polynomial approximation to atan(r) */ + + s = r*r; + q2 = q2 + r - r*(s * (0.33333333333224095522 - s*(0.19999918038989143496))); + } + else if (vbyu < 1.e-8) + { /* v/u is small enough that atan(v/u) = v/u */ + q1 = 0.0; + q2 = vbyu; + } + else /* vbyu <= 0.0625 */ + { + /* Small values of v/u. Use a series expansion + computed carefully to minimise cancellation */ + + GET_BITS_DP64(u, ui); + PUT_BITS_DP64(0xffffffff00000000 & ui, u1); + GET_BITS_DP64(vbyu, ui); + PUT_BITS_DP64(0xffffffff00000000 & ui, vu1); + u2 = u - u1; + vu2 = vbyu - vu1; + + q1 = 0.0; + s = vbyu*vbyu; + q2 = vbyu + + ((((v - u1*vu1) - u2*vu1) - u*vu2)/u - + (vbyu*s*(0.33333333333333170500 - + s*(0.19999999999393223405 - + s*(0.14285713561807169030 - + s*(0.11110736283514525407 - + s*(0.90029810285449784439E-01))))))); + } + + /* Tidy-up according to which quadrant the arguments lie in */ + + if (swap_vu) {q1 = piby2_head - q1; q2 = piby2_tail - q2;} + if (xneg) {q1 = pi_head - q1; q2 = pi_tail - q2;} + q1 = q1 + q2; + + if (yneg) q1 = - q1; + + return q1; +} diff --git a/sdk/lib/crt/math/libm_sse2/atan2f.c b/sdk/lib/crt/math/libm_sse2/atan2f.c new file mode 100644 index 00000000000..42d54cda2d5 --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/atan2f.c @@ -0,0 +1,469 @@ + +/******************************************************************************* +MIT License +----------- + +Copyright (c) 2002-2019 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this Software and associated documentaon files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include "libm.h" +#include "libm_util.h" + +#define USE_VALF_WITH_FLAGS +#define USE_NAN_WITH_FLAGS +#define USE_SCALEDOUBLE_1 +#define USE_SCALEDOWNDOUBLE +#define USE_HANDLE_ERRORF +#include "libm_inlines.h" +#undef USE_VALF_WITH_FLAGS +#undef USE_NAN_WITH_FLAGS +#undef USE_SCALEDOUBLE_1 +#undef USE_SCALEDOWNDOUBLE +#undef USE_HANDLE_ERRORF + +#include "libm_errno.h" + +// Disable "C4163: not available as intrinsic function" warning that older +// compilers may issue here. +#pragma warning(disable:4163) +#pragma function(atan2f) + +float FN_PROTOTYPE(atan2f)(float fy, float fx) +{ + /* Array atan_jby256 contains precomputed values of atan(j/256), + for j = 16, 17, ..., 256. */ + + static const double atan_jby256[ 241] = { + 6.24188099959573430842e-02, /* 0x3faff55bb72cfde9 */ + 6.63088949198234745008e-02, /* 0x3fb0f99ea71d52a6 */ + 7.01969710718705064423e-02, /* 0x3fb1f86dbf082d58 */ + 7.40829225490337306415e-02, /* 0x3fb2f719318a4a9a */ + 7.79666338315423007588e-02, /* 0x3fb3f59f0e7c559d */ + 8.18479898030765457007e-02, /* 0x3fb4f3fd677292fb */ + 8.57268757707448092464e-02, /* 0x3fb5f2324fd2d7b2 */ + 8.96031774848717321724e-02, /* 0x3fb6f03bdcea4b0c */ + 9.34767811585894559112e-02, /* 0x3fb7ee182602f10e */ + 9.73475734872236708739e-02, /* 0x3fb8ebc54478fb28 */ + 1.01215441667466668485e-01, /* 0x3fb9e94153cfdcf1 */ + 1.05080273416329528224e-01, /* 0x3fbae68a71c722b8 */ + 1.08941956989865793015e-01, /* 0x3fbbe39ebe6f07c3 */ + 1.12800381201659388752e-01, /* 0x3fbce07c5c3cca32 */ + 1.16655435441069349478e-01, /* 0x3fbddd21701eba6e */ + 1.20507009691224548087e-01, /* 0x3fbed98c2190043a */ + 1.24354994546761424279e-01, /* 0x3fbfd5ba9aac2f6d */ + 1.28199281231298117811e-01, /* 0x3fc068d584212b3d */ + 1.32039761614638734288e-01, /* 0x3fc0e6adccf40881 */ + 1.35876328229701304195e-01, /* 0x3fc1646541060850 */ + 1.39708874289163620386e-01, /* 0x3fc1e1fafb043726 */ + 1.43537293701821222491e-01, /* 0x3fc25f6e171a535c */ + 1.47361481088651630200e-01, /* 0x3fc2dcbdb2fba1ff */ + 1.51181331798580037562e-01, /* 0x3fc359e8edeb99a3 */ + 1.54996741923940972718e-01, /* 0x3fc3d6eee8c6626c */ + 1.58807608315631065832e-01, /* 0x3fc453cec6092a9e */ + 1.62613828597948567589e-01, /* 0x3fc4d087a9da4f17 */ + 1.66415301183114927586e-01, /* 0x3fc54d18ba11570a */ + 1.70211925285474380276e-01, /* 0x3fc5c9811e3ec269 */ + 1.74003600935367680469e-01, /* 0x3fc645bfffb3aa73 */ + 1.77790228992676047071e-01, /* 0x3fc6c1d4898933d8 */ + 1.81571711160032150945e-01, /* 0x3fc73dbde8a7d201 */ + 1.85347949995694760705e-01, /* 0x3fc7b97b4bce5b02 */ + 1.89118848926083965578e-01, /* 0x3fc8350be398ebc7 */ + 1.92884312257974643856e-01, /* 0x3fc8b06ee2879c28 */ + 1.96644245190344985064e-01, /* 0x3fc92ba37d050271 */ + 2.00398553825878511514e-01, /* 0x3fc9a6a8e96c8626 */ + 2.04147145182116990236e-01, /* 0x3fca217e601081a5 */ + 2.07889927202262986272e-01, /* 0x3fca9c231b403279 */ + 2.11626808765629753628e-01, /* 0x3fcb1696574d780b */ + 2.15357699697738047551e-01, /* 0x3fcb90d7529260a2 */ + 2.19082510780057748701e-01, /* 0x3fcc0ae54d768466 */ + 2.22801153759394493514e-01, /* 0x3fcc84bf8a742e6d */ + 2.26513541356919617664e-01, /* 0x3fccfe654e1d5395 */ + 2.30219587276843717927e-01, /* 0x3fcd77d5df205736 */ + 2.33919206214733416127e-01, /* 0x3fcdf110864c9d9d */ + 2.37612313865471241892e-01, /* 0x3fce6a148e96ec4d */ + 2.41298826930858800743e-01, /* 0x3fcee2e1451d980c */ + 2.44978663126864143473e-01, /* 0x3fcf5b75f92c80dd */ + 2.48651741190513253521e-01, /* 0x3fcfd3d1fc40dbe4 */ + 2.52317980886427151166e-01, /* 0x3fd025fa510665b5 */ + 2.55977303013005474952e-01, /* 0x3fd061eea03d6290 */ + 2.59629629408257511791e-01, /* 0x3fd09dc597d86362 */ + 2.63274882955282396590e-01, /* 0x3fd0d97ee509acb3 */ + 2.66912987587400396539e-01, /* 0x3fd1151a362431c9 */ + 2.70543868292936529052e-01, /* 0x3fd150973a9ce546 */ + 2.74167451119658789338e-01, /* 0x3fd18bf5a30bf178 */ + 2.77783663178873208022e-01, /* 0x3fd1c735212dd883 */ + 2.81392432649178403370e-01, /* 0x3fd2025567e47c95 */ + 2.84993688779881237938e-01, /* 0x3fd23d562b381041 */ + 2.88587361894077354396e-01, /* 0x3fd278372057ef45 */ + 2.92173383391398755471e-01, /* 0x3fd2b2f7fd9b5fe2 */ + 2.95751685750431536626e-01, /* 0x3fd2ed987a823cfe */ + 2.99322202530807379706e-01, /* 0x3fd328184fb58951 */ + 3.02884868374971361060e-01, /* 0x3fd362773707ebcb */ + 3.06439619009630070945e-01, /* 0x3fd39cb4eb76157b */ + 3.09986391246883430384e-01, /* 0x3fd3d6d129271134 */ + 3.13525122985043869228e-01, /* 0x3fd410cbad6c7d32 */ + 3.17055753209146973237e-01, /* 0x3fd44aa436c2af09 */ + 3.20578221991156986359e-01, /* 0x3fd4845a84d0c21b */ + 3.24092470489871664618e-01, /* 0x3fd4bdee586890e6 */ + 3.27598440950530811477e-01, /* 0x3fd4f75f73869978 */ + 3.31096076704132047386e-01, /* 0x3fd530ad9951cd49 */ + 3.34585322166458920545e-01, /* 0x3fd569d88e1b4cd7 */ + 3.38066122836825466713e-01, /* 0x3fd5a2e0175e0f4e */ + 3.41538425296541714449e-01, /* 0x3fd5dbc3fbbe768d */ + 3.45002177207105076295e-01, /* 0x3fd614840309cfe1 */ + 3.48457327308122011278e-01, /* 0x3fd64d1ff635c1c5 */ + 3.51903825414964732676e-01, /* 0x3fd685979f5fa6fd */ + 3.55341622416168290144e-01, /* 0x3fd6bdeac9cbd76c */ + 3.58770670270572189509e-01, /* 0x3fd6f61941e4def0 */ + 3.62190922004212156882e-01, /* 0x3fd72e22d53aa2a9 */ + 3.65602331706966821034e-01, /* 0x3fd7660752817501 */ + 3.69004854528964421068e-01, /* 0x3fd79dc6899118d1 */ + 3.72398446676754202311e-01, /* 0x3fd7d5604b63b3f7 */ + 3.75783065409248884237e-01, /* 0x3fd80cd46a14b1d0 */ + 3.79158669033441808605e-01, /* 0x3fd84422b8df95d7 */ + 3.82525216899905096124e-01, /* 0x3fd87b4b0c1ebedb */ + 3.85882669398073752109e-01, /* 0x3fd8b24d394a1b25 */ + 3.89230987951320717144e-01, /* 0x3fd8e92916f5cde8 */ + 3.92570135011828580396e-01, /* 0x3fd91fde7cd0c662 */ + 3.95900074055262896078e-01, /* 0x3fd9566d43a34907 */ + 3.99220769575252543149e-01, /* 0x3fd98cd5454d6b18 */ + 4.02532187077682512832e-01, /* 0x3fd9c3165cc58107 */ + 4.05834293074804064450e-01, /* 0x3fd9f93066168001 */ + 4.09127055079168300278e-01, /* 0x3fda2f233e5e530b */ + 4.12410441597387267265e-01, /* 0x3fda64eec3cc23fc */ + 4.15684422123729413467e-01, /* 0x3fda9a92d59e98cf */ + 4.18948967133552840902e-01, /* 0x3fdad00f5422058b */ + 4.22204048076583571270e-01, /* 0x3fdb056420ae9343 */ + 4.25449637370042266227e-01, /* 0x3fdb3a911da65c6c */ + 4.28685708391625730496e-01, /* 0x3fdb6f962e737efb */ + 4.31912235472348193799e-01, /* 0x3fdba473378624a5 */ + 4.35129193889246812521e-01, /* 0x3fdbd9281e528191 */ + 4.38336559857957774877e-01, /* 0x3fdc0db4c94ec9ef */ + 4.41534310525166673322e-01, /* 0x3fdc42191ff11eb6 */ + 4.44722423960939305942e-01, /* 0x3fdc76550aad71f8 */ + 4.47900879150937292206e-01, /* 0x3fdcaa6872f3631b */ + 4.51069655988523443568e-01, /* 0x3fdcde53432c1350 */ + 4.54228735266762495559e-01, /* 0x3fdd121566b7f2ad */ + 4.57378098670320809571e-01, /* 0x3fdd45aec9ec862b */ + 4.60517728767271039558e-01, /* 0x3fdd791f5a1226f4 */ + 4.63647609000806093515e-01, /* 0x3fddac670561bb4f */ + 4.66767723680866497560e-01, /* 0x3fdddf85bb026974 */ + 4.69878057975686880265e-01, /* 0x3fde127b6b0744af */ + 4.72978597903265574054e-01, /* 0x3fde4548066cf51a */ + 4.76069330322761219421e-01, /* 0x3fde77eb7f175a34 */ + 4.79150242925822533735e-01, /* 0x3fdeaa65c7cf28c4 */ + 4.82221324227853687105e-01, /* 0x3fdedcb6d43f8434 */ + 4.85282563559221225002e-01, /* 0x3fdf0ede98f393cf */ + 4.88333951056405479729e-01, /* 0x3fdf40dd0b541417 */ + 4.91375477653101910835e-01, /* 0x3fdf72b221a4e495 */ + 4.94407135071275316562e-01, /* 0x3fdfa45dd3029258 */ + 4.97428915812172245392e-01, /* 0x3fdfd5e0175fdf83 */ + 5.00440813147294050189e-01, /* 0x3fe0039c73c1a40b */ + 5.03442821109336358099e-01, /* 0x3fe01c341e82422d */ + 5.06434934483096732549e-01, /* 0x3fe034b709250488 */ + 5.09417148796356245022e-01, /* 0x3fe04d25314342e5 */ + 5.12389460310737621107e-01, /* 0x3fe0657e94db30cf */ + 5.15351866012543347040e-01, /* 0x3fe07dc3324e9b38 */ + 5.18304363603577900044e-01, /* 0x3fe095f30861a58f */ + 5.21246951491958210312e-01, /* 0x3fe0ae0e1639866c */ + 5.24179628782913242802e-01, /* 0x3fe0c6145b5b43da */ + 5.27102395269579471204e-01, /* 0x3fe0de05d7aa6f7c */ + 5.30015251423793132268e-01, /* 0x3fe0f5e28b67e295 */ + 5.32918198386882147055e-01, /* 0x3fe10daa77307a0d */ + 5.35811237960463593311e-01, /* 0x3fe1255d9bfbd2a8 */ + 5.38694372597246617929e-01, /* 0x3fe13cfbfb1b056e */ + 5.41567605391844897333e-01, /* 0x3fe1548596376469 */ + 5.44430940071603086672e-01, /* 0x3fe16bfa6f5137e1 */ + 5.47284380987436924748e-01, /* 0x3fe1835a88be7c13 */ + 5.50127933104692989907e-01, /* 0x3fe19aa5e5299f99 */ + 5.52961601994028217888e-01, /* 0x3fe1b1dc87904284 */ + 5.55785393822313511514e-01, /* 0x3fe1c8fe7341f64f */ + 5.58599315343562330405e-01, /* 0x3fe1e00babdefeb3 */ + 5.61403373889889367732e-01, /* 0x3fe1f7043557138a */ + 5.64197577362497537656e-01, /* 0x3fe20de813e823b1 */ + 5.66981934222700489912e-01, /* 0x3fe224b74c1d192a */ + 5.69756453482978431069e-01, /* 0x3fe23b71e2cc9e6a */ + 5.72521144698072359525e-01, /* 0x3fe25217dd17e501 */ + 5.75276017956117824426e-01, /* 0x3fe268a940696da6 */ + 5.78021083869819540801e-01, /* 0x3fe27f261273d1b3 */ + 5.80756353567670302596e-01, /* 0x3fe2958e59308e30 */ + 5.83481838685214859730e-01, /* 0x3fe2abe21aded073 */ + 5.86197551356360535557e-01, /* 0x3fe2c2215e024465 */ + 5.88903504204738026395e-01, /* 0x3fe2d84c2961e48b */ + 5.91599710335111383941e-01, /* 0x3fe2ee628406cbca */ + 5.94286183324841177367e-01, /* 0x3fe30464753b090a */ + 5.96962937215401501234e-01, /* 0x3fe31a52048874be */ + 5.99629986503951384336e-01, /* 0x3fe3302b39b78856 */ + 6.02287346134964152178e-01, /* 0x3fe345f01cce37bb */ + 6.04935031491913965951e-01, /* 0x3fe35ba0b60eccce */ + 6.07573058389022313541e-01, /* 0x3fe3713d0df6c503 */ + 6.10201443063065118722e-01, /* 0x3fe386c52d3db11e */ + 6.12820202165241245673e-01, /* 0x3fe39c391cd41719 */ + 6.15429352753104952356e-01, /* 0x3fe3b198e5e2564a */ + 6.18028912282561737612e-01, /* 0x3fe3c6e491c78dc4 */ + 6.20618898599929469384e-01, /* 0x3fe3dc1c2a188504 */ + 6.23199329934065904268e-01, /* 0x3fe3f13fb89e96f4 */ + 6.25770224888563042498e-01, /* 0x3fe4064f47569f48 */ + 6.28331602434009650615e-01, /* 0x3fe41b4ae06fea41 */ + 6.30883481900321840818e-01, /* 0x3fe430328e4b26d5 */ + 6.33425882969144482537e-01, /* 0x3fe445065b795b55 */ + 6.35958825666321447834e-01, /* 0x3fe459c652badc7f */ + 6.38482330354437466191e-01, /* 0x3fe46e727efe4715 */ + 6.40996417725432032775e-01, /* 0x3fe4830aeb5f7bfd */ + 6.43501108793284370968e-01, /* 0x3fe4978fa3269ee1 */ + 6.45996424886771558604e-01, /* 0x3fe4ac00b1c71762 */ + 6.48482387642300484032e-01, /* 0x3fe4c05e22de94e4 */ + 6.50959018996812410762e-01, /* 0x3fe4d4a8023414e8 */ + 6.53426341180761927063e-01, /* 0x3fe4e8de5bb6ec04 */ + 6.55884376711170835605e-01, /* 0x3fe4fd013b7dd17e */ + 6.58333148384755983962e-01, /* 0x3fe51110adc5ed81 */ + 6.60772679271132590273e-01, /* 0x3fe5250cbef1e9fa */ + 6.63202992706093175102e-01, /* 0x3fe538f57b89061e */ + 6.65624112284960989250e-01, /* 0x3fe54ccaf0362c8f */ + 6.68036061856020157990e-01, /* 0x3fe5608d29c70c34 */ + 6.70438865514021320458e-01, /* 0x3fe5743c352b33b9 */ + 6.72832547593763097282e-01, /* 0x3fe587d81f732fba */ + 6.75217132663749830535e-01, /* 0x3fe59b60f5cfab9d */ + 6.77592645519925151909e-01, /* 0x3fe5aed6c5909517 */ + 6.79959111179481823228e-01, /* 0x3fe5c2399c244260 */ + 6.82316554874748071313e-01, /* 0x3fe5d58987169b18 */ + 6.84665002047148862907e-01, /* 0x3fe5e8c6941043cf */ + 6.87004478341244895212e-01, /* 0x3fe5fbf0d0d5cc49 */ + 6.89335009598845749323e-01, /* 0x3fe60f084b46e05e */ + 6.91656621853199760075e-01, /* 0x3fe6220d115d7b8d */ + 6.93969341323259825138e-01, /* 0x3fe634ff312d1f3b */ + 6.96273194408023488045e-01, /* 0x3fe647deb8e20b8f */ + 6.98568207680949848637e-01, /* 0x3fe65aabb6c07b02 */ + 7.00854407884450081312e-01, /* 0x3fe66d663923e086 */ + 7.03131821924453670469e-01, /* 0x3fe6800e4e7e2857 */ + 7.05400476865049030906e-01, /* 0x3fe692a40556fb6a */ + 7.07660399923197958039e-01, /* 0x3fe6a5276c4b0575 */ + 7.09911618463524796141e-01, /* 0x3fe6b798920b3d98 */ + 7.12154159993178659249e-01, /* 0x3fe6c9f7855c3198 */ + 7.14388052156768926793e-01, /* 0x3fe6dc44551553ae */ + 7.16613322731374569052e-01, /* 0x3fe6ee7f10204aef */ + 7.18829999621624415873e-01, /* 0x3fe700a7c5784633 */ + 7.21038110854851588272e-01, /* 0x3fe712be84295198 */ + 7.23237684576317874097e-01, /* 0x3fe724c35b4fae7b */ + 7.25428749044510712274e-01, /* 0x3fe736b65a172dff */ + 7.27611332626510676214e-01, /* 0x3fe748978fba8e0f */ + 7.29785463793429123314e-01, /* 0x3fe75a670b82d8d8 */ + 7.31951171115916565668e-01, /* 0x3fe76c24dcc6c6c0 */ + 7.34108483259739652560e-01, /* 0x3fe77dd112ea22c7 */ + 7.36257428981428097003e-01, /* 0x3fe78f6bbd5d315e */ + 7.38398037123989547936e-01, /* 0x3fe7a0f4eb9c19a2 */ + 7.40530336612692630105e-01, /* 0x3fe7b26cad2e50fd */ + 7.42654356450917929600e-01, /* 0x3fe7c3d311a6092b */ + 7.44770125716075148681e-01, /* 0x3fe7d528289fa093 */ + 7.46877673555587429099e-01, /* 0x3fe7e66c01c114fd */ + 7.48977029182941400620e-01, /* 0x3fe7f79eacb97898 */ + 7.51068221873802288613e-01, /* 0x3fe808c03940694a */ + 7.53151280962194302759e-01, /* 0x3fe819d0b7158a4c */ + 7.55226235836744863583e-01, /* 0x3fe82ad036000005 */ + 7.57293115936992444759e-01, /* 0x3fe83bbec5cdee22 */ + 7.59351950749757920178e-01, /* 0x3fe84c9c7653f7ea */ + 7.61402769805578416573e-01, /* 0x3fe85d69576cc2c5 */ + 7.63445602675201784315e-01, /* 0x3fe86e2578f87ae5 */ + 7.65480478966144461950e-01, /* 0x3fe87ed0eadc5a2a */ + 7.67507428319308182552e-01, /* 0x3fe88f6bbd023118 */ + 7.69526480405658186434e-01, /* 0x3fe89ff5ff57f1f7 */ + 7.71537664922959498526e-01, /* 0x3fe8b06fc1cf3dfe */ + 7.73541011592573490852e-01, /* 0x3fe8c0d9145cf49d */ + 7.75536550156311621507e-01, /* 0x3fe8d13206f8c4ca */ + 7.77524310373347682379e-01, /* 0x3fe8e17aa99cc05d */ + 7.79504322017186335181e-01, /* 0x3fe8f1b30c44f167 */ + 7.81476614872688268854e-01, /* 0x3fe901db3eeef187 */ + 7.83441218733151756304e-01, /* 0x3fe911f35199833b */ + 7.85398163397448278999e-01}; /* 0x3fe921fb54442d18 */ + + /* Some constants. */ + + static double pi = 3.1415926535897932e+00, /* 0x400921fb54442d18 */ + piby2 = 1.5707963267948966e+00, /* 0x3ff921fb54442d18 */ + piby4 = 7.8539816339744831e-01, /* 0x3fe921fb54442d18 */ + three_piby4 = 2.3561944901923449e+00; /* 0x4002d97c7f3321d2 */ + + double u, v, vbyu, q, s, uu, r; + unsigned int swap_vu, index, xzero, yzero, xnan, ynan, xinf, yinf; + int xexp, yexp, diffexp; + + double x = fx; + double y = fy; + + /* Find properties of arguments x and y. */ + + unsigned long ux, aux, xneg, uy, auy, yneg; + + GET_BITS_DP64(x, ux); + GET_BITS_DP64(y, uy); + aux = ux & ~SIGNBIT_DP64; + auy = uy & ~SIGNBIT_DP64; + xexp = (int)((ux & EXPBITS_DP64) >> EXPSHIFTBITS_DP64); + yexp = (int)((uy & EXPBITS_DP64) >> EXPSHIFTBITS_DP64); + xneg = ux & SIGNBIT_DP64; + yneg = uy & SIGNBIT_DP64; + xzero = (aux == 0); + yzero = (auy == 0); + xnan = (aux > PINFBITPATT_DP64); + ynan = (auy > PINFBITPATT_DP64); + xinf = (aux == PINFBITPATT_DP64); + yinf = (auy == PINFBITPATT_DP64); + + diffexp = yexp - xexp; + + /* Special cases */ + + if (xnan) + { + unsigned int ufx; + GET_BITS_SP32(fx, ufx); + return _handle_errorf("atan2f", OP_ATAN2, ufx|0x00400000, _DOMAIN, 0, + EDOM, fx, fy, 2); + } + else if (ynan) + { + unsigned int ufy; + GET_BITS_SP32(fy, ufy); + return _handle_errorf("atan2f", OP_ATAN2, ufy|0x00400000, _DOMAIN, 0, + EDOM, fx, fy, 2); + } + else if (yzero) + { /* Zero y gives +-0 for positive x + and +-pi for negative x */ + if (xneg) + { + if (yneg) return valf_with_flags((float)-pi, AMD_F_INEXACT); + else return valf_with_flags((float)pi, AMD_F_INEXACT); + } + else return (float)y; + } + else if (xzero) + { /* Zero x gives +- pi/2 + depending on sign of y */ + if (yneg) return valf_with_flags((float)-piby2, AMD_F_INEXACT); + else valf_with_flags((float)piby2, AMD_F_INEXACT); + } + + if (diffexp > 26) + { /* abs(y)/abs(x) > 2^26 => arctan(x/y) + is insignificant compared to piby2 */ + if (yneg) return valf_with_flags((float)-piby2, AMD_F_INEXACT); + else return valf_with_flags((float)piby2, AMD_F_INEXACT); + } + else if (diffexp < -13 && (!xneg)) + { /* x positive and dominant over y by a factor of 2^13. + In this case atan(y/x) is y/x to machine accuracy. */ + + if (diffexp < -150) /* Result underflows */ + { + if (yneg) + return valf_with_flags(-0.0F, AMD_F_INEXACT | AMD_F_UNDERFLOW); + else + return valf_with_flags(0.0F, AMD_F_INEXACT | AMD_F_UNDERFLOW); + } + else + { + if (diffexp < -126) + { + /* Result will likely be denormalized */ + y = scaleDouble_1(y, 100); + y /= x; + /* Now y is 2^100 times the true result. Scale it back down. */ + GET_BITS_DP64(y, uy); + scaleDownDouble(uy, 100, &uy); + PUT_BITS_DP64(uy, y); + if ((uy & EXPBITS_DP64) == 0) + return valf_with_flags((float)y, AMD_F_INEXACT | AMD_F_UNDERFLOW); + else + return (float)y; + } + else + return (float)(y / x); + } + } + else if (diffexp < -26 && xneg) + { /* abs(x)/abs(y) > 2^56 and x < 0 => arctan(y/x) + is insignificant compared to pi */ + if (yneg) return valf_with_flags((float)-pi, AMD_F_INEXACT); + else return valf_with_flags((float)pi, AMD_F_INEXACT); + } + else if (yinf && xinf) + { /* If abs(x) and abs(y) are both infinity + return +-pi/4 or +- 3pi/4 according to + signs. */ + if (xneg) + { + if (yneg) return valf_with_flags((float)-three_piby4, AMD_F_INEXACT); + else return valf_with_flags((float)three_piby4, AMD_F_INEXACT); + } + else + { + if (yneg) return valf_with_flags((float)-piby4, AMD_F_INEXACT); + else return valf_with_flags((float)piby4, AMD_F_INEXACT); + } + } + + /* General case: take absolute values of arguments */ + + u = x; v = y; + if (xneg) u = -x; + if (yneg) v = -y; + + /* Swap u and v if necessary to obtain 0 < v < u. Compute v/u. */ + + swap_vu = (u < v); + if (swap_vu) { uu = u; u = v; v = uu; } + vbyu = v/u; + + if (vbyu > 0.0625) + { /* General values of v/u. Use a look-up + table and series expansion. */ + + index = (int)(256*vbyu + 0.5); + r = (256*v-index*u)/(256*u+index*v); + + /* Polynomial approximation to atan(vbyu) */ + + s = r*r; + q = atan_jby256[index-16] + r - r*s*0.33333333333224095522; + } + else if (vbyu < 1.e-4) + { /* v/u is small enough that atan(v/u) = v/u */ + q = vbyu; + } + else /* vbyu <= 0.0625 */ + { + /* Small values of v/u. Use a series expansion */ + + s = vbyu*vbyu; + q = vbyu - + vbyu*s*(0.33333333333333170500 - + s*(0.19999999999393223405 - + s*0.14285713561807169030)); + } + + /* Tidy-up according to which quadrant the arguments lie in */ + + if (swap_vu) {q = piby2 - q;} + if (xneg) {q = pi - q;} + if (yneg) q = - q; + return (float)q; +} diff --git a/sdk/lib/crt/math/libm_sse2/atanf.c b/sdk/lib/crt/math/libm_sse2/atanf.c new file mode 100644 index 00000000000..08c4eb7ff43 --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/atanf.c @@ -0,0 +1,135 @@ + +/******************************************************************************* +MIT License +----------- + +Copyright (c) 2002-2019 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this Software and associated documentaon files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include "libm.h" +#include "libm_util.h" + +#define USE_VALF_WITH_FLAGS +#define USE_NAN_WITH_FLAGS +#define USE_HANDLE_ERRORF +#include "libm_inlines.h" +#undef USE_VALF_WITH_FLAGS +#undef USE_NAN_WITH_FLAGS +#undef USE_HANDLE_ERRORF + +#include "libm_errno.h" + +// Disable "C4163: not available as intrinsic function" warning that older +// compilers may issue here. +#pragma warning(disable:4163) +#pragma function(atanf) + +float FN_PROTOTYPE(atanf)(float fx) +{ + + /* Some constants and split constants. */ + + static double piby2 = 1.5707963267948966e+00; /* 0x3ff921fb54442d18 */ + + double c, v, s, q, z; + unsigned int xnan; + + double x = fx; + + /* Find properties of argument fx. */ + + unsigned long ux, aux, xneg; + + GET_BITS_DP64(x, ux); + aux = ux & ~SIGNBIT_DP64; + xneg = ux & SIGNBIT_DP64; + + v = x; + if (xneg) v = -x; + + /* Argument reduction to range [-7/16,7/16] */ + + if (aux < 0x3fdc000000000000) /* v < 7./16. */ + { + x = v; + c = 0.0; + } + else if (aux < 0x3fe6000000000000) /* v < 11./16. */ + { + x = (2.0*v-1.0)/(2.0+v); + /* c = arctan(0.5) */ + c = 4.63647609000806093515e-01; /* 0x3fddac670561bb4f */ + } + else if (aux < 0x3ff3000000000000) /* v < 19./16. */ + { + x = (v-1.0)/(1.0+v); + /* c = arctan(1.) */ + c = 7.85398163397448278999e-01; /* 0x3fe921fb54442d18 */ + } + else if (aux < 0x4003800000000000) /* v < 39./16. */ + { + x = (v-1.5)/(1.0+1.5*v); + /* c = arctan(1.5) */ + c = 9.82793723247329054082e-01; /* 0x3fef730bd281f69b */ + } + else + { + + xnan = (aux > PINFBITPATT_DP64); + + if (xnan) + { + /* x is NaN */ + unsigned int uhx; + GET_BITS_SP32(fx, uhx); + return _handle_errorf("atanf", OP_ATAN, uhx|0x00400000, _DOMAIN, + 0, EDOM, fx, 0.0F, 1); + } + else if (v > 0x4c80000000000000) + { /* abs(x) > 2^26 => arctan(1/x) is + insignificant compared to piby2 */ + if (xneg) + return valf_with_flags((float)-piby2, AMD_F_INEXACT); + else + return valf_with_flags((float)piby2, AMD_F_INEXACT); + } + + x = -1.0/v; + /* c = arctan(infinity) */ + c = 1.57079632679489655800e+00; /* 0x3ff921fb54442d18 */ + } + + /* Core approximation: Remez(2,2) on [-7/16,7/16] */ + + s = x*x; + q = x*s* + (0.296528598819239217902158651186e0 + + (0.192324546402108583211697690500e0 + + 0.470677934286149214138357545549e-2*s)*s)/ + (0.889585796862432286486651434570e0 + + (0.111072499995399550138837673349e1 + + 0.299309699959659728404442796915e0*s)*s); + + z = c - (q - x); + + if (xneg) z = -z; + return (float)z; +} diff --git a/sdk/lib/crt/math/libm_sse2/cabs.c b/sdk/lib/crt/math/libm_sse2/cabs.c new file mode 100644 index 00000000000..fa1b22a9b62 --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/cabs.c @@ -0,0 +1,34 @@ + +/******************************************************************************* +MIT License +----------- + +Copyright (c) 2002-2019 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this Software and associated documentaon files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include "libm.h" + +double __cdecl _cabs(COMPLEX z) +{ + /* Returns the absolute value of a complex number z + with real part a and complex part b. */ +return _hypot(z.x, z.y); +} diff --git a/sdk/lib/crt/math/libm_sse2/cabsf.c b/sdk/lib/crt/math/libm_sse2/cabsf.c new file mode 100644 index 00000000000..c9235ce97bf --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/cabsf.c @@ -0,0 +1,35 @@ + +/******************************************************************************* +MIT License +----------- + +Copyright (c) 2002-2019 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this Software and associated documentaon files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include "libm.h" + +float _hypotf(float,float); +float _cabsf(COMPLEX z) +{ + /* Returns the absolute value of a complex number z + with real part a and complex part b. */ +return _hypotf((float)z.x, (float)z.y); +} diff --git a/sdk/lib/crt/math/libm_sse2/ceil.c b/sdk/lib/crt/math/libm_sse2/ceil.c new file mode 100644 index 00000000000..cb0f155e1d8 --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/ceil.c @@ -0,0 +1,88 @@ + +/******************************************************************************* +MIT License +----------- + +Copyright (c) 2002-2019 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this Software and associated documentaon files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include "libm.h" +#include "libm_util.h" + +#include "libm_errno.h" +#define USE_HANDLE_ERROR +#include "libm_inlines.h" +#undef USE_HANDLE_ERROR + +// Disable "C4163: not available as intrinsic function" warning that older +// compilers may issue here. +#pragma warning(disable:4163) +#pragma function(ceil) + +double FN_PROTOTYPE(ceil)(double x) +{ + double r; + long rexp, xneg; + unsigned long ux, ax, ur, mask; + + GET_BITS_DP64(x, ux); + ax = ux & (~SIGNBIT_DP64); + xneg = (ux != ax); + + if (ax >= 0x4340000000000000) + { + /* abs(x) is either NaN, infinity, or >= 2^53 */ + if (ax > 0x7ff0000000000000) + /* x is NaN */ + return _handle_error("ceil", OP_CEIL, ux|0x0008000000000000, _DOMAIN, 0, + EDOM, x, 0.0, 1); + else + return x; + } + else if (ax < 0x3ff0000000000000) /* abs(x) < 1.0 */ + { + if (ax == 0x0000000000000000) + /* x is +zero or -zero; return the same zero */ + return x; + else if (xneg) /* x < 0.0 */ + { + PUT_BITS_DP64(SIGNBIT_DP64, r); /* return -0.0 */ + return r; + } + else + return 1.0; + } + else + { + rexp = ((ux & EXPBITS_DP64) >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64; + /* Mask out the bits of r that we don't want */ + mask = 1; + mask = (mask << (EXPSHIFTBITS_DP64 - rexp)) - 1; + ur = (ux & ~mask); + PUT_BITS_DP64(ur, r); + if (xneg || (ur == ux)) + return r; + else + /* We threw some bits away and x was positive */ + return r + 1.0; + } + +} diff --git a/sdk/lib/crt/math/libm_sse2/ceilf.c b/sdk/lib/crt/math/libm_sse2/ceilf.c new file mode 100644 index 00000000000..22f85b9777b --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/ceilf.c @@ -0,0 +1,86 @@ + +/******************************************************************************* +MIT License +----------- + +Copyright (c) 2002-2019 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this Software and associated documentaon files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include "libm.h" +#include "libm_util.h" + +#include "libm_errno.h" +#define USE_HANDLE_ERRORF +#include "libm_inlines.h" +#undef USE_HANDLE_ERRORF + +// Disable "C4163: not available as intrinsic function" warning that older +// compilers may issue here. +#pragma warning(disable:4163) +#pragma function(ceilf) + +float FN_PROTOTYPE(ceilf)(float x) +{ + float r; + int rexp, xneg; + unsigned int ux, ax, ur, mask; + + GET_BITS_SP32(x, ux); + ax = ux & (~SIGNBIT_SP32); + xneg = (ux != ax); + + if (ax >= 0x4b800000) + { + /* abs(x) is either NaN, infinity, or >= 2^24 */ + if (ax > 0x7f800000) + /* x is NaN */ + return _handle_errorf("ceilf", OP_CEIL, ux, _DOMAIN, 0, EDOM, x, + 0.0F, 1); + else + return x; + } + else if (ax < 0x3f800000) /* abs(x) < 1.0 */ + { + if (ax == 0x00000000) + /* x is +zero or -zero; return the same zero */ + return x; + else if (xneg) /* x < 0.0 */ + { + PUT_BITS_SP32(SIGNBIT_SP32, r); /* return -0.0 */ + return r; + } + else + return 1.0F; + } + else + { + rexp = ((ux & EXPBITS_SP32) >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32; + /* Mask out the bits of r that we don't want */ + mask = (1 << (EXPSHIFTBITS_SP32 - rexp)) - 1; + ur = (ux & ~mask); + PUT_BITS_SP32(ur, r); + + if (xneg || (ux == ur)) return r; + else + /* We threw some bits away and x was positive */ + return r + 1.0F; + } +} diff --git a/sdk/lib/crt/math/libm_sse2/cos.asm b/sdk/lib/crt/math/libm_sse2/cos.asm new file mode 100644 index 00000000000..850b8f1a34d --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/cos.asm @@ -0,0 +1,533 @@ +; +; MIT License +; ----------- +; +; Copyright (c) 2002-2019 Advanced Micro Devices, Inc. +; +; Permission is hereby granted, free of charge, to any person obtaining a copy +; of this Software and associated documentaon files (the "Software"), to deal +; in the Software without restriction, including without limitation the rights +; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +; copies of the Software, and to permit persons to whom the Software is +; furnished to do so, subject to the following conditions: +; +; The above copyright notice and this permission notice shall be included in +; all copies or substantial portions of the Software. +; +; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +; THE SOFTWARE. +; +; +; An implementation of the cos function. +; +; Prototype: +; +; double cos(double x); +; +; Computes cos(x). +; It will provide proper C99 return values, +; but may not raise floating point status bits properly. +; Based on the NAG C implementation. +; +; If FMA3 hardware is available, an FMA3 implementation of cos will be used. + +.const +ALIGN 16 +L_real_piby2_1 DQ 03ff921fb54400000h ; piby2_1 + DQ 0 +L_real_piby2_1tail DQ 03dd0b4611a626331h ; piby2_1tail + DQ 0 +L_real_piby2_2 DQ 03dd0b4611a600000h ; piby2_2 + DQ 0 +L_real_piby2_2tail DQ 03ba3198a2e037073h ; piby2_2tail + DQ 0 + +ALIGN 16 +L_one DQ 03FF0000000000000h, 03FF0000000000000h +L_signbit DQ 08000000000000000h, 00000000000000000h +L_int_one DQ 00000000000000001h, 00000000000000000h +L_int_two DQ 00000000000000002h, 00000000000000000h + +L_2_by_pi DQ 03fe45f306dc9c883h ; 2/pi +L_one_half DQ 03FE0000000000000h ; .5 +L_neg_one_half DQ 0bfe0000000000000h ; - 0.5 +L_two_to_neg_27 DQ 03e40000000000000h ; 2^-27 +L_two_to_neg_13 DQ 03f20000000000000h ; 2^-13 +L_piby4 DQ 03FE921FB54442D18h ; pi/4 +L_small_arg_cw DQ 0411E848000000000h ; 5.e5, appropriate for CW +L_small_arg_bdl DQ 0417312D000000000h ; 2e7, works for BDL +L_sign_mask DQ 07FFFFFFFFFFFFFFFh + +L__inf_mask_64 DQ 07FF0000000000000h ; +Inf + + + +EXTRN __Lcosarray:QWORD +EXTRN __Lsinarray:QWORD +EXTRN __use_fma3_lib:DWORD + +; local storage offsets +p_temp EQU 020h ; temporary for get/put bits operation +p_temp1 EQU 030h ; temporary for get/put bits operation +dummy_space EQU 040h +stack_size EQU 068h + +include fm.inc + +fname TEXTEQU +fname_special TEXTEQU <_cos_special> + +;Define name and any external functions being called +EXTERN __remainder_piby2_forAsm : PROC +EXTERN __remainder_piby2_fma3 : PROC +EXTERN __remainder_piby2_fma3_bdl : PROC +EXTERN fname_special : PROC + +.code + +PUBLIC fname +fname PROC FRAME + StackAllocate stack_size + .ENDPROLOG + + cmp DWORD PTR __use_fma3_lib, 0 + jne L_cos_fma3 + +Lcos_sse2: + movd rdx, xmm0 + xorpd xmm2, xmm2 ; zeroed out for later use + + mov r10, rdx + btr r10, 63 ; r10 <-- |x| + cmp r10, L_piby4 + jb Lcos_sse2_absx_lt_piby4 + +Lcos_absx_nlt_piby4: ; common case + +; Here rdx has x, r10 has |x| + movd xmm0, r10 ; xmm0 <-- |x| + + cmp r10, QWORD PTR L_small_arg_cw + jae Lcos_reduce_precise ; Note NaN/Inf will branch + +; At this point we have |x| < L_small_arg_cw, which is currently 500000. +; Note that if |x| were too large, conversion of npi2 to integer would fail. +; We reduce the argument to be in a range from -pi/4 to +pi/4 +; by subtracting multiples of pi/2 + movapd xmm2, xmm0 + mulsd xmm2, L_2_by_pi + movapd xmm4, xmm0 + +; xexp = ax >> EXPSHIFTBITS_DP64; + mov r9, r10 + shr r9, 52 ; >>EXPSHIFTBITS_DP64 + +; How many pi/2 is |x| a multiple of? +; npi2 = (int)(x * twobypi + 0.5); + addsd xmm2, L_one_half ; npi2 + + movsd xmm3, L_real_piby2_1 + cvttpd2dq xmm0, xmm2 ; convert npi2 to integer + movsd xmm1, L_real_piby2_1tail + cvtdq2pd xmm2, xmm0 ; and back to double. + +; Subtract the multiple from x to get an extra-precision remainder +; rhead = x - npi2 * piby2_1; + mulsd xmm3, xmm2 + subsd xmm4, xmm3 ; rhead + +; rtail = npi2 * piby2_1tail; + mulsd xmm1, xmm2 ; rtail + movd eax, xmm0 ; eax <-- npi2 + +; GET_BITS_DP64(rhead-rtail, uy); +; originally only rhead + movapd xmm0, xmm4 + subsd xmm0, xmm1 + + movsd xmm3, L_real_piby2_2 + movd rcx, xmm0 ; rcx <-- rhead - rtail + movsd xmm5, L_real_piby2_2tail ; piby2_2tail + +; xmm0=r, xmm1=rtail, xmm2=npi2, xmm3=temp for calc, +; xmm4=rhead xmm5= temp for calc +; expdiff = xexp - ((uy & EXPBITS_DP64) >> EXPSHIFTBITS_DP64); +; expdiff measures how close rhead - rtail is to |x| +; (larger expdiff ==> more cancellation in |x| - (rhead-rtail) ==> closer) + shl rcx, 1 ; strip any sign bit + shr rcx, 53 ; >> EXPSHIFTBITS_DP64 +1 + sub r9, rcx ; expdiff + +;; if (expdiff > 15) + cmp r9, 15 + jle Lcos_sse2_cw_reduction_done + +; Here the remainder is pretty small compared with x, which +; implies that x is a near multiple of pi/2 +; (x matches the multiple to at least 15 bits) +; So we do another stage of argument reduction. + +; t = rhead; + movapd xmm1, xmm4 + +; rtail = npi2 * piby2_2; + mulsd xmm3, xmm2 + +; rhead = t - rtail; + mulsd xmm5, xmm2 ; npi2 * piby2_2tail + subsd xmm4, xmm3 ; rhead + +; rtail = npi2 * piby2_2tail - ((t - rhead) - rtail); + subsd xmm1, xmm4 ; t - rhead + subsd xmm1, xmm3 ; -rtail + subsd xmm5, xmm1 ; rtail + +; r = rhead - rtail; + movapd xmm0, xmm4 + +;HARSHA +;xmm1=rtail + movapd xmm1, xmm5 ; xmm1 <-- copy of rtail + subsd xmm0, xmm5 + +; xmm0=r, xmm4=rhead, xmm1=rtail +Lcos_sse2_cw_reduction_done: +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; if the input was close to a pi/2 multiple +; The original NAG code missed this trick. +; If the input is very close to n*pi/2 after reduction, so r < 2^-27, +; then the cos is either ~ 1.0 or ~r, to within 53 bits. + +; NOTE: Unfortunately, this introduces two jcc instructions close to each +; other and to other branches. As r < 2^-13 should be rather uncommon, +; the problems for branch prediction outweigh the computational savings. - WAT +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; region = npi2 & 3; + subsd xmm4, xmm0 ; rhead-r + subsd xmm4, xmm1 ; rr = (rhead-r) - rtail + +Lcos_piby4: +; perform taylor series to calc sinx or cosx +; x2 = r * r; + +;xmm4 = a part of rr for the sin path, xmm4 is overwritten in the cos path +;instead use xmm3 because that was freed up in the sin path, xmm3 is overwritten in sin path + movapd xmm3, xmm0 + movapd xmm2, xmm0 + mulsd xmm2, xmm0 ;x2 + + bt eax,0 + jnc Lcos_sse2_calc_cos + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; region 1 or 3 do a sin calculation + movsd xmm3, __Lsinarray+50h ; s6 + mulsd xmm3, xmm2 ; x2s6 + movsd xmm5, __Lsinarray+20h ; s3 + movsd QWORD PTR p_temp[rsp], xmm4 ; store xx + movapd xmm1, xmm2 ; move for x4 + mulsd xmm1, xmm2 ; x4 + movsd QWORD PTR p_temp1[rsp], xmm0 ; store x + mulsd xmm5, xmm2 ; x2s3 + movapd xmm4, xmm0 ; move for x3 + addsd xmm3, __Lsinarray+40h ; s5+x2s6 + mulsd xmm1, xmm2 ; x6 + mulsd xmm3, xmm2 ; x2(s5+x2s6) + mulsd xmm4, xmm2 ; x3 + addsd xmm5, __Lsinarray+10h ; s2+x2s3 + mulsd xmm5, xmm2 ; x2(s2+x2s3) + addsd xmm3, __Lsinarray+30h ; s4 + x2(s5+x2s6) + mulsd xmm2, L_one_half ; 0.5 *x2 + movsd xmm0, QWORD PTR p_temp[rsp] ; load xx + mulsd xmm3, xmm1 ; x6(s4 + x2(s5+x2s6)) + addsd xmm5, __Lsinarray ; s1+x2(s2+x2s3) + mulsd xmm2, xmm0 ; 0.5 * x2 *xx + addsd xmm3, xmm5 ; zs + mulsd xmm4, xmm3 ; *x3 + subsd xmm4, xmm2 ; x3*zs - 0.5 * x2 *xx + addsd xmm0, xmm4 ; +xx + addsd xmm0, QWORD PTR p_temp1[rsp] ; +x + + jmp Lcos_sse2_adjust_region + +ALIGN 16 +Lcos_sse2_calc_cos: +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; region 0 or 2 - do a cos calculation +; zc = (c2 + x2 * (c3 + x2 * (c4 + x2 * (c5 + x2 * c6)))); + mulsd xmm4, xmm0 ; x*xx + movsd xmm5, L_one_half + movsd xmm1, __Lcosarray+50h ; c6 + movsd xmm0, __Lcosarray+20h ; c3 + mulsd xmm5, xmm2 ; r = 0.5 *x2 + movapd xmm3, xmm2 ; copy of x2 + movsd QWORD PTR p_temp[rsp], xmm4 ; store x*xx + mulsd xmm1, xmm2 ; c6*x2 + mulsd xmm0, xmm2 ; c3*x2 + subsd xmm5, L_one ; -t=r-1.0, trash r + mulsd xmm3, xmm2 ; x4 + addsd xmm1, __Lcosarray+40h ; c5+x2c6 + addsd xmm0, __Lcosarray+10h ; c2+x2C3 + addsd xmm5, L_one ; 1 + (-t), trash t + mulsd xmm3, xmm2 ; x6 + mulsd xmm1, xmm2 ; x2(c5+x2c6) + mulsd xmm0, xmm2 ; x2(c2+x2C3) + movapd xmm4, xmm2 ; copy of x2 + mulsd xmm4, L_one_half ; r recalculate + addsd xmm1, __Lcosarray+30h ; c4 + x2(c5+x2c6) + addsd xmm0, __Lcosarray ; c1+x2(c2+x2C3) + mulsd xmm2, xmm2 ; x4 recalculate + subsd xmm5, xmm4 ; (1 + (-t)) - r + mulsd xmm1, xmm3 ; x6(c4 + x2(c5+x2c6)) + addsd xmm0, xmm1 ; zc + subsd xmm4, L_one ; t relaculate + subsd xmm5, QWORD PTR p_temp[rsp] ; ((1 + (-t)) - r) - x*xx + mulsd xmm0, xmm2 ; x4 * zc + addsd xmm0, xmm5 ; x4 * zc + ((1 + (-t)) - r -x*xx) + subsd xmm0, xmm4 ; result - (-t) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +Lcos_sse2_adjust_region: +; switch (region) + add eax, 1 + and eax, 2 + jz Lcos_sse2_cleanup + +;; if the original region 1 or 2 then we negate the result. + movapd xmm2, xmm0 + xorpd xmm0, xmm0 + subsd xmm0, xmm2 + +ALIGN 16 +Lcos_sse2_cleanup: + StackDeallocate stack_size + ret + + + + + + +ALIGN 16 +Lcos_sse2_absx_lt_piby4: +; cos = cos_piby4(x, 0.0); + +; x2 = r * r; + cmp r10, L_two_to_neg_13 + jb Lcos_sse2_x_small + movapd xmm2, xmm0 + mulsd xmm2, xmm0 ; x2 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; region 0 - do a cos calculation +; zc = (c2 + x2 * (c3 + x2 * (c4 + x2 * (c5 + x2 * c6)))); + movsd xmm1, __Lcosarray+10h ; c2 + movapd xmm4, xmm2 ; move for x4 + mulsd xmm4, xmm2 ; x4 + movsd xmm3, __Lcosarray+30h ; c4 + mulsd xmm1, xmm2 ; c2x2 + movsd xmm5, __Lcosarray+50h ; c6 + mulsd xmm3, xmm2 ; c4x2 + movapd xmm0, xmm4 ; move for x8 + mulsd xmm5, xmm2 ; c6x2 + mulsd xmm0, xmm4 ; x8 + addsd xmm1, __Lcosarray ; c1 + c2x2 + mulsd xmm1, xmm4 ; c1x4 + c2x6 + addsd xmm3, __Lcosarray+20h ; c3 + c4x2 + mulsd xmm2, L_neg_one_half ; -0.5x2, destroy xmm2 + addsd xmm5, __Lcosarray+40h ; c5 + c6x2 + mulsd xmm3, xmm0 ; c3x8 + c4x10 + mulsd xmm4, xmm0 ; x12 + mulsd xmm4, xmm5 ; c5x12 + c6x14 + + movsd xmm0, L_one + addsd xmm1, xmm3 ; c1x4 + c2x6 + c3x8 + c4x10 + movapd xmm3, xmm2 ; preserve -0.5x2 + addsd xmm2, xmm0 ; t = 1 - 0.5x2 + subsd xmm0, xmm2 ; 1-t + addsd xmm0, xmm3 ; (1-t) - r + addsd xmm1, xmm4 ; c1x4 + c2x6 + c3x8 + c4x10 + c5x12 + c6x14 + addsd xmm0, xmm1 ; (1-t) - r + c1x4 + c2x6 + c3x8 + c4x10 + c5x12 + c6x14 + addsd xmm0, xmm2 ; 1 - 0.5x2 + above + + StackDeallocate stack_size + ret + +ALIGN 16 +Lcos_sse2_x_small: + movsd xmm2, xmm0 + movsd xmm0, L_one + cmp r10, L_two_to_neg_27 + jb Lcos_sse2_x_smaller + mulsd xmm2, xmm2 + mulsd xmm2, L_one_half + subsd xmm0, xmm2 + StackDeallocate stack_size + ret + +ALIGN 16 +Lcos_sse2_x_smaller: + movsd xmm0, L_one + addsd xmm0, L_int_one ; really adding smallest subnormal; set inexact + StackDeallocate stack_size + ret + +ALIGN 16 +Lcos_reduce_precise: +; Reduce x into range [-pi/4, pi/4] + cmp r10, L__inf_mask_64 + jae Lcos_x_naninf + call __remainder_piby2_forAsm + + ; At this point xmm0 has r, xmm1 has rr, rax has region + + movapd xmm4, xmm1 ; xmm4 <-- rr + jmp Lcos_piby4 + +; xmm0 = x, xmm4 = xx, eax= region + + +ALIGN 16 +Lcos_x_naninf: + call fname_special + StackDeallocate stack_size + ret + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; From this point we assume that FMA3 and AVX hardware are present. + +ALIGN 16 +L_cos_fma3: + vmovq r9,xmm0 + mov rax,r9 + and r9,L_sign_mask ; clear sign + +Lcos_early_exit_s_1: ;; unused label + cmp r9,L_piby4 + jg Lcos_early_exit_s ; Note that NaN will branch + cmp r9,L_two_to_neg_13 + jge Lcompute_cos_pyby_4 + cmp r9,L_two_to_neg_27 + jge Lcompute_1_xx_5 + vmovq xmm0,L_one ; for tiniest args, cos is 1 + jmp Lreturn_no_restore + +Lcompute_1_xx_5: + vmulsd xmm1,xmm0,L_one_half ; xmm1l <-- .5*x + vfnmadd213sd xmm0,xmm1,L_one ; xmm0l <-- 1.0 - (.5*x)*x + jmp Lreturn_no_restore + +Lcompute_cos_pyby_4: + ; make sure this is accurate enough + ; note that x^2 can't be all that close to 1 here + vmulsd xmm3,xmm0,xmm0 ; xmm3 <-- xx = x*x + vmovapd xmm0,__Lcosarray+050h ; xmm0 <-- c5 + vfmadd213sd xmm0,xmm3,__Lcosarray+040h ; xmm0 <-- c5*xx + c4 + vfmadd213sd xmm0,xmm3,__Lcosarray+030h ; xmm0 <-- (c5*xx + c4)*xx + c3 + vfmadd213sd xmm0,xmm3,__Lcosarray+020h + vfmadd213sd xmm0,xmm3,__Lcosarray+010h + vfmadd213sd xmm0,xmm3,__Lcosarray + vfmsub213sd xmm0,xmm3,L_one_half + vfmadd213sd xmm0,xmm3,L_one + + jmp Lreturn_no_restore + +Lcos_early_exit_s: + mov r8,L__inf_mask_64 + and rax,r8 + cmp rax, r8 + jz Lcos_x_naninf + +Lrange_reduce: + vmovq xmm0,r9 ; r9 <-- |x| + cmp r9,L_small_arg_bdl + jae Lcos_remainder_piby2 + + ; For __remainder_piby2_fma3 and __remainder_piby2_fma3_bdl + ; on input + ; x is in xmm0 + ; on output + ; r is in xmm0 + ; rr is in xmm1 + ; region is in rax + + ; Boldo-Daumas-Li reduction for reasonably small |x| + call __remainder_piby2_fma3_bdl + +;; if region is 0 or 2 do a cos calc. +;; if region is 1 or 3 do a sin calc. +Lcos_exit_s: + bt rax,0 + jc Lsin_piby4_compute + +Lcos_piby4_compute: ;; unused label + ; compute the cosine of r+rr, where this sum is in [-pi/4,pi/4] + vmovapd xmm2,L_one + vmulsd xmm3,xmm0,xmm0 ; xmm3 <-- x * x + vmulsd xmm5,xmm3,L_one_half ; xmm5 <-- x*x*.5 == r + vsubsd xmm4,xmm2,xmm5 ; xmm4 <-- t = 1. - x*x*.5 + vsubsd xmm2,xmm2,xmm4 ; 1-t + vsubsd xmm2,xmm2,xmm5 ; xmm2 <-- (1-t) - r + vmovapd xmm5,__Lcosarray+040h + vfnmadd231sd xmm2,xmm0,xmm1 ; (1.0 - t) - r) - x * xx) xmm2 + vmulsd xmm1,xmm3,xmm3 ; x2 * x2 xmm1 + vfmadd231sd xmm5,xmm3,__Lcosarray+050h + vfmadd213sd xmm5,xmm3,__Lcosarray+030h + vfmadd213sd xmm5,xmm3,__Lcosarray+020h + vfmadd213sd xmm5,xmm3,__Lcosarray+010h + vfmadd213sd xmm5,xmm3,__Lcosarray + vfmadd213sd xmm5,xmm1,xmm2 + vaddsd xmm0,xmm5,xmm4 + + jmp Lcos_exit_s_1 + +ALIGN 16 +Lsin_piby4_compute: + ; compute the sine of r+rr, where this sum is in [-pi/4,pi/4] + vmovapd xmm5,__Lsinarray+040h + vmulsd xmm3,xmm0,xmm0 ; xmm3 <-- x2 = x * x + vfmadd231sd xmm5,xmm3,__Lsinarray+050h + vfmadd213sd xmm5,xmm3,__Lsinarray+030h + vfmadd213sd xmm5,xmm3,__Lsinarray+020h + vfmadd213sd xmm5,xmm3,__Lsinarray+010h ; xmm5 <-- r + + vmulsd xmm4,xmm0,xmm3 ; xmm4 <-- x3 = x*x*x + vmulsd xmm2,xmm4,xmm5 ; xmm2 <-- x*x*x * r + vmulsd xmm5,xmm1,L_one_half ; xmm5 <-- .5*x*x + vsubsd xmm2,xmm5,xmm2 ; xmm2 <-- .5*x*x - x*x*x*r + vmulsd xmm2,xmm3,xmm2 + vsubsd xmm2,xmm2,xmm1 + vfnmadd231sd xmm2, xmm4,__Lsinarray + vsubsd xmm0,xmm0,xmm2 + +Lcos_exit_s_1: + xor r8,r8 + add eax, 1 + and eax, 2 + cmovnz r8, L_signbit + vmovq xmm3,r8 + vxorpd xmm0,xmm0,xmm3 + +Lreturn_restore_regs: + StackDeallocate stack_size + ret + +Lreturn_no_restore: + StackDeallocate stack_size + ret + +ALIGN 16 +Lcos_remainder_piby2: + ; argument reduction for general x + call __remainder_piby2_fma3 + jmp Lcos_exit_s + + +fname endp +END diff --git a/sdk/lib/crt/math/libm_sse2/cosf.asm b/sdk/lib/crt/math/libm_sse2/cosf.asm new file mode 100644 index 00000000000..6b232472a23 --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/cosf.asm @@ -0,0 +1,525 @@ +; +; MIT License +; ----------- +; +; Copyright (c) 2002-2019 Advanced Micro Devices, Inc. +; +; Permission is hereby granted, free of charge, to any person obtaining a copy +; of this Software and associated documentaon files (the "Software"), to deal +; in the Software without restriction, including without limitation the rights +; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +; copies of the Software, and to permit persons to whom the Software is +; furnished to do so, subject to the following conditions: +; +; The above copyright notice and this permission notice shall be included in +; all copies or substantial portions of the Software. +; +; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +; THE SOFTWARE. +; +; An implementation of the cosf function. +; +; Prototype: +; +; float cosf(float x); +; +; Computes cosf(x). +; Based on the NAG C implementation. +; It will provide proper C99 return values, +; but may not raise floating point status bits properly. +; Original Author: Harsha Jagasia + +.const +ALIGN 16 +L_real_one DQ 03ff0000000000000h ; 1.0 + DQ 0 ; for alignment +L_one_half DQ 03fe0000000000000h ; 0.5 + DQ 0 +L_2bypi DQ 03fe45f306dc9c883h ; 2./pi + DQ 0 +L_one_sixth DQ 03fc5555555555555h ; 0.166666666666 + DQ 0 +L_piby2 DQ 03fe921fb54442d18h + DQ 0 +L_piby2_1 DQ 03ff921fb54400000h ; piby2_1 + DQ 0 +L_piby2_1tail DQ 03dd0b4611a626331h ; piby2_1tail + DQ 0 +L_piby2_2 DQ 03dd0b4611a600000h ; piby2_2 + DQ 0 +L_piby2_2tail DQ 03ba3198a2e037073h ; piby2_2tail + DQ 0 +L_large_x_sse2 DQ 0411E848000000000h ; 5e5 + DQ 0 +L_large_x_fma3 DQ 041E921FB60000000h ; 3.37325952e9 + DQ 0 +L_sign_mask DQ 07FFFFFFFFFFFFFFFh + DQ 07FFFFFFFFFFFFFFFh +L__int_three DQ 00000000000000003h + DQ 00000000000000003h +L__min_norm_double DQ 00010000000000000h + DQ 00010000000000000h +L_two_to_neg_7 DQ 03f80000000000000h + DQ 0 +L_two_to_neg_13 DQ 03f20000000000000h + DQ 0 +L_inf_mask_32 DD 07F800000h + DQ 0 + +fname TEXTEQU +fname_special TEXTEQU <_cosf_special> + +;Define name and any external functions being called +EXTERN __remainder_piby2d2f_forAsm : PROC ; NEAR +EXTERN __remainder_piby2_fma3_bdl : PROC ; NEAR +EXTERN __remainder_piby2_fma3 : PROC ; NEAR +EXTERN fname_special : PROC +EXTERN _set_statfp : PROC + + +EXTRN __Lcosfarray:QWORD +EXTRN __Lsinfarray:QWORD +EXTRN __use_fma3_lib:DWORD + +; define local variable storage offsets +p_temp equ 020h ; temporary for get/put bits operation +p_temp1 equ 030h ; temporary for get/put bits operation +dummy_space EQU 040h +stack_size EQU 068h + +include fm.inc + +.code + +ALIGN 16 +PUBLIC fname +fname PROC FRAME + StackAllocate stack_size + .ENDPROLOG + cmp DWORD PTR __use_fma3_lib, 0 + jne Lcosf_fma3 + +Lcosf_sse2: + + xorpd xmm2, xmm2 ; zeroed out for later use + +;; if NaN or inf + movd edx, xmm0 + mov eax, 07f800000h + mov r10d, eax + and r10d, edx + cmp r10d, eax + jz Lcosf_sse2_naninf + + cvtss2sd xmm0, xmm0 + movd rdx, xmm0 + +; ax = (ux & ~SIGNBIT_DP64); + mov r10, rdx + btr r10, 63 ; r10 <-- |x| + mov r8d, 1 ; for determining region later on + + movapd xmm1, xmm0 ; xmm1 <-- copy of x + + +;; if (ax <= 3fe921fb54442d18h) /* abs(x) <= pi/4 */ + mov rax, 03fe921fb54442d18h + cmp r10, rax + jg Lcosf_sse2_absx_gt_piby4 + +; *c = cos_piby4(x, 0.0); + movapd xmm2, xmm0 + mulsd xmm2, xmm2 ;x^2 + xor eax, eax + mov rdx, r10 + movsd xmm5, QWORD PTR L_one_half + jmp Lcosf_sse2_calc_sincosf_piby4 ; done + + +ALIGN 16 +Lcosf_sse2_absx_gt_piby4: +; reduce the argument to be in a range from -pi/4 to +pi/4 +; by subtracting multiples of pi/2 +; xneg = (ax != ux); + movd xmm0, r10 ; xmm0 <-- |x| + cmp r10, QWORD PTR L_large_x_sse2 + jae Lcosf_sse2_reduce_precise + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; xmm0=abs(x), xmm1=x +;/* How many pi/2 is x a multiple of? */ + + movapd xmm2, xmm0 + movsd xmm3, QWORD PTR L_2bypi + movapd xmm4, xmm0 + movsd xmm5, QWORD PTR L_one_half + mulsd xmm2, xmm3 + +; movsd xmm5, QWORD PTR L_one_half +; movapd xmm2, xmm0 +; mulsd xmm2, QWORD PTR L_2bypi +; movapd xmm4, xmm0 + + mov r9, r10 + shr r9, 52 ; r9 <-- biased exponent of x + +; npi2 = (int)(x * twobypi + 0.5); + addsd xmm2, xmm5 ; npi2 + + movsd xmm3, QWORD PTR L_piby2_1 ; piby2_1 + cvttpd2dq xmm0, xmm2 ; xmm0 <-- npi2 + movsd xmm1, QWORD PTR L_piby2_1tail ; piby2_1tail + cvtdq2pd xmm2, xmm0 ; xmm2 <-- (double)npi2 + +; Subtract the multiple from x to get an extra-precision remainder +; rhead = x - npi2 * piby2_1; + + mulsd xmm3, xmm2 ; use piby2_1 + subsd xmm4, xmm3 ; rhead + +; rtail = npi2 * piby2_1tail; + mulsd xmm1, xmm2 ; rtail + movd eax, xmm0 + +; GET_BITS_DP64(rhead-rtail, uy); +; originally only rhead + movapd xmm0, xmm4 + subsd xmm0, xmm1 + + movsd xmm3, QWORD PTR L_piby2_2 ; piby2_2 + movd rcx, xmm0 ; rcx <-- rhead-rtail + movsd xmm5, QWORD PTR L_piby2_2tail ; piby2_2tail + +; region = npi2 & 3; +; and eax, 3 +; expdiff = xexp - ((uy & EXPBITS_DP64) >> EXPSHIFTBITS_DP64); + shl rcx, 1 ; strip any sign bit + shr rcx, 53 ; >> EXPSHIFTBITS_DP64 +1 + sub r9, rcx ; expdiff + +;; if (expdiff > 15) + cmp r9, 15 + jle Lcosf_sse2_expdiff_le_15 + +; The remainder is pretty small compared with x, which +; implies that x is a near multiple of pi/2 +; (x matches the multiple to at least 15 bits) +; t = rhead; + movapd xmm1, xmm4 + +; rtail = npi2 * piby2_2; + mulsd xmm3, xmm2 + +; rhead = t - rtail; + mulsd xmm5, xmm2 ; npi2 * piby2_2tail + subsd xmm4, xmm3 ; rhead + +; rtail = npi2 * piby2_2tail - ((t - rhead) - rtail); + subsd xmm1, xmm4 ; t - rhead + subsd xmm1, xmm3 ; -rtail + subsd xmm5, xmm1 ; rtail + +; r = rhead - rtail; + movapd xmm0, xmm4 + +;HARSHA +;xmm1=rtail + movapd xmm1, xmm5 + subsd xmm0, xmm5 + +; xmm0=r, xmm4=rhead, xmm1=rtail + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +Lcosf_sse2_expdiff_le_15: + cmp rcx, 03f2h ; is r < 2^-13 ? + jge Lcosf_sse2_calc_sincosf_piby4 ; use taylor series if not + cmp rcx, 03deh ; is r < 2^-33 ? + jle Lcosf_sse2_r_very_small ; then cosf(r) ~ 1 or r + + movapd xmm2, xmm0 + mulsd xmm2, xmm0 ; xmm2 <-- x^2 + +;; if region is 1 or 3 do a sinf calc. + and r8d, eax + jz Lcosf_sse2_r_small_calc_sin + +Lcosf_sse2_r_small_calc_cos: +; region 1 or 3 +; use simply polynomial +; *s = x - x*x*x*0.166666666666666666; + movsd xmm3, QWORD PTR L_one_sixth + mulsd xmm3, xmm0 ; * x + mulsd xmm3, xmm2 ; * x^2 + subsd xmm0, xmm3 ; xs + jmp Lcosf_sse2_adjust_region + +ALIGN 16 +Lcosf_sse2_r_small_calc_sin: +; region 0 or 2 +; cos = 1.0 - x*x*0.5; + movsd xmm0, QWORD PTR L_real_one ; 1.0 + mulsd xmm2, QWORD PTR L_one_half ; 0.5 *x^2 + subsd xmm0, xmm2 + jmp Lcosf_sse2_adjust_region + +ALIGN 16 +Lcosf_sse2_r_very_small: +; then sin(r) = r +; if region is 1 or 3 do a sin calc. + and r8d, eax + jnz Lcosf_sse2_adjust_region + + movsd xmm0, QWORD PTR L_real_one ; cosf(r) is a 1 + ; By this point, calculations should already have set inexact + jmp Lcosf_sse2_adjust_region + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +ALIGN 16 +Lcosf_sse2_reduce_precise: +; Reduce abs(x) into range [-pi/4, pi/4] +; remainder_piby2d2f(ax, &r, ®ion); + mov QWORD PTR p_temp[rsp], rdx ; save ux for use later + mov QWORD PTR p_temp1[rsp], r10 ; save ax for use later + + call __remainder_piby2d2f_forAsm + mov rdx, QWORD PTR p_temp[rsp] ; restore ux for use later + mov r10, QWORD PTR p_temp1[rsp] ; restore ax for use later + mov r8d, 1 ; for determining region later + + ; Reduced argument is in xmm0. No second word; after all, we started in + ; single precision. Region is in rax. + movapd xmm1, xmm0 + movsd xmm5, QWORD PTR L_one_half + + jmp Lcosf_sse2_calc_sincosf_piby4 + + +; done with reducing the argument. Now perform the sin/cos calculations. +ALIGN 16 +Lcosf_sse2_calc_sincosf_piby4: + movapd xmm2, xmm0 + mulsd xmm2, xmm0 ; x^2 + +;; if region is 0 or 2, do a cosf calc + and r8d, eax + jz Lcosf_sse2_do_cosf_calc +; region is 1 or 3: do a sinf calc. +Lcosf_sse2_do_sinf_calc: + movsd xmm1, QWORD PTR __Lsinfarray+18h ; s4 + mulsd xmm1, xmm2 ; s4x2 + movsd xmm4, xmm2 ; move for x4 + mulsd xmm4, xmm2 ; x4 + movsd xmm5, QWORD PTR __Lsinfarray+8h ; s2 + mulsd xmm5, xmm2 ; s2x2 + movsd xmm3, xmm0 ; move for x3 + mulsd xmm3, xmm2 ; x3 + addsd xmm1, QWORD PTR __Lsinfarray+10h ; s3+s4x2 + mulsd xmm1, xmm4 ; s3x4+s4x6 + addsd xmm5, QWORD PTR __Lsinfarray ; s1+s2x2 + addsd xmm1, xmm5 ; s1+s2x2+s3x4+s4x6 + mulsd xmm1, xmm3 ; x3(s1+s2x2+s3x4+s4x6) + addsd xmm0, xmm1 ; x + x3(s1+s2x2+s3x4+s4x6) + jmp Lcosf_sse2_adjust_region + +ALIGN 16 +Lcosf_sse2_do_cosf_calc: +; region 0 or 2 - do a cos calculation +; zc = 1-0.5*x2+ c1*x4 +c2*x6 +c3*x8; +; zc = 1-0.5*x2+ c1*x4 +c2*x6 +c3*x8 + c4*x10 for a higher precision + movsd xmm1, QWORD PTR __Lcosfarray+20h ; c4 + movsd xmm4, xmm2 ; move for x4 + mulsd xmm1, xmm2 ; c4x2 + movsd xmm3, QWORD PTR __Lcosfarray+10h ; c2 + mulsd xmm4, xmm2 ; x4 + movsd xmm0, QWORD PTR __Lcosfarray ; c0 + mulsd xmm3, xmm2 ; c2x2 + mulsd xmm0, xmm2 ; c0x2 (=-0.5x2) + addsd xmm1, QWORD PTR __Lcosfarray+18h ; c3+c4x2 + mulsd xmm1, xmm4 ; c3x4 + c4x6 + addsd xmm3, QWORD PTR __Lcosfarray+8h ; c1+c2x2 + addsd xmm1, xmm3 ; c1 + c2x2 + c3x4 + c4x6 + mulsd xmm1, xmm4 ; c1x4 + c2x6 + c3x8 + c4x10 + addsd xmm0, QWORD PTR L_real_one ; 1 - 0.5x2 + addsd xmm0, xmm1 ; 1 - 0.5x2 + c1x4 + c2x6 + c3x8 + c4x10 + +Lcosf_sse2_adjust_region: +; xmm1 is cos or sin, relies on previous sections to +; switch (region) + add eax, 1 + and eax, 2 + jz Lcosf_sse2_cleanup +;; if region 1 or 2 then we negate the result. + xorpd xmm2, xmm2 + subsd xmm2, xmm0 + movapd xmm0, xmm2 + +ALIGN 16 +Lcosf_sse2_cleanup: + cvtsd2ss xmm0, xmm0 + StackDeallocate stack_size + ret + + +Lcosf_sse2_naninf: + call fname_special + StackDeallocate stack_size + ret + + +ALIGN 16 +Lcosf_fma3: + vmovd eax,xmm0 + mov r8d,L_inf_mask_32 + and eax,r8d + cmp eax, r8d + jz Lcosf_fma3_naninf + + vcvtss2sd xmm5,xmm0,xmm0 + vmovq r9,xmm5 + btr r9,63 ;clear sign + + cmp r9,L_piby2 + jg Lcosf_fma3_range_reduce + cmp r9,L_two_to_neg_7 + jge Lcosf_fma3_compute_cosf_piby_4 + cmp r9,L_two_to_neg_13 + jge Lcosf_fma3_compute_1_xx_5 + + vmovq xmm0,QWORD PTR L_real_one + ; Here we need to set inexact + vaddsd xmm0,xmm0,L__min_norm_double ; this will set inexact + jmp Lcosf_fma3_return + +ALIGN 16 +Lcosf_fma3_compute_1_xx_5: + vmulsd xmm0,xmm5,QWORD PTR L_one_half + vfnmadd213sd xmm0,xmm5,L_real_one ; xmm9 1.0 - x*x*(double2)0.5 + jmp Lcosf_fma3_return + +ALIGN 16 +Lcosf_fma3_compute_cosf_piby_4: + movsd xmm0,xmm5 + vmovapd xmm2,L_real_one + vmulsd xmm3,xmm0,xmm0 + vmulsd xmm1,xmm3,L_one_half ; xmm1 <-- r + vsubsd xmm2,xmm2,xmm1 + vmovsd xmm1,__Lcosfarray+018h + vfmadd231sd xmm1,xmm3,__Lcosfarray+020h + vfmadd213sd xmm1,xmm3,__Lcosfarray+010h + vfmadd213sd xmm1,xmm3,__Lcosfarray+008h + vmulsd xmm3,xmm3,xmm3 ; xmm3 <-- x^4 + vmovdqa xmm0,xmm2 + vfmadd231sd xmm0,xmm1,xmm3 + jmp Lcosf_fma3_return + +ALIGN 16 +Lcosf_fma3_range_reduce: + vmovq xmm0,r9 ; xmm0 <-- |x| + cmp r9,L_large_x_fma3 + jge Lcosf_reduce_precise + +;cosff_range_e_5_s: + vandpd xmm1,xmm0,L_sign_mask + vmovapd xmm2,L_2bypi + vfmadd213sd xmm2,xmm1,L_one_half + vcvttpd2dq xmm2,xmm2 + vpmovsxdq xmm1,xmm2 + vandpd xmm4,xmm1,L__int_three ; region xmm4 + vshufps xmm1 ,xmm1,xmm1,8 + vcvtdq2pd xmm1,xmm1 + vmovdqa xmm2,xmm0 + vfnmadd231sd xmm2,xmm1,L_piby2_1 ; xmm2 rhead + vmulsd xmm3,xmm1,L_piby2_1tail ; xmm3 rtail + vsubsd xmm0,xmm2,xmm3 ; r_1 xmm0 + vsubsd xmm2,xmm2,xmm0 + vsubsd xmm1,xmm2,xmm3 + vmovq rax,xmm4 + jmp Lcosf_exit_s + +ALIGN 16 +Lcosf_reduce_precise: + + vmovq xmm0,r9 ; r9 <-- |x| + cmp r9,L_large_x_fma3 + jge Lcos_remainder_piby2 + + ; __remainder_piby2_fma3 and __remainder_piby2_fma3_bdl + ; have the following conventions: + ; on input + ; x is in xmm0 + ; on output + ; r is in xmm0 + ; rr is in xmm1 + ; region is in rax + ; The _bdl routine is guaranteed not to touch r10 + +Lcos_remainder_piby2_small: ;; unused label + ; Boldo-Daumas-Li reduction for reasonably small |x| + call __remainder_piby2_fma3_bdl + jmp Lcosf_exit_s + +ALIGN 16 +Lcos_remainder_piby2: + ; argument reduction for general x + call __remainder_piby2_fma3 +Lcosf_exit_s: + bt rax,0 + jnc Lcosf_piby4_compute + +;sinf_piby4_compute: +; vmovapd xmm1,__Lsinfarray+010h + vmovsd xmm1,__Lsinfarray+010h + vmulsd xmm3,xmm0,xmm0 + vfmadd231sd xmm1,xmm3,__Lsinfarray+018h + vfmadd213sd xmm1,xmm3,__Lsinfarray+008h + vfmadd213sd xmm1,xmm3,__Lsinfarray + vmulsd xmm3,xmm0,xmm3 ; xmm3 <-- x^3 + vfmadd231sd xmm0,xmm1,xmm3 + jmp Lcosf_fma3_adjust_sign + +ALIGN 16 +Lcosf_piby4_compute: + vmovapd xmm2,L_real_one + vmulsd xmm3,xmm0,xmm0 + vmulsd xmm1,xmm3,L_one_half ; xmm1 <-- r + vsubsd xmm2,xmm2,xmm1 + vmovsd xmm1,__Lcosfarray+018h + vfmadd231sd xmm1 ,xmm3,__Lcosfarray+020h + vfmadd213sd xmm1 ,xmm3,__Lcosfarray+010h + vfmadd213sd xmm1 ,xmm3,__Lcosfarray+008h + vmulsd xmm3,xmm3,xmm3 ; xmm3 <-- x^4 + vmovdqa xmm0, xmm2 + vfmadd231sd xmm0 ,xmm1,xmm3 + +Lcosf_fma3_adjust_sign: + ; assuming FMA3 ==> AVX ==> SSE4.1 +; vpcmpeqq xmm1,xmm4,XMMWORD PTR L_int_one +; vpcmpeqq xmm2,xmm4,XMMWORD PTR L_int_two +; vorpd xmm3,xmm2,xmm1 + +; vandpd xmm3,xmm3,L_signbit + + add rax,1 ; 1,2 --> 2,3 + shr rax,1 ; 2,3 --> 1 + shl rax,63 ; 1 --> sign bit + vmovq xmm3,rax + + vxorpd xmm0,xmm0,xmm3 + +Lcosf_fma3_return: + vcvtsd2ss xmm0,xmm0,xmm0 + StackDeallocate stack_size + ret + +Lcosf_fma3_naninf: + call fname_special + StackDeallocate stack_size + ret + +fname endp +END diff --git a/sdk/lib/crt/math/libm_sse2/cosh.c b/sdk/lib/crt/math/libm_sse2/cosh.c new file mode 100644 index 00000000000..9eb06d0c261 --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/cosh.c @@ -0,0 +1,344 @@ + +/******************************************************************************* +MIT License +----------- + +Copyright (c) 2002-2019 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this Software and associated documentaon files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include "libm.h" +#include "libm_util.h" + +#define USE_SPLITEXP +#define USE_SCALEDOUBLE_1 +#define USE_SCALEDOUBLE_2 +#define USE_INFINITY_WITH_FLAGS +#define USE_VAL_WITH_FLAGS +#define USE_HANDLE_ERROR +#include "libm_inlines.h" +#undef USE_SPLITEXP +#undef USE_SCALEDOUBLE_1 +#undef USE_SCALEDOUBLE_2 +#undef USE_INFINITY_WITH_FLAGS +#undef USE_VAL_WITH_FLAGS +#undef USE_HANDLE_ERROR + + +#pragma function(cosh) +double cosh(double x) +{ + /* + Derived from sinh subroutine + + After dealing with special cases the computation is split into + regions as follows: + + abs(x) >= max_cosh_arg: + cosh(x) = sign(x)*Inf + + abs(x) >= small_threshold: + cosh(x) = sign(x)*exp(abs(x))/2 computed using the + splitexp and scaleDouble functions as for exp_amd(). + + abs(x) < small_threshold: + compute p = exp(y) - 1 and then z = 0.5*(p+(p/(p+1.0))) + cosh(x) is then sign(x)*z. */ + + static const double + max_cosh_arg = 7.10475860073943977113e+02, /* 0x408633ce8fb9f87e */ + thirtytwo_by_log2 = 4.61662413084468283841e+01, /* 0x40471547652b82fe */ + log2_by_32_lead = 2.16608493356034159660e-02, /* 0x3f962e42fe000000 */ + log2_by_32_tail = 5.68948749532545630390e-11, /* 0x3dcf473de6af278e */ +// small_threshold = 8*BASEDIGITS_DP64*0.30102999566398119521373889; + small_threshold = 20.0; + /* (8*BASEDIGITS_DP64*log10of2) ' exp(-x) insignificant c.f. exp(x) */ + + /* Lead and tail tabulated values of sinh(i) and cosh(i) + for i = 0,...,36. The lead part has 26 leading bits. */ + + static const double sinh_lead[ 37] = { + 0.00000000000000000000e+00, /* 0x0000000000000000 */ + 1.17520117759704589844e+00, /* 0x3ff2cd9fc0000000 */ + 3.62686038017272949219e+00, /* 0x400d03cf60000000 */ + 1.00178747177124023438e+01, /* 0x40240926e0000000 */ + 2.72899169921875000000e+01, /* 0x403b4a3800000000 */ + 7.42032089233398437500e+01, /* 0x40528d0160000000 */ + 2.01713153839111328125e+02, /* 0x406936d228000000 */ + 5.48316116333007812500e+02, /* 0x4081228768000000 */ + 1.49047882080078125000e+03, /* 0x409749ea50000000 */ + 4.05154187011718750000e+03, /* 0x40afa71570000000 */ + 1.10132326660156250000e+04, /* 0x40c5829dc8000000 */ + 2.99370708007812500000e+04, /* 0x40dd3c4488000000 */ + 8.13773945312500000000e+04, /* 0x40f3de1650000000 */ + 2.21206695312500000000e+05, /* 0x410b00b590000000 */ + 6.01302140625000000000e+05, /* 0x412259ac48000000 */ + 1.63450865625000000000e+06, /* 0x4138f0cca8000000 */ + 4.44305525000000000000e+06, /* 0x4150f2ebd0000000 */ + 1.20774762500000000000e+07, /* 0x4167093488000000 */ + 3.28299845000000000000e+07, /* 0x417f4f2208000000 */ + 8.92411500000000000000e+07, /* 0x419546d8f8000000 */ + 2.42582596000000000000e+08, /* 0x41aceb0888000000 */ + 6.59407856000000000000e+08, /* 0x41c3a6e1f8000000 */ + 1.79245641600000000000e+09, /* 0x41dab5adb8000000 */ + 4.87240166400000000000e+09, /* 0x41f226af30000000 */ + 1.32445608960000000000e+10, /* 0x4208ab7fb0000000 */ + 3.60024494080000000000e+10, /* 0x4220c3d390000000 */ + 9.78648043520000000000e+10, /* 0x4236c93268000000 */ + 2.66024116224000000000e+11, /* 0x424ef822f0000000 */ + 7.23128516608000000000e+11, /* 0x42650bba30000000 */ + 1.96566712320000000000e+12, /* 0x427c9aae40000000 */ + 5.34323724288000000000e+12, /* 0x4293704708000000 */ + 1.45244246507520000000e+13, /* 0x42aa6b7658000000 */ + 3.94814795284480000000e+13, /* 0x42c1f43fc8000000 */ + 1.07321789251584000000e+14, /* 0x42d866f348000000 */ + 2.91730863685632000000e+14, /* 0x42f0953e28000000 */ + 7.93006722514944000000e+14, /* 0x430689e220000000 */ + 2.15561576592179200000e+15}; /* 0x431ea215a0000000 */ + + static const double sinh_tail[ 37] = { + 0.00000000000000000000e+00, /* 0x0000000000000000 */ + 1.60467555584448807892e-08, /* 0x3e513ae6096a0092 */ + 2.76742892754807136947e-08, /* 0x3e5db70cfb79a640 */ + 2.09697499555224576530e-07, /* 0x3e8c2526b66dc067 */ + 2.04940252448908240062e-07, /* 0x3e8b81b18647f380 */ + 1.65444891522700935932e-06, /* 0x3ebbc1cdd1e1eb08 */ + 3.53116789999998198721e-06, /* 0x3ecd9f201534fb09 */ + 6.94023870987375490695e-06, /* 0x3edd1c064a4e9954 */ + 4.98876893611587449271e-06, /* 0x3ed4eca65d06ea74 */ + 3.19656024605152215752e-05, /* 0x3f00c259bcc0ecc5 */ + 2.08687768377236501204e-04, /* 0x3f2b5a6647cf9016 */ + 4.84668088325403796299e-05, /* 0x3f09691adefb0870 */ + 1.17517985422733832468e-03, /* 0x3f53410fc29cde38 */ + 6.90830086959560562415e-04, /* 0x3f46a31a50b6fb3c */ + 1.45697262451506548420e-03, /* 0x3f57defc71805c40 */ + 2.99859023684906737806e-02, /* 0x3f9eb49fd80e0bab */ + 1.02538800507941396667e-02, /* 0x3f84fffc7bcd5920 */ + 1.26787628407699110022e-01, /* 0x3fc03a93b6c63435 */ + 6.86652479544033744752e-02, /* 0x3fb1940bb255fd1c */ + 4.81593627621056619148e-01, /* 0x3fded26e14260b50 */ + 1.70489513795397629181e+00, /* 0x3ffb47401fc9f2a2 */ + 1.12416073482258713767e+01, /* 0x40267bb3f55634f1 */ + 7.06579578070110514432e+00, /* 0x401c435ff8194ddc */ + 5.91244512999659974639e+01, /* 0x404d8fee052ba63a */ + 1.68921736147050694399e+02, /* 0x40651d7edccde3f6 */ + 2.60692936262073658327e+02, /* 0x40704b1644557d1a */ + 3.62419382134885609048e+02, /* 0x4076a6b5ca0a9dc4 */ + 4.07689930834187271103e+03, /* 0x40afd9cc72249aba */ + 1.55377375868385224749e+04, /* 0x40ce58de693edab5 */ + 2.53720210371943067003e+04, /* 0x40d8c70158ac6363 */ + 4.78822310734952334315e+04, /* 0x40e7614764f43e20 */ + 1.81871712615542812273e+05, /* 0x4106337db36fc718 */ + 5.62892347580489004031e+05, /* 0x41212d98b1f611e2 */ + 6.41374032312148716301e+05, /* 0x412392bc108b37cc */ + 7.57809544070145115256e+06, /* 0x415ce87bdc3473dc */ + 3.64177136406482197344e+06, /* 0x414bc8d5ae99ad14 */ + 7.63580561355670914054e+06}; /* 0x415d20d76744835c */ + + static const double cosh_lead[ 37] = { + 1.00000000000000000000e+00, /* 0x3ff0000000000000 */ + 1.54308062791824340820e+00, /* 0x3ff8b07550000000 */ + 3.76219564676284790039e+00, /* 0x400e18fa08000000 */ + 1.00676617622375488281e+01, /* 0x402422a490000000 */ + 2.73082327842712402344e+01, /* 0x403b4ee858000000 */ + 7.42099475860595703125e+01, /* 0x40528d6fc8000000 */ + 2.01715633392333984375e+02, /* 0x406936e678000000 */ + 5.48317031860351562500e+02, /* 0x4081228948000000 */ + 1.49047915649414062500e+03, /* 0x409749eaa8000000 */ + 4.05154199218750000000e+03, /* 0x40afa71580000000 */ + 1.10132329101562500000e+04, /* 0x40c5829dd0000000 */ + 2.99370708007812500000e+04, /* 0x40dd3c4488000000 */ + 8.13773945312500000000e+04, /* 0x40f3de1650000000 */ + 2.21206695312500000000e+05, /* 0x410b00b590000000 */ + 6.01302140625000000000e+05, /* 0x412259ac48000000 */ + 1.63450865625000000000e+06, /* 0x4138f0cca8000000 */ + 4.44305525000000000000e+06, /* 0x4150f2ebd0000000 */ + 1.20774762500000000000e+07, /* 0x4167093488000000 */ + 3.28299845000000000000e+07, /* 0x417f4f2208000000 */ + 8.92411500000000000000e+07, /* 0x419546d8f8000000 */ + 2.42582596000000000000e+08, /* 0x41aceb0888000000 */ + 6.59407856000000000000e+08, /* 0x41c3a6e1f8000000 */ + 1.79245641600000000000e+09, /* 0x41dab5adb8000000 */ + 4.87240166400000000000e+09, /* 0x41f226af30000000 */ + 1.32445608960000000000e+10, /* 0x4208ab7fb0000000 */ + 3.60024494080000000000e+10, /* 0x4220c3d390000000 */ + 9.78648043520000000000e+10, /* 0x4236c93268000000 */ + 2.66024116224000000000e+11, /* 0x424ef822f0000000 */ + 7.23128516608000000000e+11, /* 0x42650bba30000000 */ + 1.96566712320000000000e+12, /* 0x427c9aae40000000 */ + 5.34323724288000000000e+12, /* 0x4293704708000000 */ + 1.45244246507520000000e+13, /* 0x42aa6b7658000000 */ + 3.94814795284480000000e+13, /* 0x42c1f43fc8000000 */ + 1.07321789251584000000e+14, /* 0x42d866f348000000 */ + 2.91730863685632000000e+14, /* 0x42f0953e28000000 */ + 7.93006722514944000000e+14, /* 0x430689e220000000 */ + 2.15561576592179200000e+15}; /* 0x431ea215a0000000 */ + + static const double cosh_tail[ 37] = { + 0.00000000000000000000e+00, /* 0x0000000000000000 */ + 6.89700037027478056904e-09, /* 0x3e3d9f5504c2bd28 */ + 4.43207835591715833630e-08, /* 0x3e67cb66f0a4c9fd */ + 2.33540217013828929694e-07, /* 0x3e8f58617928e588 */ + 5.17452463948269748331e-08, /* 0x3e6bc7d000c38d48 */ + 9.38728274131605919153e-07, /* 0x3eaf7f9d4e329998 */ + 2.73012191010840495544e-06, /* 0x3ec6e6e464885269 */ + 3.29486051438996307950e-06, /* 0x3ecba3a8b946c154 */ + 4.75803746362771416375e-06, /* 0x3ed3f4e76110d5a4 */ + 3.33050940471947692369e-05, /* 0x3f017622515a3e2b */ + 9.94707313972136215365e-06, /* 0x3ee4dc4b528af3d0 */ + 6.51685096227860253398e-05, /* 0x3f11156278615e10 */ + 1.18132406658066663359e-03, /* 0x3f535ad50ed821f5 */ + 6.93090416366541877541e-04, /* 0x3f46b61055f2935c */ + 1.45780415323416845386e-03, /* 0x3f57e2794a601240 */ + 2.99862082708111758744e-02, /* 0x3f9eb4b45f6aadd3 */ + 1.02539925859688602072e-02, /* 0x3f85000b967b3698 */ + 1.26787669807076286421e-01, /* 0x3fc03a940fadc092 */ + 6.86652631843830962843e-02, /* 0x3fb1940bf3bf874c */ + 4.81593633223853068159e-01, /* 0x3fded26e1a2a2110 */ + 1.70489514001513020602e+00, /* 0x3ffb4740205796d6 */ + 1.12416073489841270572e+01, /* 0x40267bb3f55cb85d */ + 7.06579578098005001152e+00, /* 0x401c435ff81e18ac */ + 5.91244513000686140458e+01, /* 0x404d8fee052bdea4 */ + 1.68921736147088438429e+02, /* 0x40651d7edccde926 */ + 2.60692936262087528121e+02, /* 0x40704b1644557e0e */ + 3.62419382134890611269e+02, /* 0x4076a6b5ca0a9e1c */ + 4.07689930834187453002e+03, /* 0x40afd9cc72249abe */ + 1.55377375868385224749e+04, /* 0x40ce58de693edab5 */ + 2.53720210371943103382e+04, /* 0x40d8c70158ac6364 */ + 4.78822310734952334315e+04, /* 0x40e7614764f43e20 */ + 1.81871712615542812273e+05, /* 0x4106337db36fc718 */ + 5.62892347580489004031e+05, /* 0x41212d98b1f611e2 */ + 6.41374032312148716301e+05, /* 0x412392bc108b37cc */ + 7.57809544070145115256e+06, /* 0x415ce87bdc3473dc */ + 3.64177136406482197344e+06, /* 0x414bc8d5ae99ad14 */ + 7.63580561355670914054e+06}; /* 0x415d20d76744835c */ + + unsigned long ux, aux, xneg; + double y, z, z1, z2; + int m; + + /* Special cases */ + + GET_BITS_DP64(x, ux); + aux = ux & ~SIGNBIT_DP64; + if (aux < 0x3e30000000000000) /* |x| small enough that cosh(x) = 1 */ + { + if (aux == 0) + /* with no inexact */ + return 1.0; + else + return val_with_flags(1.0, AMD_F_INEXACT); + } + else if (aux >= PINFBITPATT_DP64) /* |x| is NaN or Inf */ + { + if (aux > PINFBITPATT_DP64) /* x is NaN */ + return _handle_error("cosh", OP_COSH, ux|0x0008000000000000,_DOMAIN, + 0,EDOM, x, 0.0, 1); + else /* x is infinity */ + return infinity_with_flags(0); + } + + xneg = (aux != ux); + + y = x; + if (xneg) y = -x; + + if (y >= max_cosh_arg) + { + return _handle_error("cosh", OP_COSH, PINFBITPATT_DP64,_OVERFLOW, + AMD_F_INEXACT|AMD_F_OVERFLOW,ERANGE, x, 0.0, 1); + +// z = infinity_with_flags(AMD_F_OVERFLOW); + } + else if (y >= small_threshold) + { + /* In this range y is large enough so that + the negative exponential is negligible, + so cosh(y) is approximated by sign(x)*exp(y)/2. The + code below is an inlined version of that from + exp() with two changes (it operates on + y instead of x, and the division by 2 is + done by reducing m by 1). */ + + splitexp(y, 1.0, thirtytwo_by_log2, log2_by_32_lead, + log2_by_32_tail, &m, &z1, &z2); + m -= 1; + + if (m >= EMIN_DP64 && m <= EMAX_DP64) + z = scaleDouble_1((z1+z2),m); + else + z = scaleDouble_2((z1+z2),m); + } + else + { + /* In this range we find the integer part y0 of y + and the increment dy = y - y0. We then compute + + z = sinh(y) = sinh(y0)cosh(dy) + cosh(y0)sinh(dy) + z = cosh(y) = cosh(y0)cosh(dy) + sinh(y0)sinh(dy) + + where sinh(y0) and cosh(y0) are tabulated above. */ + + int ind; + double dy, dy2, sdy, cdy; + + ind = (int)y; + dy = y - ind; + + dy2 = dy*dy; + sdy = dy*dy2*(0.166666666666666667013899e0 + + (0.833333333333329931873097e-2 + + (0.198412698413242405162014e-3 + + (0.275573191913636406057211e-5 + + (0.250521176994133472333666e-7 + + (0.160576793121939886190847e-9 + + 0.7746188980094184251527126e-12*dy2)*dy2)*dy2)*dy2)*dy2)*dy2); + + cdy = dy2*(0.500000000000000005911074e0 + + (0.416666666666660876512776e-1 + + (0.138888888889814854814536e-2 + + (0.248015872460622433115785e-4 + + (0.275573350756016588011357e-6 + + (0.208744349831471353536305e-8 + + 0.1163921388172173692062032e-10*dy2)*dy2)*dy2)*dy2)*dy2)*dy2); + + /* At this point sinh(dy) is approximated by dy + sdy, and cosh(dy) is approximated by 1 + cdy. + Shift some significant bits from dy to cdy. */ +#if 0 + double sdy1,sdy2; + GET_BITS_DP64(dy, ux); + ux &= 0xfffffffff8000000; + PUT_BITS_DP64(ux, sdy1); // sdy1 is upper 53-27=26 significant bits of dy. + sdy2 = sdy + (dy - sdy1); // sdy2 is sdy + lower bits of dy + + z = ((((((cosh_tail[ind]*cdy + sinh_tail[ind]*sdy2) + + sinh_tail[ind]*sdy1) + cosh_tail[ind]) + + cosh_lead[ind]*cdy) + sinh_lead[ind]*sdy2) + + sinh_lead[ind]*sdy1) + cosh_lead[ind]; +#else + z = ((((((cosh_tail[ind]*cdy + sinh_tail[ind]*sdy) + + sinh_tail[ind]*dy) + cosh_tail[ind]) + + cosh_lead[ind]*cdy) + sinh_lead[ind]*sdy) + + sinh_lead[ind]*dy) + cosh_lead[ind]; +#endif + } + + return z; +} diff --git a/sdk/lib/crt/math/libm_sse2/coshf.c b/sdk/lib/crt/math/libm_sse2/coshf.c new file mode 100644 index 00000000000..6e7ad089c37 --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/coshf.c @@ -0,0 +1,247 @@ + +/******************************************************************************* +MIT License +----------- + +Copyright (c) 2002-2019 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this Software and associated documentaon files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include "libm.h" +#include "libm_util.h" + +#define USE_SPLITEXP +#define USE_SCALEDOUBLE_1 +#define USE_SCALEDOUBLE_2 +#define USE_INFINITYF_WITH_FLAGS +#define USE_VALF_WITH_FLAGS +#define USE_HANDLE_ERRORF +#include "libm_inlines.h" +#undef USE_SPLITEXP +#undef USE_SCALEDOUBLE_1 +#undef USE_SCALEDOUBLE_2 +#undef USE_INFINITYF_WITH_FLAGS +#undef USE_VALF_WITH_FLAGS +#undef USE_HANDLE_ERRORF + +// Disable "C4163: not available as intrinsic function" warning that older +// compilers may issue here. +#pragma warning(disable:4163) +#pragma function(coshf) + +float coshf(float fx) +{ + /* + After dealing with special cases the computation is split into + regions as follows: + + abs(x) >= max_cosh_arg: + cosh(x) = sign(x)*Inf + + abs(x) >= small_threshold: + cosh(x) = sign(x)*exp(abs(x))/2 computed using the + splitexp and scaleDouble functions as for exp_amd(). + + abs(x) < small_threshold: + compute p = exp(y) - 1 and then z = 0.5*(p+(p/(p+1.0))) + cosh(x) is then sign(x)*z. */ + + static const double + /* The max argument of coshf, but stored as a double */ + max_cosh_arg = 8.94159862922329438106e+01, /* 0x40565a9f84f82e63 */ + thirtytwo_by_log2 = 4.61662413084468283841e+01, /* 0x40471547652b82fe */ + log2_by_32_lead = 2.16608493356034159660e-02, /* 0x3f962e42fe000000 */ + log2_by_32_tail = 5.68948749532545630390e-11, /* 0x3dcf473de6af278e */ + + small_threshold = 8*BASEDIGITS_DP64*0.30102999566398119521373889; +// small_threshold = 20.0; + /* (8*BASEDIGITS_DP64*log10of2) ' exp(-x) insignificant c.f. exp(x) */ + + /* Tabulated values of sinh(i) and cosh(i) for i = 0,...,36. */ + + static const double sinh_lead[ 37] = { + 0.00000000000000000000e+00, /* 0x0000000000000000 */ + 1.17520119364380137839e+00, /* 0x3ff2cd9fc44eb982 */ + 3.62686040784701857476e+00, /* 0x400d03cf63b6e19f */ + 1.00178749274099008204e+01, /* 0x40240926e70949ad */ + 2.72899171971277496596e+01, /* 0x403b4a3803703630 */ + 7.42032105777887522891e+01, /* 0x40528d0166f07374 */ + 2.01713157370279219549e+02, /* 0x406936d22f67c805 */ + 5.48316123273246489589e+02, /* 0x408122876ba380c9 */ + 1.49047882578955000099e+03, /* 0x409749ea514eca65 */ + 4.05154190208278987484e+03, /* 0x40afa7157430966f */ + 1.10132328747033916443e+04, /* 0x40c5829dced69991 */ + 2.99370708492480553105e+04, /* 0x40dd3c4488cb48d6 */ + 8.13773957064298447222e+04, /* 0x40f3de1654d043f0 */ + 2.21206696003330085659e+05, /* 0x410b00b5916a31a5 */ + 6.01302142081972560845e+05, /* 0x412259ac48bef7e3 */ + 1.63450868623590236530e+06, /* 0x4138f0ccafad27f6 */ + 4.44305526025387924165e+06, /* 0x4150f2ebd0a7ffe3 */ + 1.20774763767876271158e+07, /* 0x416709348c0ea4ed */ + 3.28299845686652474105e+07, /* 0x417f4f22091940bb */ + 8.92411504815936237574e+07, /* 0x419546d8f9ed26e1 */ + 2.42582597704895108938e+08, /* 0x41aceb088b68e803 */ + 6.59407867241607308388e+08, /* 0x41c3a6e1fd9eecfd */ + 1.79245642306579566002e+09, /* 0x41dab5adb9c435ff */ + 4.87240172312445068359e+09, /* 0x41f226af33b1fdc0 */ + 1.32445610649217357635e+10, /* 0x4208ab7fb5475fb7 */ + 3.60024496686929321289e+10, /* 0x4220c3d3920962c8 */ + 9.78648047144193725586e+10, /* 0x4236c932696a6b5c */ + 2.66024120300899291992e+11, /* 0x424ef822f7f6731c */ + 7.23128532145737548828e+11, /* 0x42650bba3796379a */ + 1.96566714857202099609e+12, /* 0x427c9aae4631c056 */ + 5.34323729076223046875e+12, /* 0x429370470aec28ec */ + 1.45244248326237109375e+13, /* 0x42aa6b765d8cdf6c */ + 3.94814800913403437500e+13, /* 0x42c1f43fcc4b662c */ + 1.07321789892958031250e+14, /* 0x42d866f34a725782 */ + 2.91730871263727437500e+14, /* 0x42f0953e2f3a1ef7 */ + 7.93006726156715250000e+14, /* 0x430689e221bc8d5a */ + 2.15561577355759750000e+15}; /* 0x431ea215a1d20d76 */ + + static const double cosh_lead[ 37] = { + 1.00000000000000000000e+00, /* 0x3ff0000000000000 */ + 1.54308063481524371241e+00, /* 0x3ff8b07551d9f550 */ + 3.76219569108363138810e+00, /* 0x400e18fa0df2d9bc */ + 1.00676619957777653269e+01, /* 0x402422a497d6185e */ + 2.73082328360164865444e+01, /* 0x403b4ee858de3e80 */ + 7.42099485247878334349e+01, /* 0x40528d6fcbeff3a9 */ + 2.01715636122455890700e+02, /* 0x406936e67db9b919 */ + 5.48317035155212010977e+02, /* 0x4081228949ba3a8b */ + 1.49047916125217807348e+03, /* 0x409749eaa93f4e76 */ + 4.05154202549259389343e+03, /* 0x40afa715845d8894 */ + 1.10132329201033226127e+04, /* 0x40c5829dd053712d */ + 2.99370708659497577173e+04, /* 0x40dd3c4489115627 */ + 8.13773957125740562333e+04, /* 0x40f3de1654d6b543 */ + 2.21206696005590405548e+05, /* 0x410b00b5916b6105 */ + 6.01302142082804115489e+05, /* 0x412259ac48bf13ca */ + 1.63450868623620807193e+06, /* 0x4138f0ccafad2d17 */ + 4.44305526025399193168e+06, /* 0x4150f2ebd0a8005c */ + 1.20774763767876680940e+07, /* 0x416709348c0ea503 */ + 3.28299845686652623117e+07, /* 0x417f4f22091940bf */ + 8.92411504815936237574e+07, /* 0x419546d8f9ed26e1 */ + 2.42582597704895138741e+08, /* 0x41aceb088b68e804 */ + 6.59407867241607308388e+08, /* 0x41c3a6e1fd9eecfd */ + 1.79245642306579566002e+09, /* 0x41dab5adb9c435ff */ + 4.87240172312445068359e+09, /* 0x41f226af33b1fdc0 */ + 1.32445610649217357635e+10, /* 0x4208ab7fb5475fb7 */ + 3.60024496686929321289e+10, /* 0x4220c3d3920962c8 */ + 9.78648047144193725586e+10, /* 0x4236c932696a6b5c */ + 2.66024120300899291992e+11, /* 0x424ef822f7f6731c */ + 7.23128532145737548828e+11, /* 0x42650bba3796379a */ + 1.96566714857202099609e+12, /* 0x427c9aae4631c056 */ + 5.34323729076223046875e+12, /* 0x429370470aec28ec */ + 1.45244248326237109375e+13, /* 0x42aa6b765d8cdf6c */ + 3.94814800913403437500e+13, /* 0x42c1f43fcc4b662c */ + 1.07321789892958031250e+14, /* 0x42d866f34a725782 */ + 2.91730871263727437500e+14, /* 0x42f0953e2f3a1ef7 */ + 7.93006726156715250000e+14, /* 0x430689e221bc8d5a */ + 2.15561577355759750000e+15}; /* 0x431ea215a1d20d76 */ + + unsigned long ux, aux, xneg; + unsigned int uhx; + double x = fx, y, z, z1, z2; + int m; + + /* Special cases */ + + GET_BITS_DP64(x, ux); + aux = ux & ~SIGNBIT_DP64; + if (aux < 0x3f10000000000000) /* |x| small enough that cosh(x) = 1 */ + { + if (aux == 0) return (float)1.0; /* with no inexact */ + if (LAMBDA_DP64 + x > 1.0) return valf_with_flags((float)1.0, AMD_F_INEXACT); /* with inexact */ + } + else if (aux >= PINFBITPATT_DP64) /* |x| is NaN or Inf */ + if (aux > PINFBITPATT_DP64) /* x is NaN */ + { + GET_BITS_SP32(fx, uhx); + return _handle_errorf("coshf",OP_COSH,uhx|0x00400000,_DOMAIN, 0, + EDOM, fx, 0.0, 1); + } + else /* x is infinity */ + return infinityf_with_flags(0); + xneg = (aux != ux); + + y = x; + if (xneg) y = -x; + + if (y >= max_cosh_arg) + /* Return +infinity with overflow flag. */ + return _handle_errorf("coshf",OP_COSH,PINFBITPATT_SP32,_OVERFLOW, + AMD_F_INEXACT|AMD_F_OVERFLOW,ERANGE, fx, 0.0, 1); +// z = infinity_with_flags(AMD_F_OVERFLOW); + else if (y >= small_threshold) + { + /* In this range y is large enough so that + the negative exponential is negligible, + so cosh(y) is approximated by sign(x)*exp(y)/2. The + code below is an inlined version of that from + exp() with two changes (it operates on + y instead of x, and the division by 2 is + done by reducing m by 1). */ + + splitexp(y, 1.0, thirtytwo_by_log2, log2_by_32_lead, + log2_by_32_tail, &m, &z1, &z2); + m -= 1; + + /* scaleDouble_1 is always safe because the argument x was + float, rather than double */ + z = scaleDouble_1((z1+z2),m); + } + else + { + /* In this range we find the integer part y0 of y + and the increment dy = y - y0. We then compute + + z = sinh(y) = sinh(y0)cosh(dy) + cosh(y0)sinh(dy) + z = cosh(y) = cosh(y0)cosh(dy) + sinh(y0)sinh(dy) + + where sinh(y0) and cosh(y0) are tabulated above. */ + + int ind; + double dy, dy2, sdy, cdy; + + ind = (int)y; + dy = y - ind; + + dy2 = dy*dy; + + sdy = dy + dy*dy2*(0.166666666666666667013899e0 + + (0.833333333333329931873097e-2 + + (0.198412698413242405162014e-3 + + (0.275573191913636406057211e-5 + + (0.250521176994133472333666e-7 + + (0.160576793121939886190847e-9 + + 0.7746188980094184251527126e-12*dy2)*dy2)*dy2)*dy2)*dy2)*dy2); + + cdy = 1 + dy2*(0.500000000000000005911074e0 + + (0.416666666666660876512776e-1 + + (0.138888888889814854814536e-2 + + (0.248015872460622433115785e-4 + + (0.275573350756016588011357e-6 + + (0.208744349831471353536305e-8 + + 0.1163921388172173692062032e-10*dy2)*dy2)*dy2)*dy2)*dy2)*dy2); + + z = cosh_lead[ind]*cdy + sinh_lead[ind]*sdy; + } + +// if (xneg) z = - z; + return (float)z; +} diff --git a/sdk/lib/crt/math/libm_sse2/exp.asm b/sdk/lib/crt/math/libm_sse2/exp.asm new file mode 100644 index 00000000000..10fb7f48c6a --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/exp.asm @@ -0,0 +1,439 @@ +; +; MIT License +; ----------- +; +; Copyright (c) 2002-2019 Advanced Micro Devices, Inc. +; +; Permission is hereby granted, free of charge, to any person obtaining a copy +; of this Software and associated documentaon files (the "Software"), to deal +; in the Software without restriction, including without limitation the rights +; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +; copies of the Software, and to permit persons to whom the Software is +; furnished to do so, subject to the following conditions: +; +; The above copyright notice and this permission notice shall be included in +; all copies or substantial portions of the Software. +; +; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +; THE SOFTWARE. +; +; exp.asm +; +; An implementation of the exp libm function. +; +; Prototype: +; +; double exp(double x); +; + +; +; Algorithm: +; +; e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64) +; +; x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer +; n = 64*m + j, 0 <= j < 64 +; +; e^x = 2^((64*m + j + f)/64) +; = (2^m) * (2^(j/64)) * 2^(f/64) +; = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64)) +; +; f = x*(64/ln(2)) - n +; r = f*(ln(2)/64) = x - n*(ln(2)/64) +; +; e^x = (2^m) * (2^(j/64)) * e^r +; +; (2^(j/64)) is precomputed +; +; e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5! + (r^5)/5! +; e^r = 1 + q +; +; q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5! + (r^5)/5! +; + +.const +ALIGN 16 +; these codes and the ones in the corresponding .c file have to match +__flag_x_nan DD 00000001 +__flag_y_zero DD 00000002 +__flag_y_inf DD 00000003 + +ALIGN 16 + +L__real_1_by_720 DQ 03f56c16c16c16c17h + DQ 03f56c16c16c16c17h ; 1/720 +L__real_1_by_120 DQ 03f81111111111111h + DQ 03f81111111111111h ; 1/120 +L__real_1_by_6 DQ 03fc5555555555555h + DQ 03fc5555555555555h ; 1/6 +L__real_1_by_2 DQ 03fe0000000000000h + DQ 03fe0000000000000h ; 1/2 +L__real_1_by_24 DQ 03fa5555555555555h + DQ 03fa5555555555555h ; 1/24 + +ALIGN 16 +L__log2_by_64_mtail_mhead DQ 0bf862e42fefa0000h, 0bd1cf79abc9e3b39h +L__ln_of_smallest_normal DQ 0C086232BDD7ABCD2h +L__zero DQ 00000000000000000h +L__max_exp_arg DQ 040862e42fefa39efh ; 709.78271289338397 +L__denormal_tiny_threshold DQ 0c0874046dfefd9d0h ; -744.03460681327306 +L__min_exp_arg DQ 0c0874910d52d3051h ; -745.13321910194111 +L__real_64_by_log2 DQ 040571547652b82feh ; 64/ln(2) +L__positive_infinity DQ 07ff0000000000000h +L__negative_infinity DQ 0fff0000000000000h +L__real_qnanbit DQ 0008000000000000h ; qnan set bit +L__real_x_near0_threshold DQ 3c00000000000000h +L__log2_by_64_mhead DQ 0bf862e42fefa0000h +L__log2_by_64_mtail DQ 0bd1cf79abc9e3b39h +L__real_smallest_denormal DQ 00000000000000001h +L__real_one DQ 03ff0000000000000h +L__2_to_neg_26 DQ 03E50000000000000h ; 2^-26 +L__min_normal DQ 00010000000000000h ; smallest normal + + +EXTRN __two_to_jby64_table:QWORD +EXTRN __two_to_jby64_head_table:QWORD +EXTRN __two_to_jby64_tail_table:QWORD +EXTRN __use_fma3_lib:DWORD + +; make room for fname_special to save things +dummy_space EQU 020h +stack_size EQU 038h + +include fm.inc + +fname TEXTEQU +fname_special TEXTEQU <_exp_special> + +;Define name and any external functions being called +EXTERN fname_special : PROC + +.code +PUBLIC fname +fname PROC FRAME + StackAllocate stack_size + .ENDPROLOG + + ; We need to avoid unwanted exceptions from a NaN argument. + ; It could be argued that a signaling NaN should raise an exception, + ; but the existing library doesn't. At any rate, the comparison operations + ; don't seem to like quiet NaN either, so... + movd rdx, xmm0 + btr rdx, 63 + cmp rdx, L__positive_infinity + jge Lexp_x_is_nan_or_inf + + cmp DWORD PTR __use_fma3_lib, 0 + jne Lexp_fma3 + + movapd xmm2, xmm0 + movapd xmm3, xmm0 + + ; Some hardware has problems with too many branches in a single + ; 16- or 32-byte window, so let's peel off the common case into + ; a single branch. + cmplesd xmm2, L__max_exp_arg ; xmm2 <-- 0xFFFFFFFF is x is not too big positive + cmpnltsd xmm3, L__denormal_tiny_threshold ; xmm3 <-- 0xFFFFFFFF if x is not too big negative + andps xmm2, xmm3 ; xmm2 <-- 0xFFFFFFFF if x is in range, 0 otherwise + ucomisd xmm2, xmm2 ; note that FFF... is NaN, so this comparison should set PF for in-range x + jp Lexp_y_is_finite + + ucomisd xmm0, L__max_exp_arg + ja Lexp_y_is_inf + ; Since we peeled off the cases with normal result, + ; there is only one possibility remaining: + jmp Lexp_y_is_denormal_or_zero + +ALIGN 16 +Lexp_y_is_finite: + ; x * (64/ln(2)) + movapd xmm1, xmm0 + btr rdx, 63 ; rdx <-- |x| + cmp rdx, L__2_to_neg_26 + jbe Lexp_return_1_plus_x + mulsd xmm1, L__real_64_by_log2 + + ; n = int( x * (64/ln(2)) ) + cvttpd2dq xmm2, xmm1 ; xmm2 = (int)n + cvtdq2pd xmm1, xmm2 ; xmm1 = (double)n + movd ecx, xmm2 + movapd xmm2, xmm1 + + ; r1 = x - n * ln(2)/64 head + mulsd xmm1, L__log2_by_64_mhead + + ; j = n & 0x3f + mov rax, 03fh + and eax, ecx ; eax = j + ; m = (n - j) / 64 + sar ecx, 6 ; ecx = m + + + ; r2 = - n * ln(2)/64 tail + mulsd xmm2, L__log2_by_64_mtail + addsd xmm0, xmm1 ; xmm0 = r1 + + ; r1+r2 + addsd xmm2, xmm0 ; xmm2 = r + + ; q = r + r^2*1/2 + r^3*1/6 + r^4 *1/24 + r^5*1/120 + r^6*1/720 + ; q = r + r*r*(1/2 + r*(1/6+ r*(1/24 + r*(1/120 + r*(1/720))))) + movapd xmm3, L__real_1_by_720 ; xmm3 = 1/720 + mulsd xmm3, xmm2 ; xmm3 = r*1/720 + movapd xmm0, L__real_1_by_6 ; xmm0 = 1/6 + movapd xmm1, xmm2 ; xmm1 = r + mulsd xmm0, xmm2 ; xmm0 = r*1/6 + addsd xmm3, L__real_1_by_120 ; xmm3 = 1/120 + (r*1/720) + mulsd xmm1, xmm2 ; xmm1 = r*r + addsd xmm0, L__real_1_by_2 ; xmm0 = 1/2 + (r*1/6) + movapd xmm4, xmm1 ; xmm4 = r*r + mulsd xmm4, xmm1 ; xmm4 = (r*r) * (r*r) + mulsd xmm3, xmm2 ; xmm3 = r * (1/120 + (r*1/720)) + mulsd xmm0, xmm1 ; xmm0 = (r*r)*(1/2 + (r*1/6)) + addsd xmm3, L__real_1_by_24 ; xmm3 = 1/24 + (r * (1/120 + (r*1/720))) + addsd xmm0, xmm2 ; xmm0 = r + ((r*r)*(1/2 + (r*1/6))) + mulsd xmm3, xmm4 ; xmm3 = ((r*r) * (r*r)) * (1/24 + (r * (1/120 + (r*1/720)))) + addsd xmm0, xmm3 ; xmm0 = r + ((r*r)*(1/2 + (r*1/6))) + ((r*r) * (r*r)) * (1/24 + (r * (1/120 + (r*1/720)))) + + ;(f)*(q) + f2 + f1 + cmp ecx, 0fffffc02h ; -1022 + lea rdx, __two_to_jby64_table + lea r11, __two_to_jby64_tail_table + lea r10, __two_to_jby64_head_table + mulsd xmm0, QWORD PTR [rdx+rax * 8 ] + addsd xmm0, QWORD PTR [r11+rax * 8 ] + addsd xmm0, QWORD PTR [r10+rax * 8 ] + + jle Lexp_process_denormal +Lexp_process_normal: + shl rcx, 52 + movd xmm2, rcx + paddq xmm0, xmm2 + StackDeallocate stack_size + ret + +ALIGN 16 +Lexp_process_denormal: + jl Lexp_process_true_denormal + ucomisd xmm0, L__real_one + jae Lexp_process_normal +Lexp_process_true_denormal: + ; here ( e^r < 1 and m = -1022 ) or m <= -1023 + add ecx, 1074 + mov rax, 1 + shl rax, cl + movd xmm2, rax + mulsd xmm0, xmm2 + jmp Lexp_finish + +Lexp_y_is_one: + movsd xmm0, L__real_one + jmp Lexp_finish + +ALIGN 16 +Lexp_x_is_nan_or_inf: + movd rax, xmm0 + cmp rax, L__positive_infinity + je Lexp_finish + cmp rax, L__negative_infinity + je Lexp_return_zero_without_exception + or rax, L__real_qnanbit + movd xmm1, rax + mov r8d, __flag_x_nan + call fname_special + jmp Lexp_finish + +ALIGN 16 +Lexp_y_is_inf: + mov rax, 07ff0000000000000h + movd xmm1, rax + mov r8d, __flag_y_inf + call fname_special + jmp Lexp_finish + +ALIGN 16 +Lexp_y_is_denormal_or_zero: + ucomisd xmm0, L__min_exp_arg + jbe Lexp_y_is_zero + movapd xmm0, L__real_smallest_denormal + jmp Lexp_finish + +ALIGN 16 +Lexp_y_is_zero: + pxor xmm1, xmm1 + mov r8d, __flag_y_zero + call fname_special + jmp Lexp_finish + +ALIGN 16 +Lexp_return_1_plus_x: + cmp rdx, L__min_normal + jbe Lexp_return_1_plus_eps + addsd xmm0, L__real_one + StackDeallocate stack_size + ret 0 + +; Some hardware really does not like subnormals. Try to avoid them. +ALIGN 16 +Lexp_return_1_plus_eps: + movsd xmm0, L__real_one + addsd xmm0, L__min_normal ; make sure inexact is set + StackDeallocate stack_size + ret 0 + +ALIGN 16 +Lexp_return_zero_without_exception: + pxor xmm0,xmm0 + StackDeallocate stack_size + ret 0 + + +ALIGN 16 +Lexp_finish: + StackDeallocate stack_size + ret 0 + +ALIGN 16 +Lexp_fma3: + ; Some hardware has problems with too many branches in a single + ; 16- or 32-byte window, so let's peel off the common case into + ; a single branch. + vcmplesd xmm2, xmm0, L__max_exp_arg ; xmm2 <-- 0xFFFFFFFF is x is not too big positive + vcmpnltsd xmm3, xmm0, L__denormal_tiny_threshold ; xmm3 <-- 0xFFFFFFFF if x is not too big negative + vandps xmm2, xmm3, xmm2 ; xmm2 <-- 0xFFFFFFFF if x is in range, 0 otherwise + vucomisd xmm2, xmm2 ; note that FFF... is NaN, so this comparison should set PF for in-range x + jp Lexp_fma3_y_is_finite + + vucomisd xmm0,L__max_exp_arg + ja Lexp_fma3_y_is_inf + ; Since we peeled off the cases with normal result, + ; there is only one possibility remaining: + jmp Lexp_fma3_y_is_zero + +; vpsllq xmm1, xmm0, 1 +; vpsrlq xmm1, xmm1, 1 +; vucomisd xmm1, L__real_x_near0_threshold ; 2^-63 +; jb Lexp_fma3_y_is_one + +ALIGN 16 +Lexp_fma3_y_is_finite: + vmovq rdx, xmm0 + btr rdx, 63 ; rdx <-- |x| + cmp rdx, L__2_to_neg_26 + jbe Lexp_fma3_return_1_plus_x + + ; x * (64/ln(2)) + vmulsd xmm1,xmm0,L__real_64_by_log2 + + ; n = int( x * (64/ln(2)) ) + vcvttpd2dq xmm2,xmm1 ;xmm2 = (int)n + vcvtdq2pd xmm1,xmm2 ;xmm1 = (double)n ;can use round + vmovd ecx,xmm2 + + ; r1 = x - n * ln(2)/64 head + ; r2 = - n * ln(2)/64 tail + ; r = r1+r2 + vmovlhps xmm1,xmm1,xmm1 ;xmm1 = (double (double)n,)n + vmovq xmm0,xmm0 ;xmm0 = 0,x ;zero out the upper part + vfmadd132pd xmm1,xmm0,L__log2_by_64_mtail_mhead + vhaddpd xmm2,xmm1,xmm1 ;xmm2 = r,r + + ;j = n & 03fh + mov rax,03fh + and eax,ecx ;eax = j + ; m = (n - j) / 64 + sar ecx,6 ;ecx = m + + ; q = r + r^2*1/2 + r^3*1/6 + r^4 *1/24 + r^5*1/120 + r^6*1/720 + ; q = r + r*r*(1/2 + r*(1/6+ r*(1/24 + r*(1/120 + r*(1/720))))) + vmovapd xmm3,L__real_1_by_720 + vfmadd213sd xmm3,xmm2,L__real_1_by_120 + vfmadd213sd xmm3,xmm2,L__real_1_by_24 + vfmadd213sd xmm3,xmm2,L__real_1_by_6 + vfmadd213sd xmm3,xmm2,L__real_1_by_2 + vmulsd xmm0,xmm2,xmm2 + vfmadd213sd xmm0,xmm3,xmm2 + + ; (f)*(q) + f2 + f1 + cmp ecx,0fffffc02h ; -1022 + lea rdx,__two_to_jby64_table + lea r11,__two_to_jby64_tail_table + lea r10,__two_to_jby64_head_table + vmulsd xmm2,xmm0,QWORD PTR[rdx + rax * 8] + vaddsd xmm1,xmm2,QWORD PTR[r11 + rax * 8] + vaddsd xmm0,xmm1,QWORD PTR[r10 + rax * 8] + + jle Lexp_fma3_process_denormal +Lexp_fma3_process_normal: + shl rcx,52 + vmovq xmm2,rcx + vpaddq xmm0,xmm0,xmm2 + StackDeallocate stack_size + ret + +ALIGN 16 +Lexp_fma3_process_denormal: + jl Lexp_fma3_process_true_denormal + vucomisd xmm0,L__real_one + jae Lexp_fma3_process_normal +Lexp_fma3_process_true_denormal: + ; here ( e^r < 1 and m = -1022 ) or m <= -1023 + add ecx,1074 + mov rax,1 + shl rax,cl + vmovq xmm2,rax + vmulsd xmm0,xmm0,xmm2 + jmp Lexp_fma3_finish + +Lexp_fma3_y_is_one: + vmovsd xmm0, L__real_one + jmp Lexp_fma3_finish + + +ALIGN 16 +Lexp_fma3_y_is_inf: + mov rax,07ff0000000000000h + vmovq xmm1,rax + mov r8d,__flag_y_inf + call fname_special + jmp Lexp_fma3_finish + +ALIGN 16 +Lexp_fma3_return_1_plus_x: + cmp rdx, L__min_normal + jbe Lexp_fma3_return_1_plus_eps + vaddsd xmm0, xmm0, L__real_one + StackDeallocate stack_size + ret 0 + +; Some hardware really does not like subnormals. Try to avoid them. +ALIGN 16 +Lexp_fma3_return_1_plus_eps: + vmovsd xmm0, L__real_one + vaddsd xmm0, xmm0, L__min_normal ; make sure inexact is set + StackDeallocate stack_size + ret 0 + +ALIGN 16 +Lexp_fma3_y_is_zero: + vpxor xmm1,xmm1,xmm1 + mov r8d,__flag_y_zero + call fname_special + jmp Lexp_fma3_finish + +ALIGN 16 +Lexp_fma3_return_zero_without_exception: + vpxor xmm0,xmm0,xmm0 + +ALIGN 16 +Lexp_fma3_finish: + StackDeallocate stack_size + ret + +fname endp +END diff --git a/sdk/lib/crt/math/libm_sse2/exp2.c b/sdk/lib/crt/math/libm_sse2/exp2.c new file mode 100644 index 00000000000..1061b5bc85d --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/exp2.c @@ -0,0 +1,162 @@ + +/******************************************************************************* +MIT License +----------- + +Copyright (c) 2002-2019 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this Software and associated documentaon files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include "libm.h" +#include "libm_util.h" + +#define USE_SPLITEXP +#define USE_SCALEDOUBLE_1 +#define USE_SCALEDOUBLE_2 +#define USE_ZERO_WITH_FLAGS +#define USE_INFINITY_WITH_FLAGS +#define USE_HANDLE_ERROR + +#include "libm_inlines.h" +#undef USE_ZERO_WITH_FLAGS +#undef USE_SPLITEXP +#undef USE_SCALEDOUBLE_1 +#undef USE_SCALEDOUBLE_2 +#undef USE_INFINITY_WITH_FLAGS +#undef USE_HANDLE_ERROR + +#include "libm_errno.h" + +/* exp2 is only provided for use by powf under Windows, so give + it a leading underscore. */ +double FN_PROTOTYPE(_exp2)(double x) +{ + static const double + max_exp2_arg = 1024.0, /* 0x4090000000000000 */ + min_exp2_arg = -1074.0, /* 0xc090c80000000000 */ + log2 = 6.931471805599453094178e-01, /* 0x3fe62e42fefa39ef */ + log2_lead = 6.93147167563438415527E-01, /* 0x3fe62e42f8000000 */ + log2_tail = 1.29965068938898869640E-08, /* 0x3e4be8e7bcd5e4f1 */ + one_by_32_lead = 0.03125; + + double y, z1, z2, z, hx, tx, y1, y2; + int m; + unsigned long ux, ax; + + /* + Computation of exp2(x). + + We compute the values m, z1, and z2 such that + exp2(x) = 2**m * (z1 + z2), where exp2(x) is 2**x. + + Computations needed in order to obtain m, z1, and z2 + involve three steps. + + First, we reduce the argument x to the form + x = n/32 + remainder, + where n has the value of an integer and |remainder| <= 1/64. + The value of n = x * 32 rounded to the nearest integer and + the remainder = x - n/32. + + Second, we approximate exp2(r1 + r2) - 1 where r1 is the leading + part of the remainder and r2 is the trailing part of the remainder. + + Third, we reconstruct exp2(x) so that + exp2(x) = 2**m * (z1 + z2). + */ + + + GET_BITS_DP64(x, ux); + ax = ux & (~SIGNBIT_DP64); + + if (ax >= 0x4090000000000000) /* abs(x) >= 1024.0 */ + { + if(ax >= 0x7ff0000000000000) + { + /* x is either NaN or infinity */ + if (ux & MANTBITS_DP64) + /* x is NaN */ + return _handle_error("exp2", OP_EXP, ux|0x0008000000000000, _DOMAIN, + 0, EDOM, x, 0.0, 1); + else if (ux & SIGNBIT_DP64) + /* x is negative infinity; return 0.0 with no flags. */ + return 0.0; + else + /* x is positive infinity */ + return x; + } + if (x > max_exp2_arg) + /* Return +infinity with overflow flag */ + return _handle_error("exp2", OP_EXP, PINFBITPATT_DP64, _OVERFLOW, + AMD_F_OVERFLOW | AMD_F_INEXACT, ERANGE, x, 0.0, 1); + else if (x < min_exp2_arg) + /* x is negative. Return +zero with underflow and inexact flags */ + return _handle_error("exp2", OP_EXP, 0, _UNDERFLOW, + AMD_F_UNDERFLOW | AMD_F_INEXACT, ERANGE, x, 0.0, 1); + } + + + /* Handle small arguments separately */ + if (ax < 0x3fb7154764ee6c2f) /* abs(x) < 1/(16*log2) */ + { + if (ax < 0x3c00000000000000) /* abs(x) < 2^(-63) */ + return 1.0 + x; /* Raises inexact if x is non-zero */ + else + { + /* Split x into hx (head) and tx (tail). */ + unsigned long u; + hx = x; + GET_BITS_DP64(hx, u); + u &= 0xfffffffff8000000; + PUT_BITS_DP64(u, hx); + tx = x - hx; + /* Carefully multiply x by log2. y1 is the most significant + part of the result, and y2 the least significant part */ + y1 = x * log2_lead; + y2 = (((hx * log2_lead - y1) + hx * log2_tail) + + tx * log2_lead) + tx * log2_tail; + + y = y1 + y2; + z = (9.99564649780173690e-1 + + (1.61251249355268050e-5 + + (2.37986978239838493e-2 + + 2.68724774856111190e-7*y)*y)*y)/ + (9.99564649780173692e-1 + + (-4.99766199765151309e-1 + + (1.070876894098586184e-1 + + (-1.189773642681502232e-2 + + 5.9480622371960190616e-4*y)*y)*y)*y); + z = ((z * y1) + (z * y2)) + 1.0; + } + } + else + { + /* Find m, z1 and z2 such that exp2(x) = 2**m * (z1 + z2) */ + + splitexp(x, log2, 32.0, one_by_32_lead, 0.0, &m, &z1, &z2); + + /* Scale (z1 + z2) by 2.0**m */ + if (m > EMIN_DP64 && m < EMAX_DP64) + z = scaleDouble_1((z1+z2),m); + else + z = scaleDouble_2((z1+z2),m); + } + return z; +} diff --git a/sdk/lib/crt/math/libm_sse2/exp_special.c b/sdk/lib/crt/math/libm_sse2/exp_special.c new file mode 100644 index 00000000000..669c574f06f --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/exp_special.c @@ -0,0 +1,101 @@ + +/******************************************************************************* +MIT License +----------- + +Copyright (c) 2002-2019 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this Software and associated documentaon files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include +#include +#include +#include +#include + +#include "libm_new.h" + +// y = expf(x) +// y = exp(x) + +// these codes and the ones in the related .asm files have to match +#define EXP_X_NAN 1 +#define EXP_Y_ZERO 2 +#define EXP_Y_INF 3 + +float _expf_special(float x, float y, U32 code) +{ + switch(code) + { + case EXP_X_NAN: + { + UT64 ym; ym.u64 = 0; ym.f32[0] = y; + _handle_errorf("expf", _FpCodeExp, ym.u64, _DOMAIN, 0, EDOM, x, 0.0, 1); + } + break; + + case EXP_Y_ZERO: + { + UT64 ym; ym.u64 = 0; ym.f32[0] = y; + _handle_errorf("expf", _FpCodeExp, ym.u64, _UNDERFLOW, AMD_F_INEXACT|AMD_F_UNDERFLOW, ERANGE, x, 0.0, 1); + } + break; + + case EXP_Y_INF: + { + UT64 ym; ym.u64 = 0; ym.f32[0] = y; + _handle_errorf("expf", _FpCodeExp, ym.u64, _OVERFLOW, AMD_F_INEXACT|AMD_F_OVERFLOW, ERANGE, x, 0.0, 1); + + } + break; + } + + return y; +} + +double _exp_special(double x, double y, U32 code) +{ + switch(code) + { + case EXP_X_NAN: + { + UT64 ym; ym.f64 = y; + _handle_error("exp", _FpCodeExp, ym.u64, _DOMAIN, 0, EDOM, x, 0.0, 1); + } + break; + + case EXP_Y_ZERO: + { + UT64 ym; ym.f64 = y; + _handle_error("exp", _FpCodeExp, ym.u64, _UNDERFLOW, AMD_F_INEXACT|AMD_F_UNDERFLOW, ERANGE, x, 0.0, 1); + } + break; + + case EXP_Y_INF: + { + UT64 ym; ym.f64 = y; + _handle_error("exp", _FpCodeExp, ym.u64, _OVERFLOW, AMD_F_INEXACT|AMD_F_OVERFLOW, ERANGE, x, 0.0, 1); + } + break; + } + + + return y; +} diff --git a/sdk/lib/crt/math/libm_sse2/expf.asm b/sdk/lib/crt/math/libm_sse2/expf.asm new file mode 100644 index 00000000000..2bf0dda2170 --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/expf.asm @@ -0,0 +1,303 @@ +; +; MIT License +; ----------- +; +; Copyright (c) 2002-2019 Advanced Micro Devices, Inc. +; +; Permission is hereby granted, free of charge, to any person obtaining a copy +; of this Software and associated documentaon files (the "Software"), to deal +; in the Software without restriction, including without limitation the rights +; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +; copies of the Software, and to permit persons to whom the Software is +; furnished to do so, subject to the following conditions: +; +; The above copyright notice and this permission notice shall be included in +; all copies or substantial portions of the Software. +; +; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +; THE SOFTWARE. +; +; expf.asm +; +; An implementation of the expf libm function. +; +; Prototype: +; +; float expf(float x); +; + +; +; Algorithm: +; Similar to one presnted in exp.asm +; +; If FMA3 hardware is available, an FMA3 implementation of expf will be used. + + +.const +ALIGN 16 + +__real_inf DD 7f800000h + DD 0 + DQ 0 + +__real_ninf DD 0ff800000h + DD 0 + DQ 0 + +__real_qnanbit DD 00400000h + DD 0 + DQ 0 + +__real_zero DD 00000000h + DD 0 + DQ 0 + +__real_p8192 DQ 40c0000000000000h + DQ 0 +__real_m9600 DQ 0c0c2c00000000000h + DQ 0 + +__real_64_by_log2 DQ 40571547652b82feh ; 64/ln(2) + DQ 0 +__real_log2_by_64 DQ 3f862e42fefa39efh ; log2_by_64 + DQ 0 + +__real_1_by_6 DQ 3fc5555555555555h ; 1/6 + DQ 0 +__real_1_by_2 DQ 3fe0000000000000h ; 1/2 + DQ 0 + +; these codes and the ones in the corresponding .c file have to match +__flag_x_nan DD 00000001 +__flag_y_zero DD 00000002 +__flag_y_inf DD 00000003 + +EXTRN __two_to_jby64_table:QWORD +EXTRN __use_fma3_lib:DWORD + +fname TEXTEQU +fname_special TEXTEQU <_expf_special> + +; define local variable storage offsets + +; make room for fname_special to save things +dummy_space EQU 020h +stack_size EQU 038h + +include fm.inc + +; external function +EXTERN fname_special:PROC + +.code + +ALIGN 16 +PUBLIC fname +fname PROC FRAME + StackAllocate stack_size + .ENDPROLOG + + ; Do this to avoid possible exceptions from a NaN argument. + movd edx, xmm0 + btr edx,31 + cmp edx, DWORD PTR __real_inf + jge Lexpf_x_is_inf_or_nan + + cmp DWORD PTR __use_fma3_lib, 0 + jne Lexpf_fma3 + +Lexpf_sse2: + + cvtss2sd xmm0, xmm0 + + ; x * (64/ln(2)) + movsd xmm3, QWORD PTR __real_64_by_log2 + mulsd xmm3, xmm0 + + ; x <= 128*ln(2), ( x * (64/ln(2)) ) <= 64*128 + ; x > -150*ln(2), ( x * (64/ln(2)) ) > 64*(-150) + comisd xmm3, QWORD PTR __real_p8192 + jae Lexpf_y_is_inf + + comisd xmm3, QWORD PTR __real_m9600 + jb Lexpf_y_is_zero + + ; n = int( x * (64/ln(2)) ) + cvtpd2dq xmm4, xmm3 + lea r10, __two_to_jby64_table + cvtdq2pd xmm1, xmm4 + + ; r = x - n * ln(2)/64 + movsd xmm2, QWORD PTR __real_log2_by_64 + mulsd xmm2, xmm1 + movd ecx, xmm4 + mov rax, 3fh + and eax, ecx + subsd xmm0, xmm2 + movsd xmm1, xmm0 + + ; m = (n - j) / 64 + sub ecx, eax + sar ecx, 6 + + ; q + movsd xmm3, QWORD PTR __real_1_by_6 + mulsd xmm3, xmm0 + mulsd xmm0, xmm0 + addsd xmm3, QWORD PTR __real_1_by_2 + mulsd xmm0, xmm3 + addsd xmm0, xmm1 + + add rcx, 1023 + shl rcx, 52 + + ; (f)*(1+q) + movsd xmm2, QWORD PTR [r10+rax*8] + mulsd xmm0, xmm2 + addsd xmm0, xmm2 + + movd xmm1, rcx + mulsd xmm0, xmm1 + cvtsd2ss xmm0, xmm0 + +Lexpf_final_check: + StackDeallocate stack_size + ret + +ALIGN 16 +Lexpf_y_is_zero: + + movss xmm1, DWORD PTR __real_zero + movd xmm0, edx + mov r8d, DWORD PTR __flag_y_zero + + call fname_special + jmp Lexpf_finish + +ALIGN 16 +Lexpf_y_is_inf: + + movss xmm1, DWORD PTR __real_inf + movd xmm0, edx + mov r8d, DWORD PTR __flag_y_inf + + call fname_special + jmp Lexpf_finish + +ALIGN 16 +Lexpf_x_is_inf_or_nan: + + cmp edx, DWORD PTR __real_inf + je Lexpf_finish + + cmp edx, DWORD PTR __real_ninf + je Lexpf_process_zero + + or edx, DWORD PTR __real_qnanbit + movd xmm1, edx + mov r8d, DWORD PTR __flag_x_nan + call fname_special + jmp Lexpf_finish + +ALIGN 16 +Lexpf_process_zero: + movss xmm0, DWORD PTR __real_zero + jmp Lexpf_final_check + +ALIGN 16 +Lexpf_finish: + StackDeallocate stack_size + ret + + +ALIGN 16 +Lexpf_fma3: + + vcvtss2sd xmm0, xmm0, xmm0 + + ; x * (64/ln(2)) + vmulsd xmm3, xmm0, QWORD PTR __real_64_by_log2 + + ; x <= 128*ln(2), ( x * (64/ln(2)) ) <= 64*128 + ; x > -150*ln(2), ( x * (64/ln(2)) ) > 64*(-150) + vcomisd xmm3, QWORD PTR __real_p8192 + jae Lexpf_fma3_y_is_inf + + vucomisd xmm3, QWORD PTR __real_m9600 + jb Lexpf_fma3_y_is_zero + + ; n = int( x * (64/ln(2)) ) + vcvtpd2dq xmm4, xmm3 + lea r10, __two_to_jby64_table + vcvtdq2pd xmm1, xmm4 + + ; r = x - n * ln(2)/64 + vfnmadd231sd xmm0, xmm1, QWORD PTR __real_log2_by_64 + vmovd ecx, xmm4 + mov rax, 3fh + and eax, ecx + vmovapd xmm1, xmm0 ; xmm1 <-- copy of r + + ; m = (n - j) / 64 + sub ecx, eax + sar ecx, 6 + + ; q + vmovsd xmm3, QWORD PTR __real_1_by_6 + vmulsd xmm0, xmm0, xmm0 ; xmm0 <-- r^2 + vfmadd213sd xmm3, xmm1, QWORD PTR __real_1_by_2 ; xmm3 <-- r/6 + 1/2 + vfmadd213sd xmm0, xmm3, xmm1 ; xmm0 <-- q = r^2*(r/6 + 1/2) + r + + add rcx, 1023 + shl rcx, 52 + + ; (f)*(1+q) + vmovsd xmm2, QWORD PTR [r10+rax*8] + vfmadd213sd xmm0, xmm2, xmm2 + + vmovq xmm2,rcx + vmulsd xmm0, xmm0, xmm2 + vcvtsd2ss xmm0, xmm0, xmm0 + +Lexpf_fma3_final_check: + StackDeallocate stack_size + ret + +ALIGN 16 +Lexpf_fma3_y_is_zero: + + vmovss xmm1, DWORD PTR __real_zero + vmovd xmm0, edx + mov r8d, DWORD PTR __flag_y_zero + + call fname_special + jmp Lexpf_fma3_finish + +ALIGN 16 +Lexpf_fma3_y_is_inf: + + vmovss xmm1, DWORD PTR __real_inf + vmovd xmm0, edx + mov r8d, DWORD PTR __flag_y_inf + + call fname_special + jmp Lexpf_fma3_finish + +ALIGN 16 +Lexpf_fma3_process_zero: + vmovss xmm0, DWORD PTR __real_zero + jmp Lexpf_fma3_final_check + +ALIGN 16 +Lexpf_fma3_finish: + StackDeallocate stack_size + ret + +fname endp + +END diff --git a/sdk/lib/crt/math/libm_sse2/floor.c b/sdk/lib/crt/math/libm_sse2/floor.c new file mode 100644 index 00000000000..cf8e18f8576 --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/floor.c @@ -0,0 +1,85 @@ + +/******************************************************************************* +MIT License +----------- + +Copyright (c) 2002-2019 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this Software and associated documentaon files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include "libm.h" +#include "libm_util.h" + +#include "libm_errno.h" +#define USE_HANDLE_ERROR +#include "libm_inlines.h" +#undef USE_HANDLE_ERROR + +#pragma function(floor) + +double FN_PROTOTYPE(floor)(double x) +{ + double r; + long rexp, xneg; + + + unsigned long ux, ax, ur, mask; + + GET_BITS_DP64(x, ux); + ax = ux & (~SIGNBIT_DP64); + xneg = (ux != ax); + + if (ax >= 0x4340000000000000) + { + /* abs(x) is either NaN, infinity, or >= 2^53 */ + if (ax > 0x7ff0000000000000) + /* x is NaN */ + return _handle_error("floor", OP_FLOOR, ux|0x0008000000000000, _DOMAIN, + 0, EDOM, x, 0.0, 1); + else + return x; + } + else if (ax < 0x3ff0000000000000) /* abs(x) < 1.0 */ + { + if (ax == 0x0000000000000000) + /* x is +zero or -zero; return the same zero */ + return x; + else if (xneg) /* x < 0.0 */ + return -1.0; + else + return 0.0; + } + else + { + r = x; + rexp = ((ux & EXPBITS_DP64) >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64; + /* Mask out the bits of r that we don't want */ + mask = 1; + mask = (mask << (EXPSHIFTBITS_DP64 - rexp)) - 1; + ur = (ux & ~mask); + PUT_BITS_DP64(ur, r); + if (xneg && (ur != ux)) + /* We threw some bits away and x was negative */ + return r - 1.0; + else + return r; + } + +} diff --git a/sdk/lib/crt/math/libm_sse2/floorf.c b/sdk/lib/crt/math/libm_sse2/floorf.c new file mode 100644 index 00000000000..7920ea7d0ef --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/floorf.c @@ -0,0 +1,83 @@ + +/******************************************************************************* +MIT License +----------- + +Copyright (c) 2002-2019 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this Software and associated documentaon files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include "libm.h" +#include "libm_util.h" + +#include "libm_errno.h" +#define USE_HANDLE_ERRORF +#include "libm_inlines.h" +#undef USE_HANDLE_ERRORF + +// Disable "C4163: not available as intrinsic function" warning that older +// compilers may issue here. +#pragma warning(disable:4163) +#pragma function(floorf) + +float FN_PROTOTYPE(floorf)(float x) +{ + float r; + int rexp, xneg; + unsigned int ux, ax, ur, mask; + + GET_BITS_SP32(x, ux); + ax = ux & (~SIGNBIT_SP32); + xneg = (ux != ax); + + if (ax >= 0x4b800000) + { + /* abs(x) is either NaN, infinity, or >= 2^24 */ + if (ax > 0x7f800000) + /* x is NaN */ + return _handle_errorf("floorf", OP_FLOOR, ux|0x00400000, _DOMAIN, + 0, EDOM, x, 0.0F, 1); + else + return x; + } + else if (ax < 0x3f800000) /* abs(x) < 1.0 */ + { + if (ax == 0x00000000) + /* x is +zero or -zero; return the same zero */ + return x; + else if (xneg) /* x < 0.0 */ + return -1.0F; + else + return 0.0F; + } + else + { + rexp = ((ux & EXPBITS_SP32) >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32; + /* Mask out the bits of r that we don't want */ + mask = (1 << (EXPSHIFTBITS_SP32 - rexp)) - 1; + ur = (ux & ~mask); + PUT_BITS_SP32(ur, r); + if (xneg && (ux != ur)) + /* We threw some bits away and x was negative */ + return r - 1.0F; + else + return r; + } +} diff --git a/sdk/lib/crt/math/libm_sse2/fm.inc b/sdk/lib/crt/math/libm_sse2/fm.inc new file mode 100644 index 00000000000..533c70e9c96 --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/fm.inc @@ -0,0 +1,39 @@ +StackAllocate MACRO size + if size ne 0 + sub rsp, size + .ALLOCSTACK size + endif +ENDM + +StackDeallocate MACRO size + if size ne 0 + add rsp, size + endif +ENDM + +SaveReg MACRO reg64, offset + mov QWORD PTR [rsp+offset], reg64 + .SAVEREG reg64, offset +ENDM + +RestoreReg MACRO reg64, offset + mov reg64, QWORD PTR [rsp+offset] +ENDM + +SaveXmm MACRO xmmreg, offset + movdqa XMMWORD PTR [offset+rsp], xmmreg + .SAVEXMM128 xmmreg, offset +ENDM + +RestoreXmm MACRO xmmreg, offset + movdqa xmmreg, XMMWORD PTR [offset+rsp] +ENDM + +AVXSaveXmm MACRO xmmreg, offset + vmovdqa XMMWORD PTR [offset+rsp], xmmreg + .SAVEXMM128 xmmreg, offset +ENDM + +AVXRestoreXmm MACRO xmmreg, offset + vmovdqa xmmreg, XMMWORD PTR [offset+rsp] +ENDM diff --git a/sdk/lib/crt/math/libm_sse2/fma3_available.c b/sdk/lib/crt/math/libm_sse2/fma3_available.c new file mode 100644 index 00000000000..123cba72194 --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/fma3_available.c @@ -0,0 +1,66 @@ + +/******************************************************************************* +MIT License +----------- + +Copyright (c) 2002-2019 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this Software and associated documentaon files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifdef TEST_STANDALONE +#include +#pragma section (".CRT$XIC",long,read) +typedef void (__cdecl *_PIFV)(void); +#else +#include +#include +#include +#include +#endif + +#define _CRTALLOC(x) __declspec(allocate(x)) + +int __fma3_is_available = 0; +int __use_fma3_lib = 0; + + +int __cdecl _set_FMA3_enable(int flag) +{ + if (__fma3_is_available) __use_fma3_lib = flag; + return __use_fma3_lib; +} + +int __fma3_lib_init(void); + +_CRTALLOC(".CRT$XIC") static _PIFV init_fma3 = __fma3_lib_init; + +int __fma3_lib_init(void) +{ + int CPUID[4]; // CPUID[2] is ECX; + + __fma3_is_available = 0; + __cpuid(CPUID, 1); + if (CPUID[2] & (1 << 12)) { + __fma3_is_available = 1; + } + + __use_fma3_lib = __fma3_is_available; + return 0; +} diff --git a/sdk/lib/crt/math/libm_sse2/fmod.asm b/sdk/lib/crt/math/libm_sse2/fmod.asm new file mode 100644 index 00000000000..11778f27bfb --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/fmod.asm @@ -0,0 +1,160 @@ +; +; MIT License +; ----------- +; +; Copyright (c) 2002-2019 Advanced Micro Devices, Inc. +; +; Permission is hereby granted, free of charge, to any person obtaining a copy +; of this Software and associated documentaon files (the "Software"), to deal +; in the Software without restriction, including without limitation the rights +; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +; copies of the Software, and to permit persons to whom the Software is +; furnished to do so, subject to the following conditions: +; +; The above copyright notice and this permission notice shall be included in +; all copies or substantial portions of the Software. +; +; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +; THE SOFTWARE. +; +; $Workfile: fmod.asm $ +; $Revision: 4 $ +; $Date: 9/15/04 16:43 $ +; +; +; This is an optimized version of fmod. +; +; Define _CRTBLD_C9X to make it compliant with C90 and on. +; +; If building the OS CRTL (_NTSUBSET_ defined), abort. + +; .ERRDEF _NTSUBSET_, "x87 code cannot be used in kernel mode" + +DOMAIN EQU 1 ; _DOMAIN +EDOM EQU 33 ; EDOM +FPCODEFMOD EQU 22 ; _FpCodeFmod +INVALID EQU 8 ; AMD_F_INVALID + +FPIND EQU 0fff8000000000000h ; indefinite +FPSNAN EQU 07ff7ffffffffffffh ; SNAN +FPQNAN EQU 07fffffffffffffffh ; QNAN + +X87SW RECORD X87SW_B: 1, + X87SW_C3: 1, + X87SW_TOP: 3, + X87SW_C: 3, + X87SW_ES: 1, + X87SW_SF: 1, + X87SW_PE: 1, + X87SW_E: 5 + +X87XAM EQU MASK X87SW_C3 OR MASK X87SW_C AND NOT (1 SHL (X87SW_C + 1)) +X87XAM_INF EQU 5 SHL X87SW_C +X87XAM_NAN EQU 1 SHL X87SW_C +X87XAM_BAD EQU MASK X87SW_E AND NOT 2 + + EXTRN _handle_error: PROC ; float _handle_error (char *fname, int opcode, unsigned long long value, int type, int flags, int error, double arg1, double arg2, int nargs) + + .const + +@fmodz DB "fmod", 0 + + .CODE + +; double fmod [double, double] ---------------------------------- + +fmod PROC FRAME + + sub rsp, 40 + 32 + + .ALLOCSTACK 40 + 32 + .ENDPROLOG + + movsd QWORD PTR 24 [rsp + 32], xmm1 ; Y + movsd QWORD PTR 16 [rsp + 32], xmm0 ; X + + DB 0ddh, 44h, 24h, 38h ; fld QWORD PTR 24 [rsp + 32] + DB 0ddh, 44h, 24h, 30h ; fld QWORD PTR 16 [rsp + 32] + + DB 0d9h, 0e5h ; fxam (X) + DB 09bh, 0ddh, 07ch, 024h, 010h ; fstsw 16 [rsp] + + movzx ecx, WORD PTR 16 [rsp] + and ecx, X87XAM + + fnclex ; clear exception flags + ; in preparation for fprem + +@@: + DB 0d9h, 0f8h ; fprem + + DB 09bh, 0dfh, 0e0h ; fstsw ax + test ax, 4 SHL X87SW_C + jnz @b ; do it again in case of partial result + + DB 0ddh, 01ch, 024h ; fstp QWORD PTR [rsp] + movlpd xmm0, QWORD PTR [rsp] ; result + + DB 0d9h, 0e5h ; fxam (Y) + DB 09bh, 0ddh, 07ch, 024h, 008h ; fstsw 8 [rsp] + + movzx edx, WORD PTR 8 [rsp] + and edx, X87XAM + + DB 0ddh, 0d8h ; fstp st(0) + + cmp edx, X87XAM_NAN ; fmod (x, NAN) = QNAN + je @error + + cmp ecx, X87XAM_NAN ; fmod (NAN, y) = QNAN + je @error + + and eax, X87XAM_BAD + jnz @raise ; handle error + + IFNDEF _CRTBLD_C9X ; Not C90 + cmp edx, X87XAM_INF ; fmod (x, infinity) = ??? + je @raise + ELSE ; C90 + ; fmod (x, infinity) = x (as x87 already does) + ENDIF + +@exit: + add rsp, 40 + 32 + ret + + ALIGN 16 + +@raise: + mov eax, INVALID ; raise exception + mov r8, FPIND + jmp @f + +@error: + xor eax, eax ; no exception + movd r8, xmm0 + jmp @f + +@@: + lea rcx, [@fmodz] ; fname + mov edx, FPCODEFMOD ; opcode +; mov r8, INDEF ; value + mov r9d, DOMAIN ; type + mov DWORD PTR 0 [rsp + 32], eax ; flags + mov DWORD PTR 8 [rsp + 32], EDOM ; error + mov DWORD PTR 32 [rsp + 32], 2 ; nargs + call _handle_error ; (char *fname, int opcode, unsigned long long value, int type, int flags, int error, double arg1, double arg2, int nargs) + + DB 09bh, 0dbh, 0e2h ; fclex + jmp @exit + +fmod ENDP + +; --------------------------------------------------------------- + + END diff --git a/sdk/lib/crt/math/libm_sse2/fmodf.asm b/sdk/lib/crt/math/libm_sse2/fmodf.asm new file mode 100644 index 00000000000..e015ca66a6c --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/fmodf.asm @@ -0,0 +1,160 @@ +; +; MIT License +; ----------- +; +; Copyright (c) 2002-2019 Advanced Micro Devices, Inc. +; +; Permission is hereby granted, free of charge, to any person obtaining a copy +; of this Software and associated documentaon files (the "Software"), to deal +; in the Software without restriction, including without limitation the rights +; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +; copies of the Software, and to permit persons to whom the Software is +; furnished to do so, subject to the following conditions: +; +; The above copyright notice and this permission notice shall be included in +; all copies or substantial portions of the Software. +; +; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +; THE SOFTWARE. +; +; $Workfile: fmodf.asm $ +; $Revision: 4 $ +; $Date: 9/15/04 16:43 $ +; +; +; This is an optimized version of fmod. +; +; Define _CRTBLD_C9X to make it compliant with C90 and on. +; +; If building the OS CRTL (_NTSUBSET_ defined), abort. + + .ERRDEF _NTSUBSET_, "x87 code cannot be used in kernel mode" + +DOMAIN EQU 1 ; _DOMAIN +EDOM EQU 33 ; EDOM +FPCODEFMOD EQU 22 ; _FpCodeFmod +INVALID EQU 8 ; AMD_F_INVALID + +FPIND EQU 0ffc00000h ; indefinite +FPSNAN EQU 07fbfffffh ; SNAN +FPQNAN EQU 07fffffffh ; QNAN + +X87SW RECORD X87SW_B: 1, + X87SW_C3: 1, + X87SW_TOP: 3, + X87SW_C: 3, + X87SW_ES: 1, + X87SW_SF: 1, + X87SW_PE: 1, + X87SW_E: 5 + +X87XAM EQU MASK X87SW_C3 OR MASK X87SW_C AND NOT (1 SHL (X87SW_C + 1)) +X87XAM_INF EQU 5 SHL X87SW_C +X87XAM_NAN EQU 1 SHL X87SW_C +X87XAM_BAD EQU MASK X87SW_E AND NOT 2 + + EXTRN _handle_errorf: PROC ; float _handle_error (char *fname, int opcode, unsigned long value, int type, int flags, int error, float arg1, float arg2, int nargs) + + .CONST + +@fmodfz DB "fmodf", 0 + + .CODE + +; float fmodf [float, float] ------------------------------------ + +fmodf PROC FRAME + + sub rsp, 40 + 32 + + .ALLOCSTACK 40 + 32 + .ENDPROLOG + + movss DWORD PTR 24 [rsp + 32], xmm1 + movss DWORD PTR 16 [rsp + 32], xmm0 + + DB 0d9h, 44h, 24h, 38h ; fld DWORD PTR 24 [rsp + 32] + DB 0d9h, 44h, 24h, 30h ; fld DWORD PTR 16 [rsp + 32] + + DB 0d9h, 0e5h ; fxam (X) + DB 09bh, 0ddh, 07ch, 024h, 010h ; fstsw 16 [rsp] + + movzx ecx, WORD PTR 16 [rsp] + and ecx, X87XAM + + fnclex ; clear exception flags + ; in preparation for fprem + +@@: + DB 0d9h, 0f8h ; fprem + + DB 9bh, 0dfh, 0e0h ; fstsw ax + test ax, 00400h + jnz @b ; do it again in case of partial result + + DB 0d9h, 1ch, 24h ; fstp DWORD PTR [rsp] + movss xmm0, DWORD PTR [rsp] ; result + + DB 0d9h, 0e5h ; fxam (Y) + DB 09bh, 0ddh, 07ch, 024h, 008h ; fstsw 8 [rsp] + + movzx edx, WORD PTR 8 [rsp] + and edx, X87XAM + + DB 0ddh, 0d8h ; fstp st(0) + + cmp edx, X87XAM_NAN ; fmod (x, NAN) = QNAN + je @error + + cmp ecx, X87XAM_NAN ; fmod (NAN, y) = QNAN + je @error + + and eax, X87XAM_BAD + jnz @raise ; handle error + + IFNDEF _CRTBLD_C9X ; Not C90 + cmp edx, X87XAM_INF ; fmod (x, infinity) = ??? + je @raise + ELSE ; C90 + ; fmod (x, infinity) = x (as x87 already does) + ENDIF + +@exit: + add rsp, 40 + 32 + ret + + ALIGN 16 + +@raise: + mov eax, INVALID ; raise exception + mov r8d, FPIND + jmp @f + +@error: + xor eax, eax ; no exception + movd r8d, xmm0 + jmp @f + +@@: + lea rcx, [@fmodfz] ; fname + mov edx, FPCODEFMOD ; opcode +; mov r8d, [rsp] ; value + mov r9d, DOMAIN ; type + mov DWORD PTR 0 [rsp + 32], eax ; flags + mov DWORD PTR 8 [rsp + 32], EDOM ; error + mov DWORD PTR 32 [rsp + 32], 2 ; nargs + call _handle_errorf ; (char *fname, int opcode, unsigned long long value, int type, int flags, int error, double arg1, double arg2, int nargs) + + DB 9Bh, 0DBh, 0E2h ; fclex + jmp @exit + +fmodf ENDP + +; --------------------------------------------------------------- + + END diff --git a/sdk/lib/crt/math/libm_sse2/hypot.c b/sdk/lib/crt/math/libm_sse2/hypot.c new file mode 100644 index 00000000000..dabaae103a7 --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/hypot.c @@ -0,0 +1,198 @@ + +/******************************************************************************* +MIT License +----------- + +Copyright (c) 2002-2019 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this Software and associated documentaon files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include "libm.h" +#include "libm_util.h" + +#define FAST_BUT_GREATER_THAN_ONE_ULP /* Helps speed by trading off a little + accuracy */ +#define USE_SCALEDOUBLE_1 +#define USE_INFINITY_WITH_FLAGS +#define USE_HANDLE_ERROR +#include "libm_inlines.h" +#undef USE_SCALEDOUBLE_1 +#undef USE_INFINITY_WITH_FLAGS +#undef USE_HANDLE_ERROR + +#include "libm_errno.h" + + +double FN_PROTOTYPE(_hypot)(double x, double y) +{ + /* Returns sqrt(x*x + y*y) with no overflow or underflow unless + the result warrants it */ + + const double large = 1.79769313486231570815e+308; /* 0x7fefffffffffffff */ + +#ifdef FAST_BUT_GREATER_THAN_ONE_ULP + double r, retval; + unsigned long xexp, yexp, ux, uy; +#else + double u, r, retval, hx, tx, x2, hy, ty, y2, hs, ts; + unsigned long xexp, yexp, ux, uy, ut; +#endif + int dexp, expadjust; + + GET_BITS_DP64(x, ux); + ux &= ~SIGNBIT_DP64; + GET_BITS_DP64(y, uy); + uy &= ~SIGNBIT_DP64; + xexp = (ux >> EXPSHIFTBITS_DP64); + yexp = (uy >> EXPSHIFTBITS_DP64); + + if (xexp == BIASEDEMAX_DP64 + 1 || yexp == BIASEDEMAX_DP64 + 1) + { + /* One or both of the arguments are NaN or infinity. The + result will also be NaN or infinity. */ + retval = x*x + y*y; + if (((xexp == BIASEDEMAX_DP64 + 1) && !(ux & MANTBITS_DP64)) || + ((yexp == BIASEDEMAX_DP64 + 1) && !(uy & MANTBITS_DP64))) + /* x or y is infinity. ISO C99 defines that we must + return +infinity, even if the other argument is NaN. + Note that the computation of x*x + y*y above will already + have raised invalid if either x or y is a signalling NaN. */ + return infinity_with_flags(0); + else + /* One or both of x or y is NaN, and neither is infinity. + Raise invalid if it's a signalling NaN */ + return retval; + } + + /* Set x = abs(x) and y = abs(y) */ + PUT_BITS_DP64(ux, x); + PUT_BITS_DP64(uy, y); + + /* The difference in exponents between x and y */ + dexp = (int)(xexp - yexp); + expadjust = 0; + + if (ux == 0) + /* x is zero */ + return y; + else if (uy == 0) + /* y is zero */ + return x; + else if (dexp > MANTLENGTH_DP64 + 1 || dexp < -MANTLENGTH_DP64 - 1) + /* One of x and y is insignificant compared to the other */ + return x + y; /* Raise inexact */ + else if (xexp > EXPBIAS_DP64 + 500 || yexp > EXPBIAS_DP64 + 500) + { + /* Danger of overflow; scale down by 2**600. */ + expadjust = 600; + ux -= 0x2580000000000000; + PUT_BITS_DP64(ux, x); + uy -= 0x2580000000000000; + PUT_BITS_DP64(uy, y); + } + else if (xexp < EXPBIAS_DP64 - 500 || yexp < EXPBIAS_DP64 - 500) + { + /* Danger of underflow; scale up by 2**600. */ + expadjust = -600; + if (xexp == 0) + { + /* x is denormal - handle by adding 601 to the exponent + and then subtracting a correction for the implicit bit */ + PUT_BITS_DP64(ux + 0x2590000000000000, x); + x -= 9.23297861778573578076e-128; /* 0x2590000000000000 */ + GET_BITS_DP64(x, ux); + } + else + { + /* x is normal - just increase the exponent by 600 */ + ux += 0x2580000000000000; + PUT_BITS_DP64(ux, x); + } + if (yexp == 0) + { + PUT_BITS_DP64(uy + 0x2590000000000000, y); + y -= 9.23297861778573578076e-128; /* 0x2590000000000000 */ + GET_BITS_DP64(y, uy); + } + else + { + uy += 0x2580000000000000; + PUT_BITS_DP64(uy, y); + } + } + + +#ifdef FAST_BUT_GREATER_THAN_ONE_ULP + /* Not awful, but results in accuracy loss larger than 1 ulp */ + r = x*x + y*y; +#else + /* Slower but more accurate */ + + /* Sort so that x is greater than y */ + if (x < y) + { + u = y; + y = x; + x = u; + ut = ux; + ux = uy; + uy = ut; + } + + /* Split x into hx and tx, head and tail */ + PUT_BITS_DP64(ux & 0xfffffffff8000000, hx); + tx = x - hx; + + PUT_BITS_DP64(uy & 0xfffffffff8000000, hy); + ty = y - hy; + + /* Compute r = x*x + y*y with extra precision */ + x2 = x*x; + y2 = y*y; + hs = x2 + y2; + + if (dexp == 0) + /* We take most care when x and y have equal exponents, + i.e. are almost the same size */ + ts = (((x2 - hs) + y2) + + ((hx * hx - x2) + 2 * hx * tx) + tx * tx) + + ((hy * hy - y2) + 2 * hy * ty) + ty * ty; + else + ts = (((x2 - hs) + y2) + + ((hx * hx - x2) + 2 * hx * tx) + tx * tx); + + r = hs + ts; +#endif + + /* The sqrt can introduce another half ulp error. */ + /* VC++ intrinsic call */ + _mm_store_sd(&retval, _mm_sqrt_sd(_mm_setzero_pd(), _mm_load_sd(&r))); + + /* If necessary scale the result back. This may lead to + overflow but if so that's the correct result. */ + retval = scaleDouble_1(retval, expadjust); + + if (retval > large) + /* The result overflowed. Deal with errno. */ + return _handle_error("_hypot", OP_HYPOT, PINFBITPATT_DP64, _OVERFLOW, + AMD_F_OVERFLOW | AMD_F_INEXACT, ERANGE, x, y, 2); + + return retval; +} diff --git a/sdk/lib/crt/math/libm_sse2/hypotf.c b/sdk/lib/crt/math/libm_sse2/hypotf.c new file mode 100644 index 00000000000..72864564bf7 --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/hypotf.c @@ -0,0 +1,99 @@ + +/******************************************************************************* +MIT License +----------- + +Copyright (c) 2002-2019 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this Software and associated documentaon files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include "libm.h" +#include "libm_util.h" + +#ifdef USE_SOFTWARE_SQRT +#define USE_SQRTF_AMD_INLINE +#endif +#define USE_INFINITYF_WITH_FLAGS +#define USE_HANDLE_ERRORF +#include "libm_inlines.h" +#ifdef USE_SOFTWARE_SQRT +#undef USE_SQRTF_AMD_INLINE +#endif +#undef USE_INFINITYF_WITH_FLAGS +#undef USE_HANDLE_ERRORF + +#include "libm_errno.h" + + +float FN_PROTOTYPE(_hypotf)(float x, float y) +{ + /* Returns sqrt(x*x + y*y) with no overflow or underflow unless + the result warrants it */ + + /* Do intermediate computations in double precision + and use sqrt instruction from chip if available. */ + double dx = x, dy = y, dr, retval; + + /* The largest finite float, stored as a double */ + const double large = 3.40282346638528859812e+38; /* 0x47efffffe0000000 */ + + + unsigned long ux, uy, avx, avy; + + GET_BITS_DP64(x, avx); + avx &= ~SIGNBIT_DP64; + GET_BITS_DP64(y, avy); + avy &= ~SIGNBIT_DP64; + ux = (avx >> EXPSHIFTBITS_DP64); + uy = (avy >> EXPSHIFTBITS_DP64); + + if (ux == BIASEDEMAX_DP64 + 1 || uy == BIASEDEMAX_DP64 + 1) + { + retval = x*x + y*y; + /* One or both of the arguments are NaN or infinity. The + result will also be NaN or infinity. */ + if (((ux == BIASEDEMAX_DP64 + 1) && !(avx & MANTBITS_DP64)) || + ((uy == BIASEDEMAX_DP64 + 1) && !(avy & MANTBITS_DP64))) + /* x or y is infinity. ISO C99 defines that we must + return +infinity, even if the other argument is NaN. + Note that the computation of x*x + y*y above will already + have raised invalid if either x or y is a signalling NaN. */ + return infinityf_with_flags(0); + else + /* One or both of x or y is NaN, and neither is infinity. + Raise invalid if it's a signalling NaN */ + return (float)retval; + } + + dr = (dx*dx + dy*dy); + +#if USE_SOFTWARE_SQRT + retval = sqrtf_amd_inline(r); +#else + /* VC++ intrinsic call */ + _mm_store_sd(&retval, _mm_sqrt_sd(_mm_setzero_pd(), _mm_load_sd(&dr))); +#endif + + if (retval > large) + return _handle_errorf("_hypotf", OP_HYPOT, PINFBITPATT_SP32, _OVERFLOW, + AMD_F_OVERFLOW | AMD_F_INEXACT, ERANGE, x, y, 2); + else + return (float)retval; +} diff --git a/sdk/lib/crt/math/libm_sse2/libm.h b/sdk/lib/crt/math/libm_sse2/libm.h new file mode 100644 index 00000000000..675b3eb8c57 --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/libm.h @@ -0,0 +1,49 @@ +/***********************************************************************************/ +/** MIT License **/ +/** ----------- **/ +/** **/ +/** Copyright (c) 2002-2019 Advanced Micro Devices, Inc. **/ +/** **/ +/** Permission is hereby granted, free of charge, to any person obtaining a copy **/ +/** of this Software and associated documentaon files (the "Software"), to deal **/ +/** in the Software without restriction, including without limitation the rights **/ +/** to use, copy, modify, merge, publish, distribute, sublicense, and/or sell **/ +/** copies of the Software, and to permit persons to whom the Software is **/ +/** furnished to do so, subject to the following conditions: **/ +/** **/ +/** The above copyright notice and this permission notice shall be included in **/ +/** all copies or substantial portions of the Software. **/ +/** **/ +/** THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR **/ +/** IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, **/ +/** FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE **/ +/** AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER **/ +/** LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, **/ +/** OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN **/ +/** THE SOFTWARE. **/ +/***********************************************************************************/ + +#ifndef LIBM_AMD_H_INCLUDED +#define LIBM_AMD_H_INCLUDED 1 + +#define FN_PROTOTYPE(fname) fname + +#include +#include + +#ifndef IS_64BIT +#define IS_64BIT +#endif + +#ifndef _COMPLEX_DEFINED +struct _complex +{ + double x, y; /* real and imaginary parts */ +}; +#define _COMPLEX_DEFINED +#endif +#define COMPLEX struct _complex + +extern void __remainder_piby2(double x, double *r, double *rr, int *region); + +#endif /* LIBM_AMD_H_INCLUDED */ diff --git a/sdk/lib/crt/math/libm_sse2/libm_errno.h b/sdk/lib/crt/math/libm_sse2/libm_errno.h new file mode 100644 index 00000000000..3d18df802a1 --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/libm_errno.h @@ -0,0 +1,35 @@ +/***********************************************************************************/ +/** MIT License **/ +/** ----------- **/ +/** **/ +/** Copyright (c) 2002-2019 Advanced Micro Devices, Inc. **/ +/** **/ +/** Permission is hereby granted, free of charge, to any person obtaining a copy **/ +/** of this Software and associated documentaon files (the "Software"), to deal **/ +/** in the Software without restriction, including without limitation the rights **/ +/** to use, copy, modify, merge, publish, distribute, sublicense, and/or sell **/ +/** copies of the Software, and to permit persons to whom the Software is **/ +/** furnished to do so, subject to the following conditions: **/ +/** **/ +/** The above copyright notice and this permission notice shall be included in **/ +/** all copies or substantial portions of the Software. **/ +/** **/ +/** THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR **/ +/** IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, **/ +/** FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE **/ +/** AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER **/ +/** LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, **/ +/** OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN **/ +/** THE SOFTWARE. **/ +/***********************************************************************************/ + +#ifndef LIBM_ERRNO_AMD_H_INCLUDED +#define LIBM_ERRNO_AMD_H_INCLUDED 1 + +#include +#include +#ifndef __set_errno +#define __set_errno(x) errno = (x) +#endif + +#endif /* LIBM_ERRNO_AMD_H_INCLUDED */ diff --git a/sdk/lib/crt/math/libm_sse2/libm_inlines.h b/sdk/lib/crt/math/libm_sse2/libm_inlines.h new file mode 100644 index 00000000000..5937701be6f --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/libm_inlines.h @@ -0,0 +1,2101 @@ +/***********************************************************************************/ +/** MIT License **/ +/** ----------- **/ +/** **/ +/** Copyright (c) 2002-2019 Advanced Micro Devices, Inc. **/ +/** **/ +/** Permission is hereby granted, free of charge, to any person obtaining a copy **/ +/** of this Software and associated documentaon files (the "Software"), to deal **/ +/** in the Software without restriction, including without limitation the rights **/ +/** to use, copy, modify, merge, publish, distribute, sublicense, and/or sell **/ +/** copies of the Software, and to permit persons to whom the Software is **/ +/** furnished to do so, subject to the following conditions: **/ +/** **/ +/** The above copyright notice and this permission notice shall be included in **/ +/** all copies or substantial portions of the Software. **/ +/** **/ +/** THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR **/ +/** IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, **/ +/** FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE **/ +/** AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER **/ +/** LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, **/ +/** OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN **/ +/** THE SOFTWARE. **/ +/***********************************************************************************/ + +#ifndef LIBM_INLINES_AMD_H_INCLUDED +#define LIBM_INLINES_AMD_H_INCLUDED 1 + +#include "libm_util.h" + +/* Set defines for inline functions calling other inlines */ +#if defined(USE_VAL_WITH_FLAGS) || defined(USE_VALF_WITH_FLAGS) || \ + defined(USE_ZERO_WITH_FLAGS) || defined(USE_ZEROF_WITH_FLAGS) || \ + defined(USE_NAN_WITH_FLAGS) || defined(USE_NANF_WITH_FLAGS) || \ + defined(USE_INDEFINITE_WITH_FLAGS) || defined(USE_INDEFINITEF_WITH_FLAGS) || \ + defined(USE_INFINITY_WITH_FLAGS) || defined(USE_INFINITYF_WITH_FLAGS) || \ + defined(USE_SQRT_AMD_INLINE) || defined(USE_SQRTF_AMD_INLINE) || \ + (defined(USE_HANDLE_ERROR) || defined(USE_HANDLE_ERRORF)) +#undef USE_RAISE_FPSW_FLAGS +#define USE_RAISE_FPSW_FLAGS 1 +#endif + +#if defined(USE_SPLITDOUBLE) +/* Splits double x into exponent e and mantissa m, where 0.5 <= abs(m) < 1.0. + Assumes that x is not zero, denormal, infinity or NaN, but these conditions + are not checked */ +static inline void splitDouble(double x, int *e, double *m) +{ + unsigned long ux, uy; + GET_BITS_DP64(x, ux); + uy = ux; + ux &= EXPBITS_DP64; + ux >>= EXPSHIFTBITS_DP64; + *e = (int)ux - EXPBIAS_DP64 + 1; + uy = (uy & (SIGNBIT_DP64 | MANTBITS_DP64)) | HALFEXPBITS_DP64; + PUT_BITS_DP64(uy, x); + *m = x; +} +#endif /* USE_SPLITDOUBLE */ + + +#if defined(USE_SPLITDOUBLE_2) +/* Splits double x into exponent e and mantissa m, where 1.0 <= abs(m) < 4.0. + Assumes that x is not zero, denormal, infinity or NaN, but these conditions + are not checked. Also assumes EXPBIAS_DP is odd. With this + assumption, e will be even on exit. */ +static inline void splitDouble_2(double x, int *e, double *m) +{ + unsigned long ux, vx; + GET_BITS_DP64(x, ux); + vx = ux; + ux &= EXPBITS_DP64; + ux >>= EXPSHIFTBITS_DP64; + if (ux & 1) + { + /* The exponent is odd */ + vx = (vx & (SIGNBIT_DP64 | MANTBITS_DP64)) | ONEEXPBITS_DP64; + PUT_BITS_DP64(vx, x); + *m = x; + *e = ux - EXPBIAS_DP64; + } + else + { + /* The exponent is even */ + vx = (vx & (SIGNBIT_DP64 | MANTBITS_DP64)) | TWOEXPBITS_DP64; + PUT_BITS_DP64(vx, x); + *m = x; + *e = ux - EXPBIAS_DP64 - 1; + } +} +#endif /* USE_SPLITDOUBLE_2 */ + + +#if defined(USE_SPLITFLOAT) +/* Splits float x into exponent e and mantissa m, where 0.5 <= abs(m) < 1.0. + Assumes that x is not zero, denormal, infinity or NaN, but these conditions + are not checked */ +static inline void splitFloat(float x, int *e, float *m) +{ + unsigned int ux, uy; + GET_BITS_SP32(x, ux); + uy = ux; + ux &= EXPBITS_SP32; + ux >>= EXPSHIFTBITS_SP32; + *e = (int)ux - EXPBIAS_SP32 + 1; + uy = (uy & (SIGNBIT_SP32 | MANTBITS_SP32)) | HALFEXPBITS_SP32; + PUT_BITS_SP32(uy, x); + *m = x; +} +#endif /* USE_SPLITFLOAT */ + + +#if defined(USE_SCALEDOUBLE_1) +/* Scales the double x by 2.0**n. + Assumes EMIN <= n <= EMAX, though this condition is not checked. */ +static inline double scaleDouble_1(double x, int n) +{ + double t; + /* Construct the number t = 2.0**n */ + PUT_BITS_DP64(((long)n + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t); + return x*t; +} +#endif /* USE_SCALEDOUBLE_1 */ + + +#if defined(USE_SCALEDOUBLE_2) +/* Scales the double x by 2.0**n. + Assumes 2*EMIN <= n <= 2*EMAX, though this condition is not checked. */ +static inline double scaleDouble_2(double x, int n) +{ + double t1, t2; + int n1, n2; + n1 = n / 2; + n2 = n - n1; + /* Construct the numbers t1 = 2.0**n1 and t2 = 2.0**n2 */ + PUT_BITS_DP64(((long)n1 + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t1); + PUT_BITS_DP64(((long)n2 + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t2); + return (x*t1)*t2; +} +#endif /* USE_SCALEDOUBLE_2 */ + + +#if defined(USE_SCALEDOUBLE_3) +/* Scales the double x by 2.0**n. + Assumes 3*EMIN <= n <= 3*EMAX, though this condition is not checked. */ +static inline double scaleDouble_3(double x, int n) +{ + double t1, t2, t3; + int n1, n2, n3; + n1 = n / 3; + n2 = (n - n1) / 2; + n3 = n - n1 - n2; + /* Construct the numbers t1 = 2.0**n1, t2 = 2.0**n2 and t3 = 2.0**n3 */ + PUT_BITS_DP64(((long)n1 + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t1); + PUT_BITS_DP64(((long)n2 + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t2); + PUT_BITS_DP64(((long)n3 + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t3); + return ((x*t1)*t2)*t3; +} +#endif /* USE_SCALEDOUBLE_3 */ + + +#if defined(USE_SCALEFLOAT_1) +/* Scales the float x by 2.0**n. + Assumes EMIN <= n <= EMAX, though this condition is not checked. */ +static inline float scaleFloat_1(float x, int n) +{ + float t; + /* Construct the number t = 2.0**n */ + PUT_BITS_SP32((n + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, t); + return x*t; +} +#endif /* USE_SCALEFLOAT_1 */ + + +#if defined(USE_SCALEFLOAT_2) +/* Scales the float x by 2.0**n. + Assumes 2*EMIN <= n <= 2*EMAX, though this condition is not checked. */ +static inline float scaleFloat_2(float x, int n) +{ + float t1, t2; + int n1, n2; + n1 = n / 2; + n2 = n - n1; + /* Construct the numbers t1 = 2.0**n1 and t2 = 2.0**n2 */ + PUT_BITS_SP32((n1 + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, t1); + PUT_BITS_SP32((n2 + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, t2); + return (x*t1)*t2; +} +#endif /* USE_SCALEFLOAT_2 */ + + +#if defined(USE_SCALEFLOAT_3) +/* Scales the float x by 2.0**n. + Assumes 3*EMIN <= n <= 3*EMAX, though this condition is not checked. */ +static inline float scaleFloat_3(float x, int n) +{ + float t1, t2, t3; + int n1, n2, n3; + n1 = n / 3; + n2 = (n - n1) / 2; + n3 = n - n1 - n2; + /* Construct the numbers t1 = 2.0**n1, t2 = 2.0**n2 and t3 = 2.0**n3 */ + PUT_BITS_SP32((n1 + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, t1); + PUT_BITS_SP32((n2 + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, t2); + PUT_BITS_SP32((n3 + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, t3); + return ((x*t1)*t2)*t3; +} +#endif /* USE_SCALEFLOAT_3 */ + +#if defined(USE_SETPRECISIONDOUBLE) +unsigned int setPrecisionDouble(void) +{ + unsigned int cw, cwold = 0; + /* There is no precision control on Hammer */ + return cwold; +} +#endif /* USE_SETPRECISIONDOUBLE */ + +#if defined(USE_RESTOREPRECISION) +void restorePrecision(unsigned int cwold) +{ + /* There is no precision control on Hammer */ + return; +} +#endif /* USE_RESTOREPRECISION */ + + + + +#if defined(USE_RAISE_FPSW_FLAGS) +/* Raises floating-point status flags. The argument should be + the bitwise or of the flags to be raised, from the + list above, e.g. + raise_fpsw_flags(AMD_F_INEXACT | AMD_F_INVALID); + */ + +/* ISSUE - wat - 08182010 + * These AMD_ISW_* flags are duplicated from trans.h + * this is not clean; Mark S. did it for targeted fix of 855457 + * Eliminate all redundant flags in the next overhaul + */ + +#define AMD_ISW_INVALID 0x0001 +#define AMD_ISW_DENORMAL 0x0002 +#define AMD_ISW_ZERODIVIDE 0x0004 +#define AMD_ISW_OVERFLOW 0x0008 +#define AMD_ISW_UNDERFLOW 0x0010 +#define AMD_ISW_INEXACT 0x0020 + +/* use this function from fpctrl.c */ +void _set_statfp(uintptr_t); + +static inline void raise_fpsw_flags(int flags) +{ + unsigned int f = 0; + + if (flags & AMD_F_OVERFLOW) { f |= AMD_ISW_OVERFLOW; } + if (flags & AMD_F_UNDERFLOW) { f |= AMD_ISW_UNDERFLOW; } + if (flags & AMD_F_DIVBYZERO) { f |= AMD_ISW_ZERODIVIDE; } + if (flags & AMD_F_INVALID) { f |= AMD_ISW_INVALID; } + if (flags & AMD_F_INEXACT) { f |= AMD_ISW_INEXACT; } + + _set_statfp(f); +} + +#endif /* USE_RAISE_FPSW_FLAGS */ + + +#if defined(USE_GET_FPSW_INLINE) +/* Return the current floating-point status word */ +static inline unsigned int get_fpsw_inline(void) +{ + return _mm_getcsr(); +} +#endif /* USE_GET_FPSW_INLINE */ + +#if defined(USE_SET_FPSW_INLINE) +/* Set the floating-point status word */ +static inline void set_fpsw_inline(unsigned int sw) +{ + _mm_setcsr(sw); +} +#endif /* USE_SET_FPSW_INLINE */ + + + +#if defined(USE_VAL_WITH_FLAGS) +/* Returns a double value after raising the given flags, + e.g. val_with_flags(AMD_F_INEXACT); + */ +static inline double val_with_flags(double val, int flags) +{ + raise_fpsw_flags(flags); + return val; +} +#endif /* USE_VAL_WITH_FLAGS */ + +#if defined(USE_VALF_WITH_FLAGS) +/* Returns a float value after raising the given flags, + e.g. valf_with_flags(AMD_F_INEXACT); + */ +static inline float valf_with_flags(float val, int flags) +{ + raise_fpsw_flags(flags); + return val; +} +#endif /* USE_VALF_WITH_FLAGS */ + + +#if defined(USE_ZERO_WITH_FLAGS) +/* Returns a double +zero after raising the given flags, + e.g. zero_with_flags(AMD_F_INEXACT | AMD_F_INVALID); + */ +static inline double zero_with_flags(int flags) +{ + raise_fpsw_flags(flags); + return 0.0; +} +#endif /* USE_ZERO_WITH_FLAGS */ + + +#if defined(USE_ZEROF_WITH_FLAGS) +/* Returns a float +zero after raising the given flags, + e.g. zerof_with_flags(AMD_F_INEXACT | AMD_F_INVALID); + */ +static inline float zerof_with_flags(int flags) +{ + raise_fpsw_flags(flags); + return 0.0F; +} +#endif /* USE_ZEROF_WITH_FLAGS */ + + +#if defined(USE_NAN_WITH_FLAGS) +/* Returns a double quiet +nan after raising the given flags, + e.g. nan_with_flags(AMD_F_INVALID); +*/ +static inline double nan_with_flags(int flags) +{ + double z; + raise_fpsw_flags(flags); + PUT_BITS_DP64(0x7ff8000000000000, z); + return z; +} +#endif /* USE_NAN_WITH_FLAGS */ + +#if defined(USE_NANF_WITH_FLAGS) +/* Returns a float quiet +nan after raising the given flags, + e.g. nanf_with_flags(AMD_F_INVALID); +*/ +static inline float nanf_with_flags(int flags) +{ + float z; + raise_fpsw_flags(flags); + PUT_BITS_SP32(0x7fc00000, z); + return z; +} +#endif /* USE_NANF_WITH_FLAGS */ + + +#if defined(USE_INDEFINITE_WITH_FLAGS) +/* Returns a double indefinite after raising the given flags, + e.g. indefinite_with_flags(AMD_F_INVALID); +*/ +static inline double indefinite_with_flags(int flags) +{ + double z; + raise_fpsw_flags(flags); + PUT_BITS_DP64(0xfff8000000000000, z); + return z; +} +#endif /* USE_INDEFINITE_WITH_FLAGS */ + +#if defined(USE_INDEFINITEF_WITH_FLAGS) +/* Returns a float quiet +indefinite after raising the given flags, + e.g. indefinitef_with_flags(AMD_F_INVALID); +*/ +static inline float indefinitef_with_flags(int flags) +{ + float z; + raise_fpsw_flags(flags); + PUT_BITS_SP32(0xffc00000, z); + return z; +} +#endif /* USE_INDEFINITEF_WITH_FLAGS */ + + +#ifdef USE_INFINITY_WITH_FLAGS +/* Returns a positive double infinity after raising the given flags, + e.g. infinity_with_flags(AMD_F_OVERFLOW); +*/ +static inline double infinity_with_flags(int flags) +{ + double z; + raise_fpsw_flags(flags); + PUT_BITS_DP64((unsigned long)(BIASEDEMAX_DP64 + 1) << EXPSHIFTBITS_DP64, z); + return z; +} +#endif /* USE_INFINITY_WITH_FLAGS */ + +#ifdef USE_INFINITYF_WITH_FLAGS +/* Returns a positive float infinity after raising the given flags, + e.g. infinityf_with_flags(AMD_F_OVERFLOW); +*/ +static inline float infinityf_with_flags(int flags) +{ + float z; + raise_fpsw_flags(flags); + PUT_BITS_SP32((BIASEDEMAX_SP32 + 1) << EXPSHIFTBITS_SP32, z); + return z; +} +#endif /* USE_INFINITYF_WITH_FLAGS */ + +#if defined(USE_HANDLE_ERROR) || defined(USE_HANDLE_ERRORF) +#include +#endif + +/* define the Microsoft specific error handling routine */ +double _handle_error( + char *fname, + int opcode, + unsigned long value, + int type, + int flags, + int error, + double arg1, + double arg2, + int nargs + ); +float _handle_errorf( + char *fname, + int opcode, + unsigned long value, + int type, + int flags, + int error, + float arg1, + float arg2, + int nargs + ); + +#if defined(USE_SPLITEXP) +/* Compute the values m, z1, and z2 such that base**x = 2**m * (z1 + z2). + Small arguments abs(x) < 1/(16*ln(base)) and extreme arguments + abs(x) > large/(ln(base)) (where large is the largest representable + floating point number) should be handled separately instead of calling + this function. This function is called by exp, exp2, exp10, + cosh and sinh. */ +static inline void splitexp(double x, double logbase, + double thirtytwo_by_logbaseof2, + double logbaseof2_by_32_lead, + double logbaseof2_by_32_trail, + int *m, double *z1, double *z2) +{ + double q, r, r1, r2, f1, f2; + int n, j; + +/* Arrays two_to_jby32_lead_table and two_to_jby32_trail_table contain + leading and trailing parts respectively of precomputed + values of pow(2.0,j/32.0), for j = 0, 1, ..., 31. + two_to_jby32_lead_table contains the first 25 bits of precision, + and two_to_jby32_trail_table contains a further 53 bits precision. */ + + static const double two_to_jby32_lead_table[32] = { + 1.00000000000000000000e+00, /* 0x3ff0000000000000 */ + 1.02189713716506958008e+00, /* 0x3ff059b0d0000000 */ + 1.04427373409271240234e+00, /* 0x3ff0b55860000000 */ + 1.06714040040969848633e+00, /* 0x3ff11301d0000000 */ + 1.09050768613815307617e+00, /* 0x3ff172b830000000 */ + 1.11438673734664916992e+00, /* 0x3ff1d48730000000 */ + 1.13878858089447021484e+00, /* 0x3ff2387a60000000 */ + 1.16372483968734741211e+00, /* 0x3ff29e9df0000000 */ + 1.18920707702636718750e+00, /* 0x3ff306fe00000000 */ + 1.21524733304977416992e+00, /* 0x3ff371a730000000 */ + 1.24185776710510253906e+00, /* 0x3ff3dea640000000 */ + 1.26905095577239990234e+00, /* 0x3ff44e0860000000 */ + 1.29683953523635864258e+00, /* 0x3ff4bfdad0000000 */ + 1.32523661851882934570e+00, /* 0x3ff5342b50000000 */ + 1.35425549745559692383e+00, /* 0x3ff5ab07d0000000 */ + 1.38390988111495971680e+00, /* 0x3ff6247eb0000000 */ + 1.41421353816986083984e+00, /* 0x3ff6a09e60000000 */ + 1.44518077373504638672e+00, /* 0x3ff71f75e0000000 */ + 1.47682613134384155273e+00, /* 0x3ff7a11470000000 */ + 1.50916439294815063477e+00, /* 0x3ff8258990000000 */ + 1.54221081733703613281e+00, /* 0x3ff8ace540000000 */ + 1.57598084211349487305e+00, /* 0x3ff93737b0000000 */ + 1.61049032211303710938e+00, /* 0x3ff9c49180000000 */ + 1.64575546979904174805e+00, /* 0x3ffa5503b0000000 */ + 1.68179279565811157227e+00, /* 0x3ffae89f90000000 */ + 1.71861928701400756836e+00, /* 0x3ffb7f76f0000000 */ + 1.75625211000442504883e+00, /* 0x3ffc199bd0000000 */ + 1.79470902681350708008e+00, /* 0x3ffcb720d0000000 */ + 1.83400803804397583008e+00, /* 0x3ffd5818d0000000 */ + 1.87416762113571166992e+00, /* 0x3ffdfc9730000000 */ + 1.91520655155181884766e+00, /* 0x3ffea4afa0000000 */ + 1.95714408159255981445e+00}; /* 0x3fff507650000000 */ + + static const double two_to_jby32_trail_table[32] = { + 0.00000000000000000000e+00, /* 0x0000000000000000 */ + 1.14890470981563546737e-08, /* 0x3e48ac2ba1d73e2a */ + 4.83347014379782142328e-08, /* 0x3e69f3121ec53172 */ + 2.67125131841396124714e-10, /* 0x3df25b50a4ebbf1b */ + 4.65271045830351350190e-08, /* 0x3e68faa2f5b9bef9 */ + 5.24924336638693782574e-09, /* 0x3e368b9aa7805b80 */ + 5.38622214388600821910e-08, /* 0x3e6ceac470cd83f6 */ + 1.90902301017041969782e-08, /* 0x3e547f7b84b09745 */ + 3.79763538792174980894e-08, /* 0x3e64636e2a5bd1ab */ + 2.69306947081946450986e-08, /* 0x3e5ceaa72a9c5154 */ + 4.49683815095311756138e-08, /* 0x3e682468446b6824 */ + 1.41933332021066904914e-09, /* 0x3e18624b40c4dbd0 */ + 1.94146510233556266402e-08, /* 0x3e54d8a89c750e5e */ + 2.46409119489264118569e-08, /* 0x3e5a753e077c2a0f */ + 4.94812958044698886494e-08, /* 0x3e6a90a852b19260 */ + 8.48872238075784476136e-10, /* 0x3e0d2ac258f87d03 */ + 2.42032342089579394887e-08, /* 0x3e59fcef32422cbf */ + 3.32420002333182569170e-08, /* 0x3e61d8bee7ba46e2 */ + 1.45956577586525322754e-08, /* 0x3e4f580c36bea881 */ + 3.46452721050003920866e-08, /* 0x3e62999c25159f11 */ + 8.07090469079979051284e-09, /* 0x3e415506dadd3e2a */ + 2.99439161340839520436e-09, /* 0x3e29b8bc9e8a0388 */ + 9.83621719880452147153e-09, /* 0x3e451f8480e3e236 */ + 8.35492309647188080486e-09, /* 0x3e41f12ae45a1224 */ + 3.48493175137966283582e-08, /* 0x3e62b5a75abd0e6a */ + 1.11084703472699692902e-08, /* 0x3e47daf237553d84 */ + 5.03688744342840346564e-08, /* 0x3e6b0aa538444196 */ + 4.81896001063495806249e-08, /* 0x3e69df20d22a0798 */ + 4.83653666334089557746e-08, /* 0x3e69f7490e4bb40b */ + 1.29745882314081237628e-08, /* 0x3e4bdcdaf5cb4656 */ + 9.84532844621636118964e-09, /* 0x3e452486cc2c7b9d */ + 4.25828404545651943883e-08}; /* 0x3e66dc8a80ce9f09 */ + + /* + Step 1. Reduce the argument. + + To perform argument reduction, we find the integer n such that + x = n * logbaseof2/32 + remainder, |remainder| <= logbaseof2/64. + n is defined by round-to-nearest-integer( x*32/logbaseof2 ) and + remainder by x - n*logbaseof2/32. The calculation of n is + straightforward whereas the computation of x - n*logbaseof2/32 + must be carried out carefully. + logbaseof2/32 is so represented in two pieces that + (1) logbaseof2/32 is known to extra precision, (2) the product + of n and the leading piece is a model number and is hence + calculated without error, and (3) the subtraction of the value + obtained in (2) from x is a model number and is hence again + obtained without error. + */ + + r = x * thirtytwo_by_logbaseof2; + /* Set n = nearest integer to r */ + /* This is faster on Hammer */ + if (r > 0) + n = (int)(r + 0.5); + else + n = (int)(r - 0.5); + + r1 = x - n * logbaseof2_by_32_lead; + r2 = - n * logbaseof2_by_32_trail; + + /* Set j = n mod 32: 5 mod 32 = 5, -5 mod 32 = 27, etc. */ + /* j = n % 32; + if (j < 0) j += 32; */ + j = n & 0x0000001f; + + f1 = two_to_jby32_lead_table[j]; + f2 = two_to_jby32_trail_table[j]; + + *m = (n - j) / 32; + + /* Step 2. The following is the core approximation. We approximate + exp(r1+r2)-1 by a polynomial. */ + + r1 *= logbase; r2 *= logbase; + + r = r1 + r2; + q = r1 + (r2 + + r*r*( 5.00000000000000008883e-01 + + r*( 1.66666666665260878863e-01 + + r*( 4.16666666662260795726e-02 + + r*( 8.33336798434219616221e-03 + + r*( 1.38889490863777199667e-03 )))))); + + /* Step 3. Function value reconstruction. + We now reconstruct the exponential of the input argument + so that exp(x) = 2**m * (z1 + z2). + The order of the computation below must be strictly observed. */ + + *z1 = f1; + *z2 = f2 + ((f1 + f2) * q); +} +#endif /* USE_SPLITEXP */ + + +#if defined(USE_SPLITEXPF) +/* Compute the values m, z1, and z2 such that base**x = 2**m * (z1 + z2). + Small arguments abs(x) < 1/(16*ln(base)) and extreme arguments + abs(x) > large/(ln(base)) (where large is the largest representable + floating point number) should be handled separately instead of calling + this function. This function is called by exp, exp2, exp10, + cosh and sinh. */ +static inline void splitexpf(float x, float logbase, + float thirtytwo_by_logbaseof2, + float logbaseof2_by_32_lead, + float logbaseof2_by_32_trail, + int *m, float *z1, float *z2) +{ + float q, r, r1, r2, f1, f2; + int n, j; + +/* Arrays two_to_jby32_lead_table and two_to_jby32_trail_table contain + leading and trailing parts respectively of precomputed + values of pow(2.0,j/32.0), for j = 0, 1, ..., 31. + two_to_jby32_lead_table contains the first 10 bits of precision, + and two_to_jby32_trail_table contains a further 24 bits precision. */ + + static const float two_to_jby32_lead_table[32] = { + 1.0000000000E+00F, /* 0x3F800000 */ + 1.0214843750E+00F, /* 0x3F82C000 */ + 1.0429687500E+00F, /* 0x3F858000 */ + 1.0664062500E+00F, /* 0x3F888000 */ + 1.0898437500E+00F, /* 0x3F8B8000 */ + 1.1132812500E+00F, /* 0x3F8E8000 */ + 1.1386718750E+00F, /* 0x3F91C000 */ + 1.1621093750E+00F, /* 0x3F94C000 */ + 1.1875000000E+00F, /* 0x3F980000 */ + 1.2148437500E+00F, /* 0x3F9B8000 */ + 1.2402343750E+00F, /* 0x3F9EC000 */ + 1.2675781250E+00F, /* 0x3FA24000 */ + 1.2949218750E+00F, /* 0x3FA5C000 */ + 1.3242187500E+00F, /* 0x3FA98000 */ + 1.3535156250E+00F, /* 0x3FAD4000 */ + 1.3828125000E+00F, /* 0x3FB10000 */ + 1.4140625000E+00F, /* 0x3FB50000 */ + 1.4433593750E+00F, /* 0x3FB8C000 */ + 1.4765625000E+00F, /* 0x3FBD0000 */ + 1.5078125000E+00F, /* 0x3FC10000 */ + 1.5410156250E+00F, /* 0x3FC54000 */ + 1.5742187500E+00F, /* 0x3FC98000 */ + 1.6093750000E+00F, /* 0x3FCE0000 */ + 1.6445312500E+00F, /* 0x3FD28000 */ + 1.6816406250E+00F, /* 0x3FD74000 */ + 1.7167968750E+00F, /* 0x3FDBC000 */ + 1.7558593750E+00F, /* 0x3FE0C000 */ + 1.7929687500E+00F, /* 0x3FE58000 */ + 1.8339843750E+00F, /* 0x3FEAC000 */ + 1.8730468750E+00F, /* 0x3FEFC000 */ + 1.9140625000E+00F, /* 0x3FF50000 */ + 1.9570312500E+00F}; /* 0x3FFA8000 */ + + static const float two_to_jby32_trail_table[32] = { + 0.0000000000E+00F, /* 0x00000000 */ + 4.1277357377E-04F, /* 0x39D86988 */ + 1.3050324051E-03F, /* 0x3AAB0D9F */ + 7.3415064253E-04F, /* 0x3A407404 */ + 6.6398258787E-04F, /* 0x3A2E0F1E */ + 1.1054925853E-03F, /* 0x3A90E62D */ + 1.1675967835E-04F, /* 0x38F4DCE0 */ + 1.6154836630E-03F, /* 0x3AD3BEA3 */ + 1.7071149778E-03F, /* 0x3ADFC146 */ + 4.0360994171E-04F, /* 0x39D39B9C */ + 1.6234370414E-03F, /* 0x3AD4C982 */ + 1.4728321694E-03F, /* 0x3AC10C0C */ + 1.9176795613E-03F, /* 0x3AFB5AA6 */ + 1.0178930825E-03F, /* 0x3A856AD3 */ + 7.3992193211E-04F, /* 0x3A41F752 */ + 1.0973819299E-03F, /* 0x3A8FD607 */ + 1.5106226783E-04F, /* 0x391E6678 */ + 1.8214319134E-03F, /* 0x3AEEBD1D */ + 2.6364589576E-04F, /* 0x398A39F4 */ + 1.3519275235E-03F, /* 0x3AB13329 */ + 1.1952003697E-03F, /* 0x3A9CA845 */ + 1.7620950239E-03F, /* 0x3AE6F619 */ + 1.1153318919E-03F, /* 0x3A923054 */ + 1.2242280645E-03F, /* 0x3AA07647 */ + 1.5220546629E-04F, /* 0x391F9958 */ + 1.8224230735E-03F, /* 0x3AEEDE5F */ + 3.9278529584E-04F, /* 0x39CDEEC0 */ + 1.7403248930E-03F, /* 0x3AE41B9D */ + 2.3711356334E-05F, /* 0x37C6E7C0 */ + 1.1207590578E-03F, /* 0x3A92E66F */ + 1.1440613307E-03F, /* 0x3A95F454 */ + 1.1287408415E-04F}; /* 0x38ECB6D0 */ + + /* + Step 1. Reduce the argument. + + To perform argument reduction, we find the integer n such that + x = n * logbaseof2/32 + remainder, |remainder| <= logbaseof2/64. + n is defined by round-to-nearest-integer( x*32/logbaseof2 ) and + remainder by x - n*logbaseof2/32. The calculation of n is + straightforward whereas the computation of x - n*logbaseof2/32 + must be carried out carefully. + logbaseof2/32 is so represented in two pieces that + (1) logbaseof2/32 is known to extra precision, (2) the product + of n and the leading piece is a model number and is hence + calculated without error, and (3) the subtraction of the value + obtained in (2) from x is a model number and is hence again + obtained without error. + */ + + r = x * thirtytwo_by_logbaseof2; + /* Set n = nearest integer to r */ + /* This is faster on Hammer */ + if (r > 0) + n = (int)(r + 0.5F); + else + n = (int)(r - 0.5F); + + r1 = x - n * logbaseof2_by_32_lead; + r2 = - n * logbaseof2_by_32_trail; + + /* Set j = n mod 32: 5 mod 32 = 5, -5 mod 32 = 27, etc. */ + /* j = n % 32; + if (j < 0) j += 32; */ + j = n & 0x0000001f; + + f1 = two_to_jby32_lead_table[j]; + f2 = two_to_jby32_trail_table[j]; + + *m = (n - j) / 32; + + /* Step 2. The following is the core approximation. We approximate + exp(r1+r2)-1 by a polynomial. */ + + r1 *= logbase; r2 *= logbase; + + r = r1 + r2; + q = r1 + (r2 + + r*r*( 5.00000000000000008883e-01F + + r*( 1.66666666665260878863e-01F ))); + + /* Step 3. Function value reconstruction. + We now reconstruct the exponential of the input argument + so that exp(x) = 2**m * (z1 + z2). + The order of the computation below must be strictly observed. */ + + *z1 = f1; + *z2 = f2 + ((f1 + f2) * q); +} +#endif /* SPLITEXPF */ + + +#if defined(USE_SCALEUPDOUBLE1024) +/* Scales up a double (normal or denormal) whose bit pattern is given + as ux by 2**1024. There are no checks that the input number is + scalable by that amount. */ +static inline void scaleUpDouble1024(unsigned long ux, unsigned long *ur) +{ + unsigned long uy; + double y; + + if ((ux & EXPBITS_DP64) == 0) + { + /* ux is denormalised */ + PUT_BITS_DP64(ux | 0x4010000000000000, y); + if (ux & SIGNBIT_DP64) + y += 4.0; + else + y -= 4.0; + GET_BITS_DP64(y, uy); + } + else + /* ux is normal */ + uy = ux + 0x4000000000000000; + + *ur = uy; + return; +} + +#endif /* SCALEUPDOUBLE1024 */ + + +#if defined(USE_SCALEDOWNDOUBLE) +/* Scales down a double whose bit pattern is given as ux by 2**k. + There are no checks that the input number is scalable by that amount. */ +static inline void scaleDownDouble(unsigned long ux, int k, + unsigned long *ur) +{ + unsigned long uy, uk, ax, xsign; + int n, shift; + xsign = ux & SIGNBIT_DP64; + ax = ux & ~SIGNBIT_DP64; + n = (int)((ax & EXPBITS_DP64) >> EXPSHIFTBITS_DP64) - k; + if (n > 0) + { + uk = (unsigned long)n << EXPSHIFTBITS_DP64; + uy = (ax & ~EXPBITS_DP64) | uk; + } + else + { + uy = (ax & ~EXPBITS_DP64) | 0x0010000000000000; + shift = (1 - n); + if (shift > MANTLENGTH_DP64 + 1) + /* Sigh. Shifting works mod 64 so be careful not to shift too much */ + uy = 0; + else + { + /* Make sure we round the result */ + uy >>= shift - 1; + uy = (uy >> 1) + (uy & 1); + } + } + *ur = uy | xsign; +} + +#endif /* SCALEDOWNDOUBLE */ + + +#if defined(USE_SCALEUPFLOAT128) +/* Scales up a float (normal or denormal) whose bit pattern is given + as ux by 2**128. There are no checks that the input number is + scalable by that amount. */ +static inline void scaleUpFloat128(unsigned int ux, unsigned int *ur) +{ + unsigned int uy; + float y; + + if ((ux & EXPBITS_SP32) == 0) + { + /* ux is denormalised */ + PUT_BITS_SP32(ux | 0x40800000, y); + /* Compensate for the implicit bit just added */ + if (ux & SIGNBIT_SP32) + y += 4.0F; + else + y -= 4.0F; + GET_BITS_SP32(y, uy); + } + else + /* ux is normal */ + uy = ux + 0x40000000; + *ur = uy; +} +#endif /* SCALEUPFLOAT128 */ + + +#if defined(USE_SCALEDOWNFLOAT) +/* Scales down a float whose bit pattern is given as ux by 2**k. + There are no checks that the input number is scalable by that amount. */ +static inline void scaleDownFloat(unsigned int ux, int k, + unsigned int *ur) +{ + unsigned int uy, uk, ax, xsign; + int n, shift; + + xsign = ux & SIGNBIT_SP32; + ax = ux & ~SIGNBIT_SP32; + n = ((ax & EXPBITS_SP32) >> EXPSHIFTBITS_SP32) - k; + if (n > 0) + { + uk = (unsigned int)n << EXPSHIFTBITS_SP32; + uy = (ax & ~EXPBITS_SP32) | uk; + } + else + { + uy = (ax & ~EXPBITS_SP32) | 0x00800000; + shift = (1 - n); + if (shift > MANTLENGTH_SP32 + 1) + /* Sigh. Shifting works mod 32 so be careful not to shift too much */ + uy = 0; + else + { + /* Make sure we round the result */ + uy >>= shift - 1; + uy = (uy >> 1) + (uy & 1); + } + } + *ur = uy | xsign; +} +#endif /* SCALEDOWNFLOAT */ + + +#if defined(USE_SQRT_AMD_INLINE) +static inline double sqrt_amd_inline(double x) +{ + /* + Computes the square root of x. + + The calculation is carried out in three steps. + + Step 1. Reduction. + The input argument is scaled to the interval [1, 4) by + computing + x = 2^e * y, where y in [1,4). + Furthermore y is decomposed as y = c + t where + c = 1 + j/32, j = 0,1,..,96; and |t| <= 1/64. + + Step 2. Approximation. + An approximation q = sqrt(1 + (t/c)) - 1 is obtained + from a basic series expansion using precomputed values + stored in rt_jby32_lead_table_dbl and rt_jby32_trail_table_dbl. + + Step 3. Reconstruction. + The value of sqrt(x) is reconstructed via + sqrt(x) = 2^(e/2) * sqrt(y) + = 2^(e/2) * sqrt(c) * sqrt(y/c) + = 2^(e/2) * sqrt(c) * sqrt(1 + t/c) + = 2^(e/2) * [ sqrt(c) + sqrt(c)*q ] + */ + + unsigned long ux, ax, u; + double r1, r2, c, y, p, q, r, twop, z, rtc, rtc_lead, rtc_trail; + int e, denorm = 0, index; + +/* Arrays rt_jby32_lead_table_dbl and rt_jby32_trail_table_dbl contain + leading and trailing parts respectively of precomputed + values of sqrt(j/32), for j = 32, 33, ..., 128. + rt_jby32_lead_table_dbl contains the first 21 bits of precision, + and rt_jby32_trail_table_dbl contains a further 53 bits precision. */ + + static const double rt_jby32_lead_table_dbl[97] = { + 1.00000000000000000000e+00, /* 0x3ff0000000000000 */ + 1.01550388336181640625e+00, /* 0x3ff03f8100000000 */ + 1.03077602386474609375e+00, /* 0x3ff07e0f00000000 */ + 1.04582500457763671875e+00, /* 0x3ff0bbb300000000 */ + 1.06065940856933593750e+00, /* 0x3ff0f87600000000 */ + 1.07528972625732421875e+00, /* 0x3ff1346300000000 */ + 1.08972454071044921875e+00, /* 0x3ff16f8300000000 */ + 1.10396957397460937500e+00, /* 0x3ff1a9dc00000000 */ + 1.11803340911865234375e+00, /* 0x3ff1e37700000000 */ + 1.13192272186279296875e+00, /* 0x3ff21c5b00000000 */ + 1.14564323425292968750e+00, /* 0x3ff2548e00000000 */ + 1.15920162200927734375e+00, /* 0x3ff28c1700000000 */ + 1.17260360717773437500e+00, /* 0x3ff2c2fc00000000 */ + 1.18585395812988281250e+00, /* 0x3ff2f94200000000 */ + 1.19895744323730468750e+00, /* 0x3ff32eee00000000 */ + 1.21191978454589843750e+00, /* 0x3ff3640600000000 */ + 1.22474479675292968750e+00, /* 0x3ff3988e00000000 */ + 1.23743629455566406250e+00, /* 0x3ff3cc8a00000000 */ + 1.25000000000000000000e+00, /* 0x3ff4000000000000 */ + 1.26243782043457031250e+00, /* 0x3ff432f200000000 */ + 1.27475452423095703125e+00, /* 0x3ff4656500000000 */ + 1.28695297241210937500e+00, /* 0x3ff4975c00000000 */ + 1.29903793334960937500e+00, /* 0x3ff4c8dc00000000 */ + 1.31101036071777343750e+00, /* 0x3ff4f9e600000000 */ + 1.32287502288818359375e+00, /* 0x3ff52a7f00000000 */ + 1.33463478088378906250e+00, /* 0x3ff55aaa00000000 */ + 1.34629058837890625000e+00, /* 0x3ff58a6800000000 */ + 1.35784721374511718750e+00, /* 0x3ff5b9be00000000 */ + 1.36930561065673828125e+00, /* 0x3ff5e8ad00000000 */ + 1.38066959381103515625e+00, /* 0x3ff6173900000000 */ + 1.39194107055664062500e+00, /* 0x3ff6456400000000 */ + 1.40312099456787109375e+00, /* 0x3ff6732f00000000 */ + 1.41421318054199218750e+00, /* 0x3ff6a09e00000000 */ + 1.42521858215332031250e+00, /* 0x3ff6cdb200000000 */ + 1.43614006042480468750e+00, /* 0x3ff6fa6e00000000 */ + 1.44697952270507812500e+00, /* 0x3ff726d400000000 */ + 1.45773792266845703125e+00, /* 0x3ff752e500000000 */ + 1.46841716766357421875e+00, /* 0x3ff77ea300000000 */ + 1.47901916503906250000e+00, /* 0x3ff7aa1000000000 */ + 1.48954677581787109375e+00, /* 0x3ff7d52f00000000 */ + 1.50000000000000000000e+00, /* 0x3ff8000000000000 */ + 1.51038074493408203125e+00, /* 0x3ff82a8500000000 */ + 1.52068996429443359375e+00, /* 0x3ff854bf00000000 */ + 1.53093051910400390625e+00, /* 0x3ff87eb100000000 */ + 1.54110336303710937500e+00, /* 0x3ff8a85c00000000 */ + 1.55120849609375000000e+00, /* 0x3ff8d1c000000000 */ + 1.56124877929687500000e+00, /* 0x3ff8fae000000000 */ + 1.57122516632080078125e+00, /* 0x3ff923bd00000000 */ + 1.58113861083984375000e+00, /* 0x3ff94c5800000000 */ + 1.59099006652832031250e+00, /* 0x3ff974b200000000 */ + 1.60078048706054687500e+00, /* 0x3ff99ccc00000000 */ + 1.61051177978515625000e+00, /* 0x3ff9c4a800000000 */ + 1.62018489837646484375e+00, /* 0x3ff9ec4700000000 */ + 1.62979984283447265625e+00, /* 0x3ffa13a900000000 */ + 1.63935947418212890625e+00, /* 0x3ffa3ad100000000 */ + 1.64886283874511718750e+00, /* 0x3ffa61be00000000 */ + 1.65831184387207031250e+00, /* 0x3ffa887200000000 */ + 1.66770744323730468750e+00, /* 0x3ffaaeee00000000 */ + 1.67705059051513671875e+00, /* 0x3ffad53300000000 */ + 1.68634128570556640625e+00, /* 0x3ffafb4100000000 */ + 1.69558238983154296875e+00, /* 0x3ffb211b00000000 */ + 1.70477199554443359375e+00, /* 0x3ffb46bf00000000 */ + 1.71391296386718750000e+00, /* 0x3ffb6c3000000000 */ + 1.72300529479980468750e+00, /* 0x3ffb916e00000000 */ + 1.73204994201660156250e+00, /* 0x3ffbb67a00000000 */ + 1.74104785919189453125e+00, /* 0x3ffbdb5500000000 */ + 1.75000000000000000000e+00, /* 0x3ffc000000000000 */ + 1.75890541076660156250e+00, /* 0x3ffc247a00000000 */ + 1.76776695251464843750e+00, /* 0x3ffc48c600000000 */ + 1.77658367156982421875e+00, /* 0x3ffc6ce300000000 */ + 1.78535652160644531250e+00, /* 0x3ffc90d200000000 */ + 1.79408740997314453125e+00, /* 0x3ffcb49500000000 */ + 1.80277538299560546875e+00, /* 0x3ffcd82b00000000 */ + 1.81142139434814453125e+00, /* 0x3ffcfb9500000000 */ + 1.82002735137939453125e+00, /* 0x3ffd1ed500000000 */ + 1.82859230041503906250e+00, /* 0x3ffd41ea00000000 */ + 1.83711719512939453125e+00, /* 0x3ffd64d500000000 */ + 1.84560203552246093750e+00, /* 0x3ffd879600000000 */ + 1.85404872894287109375e+00, /* 0x3ffdaa2f00000000 */ + 1.86245727539062500000e+00, /* 0x3ffdcca000000000 */ + 1.87082862854003906250e+00, /* 0x3ffdeeea00000000 */ + 1.87916183471679687500e+00, /* 0x3ffe110c00000000 */ + 1.88745784759521484375e+00, /* 0x3ffe330700000000 */ + 1.89571857452392578125e+00, /* 0x3ffe54dd00000000 */ + 1.90394306182861328125e+00, /* 0x3ffe768d00000000 */ + 1.91213226318359375000e+00, /* 0x3ffe981800000000 */ + 1.92028617858886718750e+00, /* 0x3ffeb97e00000000 */ + 1.92840576171875000000e+00, /* 0x3ffedac000000000 */ + 1.93649101257324218750e+00, /* 0x3ffefbde00000000 */ + 1.94454288482666015625e+00, /* 0x3fff1cd900000000 */ + 1.95256233215332031250e+00, /* 0x3fff3db200000000 */ + 1.96054744720458984375e+00, /* 0x3fff5e6700000000 */ + 1.96850109100341796875e+00, /* 0x3fff7efb00000000 */ + 1.97642326354980468750e+00, /* 0x3fff9f6e00000000 */ + 1.98431301116943359375e+00, /* 0x3fffbfbf00000000 */ + 1.99217128753662109375e+00, /* 0x3fffdfef00000000 */ + 2.00000000000000000000e+00}; /* 0x4000000000000000 */ + + static const double rt_jby32_trail_table_dbl[97] = { + 0.00000000000000000000e+00, /* 0x0000000000000000 */ + 9.17217678638807524014e-07, /* 0x3eaec6d70177881c */ + 3.82539669043705364790e-07, /* 0x3e99abfb41bd6b24 */ + 2.85899577162227138140e-08, /* 0x3e5eb2bf6bab55a2 */ + 7.63210485349101216659e-07, /* 0x3ea99bed9b2d8d0c */ + 9.32123004127716212874e-07, /* 0x3eaf46e029c1b296 */ + 1.95174719169309219157e-07, /* 0x3e8a3226fc42f30c */ + 5.34316371481845492427e-07, /* 0x3ea1edbe20701d73 */ + 5.79631242504454563052e-07, /* 0x3ea372fe94f82be7 */ + 4.20404384109571705948e-07, /* 0x3e9c367e08e7bb06 */ + 6.89486030314147010716e-07, /* 0x3ea722a3d0a66608 */ + 6.89927685625314560328e-07, /* 0x3ea7266f067ca1d6 */ + 3.32778123013641425828e-07, /* 0x3e965515a9b34850 */ + 1.64433259436999584387e-07, /* 0x3e8611e23ef6c1bd */ + 4.37590875197899335723e-07, /* 0x3e9d5dc1059ed8e7 */ + 1.79808183816018617413e-07, /* 0x3e88222982d0e4f4 */ + 7.46386593615986477624e-08, /* 0x3e7409212e7d0322 */ + 5.72520794105201454728e-07, /* 0x3ea335ea8a5fcf39 */ + 0.00000000000000000000e+00, /* 0x0000000000000000 */ + 2.96860689431670420344e-07, /* 0x3e93ec071e938bfe */ + 3.54167239176257065345e-07, /* 0x3e97c48bfd9862c6 */ + 7.95211265664474710063e-07, /* 0x3eaaaed010f74671 */ + 1.72327048595145565621e-07, /* 0x3e87211cbfeb62e0 */ + 6.99494915996239297020e-07, /* 0x3ea7789d9660e72d */ + 6.32644111701500844315e-07, /* 0x3ea53a5f1d36f1cf */ + 6.20124838851440463844e-10, /* 0x3e054eacff2057dc */ + 6.13404719757812629969e-07, /* 0x3ea4951b3e6a83cc */ + 3.47654909777986407387e-07, /* 0x3e9754aa76884c66 */ + 7.83106177002392475763e-07, /* 0x3eaa46d4b1de1074 */ + 5.33337372440526357008e-07, /* 0x3ea1e55548f92635 */ + 2.01508648555298681765e-08, /* 0x3e55a3070dd17788 */ + 5.25472356925843939587e-07, /* 0x3ea1a1c5eedb0801 */ + 3.81831102861301692797e-07, /* 0x3e999fcef32422cc */ + 6.99220602161420018738e-07, /* 0x3ea776425d6b0199 */ + 6.01209702477462624811e-07, /* 0x3ea42c5a1e0191a2 */ + 9.01437000591944740554e-08, /* 0x3e7832a0bdff1327 */ + 5.10428680864685379950e-08, /* 0x3e6b674743636676 */ + 3.47895267104621031421e-07, /* 0x3e9758cb90d2f714 */ + 7.80735841510641848628e-07, /* 0x3eaa3278459cde25 */ + 1.35158752025506517690e-07, /* 0x3e822404f4a103ee */ + 0.00000000000000000000e+00, /* 0x0000000000000000 */ + 1.76523947728535489812e-09, /* 0x3e1e539af6892ac5 */ + 6.68280121328499932183e-07, /* 0x3ea66c7b872c9cd0 */ + 5.70135482405123276616e-07, /* 0x3ea3216d2f43887d */ + 1.37705134737562525897e-07, /* 0x3e827b832cbedc0e */ + 7.09655107074516613672e-07, /* 0x3ea7cfe41579091d */ + 7.20302724551461693011e-07, /* 0x3ea82b5a713c490a */ + 4.69926266058212796694e-07, /* 0x3e9f8945932d872e */ + 2.19244345915999437026e-07, /* 0x3e8d6d2da9490251 */ + 1.91141411617401877927e-07, /* 0x3e89a791a3114e4a */ + 5.72297665296622053774e-07, /* 0x3ea333ffe005988d */ + 5.61055484436830560103e-07, /* 0x3ea2d36e0ed49ab1 */ + 2.76225500213991506100e-07, /* 0x3e92898498f55f9e */ + 7.58466189522395692908e-07, /* 0x3ea9732cca1032a3 */ + 1.56893371256836029827e-07, /* 0x3e850ed0b02a22d2 */ + 4.06038997708867066507e-07, /* 0x3e9b3fb265b1e40a */ + 5.51305629612057435809e-07, /* 0x3ea27fade682d1de */ + 5.64778487026561123207e-07, /* 0x3ea2f36906f707ba */ + 3.92609705553556897517e-07, /* 0x3e9a58fbbee883b6 */ + 9.09698438776943827802e-07, /* 0x3eae864005bca6d7 */ + 1.05949774066016139743e-07, /* 0x3e7c70d02300f263 */ + 7.16578798392844784244e-07, /* 0x3ea80b5d712d8e3e */ + 6.86233073531233972561e-07, /* 0x3ea706b27cc7d390 */ + 7.99211473033494452908e-07, /* 0x3eaad12c9d849a97 */ + 8.65552275731027456121e-07, /* 0x3ead0b09954e764b */ + 6.75456120386058448618e-07, /* 0x3ea6aa1fb7826cbd */ + 0.00000000000000000000e+00, /* 0x0000000000000000 */ + 4.99167184520462138743e-07, /* 0x3ea0bfd03f46763c */ + 4.51720373502110930296e-10, /* 0x3dff0abfb4adfb9e */ + 1.28874162718371367439e-07, /* 0x3e814c151f991b2e */ + 5.85529267186999798656e-07, /* 0x3ea3a5a879b09292 */ + 1.01827770937125531924e-07, /* 0x3e7b558d173f9796 */ + 2.54736389177809626508e-07, /* 0x3e9118567cd83fb8 */ + 6.98925535290464831294e-07, /* 0x3ea773b981896751 */ + 1.20940735036524314513e-07, /* 0x3e803b7df49f48a8 */ + 5.43759351196479689657e-08, /* 0x3e6d315f22491900 */ + 1.11957989042397958409e-07, /* 0x3e7e0db1c5bb84b2 */ + 8.47006714134442661218e-07, /* 0x3eac6bbb7644ff76 */ + 8.92831044643427836228e-07, /* 0x3eadf55c3afec01f */ + 7.77828292464916501663e-07, /* 0x3eaa197e81034da3 */ + 6.48469316302918797451e-08, /* 0x3e71683f4920555d */ + 2.12579816658859849140e-07, /* 0x3e8c882fd78bb0b0 */ + 7.61222472580559138435e-07, /* 0x3ea98ad9eb7b83ec */ + 2.86488961857314189607e-07, /* 0x3e9339d7c7777273 */ + 2.14637363790165363515e-07, /* 0x3e8ccee237cae6fe */ + 5.44137005612605847831e-08, /* 0x3e6d368fe324a146 */ + 2.58378284856442408413e-07, /* 0x3e9156e7b6d99b45 */ + 3.15848939061134843091e-07, /* 0x3e95323e5310b5c1 */ + 6.60530466255089632309e-07, /* 0x3ea629e9db362f5d */ + 7.63436345535852301127e-07, /* 0x3ea99dde4728d7ec */ + 8.68233432860324345268e-08, /* 0x3e774e746878544d */ + 9.45465175398023087082e-07, /* 0x3eafb97be873a87d */ + 8.77499534786171267246e-07, /* 0x3ead71a9e23c2f63 */ + 2.74055432394999316135e-07, /* 0x3e92643c89cda173 */ + 4.72129009349126213532e-07, /* 0x3e9faf1d57a4d56c */ + 8.93777032327078947306e-07, /* 0x3eadfd7c7ab7b282 */ + 0.00000000000000000000e+00}; /* 0x0000000000000000 */ + + + /* Handle special arguments first */ + + GET_BITS_DP64(x, ux); + ax = ux & (~SIGNBIT_DP64); + + if(ax >= 0x7ff0000000000000) + { + /* x is either NaN or infinity */ + if (ux & MANTBITS_DP64) + /* x is NaN */ + return x + x; /* Raise invalid if it is a signalling NaN */ + else if (ux & SIGNBIT_DP64) + /* x is negative infinity */ + return nan_with_flags(AMD_F_INVALID); + else + /* x is positive infinity */ + return x; + } + else if (ux & SIGNBIT_DP64) + { + /* x is negative. */ + if (ux == SIGNBIT_DP64) + /* Handle negative zero first */ + return x; + else + return nan_with_flags(AMD_F_INVALID); + } + else if (ux <= 0x000fffffffffffff) + { + /* x is denormalised or zero */ + if (ux == 0) + /* x is zero */ + return x; + else + { + /* x is denormalised; scale it up */ + /* Normalize x by increasing the exponent by 60 + and subtracting a correction to account for the implicit + bit. This replaces a slow denormalized + multiplication by a fast normal subtraction. */ + static const double corr = 2.5653355008114851558350183e-290; /* 0x03d0000000000000 */ + denorm = 1; + GET_BITS_DP64(x, ux); + PUT_BITS_DP64(ux | 0x03d0000000000000, x); + x -= corr; + GET_BITS_DP64(x, ux); + } + } + + /* Main algorithm */ + + /* + Find y and e such that x = 2^e * y, where y in [1,4). + This is done using an in-lined variant of splitDouble, + which also ensures that e is even. + */ + y = x; + ux &= EXPBITS_DP64; + ux >>= EXPSHIFTBITS_DP64; + if (ux & 1) + { + GET_BITS_DP64(y, u); + u &= (SIGNBIT_DP64 | MANTBITS_DP64); + u |= ONEEXPBITS_DP64; + PUT_BITS_DP64(u, y); + e = ux - EXPBIAS_DP64; + } + else + { + GET_BITS_DP64(y, u); + u &= (SIGNBIT_DP64 | MANTBITS_DP64); + u |= TWOEXPBITS_DP64; + PUT_BITS_DP64(u, y); + e = ux - EXPBIAS_DP64 - 1; + } + + + /* Find the index of the sub-interval of [1,4) in which y lies. */ + + index = (int)(32.0*y+0.5); + + /* Look up the table values and compute c and r = c/t */ + + rtc_lead = rt_jby32_lead_table_dbl[index-32]; + rtc_trail = rt_jby32_trail_table_dbl[index-32]; + c = 0.03125*index; + r = (y - c)/c; + + /* + Find q = sqrt(1+r) - 1. + From one step of Newton on (q+1)^2 = 1+r + */ + + p = r*0.5 - r*r*(0.1250079870 - r*(0.6250522999E-01)); + twop = p + p; + q = p - (p*p + (twop - r))/(twop + 2.0); + + /* Reconstruction */ + + rtc = rtc_lead + rtc_trail; + e >>= 1; /* e = e/2 */ + z = rtc_lead + (rtc*q+rtc_trail); + + if (denorm) + { + /* Scale by 2**(e-30) */ + PUT_BITS_DP64(((long)(e - 30) + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, r); + z *= r; + } + else + { + /* Scale by 2**e */ + PUT_BITS_DP64(((long)e + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, r); + z *= r; + } + + return z; + +} +#endif /* SQRT_AMD_INLINE */ + +#if defined(USE_SQRTF_AMD_INLINE) + +static inline float sqrtf_amd_inline(float x) +{ + /* + Computes the square root of x. + + The calculation is carried out in three steps. + + Step 1. Reduction. + The input argument is scaled to the interval [1, 4) by + computing + x = 2^e * y, where y in [1,4). + Furthermore y is decomposed as y = c + t where + c = 1 + j/32, j = 0,1,..,96; and |t| <= 1/64. + + Step 2. Approximation. + An approximation q = sqrt(1 + (t/c)) - 1 is obtained + from a basic series expansion using precomputed values + stored in rt_jby32_lead_table_float and rt_jby32_trail_table_float. + + Step 3. Reconstruction. + The value of sqrt(x) is reconstructed via + sqrt(x) = 2^(e/2) * sqrt(y) + = 2^(e/2) * sqrt(c) * sqrt(y/c) + = 2^(e/2) * sqrt(c) * sqrt(1 + t/c) + = 2^(e/2) * [ sqrt(c) + sqrt(c)*q ] + */ + + unsigned int ux, ax, u; + float r1, r2, c, y, p, q, r, twop, z, rtc, rtc_lead, rtc_trail; + int e, denorm = 0, index; + +/* Arrays rt_jby32_lead_table_float and rt_jby32_trail_table_float contain + leading and trailing parts respectively of precomputed + values of sqrt(j/32), for j = 32, 33, ..., 128. + rt_jby32_lead_table_float contains the first 13 bits of precision, + and rt_jby32_trail_table_float contains a further 24 bits precision. */ + +static const float rt_jby32_lead_table_float[97] = { + 1.00000000000000000000e+00F, /* 0x3f800000 */ + 1.01538085937500000000e+00F, /* 0x3f81f800 */ + 1.03076171875000000000e+00F, /* 0x3f83f000 */ + 1.04565429687500000000e+00F, /* 0x3f85d800 */ + 1.06054687500000000000e+00F, /* 0x3f87c000 */ + 1.07519531250000000000e+00F, /* 0x3f89a000 */ + 1.08959960937500000000e+00F, /* 0x3f8b7800 */ + 1.10375976562500000000e+00F, /* 0x3f8d4800 */ + 1.11791992187500000000e+00F, /* 0x3f8f1800 */ + 1.13183593750000000000e+00F, /* 0x3f90e000 */ + 1.14550781250000000000e+00F, /* 0x3f92a000 */ + 1.15917968750000000000e+00F, /* 0x3f946000 */ + 1.17236328125000000000e+00F, /* 0x3f961000 */ + 1.18579101562500000000e+00F, /* 0x3f97c800 */ + 1.19873046875000000000e+00F, /* 0x3f997000 */ + 1.21191406250000000000e+00F, /* 0x3f9b2000 */ + 1.22460937500000000000e+00F, /* 0x3f9cc000 */ + 1.23730468750000000000e+00F, /* 0x3f9e6000 */ + 1.25000000000000000000e+00F, /* 0x3fa00000 */ + 1.26220703125000000000e+00F, /* 0x3fa19000 */ + 1.27465820312500000000e+00F, /* 0x3fa32800 */ + 1.28686523437500000000e+00F, /* 0x3fa4b800 */ + 1.29882812500000000000e+00F, /* 0x3fa64000 */ + 1.31079101562500000000e+00F, /* 0x3fa7c800 */ + 1.32275390625000000000e+00F, /* 0x3fa95000 */ + 1.33447265625000000000e+00F, /* 0x3faad000 */ + 1.34619140625000000000e+00F, /* 0x3fac5000 */ + 1.35766601562500000000e+00F, /* 0x3fadc800 */ + 1.36914062500000000000e+00F, /* 0x3faf4000 */ + 1.38061523437500000000e+00F, /* 0x3fb0b800 */ + 1.39184570312500000000e+00F, /* 0x3fb22800 */ + 1.40307617187500000000e+00F, /* 0x3fb39800 */ + 1.41406250000000000000e+00F, /* 0x3fb50000 */ + 1.42504882812500000000e+00F, /* 0x3fb66800 */ + 1.43603515625000000000e+00F, /* 0x3fb7d000 */ + 1.44677734375000000000e+00F, /* 0x3fb93000 */ + 1.45751953125000000000e+00F, /* 0x3fba9000 */ + 1.46826171875000000000e+00F, /* 0x3fbbf000 */ + 1.47900390625000000000e+00F, /* 0x3fbd5000 */ + 1.48950195312500000000e+00F, /* 0x3fbea800 */ + 1.50000000000000000000e+00F, /* 0x3fc00000 */ + 1.51025390625000000000e+00F, /* 0x3fc15000 */ + 1.52050781250000000000e+00F, /* 0x3fc2a000 */ + 1.53076171875000000000e+00F, /* 0x3fc3f000 */ + 1.54101562500000000000e+00F, /* 0x3fc54000 */ + 1.55102539062500000000e+00F, /* 0x3fc68800 */ + 1.56103515625000000000e+00F, /* 0x3fc7d000 */ + 1.57104492187500000000e+00F, /* 0x3fc91800 */ + 1.58105468750000000000e+00F, /* 0x3fca6000 */ + 1.59082031250000000000e+00F, /* 0x3fcba000 */ + 1.60058593750000000000e+00F, /* 0x3fcce000 */ + 1.61035156250000000000e+00F, /* 0x3fce2000 */ + 1.62011718750000000000e+00F, /* 0x3fcf6000 */ + 1.62963867187500000000e+00F, /* 0x3fd09800 */ + 1.63916015625000000000e+00F, /* 0x3fd1d000 */ + 1.64868164062500000000e+00F, /* 0x3fd30800 */ + 1.65820312500000000000e+00F, /* 0x3fd44000 */ + 1.66748046875000000000e+00F, /* 0x3fd57000 */ + 1.67700195312500000000e+00F, /* 0x3fd6a800 */ + 1.68627929687500000000e+00F, /* 0x3fd7d800 */ + 1.69555664062500000000e+00F, /* 0x3fd90800 */ + 1.70458984375000000000e+00F, /* 0x3fda3000 */ + 1.71386718750000000000e+00F, /* 0x3fdb6000 */ + 1.72290039062500000000e+00F, /* 0x3fdc8800 */ + 1.73193359375000000000e+00F, /* 0x3fddb000 */ + 1.74096679687500000000e+00F, /* 0x3fded800 */ + 1.75000000000000000000e+00F, /* 0x3fe00000 */ + 1.75878906250000000000e+00F, /* 0x3fe12000 */ + 1.76757812500000000000e+00F, /* 0x3fe24000 */ + 1.77636718750000000000e+00F, /* 0x3fe36000 */ + 1.78515625000000000000e+00F, /* 0x3fe48000 */ + 1.79394531250000000000e+00F, /* 0x3fe5a000 */ + 1.80273437500000000000e+00F, /* 0x3fe6c000 */ + 1.81127929687500000000e+00F, /* 0x3fe7d800 */ + 1.81982421875000000000e+00F, /* 0x3fe8f000 */ + 1.82836914062500000000e+00F, /* 0x3fea0800 */ + 1.83691406250000000000e+00F, /* 0x3feb2000 */ + 1.84545898437500000000e+00F, /* 0x3fec3800 */ + 1.85400390625000000000e+00F, /* 0x3fed5000 */ + 1.86230468750000000000e+00F, /* 0x3fee6000 */ + 1.87060546875000000000e+00F, /* 0x3fef7000 */ + 1.87915039062500000000e+00F, /* 0x3ff08800 */ + 1.88745117187500000000e+00F, /* 0x3ff19800 */ + 1.89550781250000000000e+00F, /* 0x3ff2a000 */ + 1.90380859375000000000e+00F, /* 0x3ff3b000 */ + 1.91210937500000000000e+00F, /* 0x3ff4c000 */ + 1.92016601562500000000e+00F, /* 0x3ff5c800 */ + 1.92822265625000000000e+00F, /* 0x3ff6d000 */ + 1.93627929687500000000e+00F, /* 0x3ff7d800 */ + 1.94433593750000000000e+00F, /* 0x3ff8e000 */ + 1.95239257812500000000e+00F, /* 0x3ff9e800 */ + 1.96044921875000000000e+00F, /* 0x3ffaf000 */ + 1.96826171875000000000e+00F, /* 0x3ffbf000 */ + 1.97631835937500000000e+00F, /* 0x3ffcf800 */ + 1.98413085937500000000e+00F, /* 0x3ffdf800 */ + 1.99194335937500000000e+00F, /* 0x3ffef800 */ + 2.00000000000000000000e+00F}; /* 0x40000000 */ + +static const float rt_jby32_trail_table_float[97] = { + 0.00000000000000000000e+00F, /* 0x00000000 */ + 1.23941208585165441036e-04F, /* 0x3901f637 */ + 1.46876545841223560274e-05F, /* 0x37766aff */ + 1.70736297150142490864e-04F, /* 0x393307ad */ + 1.13296780909877270460e-04F, /* 0x38ed99bf */ + 9.53458802541717886925e-05F, /* 0x38c7f46e */ + 1.25126505736261606216e-04F, /* 0x39033464 */ + 2.10342666832730174065e-04F, /* 0x395c8f6e */ + 1.14066875539720058441e-04F, /* 0x38ef3730 */ + 8.72047676239162683487e-05F, /* 0x38b6e1b4 */ + 1.36111237225122749805e-04F, /* 0x390eb915 */ + 2.26244374061934649944e-05F, /* 0x37bdc99c */ + 2.40658700931817293167e-04F, /* 0x397c5954 */ + 6.31069415248930454254e-05F, /* 0x38845848 */ + 2.27412077947519719601e-04F, /* 0x396e7577 */ + 5.90185391047270968556e-06F, /* 0x36c6088a */ + 1.35496389702893793583e-04F, /* 0x390e1409 */ + 1.32179571664892137051e-04F, /* 0x390a99af */ + 0.00000000000000000000e+00F, /* 0x00000000 */ + 2.31086043640971183777e-04F, /* 0x39724fb0 */ + 9.66752704698592424393e-05F, /* 0x38cabe24 */ + 8.85332483449019491673e-05F, /* 0x38b9aaed */ + 2.09980673389509320259e-04F, /* 0x395c2e42 */ + 2.20044588786549866199e-04F, /* 0x3966bbc5 */ + 1.21749282698146998882e-04F, /* 0x38ff53a6 */ + 1.62125259521417319775e-04F, /* 0x392a002b */ + 9.97955357888713479042e-05F, /* 0x38d14952 */ + 1.81545779923908412457e-04F, /* 0x393e5d53 */ + 1.65768768056295812130e-04F, /* 0x392dd237 */ + 5.48927710042335093021e-05F, /* 0x38663caa */ + 9.53875860432162880898e-05F, /* 0x38c80ad2 */ + 4.53481625299900770187e-05F, /* 0x383e3438 */ + 1.51062369695864617825e-04F, /* 0x391e667f */ + 1.70453247847035527229e-04F, /* 0x3932bbb2 */ + 1.05505387182347476482e-04F, /* 0x38dd42c6 */ + 2.02269104192964732647e-04F, /* 0x39541833 */ + 2.18442466575652360916e-04F, /* 0x39650db4 */ + 1.55796806211583316326e-04F, /* 0x39235d63 */ + 1.60395247803535312414e-05F, /* 0x37868c9e */ + 4.49578510597348213196e-05F, /* 0x383c9120 */ + 0.00000000000000000000e+00F, /* 0x00000000 */ + 1.26840444863773882389e-04F, /* 0x39050079 */ + 1.82820076588541269302e-04F, /* 0x393fb364 */ + 1.69370483490638434887e-04F, /* 0x3931990b */ + 8.78757418831810355186e-05F, /* 0x38b849ee */ + 1.83815121999941766262e-04F, /* 0x3940be7f */ + 2.14343352126888930798e-04F, /* 0x3960c15b */ + 1.80714370799250900745e-04F, /* 0x393d7e25 */ + 8.41425862745381891727e-05F, /* 0x38b075b5 */ + 1.69945167726837098598e-04F, /* 0x3932334f */ + 1.95121858268976211548e-04F, /* 0x394c99a0 */ + 1.60778334247879683971e-04F, /* 0x3928969b */ + 6.79871009197086095810e-05F, /* 0x388e944c */ + 1.61929419846273958683e-04F, /* 0x3929cb99 */ + 1.99474830878898501396e-04F, /* 0x39512a1e */ + 1.81604162207804620266e-04F, /* 0x393e6cff */ + 1.09270178654696792364e-04F, /* 0x38e527fb */ + 2.27539261686615645885e-04F, /* 0x396e979b */ + 4.90300008095800876617e-05F, /* 0x384da590 */ + 6.28985289949923753738e-05F, /* 0x3883e864 */ + 2.58551553997676819563e-05F, /* 0x37d8e386 */ + 1.82868374395184218884e-04F, /* 0x393fc05b */ + 4.64625991298817098141e-05F, /* 0x3842e0d6 */ + 1.05703387816902250051e-04F, /* 0x38ddad13 */ + 1.17213814519345760345e-04F, /* 0x38f5d0b0 */ + 8.17377731436863541603e-05F, /* 0x38ab6aa2 */ + 0.00000000000000000000e+00F, /* 0x00000000 */ + 1.16847433673683553934e-04F, /* 0x38f50bfd */ + 1.88827965757809579372e-04F, /* 0x3946001f */ + 2.16612941585481166840e-04F, /* 0x39632298 */ + 2.00857131858356297016e-04F, /* 0x39529d2d */ + 1.42199307447299361229e-04F, /* 0x39151b56 */ + 4.12627305195201188326e-05F, /* 0x382d1185 */ + 1.42796401632949709892e-04F, /* 0x3915bb9e */ + 2.03253570361994206905e-04F, /* 0x39552077 */ + 2.23214170546270906925e-04F, /* 0x396a0e99 */ + 2.03244591830298304558e-04F, /* 0x39551e0e */ + 1.43898156238719820976e-04F, /* 0x3916e35e */ + 4.57155256299301981926e-05F, /* 0x383fbeac */ + 1.53365719597786664963e-04F, /* 0x3920d0cc */ + 2.23224633373320102692e-04F, /* 0x396a1168 */ + 1.16566716314991936088e-05F, /* 0x37439106 */ + 7.43694272387074306607e-06F, /* 0x36f98ada */ + 2.11048507480882108212e-04F, /* 0x395d4ce7 */ + 1.34682719362899661064e-04F, /* 0x390d399e */ + 2.29425968427676707506e-05F, /* 0x37c074da */ + 1.20421340398024767637e-04F, /* 0x38fc8ab7 */ + 1.83421318070031702518e-04F, /* 0x394054c9 */ + 2.12376224226318299770e-04F, /* 0x395eb14f */ + 2.07710763788782060146e-04F, /* 0x3959ccef */ + 1.69840845046564936638e-04F, /* 0x3932174e */ + 9.91739216260612010956e-05F, /* 0x38cffb98 */ + 2.40249748458154499531e-04F, /* 0x397beb8d */ + 1.05178231024183332920e-04F, /* 0x38dc9322 */ + 1.82623916771262884140e-04F, /* 0x393f7ebc */ + 2.28821940254420042038e-04F, /* 0x396fefec */ + 0.00000000000000000000e+00F}; /* 0x00000000 */ + + +/* Handle special arguments first */ + + GET_BITS_SP32(x, ux); + ax = ux & (~SIGNBIT_SP32); + + if(ax >= 0x7f800000) + { + /* x is either NaN or infinity */ + if (ux & MANTBITS_SP32) + /* x is NaN */ + return x + x; /* Raise invalid if it is a signalling NaN */ + else if (ux & SIGNBIT_SP32) + return nanf_with_flags(AMD_F_INVALID); + else + /* x is positive infinity */ + return x; + } + else if (ux & SIGNBIT_SP32) + { + /* x is negative. */ + if (x == 0.0F) + /* Handle negative zero first */ + return x; + else + return nanf_with_flags(AMD_F_INVALID); + } + else if (ux <= 0x007fffff) + { + /* x is denormalised or zero */ + if (ux == 0) + /* x is zero */ + return x; + else + { + /* x is denormalised; scale it up */ + /* Normalize x by increasing the exponent by 26 + and subtracting a correction to account for the implicit + bit. This replaces a slow denormalized + multiplication by a fast normal subtraction. */ + static const float corr = 7.888609052210118054e-31F; /* 0x0d800000 */ + denorm = 1; + GET_BITS_SP32(x, ux); + PUT_BITS_SP32(ux | 0x0d800000, x); + x -= corr; + GET_BITS_SP32(x, ux); + } + } + + /* Main algorithm */ + + /* + Find y and e such that x = 2^e * y, where y in [1,4). + This is done using an in-lined variant of splitFloat, + which also ensures that e is even. + */ + y = x; + ux &= EXPBITS_SP32; + ux >>= EXPSHIFTBITS_SP32; + if (ux & 1) + { + GET_BITS_SP32(y, u); + u &= (SIGNBIT_SP32 | MANTBITS_SP32); + u |= ONEEXPBITS_SP32; + PUT_BITS_SP32(u, y); + e = ux - EXPBIAS_SP32; + } + else + { + GET_BITS_SP32(y, u); + u &= (SIGNBIT_SP32 | MANTBITS_SP32); + u |= TWOEXPBITS_SP32; + PUT_BITS_SP32(u, y); + e = ux - EXPBIAS_SP32 - 1; + } + + /* Find the index of the sub-interval of [1,4) in which y lies. */ + + index = (int)(32.0F*y+0.5); + + /* Look up the table values and compute c and r = c/t */ + + rtc_lead = rt_jby32_lead_table_float[index-32]; + rtc_trail = rt_jby32_trail_table_float[index-32]; + c = 0.03125F*index; + r = (y - c)/c; + + /* + Find q = sqrt(1+r) - 1. + From one step of Newton on (q+1)^2 = 1+r + */ + + p = r*0.5F - r*r*(0.1250079870F - r*(0.6250522999e-01F)); + twop = p + p; + q = p - (p*p + (twop - r))/(twop + 2.0); + + /* Reconstruction */ + + rtc = rtc_lead + rtc_trail; + e >>= 1; /* e = e/2 */ + z = rtc_lead + (rtc*q+rtc_trail); + + if (denorm) + { + /* Scale by 2**(e-13) */ + PUT_BITS_SP32(((e - 13) + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, r); + z *= r; + } + else + { + /* Scale by 2**e */ + PUT_BITS_SP32((e + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, r); + z *= r; + } + + return z; + +} +#endif /* SQRTF_AMD_INLINE */ + +#ifdef USE_LOG_KERNEL_AMD +static inline void log_kernel_amd64(double x, unsigned long ux, int *xexp, double *r1, double *r2) +{ + + int expadjust; + double r, z1, z2, correction, f, f1, f2, q, u, v, poly; + int index; + + /* + Computes natural log(x). Algorithm based on: + Ping-Tak Peter Tang + "Table-driven implementation of the logarithm function in IEEE + floating-point arithmetic" + ACM Transactions on Mathematical Software (TOMS) + Volume 16, Issue 4 (December 1990) + */ + +/* Arrays ln_lead_table and ln_tail_table contain + leading and trailing parts respectively of precomputed + values of natural log(1+i/64), for i = 0, 1, ..., 64. + ln_lead_table contains the first 24 bits of precision, + and ln_tail_table contains a further 53 bits precision. */ + + static const double ln_lead_table[65] = { + 0.00000000000000000000e+00, /* 0x0000000000000000 */ + 1.55041813850402832031e-02, /* 0x3f8fc0a800000000 */ + 3.07716131210327148438e-02, /* 0x3f9f829800000000 */ + 4.58095073699951171875e-02, /* 0x3fa7745800000000 */ + 6.06245994567871093750e-02, /* 0x3faf0a3000000000 */ + 7.52233862876892089844e-02, /* 0x3fb341d700000000 */ + 8.96121263504028320312e-02, /* 0x3fb6f0d200000000 */ + 1.03796780109405517578e-01, /* 0x3fba926d00000000 */ + 1.17783010005950927734e-01, /* 0x3fbe270700000000 */ + 1.31576299667358398438e-01, /* 0x3fc0d77e00000000 */ + 1.45181953907012939453e-01, /* 0x3fc2955280000000 */ + 1.58604979515075683594e-01, /* 0x3fc44d2b00000000 */ + 1.71850204467773437500e-01, /* 0x3fc5ff3000000000 */ + 1.84922337532043457031e-01, /* 0x3fc7ab8900000000 */ + 1.97825729846954345703e-01, /* 0x3fc9525a80000000 */ + 2.10564732551574707031e-01, /* 0x3fcaf3c900000000 */ + 2.23143517971038818359e-01, /* 0x3fcc8ff780000000 */ + 2.35566020011901855469e-01, /* 0x3fce270700000000 */ + 2.47836112976074218750e-01, /* 0x3fcfb91800000000 */ + 2.59957492351531982422e-01, /* 0x3fd0a324c0000000 */ + 2.71933674812316894531e-01, /* 0x3fd1675c80000000 */ + 2.83768117427825927734e-01, /* 0x3fd22941c0000000 */ + 2.95464158058166503906e-01, /* 0x3fd2e8e280000000 */ + 3.07025015354156494141e-01, /* 0x3fd3a64c40000000 */ + 3.18453729152679443359e-01, /* 0x3fd4618bc0000000 */ + 3.29753279685974121094e-01, /* 0x3fd51aad80000000 */ + 3.40926527976989746094e-01, /* 0x3fd5d1bd80000000 */ + 3.51976394653320312500e-01, /* 0x3fd686c800000000 */ + 3.62905442714691162109e-01, /* 0x3fd739d7c0000000 */ + 3.73716354370117187500e-01, /* 0x3fd7eaf800000000 */ + 3.84411692619323730469e-01, /* 0x3fd89a3380000000 */ + 3.94993782043457031250e-01, /* 0x3fd9479400000000 */ + 4.05465066432952880859e-01, /* 0x3fd9f323c0000000 */ + 4.15827870368957519531e-01, /* 0x3fda9cec80000000 */ + 4.26084339618682861328e-01, /* 0x3fdb44f740000000 */ + 4.36236739158630371094e-01, /* 0x3fdbeb4d80000000 */ + 4.46287095546722412109e-01, /* 0x3fdc8ff7c0000000 */ + 4.56237375736236572266e-01, /* 0x3fdd32fe40000000 */ + 4.66089725494384765625e-01, /* 0x3fddd46a00000000 */ + 4.75845873355865478516e-01, /* 0x3fde744240000000 */ + 4.85507786273956298828e-01, /* 0x3fdf128f40000000 */ + 4.95077252388000488281e-01, /* 0x3fdfaf5880000000 */ + 5.04556000232696533203e-01, /* 0x3fe02552a0000000 */ + 5.13945698738098144531e-01, /* 0x3fe0723e40000000 */ + 5.23248136043548583984e-01, /* 0x3fe0be72e0000000 */ + 5.32464742660522460938e-01, /* 0x3fe109f380000000 */ + 5.41597247123718261719e-01, /* 0x3fe154c3c0000000 */ + 5.50647079944610595703e-01, /* 0x3fe19ee6a0000000 */ + 5.59615731239318847656e-01, /* 0x3fe1e85f40000000 */ + 5.68504691123962402344e-01, /* 0x3fe23130c0000000 */ + 5.77315330505371093750e-01, /* 0x3fe2795e00000000 */ + 5.86049020290374755859e-01, /* 0x3fe2c0e9e0000000 */ + 5.94707071781158447266e-01, /* 0x3fe307d720000000 */ + 6.03290796279907226562e-01, /* 0x3fe34e2880000000 */ + 6.11801505088806152344e-01, /* 0x3fe393e0c0000000 */ + 6.20240390300750732422e-01, /* 0x3fe3d90260000000 */ + 6.28608644008636474609e-01, /* 0x3fe41d8fe0000000 */ + 6.36907458305358886719e-01, /* 0x3fe4618bc0000000 */ + 6.45137906074523925781e-01, /* 0x3fe4a4f840000000 */ + 6.53301239013671875000e-01, /* 0x3fe4e7d800000000 */ + 6.61398470401763916016e-01, /* 0x3fe52a2d20000000 */ + 6.69430613517761230469e-01, /* 0x3fe56bf9c0000000 */ + 6.77398800849914550781e-01, /* 0x3fe5ad4040000000 */ + 6.85303986072540283203e-01, /* 0x3fe5ee02a0000000 */ + 6.93147122859954833984e-01}; /* 0x3fe62e42e0000000 */ + + static const double ln_tail_table[65] = { + 0.00000000000000000000e+00, /* 0x0000000000000000 */ + 5.15092497094772879206e-09, /* 0x3e361f807c79f3db */ + 4.55457209735272790188e-08, /* 0x3e6873c1980267c8 */ + 2.86612990859791781788e-08, /* 0x3e5ec65b9f88c69e */ + 2.23596477332056055352e-08, /* 0x3e58022c54cc2f99 */ + 3.49498983167142274770e-08, /* 0x3e62c37a3a125330 */ + 3.23392843005887000414e-08, /* 0x3e615cad69737c93 */ + 1.35722380472479366661e-08, /* 0x3e4d256ab1b285e9 */ + 2.56504325268044191098e-08, /* 0x3e5b8abcb97a7aa2 */ + 5.81213608741512136843e-08, /* 0x3e6f34239659a5dc */ + 5.59374849578288093334e-08, /* 0x3e6e07fd48d30177 */ + 5.06615629004996189970e-08, /* 0x3e6b32df4799f4f6 */ + 5.24588857848400955725e-08, /* 0x3e6c29e4f4f21cf8 */ + 9.61968535632653505972e-10, /* 0x3e1086c848df1b59 */ + 1.34829655346594463137e-08, /* 0x3e4cf456b4764130 */ + 3.65557749306383026498e-08, /* 0x3e63a02ffcb63398 */ + 3.33431709374069198903e-08, /* 0x3e61e6a6886b0976 */ + 5.13008650536088382197e-08, /* 0x3e6b8abcb97a7aa2 */ + 5.09285070380306053751e-08, /* 0x3e6b578f8aa35552 */ + 3.20853940845502057341e-08, /* 0x3e6139c871afb9fc */ + 4.06713248643004200446e-08, /* 0x3e65d5d30701ce64 */ + 5.57028186706125221168e-08, /* 0x3e6de7bcb2d12142 */ + 5.48356693724804282546e-08, /* 0x3e6d708e984e1664 */ + 1.99407553679345001938e-08, /* 0x3e556945e9c72f36 */ + 1.96585517245087232086e-09, /* 0x3e20e2f613e85bda */ + 6.68649386072067321503e-09, /* 0x3e3cb7e0b42724f6 */ + 5.89936034642113390002e-08, /* 0x3e6fac04e52846c7 */ + 2.85038578721554472484e-08, /* 0x3e5e9b14aec442be */ + 5.09746772910284482606e-08, /* 0x3e6b5de8034e7126 */ + 5.54234668933210171467e-08, /* 0x3e6dc157e1b259d3 */ + 6.29100830926604004874e-09, /* 0x3e3b05096ad69c62 */ + 2.61974119468563937716e-08, /* 0x3e5c2116faba4cdd */ + 4.16752115011186398935e-08, /* 0x3e665fcc25f95b47 */ + 2.47747534460820790327e-08, /* 0x3e5a9a08498d4850 */ + 5.56922172017964209793e-08, /* 0x3e6de647b1465f77 */ + 2.76162876992552906035e-08, /* 0x3e5da71b7bf7861d */ + 7.08169709942321478061e-09, /* 0x3e3e6a6886b09760 */ + 5.77453510221151779025e-08, /* 0x3e6f0075eab0ef64 */ + 4.43021445893361960146e-09, /* 0x3e33071282fb989b */ + 3.15140984357495864573e-08, /* 0x3e60eb43c3f1bed2 */ + 2.95077445089736670973e-08, /* 0x3e5faf06ecb35c84 */ + 1.44098510263167149349e-08, /* 0x3e4ef1e63db35f68 */ + 1.05196987538551827693e-08, /* 0x3e469743fb1a71a5 */ + 5.23641361722697546261e-08, /* 0x3e6c1cdf404e5796 */ + 7.72099925253243069458e-09, /* 0x3e4094aa0ada625e */ + 5.62089493829364197156e-08, /* 0x3e6e2d4c96fde3ec */ + 3.53090261098577946927e-08, /* 0x3e62f4d5e9a98f34 */ + 3.80080516835568242269e-08, /* 0x3e6467c96ecc5cbe */ + 5.66961038386146408282e-08, /* 0x3e6e7040d03dec5a */ + 4.42287063097349852717e-08, /* 0x3e67bebf4282de36 */ + 3.45294525105681104660e-08, /* 0x3e6289b11aeb783f */ + 2.47132034530447431509e-08, /* 0x3e5a891d1772f538 */ + 3.59655343422487209774e-08, /* 0x3e634f10be1fb591 */ + 5.51581770357780862071e-08, /* 0x3e6d9ce1d316eb93 */ + 3.60171867511861372793e-08, /* 0x3e63562a19a9c442 */ + 1.94511067964296180547e-08, /* 0x3e54e2adf548084c */ + 1.54137376631349347838e-08, /* 0x3e508ce55cc8c97a */ + 3.93171034490174464173e-09, /* 0x3e30e2f613e85bda */ + 5.52990607758839766440e-08, /* 0x3e6db03ebb0227bf */ + 3.29990737637586136511e-08, /* 0x3e61b75bb09cb098 */ + 1.18436010922446096216e-08, /* 0x3e496f16abb9df22 */ + 4.04248680368301346709e-08, /* 0x3e65b3f399411c62 */ + 2.27418915900284316293e-08, /* 0x3e586b3e59f65355 */ + 1.70263791333409206020e-08, /* 0x3e52482ceae1ac12 */ + 5.76999904754328540596e-08}; /* 0x3e6efa39ef35793c */ + + /* Approximating polynomial coefficients for x near 1.0 */ + static const double + ca_1 = 8.33333333333317923934e-02, /* 0x3fb55555555554e6 */ + ca_2 = 1.25000000037717509602e-02, /* 0x3f89999999bac6d4 */ + ca_3 = 2.23213998791944806202e-03, /* 0x3f62492307f1519f */ + ca_4 = 4.34887777707614552256e-04; /* 0x3f3c8034c85dfff0 */ + + /* Approximating polynomial coefficients for other x */ + static const double + cb_1 = 8.33333333333333593622e-02, /* 0x3fb5555555555557 */ + cb_2 = 1.24999999978138668903e-02, /* 0x3f89999999865ede */ + cb_3 = 2.23219810758559851206e-03; /* 0x3f6249423bd94741 */ + + static const unsigned long + log_thresh1 = 0x3fee0faa00000000, + log_thresh2 = 0x3ff1082c00000000; + + /* log_thresh1 = 9.39412117004394531250e-1 = 0x3fee0faa00000000 + log_thresh2 = 1.06449508666992187500 = 0x3ff1082c00000000 */ + if (ux >= log_thresh1 && ux <= log_thresh2) + { + /* Arguments close to 1.0 are handled separately to maintain + accuracy. + + The approximation in this region exploits the identity + log( 1 + r ) = log( 1 + u/2 ) / log( 1 - u/2 ), where + u = 2r / (2+r). + Note that the right hand side has an odd Taylor series expansion + which converges much faster than the Taylor series expansion of + log( 1 + r ) in r. Thus, we approximate log( 1 + r ) by + u + A1 * u^3 + A2 * u^5 + ... + An * u^(2n+1). + + One subtlety is that since u cannot be calculated from + r exactly, the rounding error in the first u should be + avoided if possible. To accomplish this, we observe that + u = r - r*r/(2+r). + Since x (=1+r) is the input argument, and thus presumed exact, + the formula above approximates u accurately because + u = r - correction, + and the magnitude of "correction" (of the order of r*r) + is small. + With these observations, we will approximate log( 1 + r ) by + r + ( (A1*u^3 + ... + An*u^(2n+1)) - correction ). + + We approximate log(1+r) by an odd polynomial in u, where + u = 2r/(2+r) = r - r*r/(2+r). + */ + r = x - 1.0; + u = r / (2.0 + r); + correction = r * u; + u = u + u; + v = u * u; + z1 = r; + z2 = (u * v * (ca_1 + v * (ca_2 + v * (ca_3 + v * ca_4))) - correction); + *r1 = z1; + *r2 = z2; + *xexp = 0; + } + else + { + /* + First, we decompose the argument x to the form + x = 2**M * (F1 + F2), + where 1 <= F1+F2 < 2, M has the value of an integer, + F1 = 1 + j/64, j ranges from 0 to 64, and |F2| <= 1/128. + + Second, we approximate log( 1 + F2/F1 ) by an odd polynomial + in U, where U = 2 F2 / (2 F2 + F1). + Note that log( 1 + F2/F1 ) = log( 1 + U/2 ) - log( 1 - U/2 ). + The core approximation calculates + Poly = [log( 1 + U/2 ) - log( 1 - U/2 )]/U - 1. + Note that log(1 + U/2) - log(1 - U/2) = 2 arctanh ( U/2 ), + thus, Poly = 2 arctanh( U/2 ) / U - 1. + + It is not hard to see that + log(x) = M*log(2) + log(F1) + log( 1 + F2/F1 ). + Hence, we return Z1 = log(F1), and Z2 = log( 1 + F2/F1). + The values of log(F1) are calculated beforehand and stored + in the program. + */ + + f = x; + if (ux < IMPBIT_DP64) + { + /* The input argument x is denormalized */ + /* Normalize f by increasing the exponent by 60 + and subtracting a correction to account for the implicit + bit. This replaces a slow denormalized + multiplication by a fast normal subtraction. */ + static const double corr = 2.5653355008114851558350183e-290; /* 0x03d0000000000000 */ + GET_BITS_DP64(f, ux); + ux |= 0x03d0000000000000; + PUT_BITS_DP64(ux, f); + f -= corr; + GET_BITS_DP64(f, ux); + expadjust = 60; + } + else + expadjust = 0; + + /* Store the exponent of x in xexp and put + f into the range [0.5,1) */ + *xexp = (int)((ux & EXPBITS_DP64) >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64 - expadjust; + PUT_BITS_DP64((ux & MANTBITS_DP64) | HALFEXPBITS_DP64, f); + + /* Now x = 2**xexp * f, 1/2 <= f < 1. */ + + /* Set index to be the nearest integer to 128*f */ + r = 128.0 * f; + index = (int)(r + 0.5); + + z1 = ln_lead_table[index-64]; + q = ln_tail_table[index-64]; + f1 = index * 0.0078125; /* 0.0078125 = 1/128 */ + f2 = f - f1; + /* At this point, x = 2**xexp * ( f1 + f2 ) where + f1 = j/128, j = 64, 65, ..., 128 and |f2| <= 1/256. */ + + /* Calculate u = 2 f2 / ( 2 f1 + f2 ) = f2 / ( f1 + 0.5*f2 ) */ + /* u = f2 / (f1 + 0.5 * f2); */ + u = f2 / (f1 + 0.5 * f2); + + /* Here, |u| <= 2(exp(1/16)-1) / (exp(1/16)+1). + The core approximation calculates + poly = [log(1 + u/2) - log(1 - u/2)]/u - 1 */ + v = u * u; + poly = (v * (cb_1 + v * (cb_2 + v * cb_3))); + z2 = q + (u + u * poly); + *r1 = z1; + *r2 = z2; + } + return; +} +#endif /* USE_LOG_KERNEL_AMD */ + +#if defined(USE_REMAINDER_PIBY2F_INLINE) +/* Define this to get debugging print statements activated */ +#define DEBUGGING_PRINT +#undef DEBUGGING_PRINT + + +#ifdef DEBUGGING_PRINT +#include +char *d2b(long d, int bitsper, int point) +{ + static char buff[200]; + int i, j; + j = bitsper; + if (point >= 0 && point <= bitsper) + j++; + buff[j] = '\0'; + for (i = bitsper - 1; i >= 0; i--) + { + j--; + if (d % 2 == 1) + buff[j] = '1'; + else + buff[j] = '0'; + if (i == point) + { + j--; + buff[j] = '.'; + } + d /= 2; + } + return buff; +} +#endif + +/* Given positive argument x, reduce it to the range [-pi/4,pi/4] using + extra precision, and return the result in r. + Return value "region" tells how many lots of pi/2 were subtracted + from x to put it in the range [-pi/4,pi/4], mod 4. */ +static inline void __remainder_piby2f_inline(unsigned long ux, double *r, int *region) +{ + + /* This method simulates multi-precision floating-point + arithmetic and is accurate for all 1 <= x < infinity */ +#if 0 + const int bitsper = 36; +#else +#define bitsper 36 +#endif + unsigned long res[10]; + unsigned long u, carry, mask, mant, nextbits; + int first, last, i, rexp, xexp, resexp, ltb, determ, bc; + double dx; + static const double + piby2 = 1.57079632679489655800e+00; /* 0x3ff921fb54442d18 */ + static unsigned long pibits[] = + { + 0LL, + 5215LL, 13000023176LL, 11362338026LL, 67174558139LL, + 34819822259LL, 10612056195LL, 67816420731LL, 57840157550LL, + 19558516809LL, 50025467026LL, 25186875954LL, 18152700886LL + }; + + xexp = (int)(((ux & EXPBITS_DP64) >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64); + ux = ((ux & MANTBITS_DP64) | IMPBIT_DP64) >> 29; + + + /* Now ux is the mantissa bit pattern of x as a long integer */ + mask = 1; + mask = (mask << bitsper) - 1; + + /* Set first and last to the positions of the first + and last chunks of 2/pi that we need */ + first = xexp / bitsper; + resexp = xexp - first * bitsper; + /* 120 is the theoretical maximum number of bits (actually + 115 for IEEE single precision) that we need to extract + from the middle of 2/pi to compute the reduced argument + accurately enough for our purposes */ + last = first + 120 / bitsper; + +#ifdef DEBUGGING_PRINT + printf("first = %d, last = %d\n", first, last); +#endif + + /* Do a long multiplication of the bits of 2/pi by the + integer mantissa */ +#if 0 + for (i = last; i >= first; i--) + { + u = pibits[i] * ux + carry; + res[i - first] = u & mask; + carry = u >> bitsper; + } + res[last - first + 1] = 0; +#else + /* Unroll the loop. This is only correct because we know + that bitsper is fixed as 36. */ + res[4] = 0; + u = pibits[last] * ux; + res[3] = u & mask; + carry = u >> bitsper; + u = pibits[last - 1] * ux + carry; + res[2] = u & mask; + carry = u >> bitsper; + u = pibits[last - 2] * ux + carry; + res[1] = u & mask; + carry = u >> bitsper; + u = pibits[first] * ux + carry; + res[0] = u & mask; +#endif + +#ifdef DEBUGGING_PRINT + printf("resexp = %d\n", resexp); + printf("Significant part of x * 2/pi with binary" + " point in correct place:\n"); + for (i = 0; i <= last - first; i++) + { + if (i > 0 && i % 5 == 0) + printf("\n "); + if (i == 1) + printf("%s ", d2b(res[i], bitsper, resexp)); + else + printf("%s ", d2b(res[i], bitsper, -1)); + } + printf("\n"); +#endif + + /* Reconstruct the result */ + ltb = (int)((((res[0] << bitsper) | res[1]) + >> (bitsper - 1 - resexp)) & 7); + + /* determ says whether the fractional part is >= 0.5 */ + determ = ltb & 1; + +#ifdef DEBUGGING_PRINT + printf("ltb = %d (last two bits before binary point" + " and first bit after)\n", ltb); + printf("determ = %d (1 means need to negate because the fractional\n" + " part of x * 2/pi is greater than 0.5)\n", determ); +#endif + + i = 1; + if (determ) + { + /* The mantissa is >= 0.5. We want to subtract it + from 1.0 by negating all the bits */ + *region = ((ltb >> 1) + 1) & 3; + mant = 1; + mant = ~(res[1]) & ((mant << (bitsper - resexp)) - 1); + while (mant < 0x0000000000010000) + { + i++; + mant = (mant << bitsper) | (~(res[i]) & mask); + } + nextbits = (~(res[i+1]) & mask); + } + else + { + *region = (ltb >> 1); + mant = 1; + mant = res[1] & ((mant << (bitsper - resexp)) - 1); + while (mant < 0x0000000000010000) + { + i++; + mant = (mant << bitsper) | res[i]; + } + nextbits = res[i+1]; + } + +#ifdef DEBUGGING_PRINT + printf("First bits of mant = %s\n", d2b(mant, bitsper, -1)); +#endif + + /* Normalize the mantissa. The shift value 6 here, determined by + trial and error, seems to give optimal speed. */ + bc = 0; + while (mant < 0x0000400000000000) + { + bc += 6; + mant <<= 6; + } + while (mant < 0x0010000000000000) + { + bc++; + mant <<= 1; + } + mant |= nextbits >> (bitsper - bc); + + rexp = 52 + resexp - bc - i * bitsper; + +#ifdef DEBUGGING_PRINT + printf("Normalised mantissa = 0x%016lx\n", mant); + printf("Exponent to be inserted on mantissa = rexp = %d\n", rexp); +#endif + + /* Put the result exponent rexp onto the mantissa pattern */ + u = ((unsigned long)rexp + EXPBIAS_DP64) << EXPSHIFTBITS_DP64; + ux = (mant & MANTBITS_DP64) | u; + if (determ) + /* If we negated the mantissa we negate x too */ + ux |= SIGNBIT_DP64; + PUT_BITS_DP64(ux, dx); + +#ifdef DEBUGGING_PRINT + printf("(x*2/pi) = %25.20e = %s\n", dx, double2hex(&dx)); +#endif + + /* x is a double precision version of the fractional part of + x * 2 / pi. Multiply x by pi/2 in double precision + to get the reduced argument r. */ + *r = dx * piby2; + +#ifdef DEBUGGING_PRINT + printf(" r = frac(x*2/pi) * pi/2:\n"); + printf(" r = %25.20e = %s\n", *r, double2hex(r)); + printf("region = (number of pi/2 subtracted from x) mod 4 = %d\n", + *region); +#endif +} +#endif /* USE_REMAINDER_PIBY2F_INLINE */ + +#endif /* LIBM_INLINES_AMD_H_INCLUDED */ diff --git a/sdk/lib/crt/math/libm_sse2/libm_new.h b/sdk/lib/crt/math/libm_sse2/libm_new.h new file mode 100644 index 00000000000..d6d54f9d36a --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/libm_new.h @@ -0,0 +1,122 @@ + +/***********************************************************************************/ +/** MIT License **/ +/** ----------- **/ +/** **/ +/** Copyright (c) 2002-2019 Advanced Micro Devices, Inc. **/ +/** **/ +/** Permission is hereby granted, free of charge, to any person obtaining a copy **/ +/** of this Software and associated documentaon files (the "Software"), to deal **/ +/** in the Software without restriction, including without limitation the rights **/ +/** to use, copy, modify, merge, publish, distribute, sublicense, and/or sell **/ +/** copies of the Software, and to permit persons to whom the Software is **/ +/** furnished to do so, subject to the following conditions: **/ +/** **/ +/** The above copyright notice and this permission notice shall be included in **/ +/** all copies or substantial portions of the Software. **/ +/** **/ +/** THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR **/ +/** IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, **/ +/** FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE **/ +/** AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER **/ +/** LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, **/ +/** OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN **/ +/** THE SOFTWARE. **/ +/***********************************************************************************/ + +#ifndef __LIBM_NEW_H__ +#define __LIBM_NEW_H__ + +// Defines, protos, etc for *new* math funcs updated by AMD 11/2008 +// Old files will continue to include libm_util.h, libm.h, libm_inlines.h +// until such time as these have all been refreshed w/ new versions. + +typedef float F32; +typedef unsigned int U32; + +typedef double F64; +typedef unsigned long long U64; + +union UT32_ +{ + F32 f32; + U32 u32; +}; + +union UT64_ +{ + F64 f64; + U64 u64; + + F32 f32[2]; + U32 u32[2]; +}; + +typedef union UT32_ UT32; +typedef union UT64_ UT64; + +#define SIGN_MASK_32 0x80000000 +#define MANTISSA_MASK_32 0x007fffff +#define EXPONENT_MASK_32 0x7f800000 +#define QNAN_MASK_32 0x00400000 + +#define INF_POS_32 0x7f800000 +#define INF_NEG_32 0xff800000 +#define QNAN_POS_32 0x7fc00000 +#define QNAN_NEG_32 0xffc00000 +#define IND_32 0xffc00000 + +#define EXPONENT_FULL_32 0x7f800000 +#define SIGN_SET_32 0x80000000 +#define QNAN_SET_32 0x00400000 + +#define INF_POS_64 0x7ff0000000000000 +#define INF_NEG_64 0xfff0000000000000 + +#define MANTISSA_MASK_64 0x000fffffffffffff +#define SIGN_MASK_64 0x8000000000000000 +#define IND_64 0xfff8000000000000 +#define QNAN_MASK_64 0x0008000000000000 + +// constants for 'flags' argument of _handle_error and _handle_errorf +#define AMD_F_INEXACT 0x00000010 +#define AMD_F_OVERFLOW 0x00000001 +#define AMD_F_UNDERFLOW 0x00000002 +#define AMD_F_DIVBYZERO 0x00000004 +#define AMD_F_INVALID 0x00000008 + +// define the Microsoft specific error handling routine + +// Note to mainainers: +// These prototypes may appear, at first glance, to differ from the versions +// declared in libm_inlines.h and defined in libm_error.c. The third +// parameter appears to have changed type from unsigned long to unsigned long +// long. In fact they are the same because in both of the aforementioned +// files, long has been #defined to __int64 in a most cowardly fashion. This +// disgusts me. The buck stops here. - MAS + +double _handle_error( + char *fname, + int opcode, + unsigned long long value, + int type, + int flags, + int error, + double arg1, + double arg2, + int nargs + ); +float _handle_errorf( + char *fname, + int opcode, + unsigned long long value, + int type, + int flags, + int error, + float arg1, + float arg2, + int nargs + ); + +#endif // __LIBM_NEW_H + diff --git a/sdk/lib/crt/math/libm_sse2/libm_util.h b/sdk/lib/crt/math/libm_sse2/libm_util.h new file mode 100644 index 00000000000..e62d32d155f --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/libm_util.h @@ -0,0 +1,150 @@ +/***********************************************************************************/ +/** MIT License **/ +/** ----------- **/ +/** **/ +/** Copyright (c) 2002-2019 Advanced Micro Devices, Inc. **/ +/** **/ +/** Permission is hereby granted, free of charge, to any person obtaining a copy **/ +/** of this Software and associated documentaon files (the "Software"), to deal **/ +/** in the Software without restriction, including without limitation the rights **/ +/** to use, copy, modify, merge, publish, distribute, sublicense, and/or sell **/ +/** copies of the Software, and to permit persons to whom the Software is **/ +/** furnished to do so, subject to the following conditions: **/ +/** **/ +/** The above copyright notice and this permission notice shall be included in **/ +/** all copies or substantial portions of the Software. **/ +/** **/ +/** THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR **/ +/** IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, **/ +/** FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE **/ +/** AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER **/ +/** LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, **/ +/** OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN **/ +/** THE SOFTWARE. **/ +/***********************************************************************************/ + +#ifndef LIBM_UTIL_AMD_H_INCLUDED +#define LIBM_UTIL_AMD_H_INCLUDED 1 + +#define inline __inline +#undef long +#define long __int64 + +#include "emmintrin.h" +#include "float.h" + + + +/* Compile-time verification that type long is the same size + as type double (i.e. we are really on a 64-bit machine) */ +void check_long_against_double_size(int machine_is_64_bit[(sizeof(long) == sizeof(double))?1:-1]); + + +/* Definitions for double functions on 64 bit machines */ +#define SIGNBIT_DP64 0x8000000000000000 +#define EXPBITS_DP64 0x7ff0000000000000 +#define MANTBITS_DP64 0x000fffffffffffff +#define ONEEXPBITS_DP64 0x3ff0000000000000 +#define TWOEXPBITS_DP64 0x4000000000000000 +#define HALFEXPBITS_DP64 0x3fe0000000000000 +#define IMPBIT_DP64 0x0010000000000000 +#define QNANBITPATT_DP64 0x7ff8000000000000 +#define INDEFBITPATT_DP64 0xfff8000000000000 +#define PINFBITPATT_DP64 0x7ff0000000000000 +#define NINFBITPATT_DP64 0xfff0000000000000 +#define EXPBIAS_DP64 1023 +#define EXPSHIFTBITS_DP64 52 +#define BIASEDEMIN_DP64 1 +#define EMIN_DP64 -1022 +#define BIASEDEMAX_DP64 2046 +#define EMAX_DP64 1023 +#define LAMBDA_DP64 1.0e300 +#define MANTLENGTH_DP64 53 +#define BASEDIGITS_DP64 15 + + +/* These definitions, used by float functions, + are for both 32 and 64 bit machines */ +#define SIGNBIT_SP32 0x80000000 +#define EXPBITS_SP32 0x7f800000 +#define MANTBITS_SP32 0x007fffff +#define ONEEXPBITS_SP32 0x3f800000 +#define TWOEXPBITS_SP32 0x40000000 +#define HALFEXPBITS_SP32 0x3f000000 +#define IMPBIT_SP32 0x00800000 +#define QNANBITPATT_SP32 0x7fc00000 +#define INDEFBITPATT_SP32 0xffc00000 +#define PINFBITPATT_SP32 0x7f800000 +#define NINFBITPATT_SP32 0xff800000 +#define EXPBIAS_SP32 127 +#define EXPSHIFTBITS_SP32 23 +#define BIASEDEMIN_SP32 1 +#define EMIN_SP32 -126 +#define BIASEDEMAX_SP32 254 +#define EMAX_SP32 127 +#define LAMBDA_SP32 1.0e30 +#define MANTLENGTH_SP32 24 +#define BASEDIGITS_SP32 7 + +#define CLASS_SIGNALLING_NAN 1 +#define CLASS_QUIET_NAN 2 +#define CLASS_NEGATIVE_INFINITY 3 +#define CLASS_NEGATIVE_NORMAL_NONZERO 4 +#define CLASS_NEGATIVE_DENORMAL 5 +#define CLASS_NEGATIVE_ZERO 6 +#define CLASS_POSITIVE_ZERO 7 +#define CLASS_POSITIVE_DENORMAL 8 +#define CLASS_POSITIVE_NORMAL_NONZERO 9 +#define CLASS_POSITIVE_INFINITY 10 + +#define OLD_BITS_SP32(x) (*((unsigned int *)&x)) +#define OLD_BITS_DP64(x) (*((unsigned long *)&x)) + +/* Alternatives to the above functions which don't have + problems when using high optimization levels on gcc */ +#define GET_BITS_SP32(x, ux) \ + { \ + volatile union {float f; unsigned int i;} _bitsy; \ + _bitsy.f = (x); \ + ux = _bitsy.i; \ + } +#define PUT_BITS_SP32(ux, x) \ + { \ + volatile union {float f; unsigned int i;} _bitsy; \ + _bitsy.i = (ux); \ + x = _bitsy.f; \ + } + +#define GET_BITS_DP64(x, ux) \ + { \ + volatile union {double d; unsigned long i;} _bitsy; \ + _bitsy.d = (x); \ + ux = _bitsy.i; \ + } +#define PUT_BITS_DP64(ux, x) \ + { \ + volatile union {double d; unsigned long i;} _bitsy; \ + _bitsy.i = (ux); \ + x = _bitsy.d; \ + } + + +/* Processor-dependent floating-point status flags */ +#define AMD_F_OVERFLOW 0x00000001 +#define AMD_F_UNDERFLOW 0x00000002 +#define AMD_F_DIVBYZERO 0x00000004 +#define AMD_F_INVALID 0x00000008 +#define AMD_F_INEXACT 0x00000010 + +/* Processor-dependent floating-point precision-control flags */ +#define AMD_F_EXTENDED 0x00000300 +#define AMD_F_DOUBLE 0x00000200 +#define AMD_F_SINGLE 0x00000000 + +/* Processor-dependent floating-point rounding-control flags */ +#define AMD_F_RC_NEAREST 0x00000000 +#define AMD_F_RC_DOWN 0x00002000 +#define AMD_F_RC_UP 0x00004000 +#define AMD_F_RC_ZERO 0x00006000 + +#endif /* LIBM_UTIL_AMD_H_INCLUDED */ diff --git a/sdk/lib/crt/math/libm_sse2/log.asm b/sdk/lib/crt/math/libm_sse2/log.asm new file mode 100644 index 00000000000..585f70d3a31 --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/log.asm @@ -0,0 +1,557 @@ +; +; MIT License +; ----------- +; +; Copyright (c) 2002-2019 Advanced Micro Devices, Inc. +; +; Permission is hereby granted, free of charge, to any person obtaining a copy +; of this Software and associated documentaon files (the "Software"), to deal +; in the Software without restriction, including without limitation the rights +; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +; copies of the Software, and to permit persons to whom the Software is +; furnished to do so, subject to the following conditions: +; +; The above copyright notice and this permission notice shall be included in +; all copies or substantial portions of the Software. +; +; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +; THE SOFTWARE. +; +; log.asm +; +; An implementation of the log libm function. +; +; Prototype: +; +; double log(double x); +; + +; +; Algorithm: +; +; Based on: +; Ping-Tak Peter Tang +; "Table-driven implementation of the logarithm function in IEEE +; floating-point arithmetic" +; ACM Transactions on Mathematical Software (TOMS) +; Volume 16, Issue 4 (December 1990) +; +; +; x very close to 1.0 is handled differently, for x everywhere else +; a brief explanation is given below +; +; x = (2^m)*A +; x = (2^m)*(G+g) with (1 <= G < 2) and (g <= 2^(-9)) +; x = (2^m)*2*(G/2+g/2) +; x = (2^m)*2*(F+f) with (0.5 <= F < 1) and (f <= 2^(-10)) +; +; Y = (2^(-1))*(2^(-m))*(2^m)*A +; Now, range of Y is: 0.5 <= Y < 1 +; +; F = 0x100 + (first 8 mantissa bits) + (9th mantissa bit) +; Now, range of F is: 256 <= F <= 512 +; F = F / 512 +; Now, range of F is: 0.5 <= F <= 1 +; +; f = -(Y-F), with (f <= 2^(-10)) +; +; log(x) = m*log(2) + log(2) + log(F-f) +; log(x) = m*log(2) + log(2) + log(F) + log(1-(f/F)) +; log(x) = m*log(2) + log(2*F) + log(1-r) +; +; r = (f/F), with (r <= 2^(-9)) +; r = f*(1/F) with (1/F) precomputed to avoid division +; +; log(x) = m*log(2) + log(G) - poly +; +; log(G) is precomputed +; poly = (r + (r^2)/2 + (r^3)/3 + (r^4)/4) + (r^5)/5) + (r^6)/6)) +; +; log(2) and log(G) need to be maintained in extra precision +; to avoid losing precision in the calculations +; + +.const +ALIGN 16 + +__real_ninf DQ 0fff0000000000000h ; -inf + DQ 0000000000000000h +__real_inf DQ 7ff0000000000000h ; +inf + DQ 0000000000000000h +__real_neg_qnan DQ 0fff8000000000000h ; neg qNaN + DQ 0000000000000000h +__real_qnanbit DQ 0008000000000000h + DQ 0000000000000000h +__real_min_norm DQ 0010000000000000h + DQ 0000000000000000h +__real_mant DQ 000FFFFFFFFFFFFFh ; mantissa bits + DQ 0000000000000000h +__mask_1023 DQ 00000000000003ffh + DQ 0000000000000000h +__mask_001 DQ 0000000000000001h + DQ 0000000000000000h + +__mask_mant_all8 DQ 000ff00000000000h + DQ 0000000000000000h +__mask_mant9 DQ 0000080000000000h + DQ 0000000000000000h + +__real_two DQ 4000000000000000h ; 2 + DQ 0000000000000000h + +__real_one DQ 3ff0000000000000h ; 1 + DQ 0000000000000000h + +__real_near_one_lt DQ 3fee000000000000h ; .9375 + DQ 0000000000000000h + +__real_near_one_gt DQ 3ff1000000000000h ; 1.0625 + DQ 0000000000000000h + +__real_half DQ 3fe0000000000000h ; 1/2 + DQ 0000000000000000h + +__mask_100 DQ 0000000000000100h + DQ 0000000000000000h + +__real_1_over_512 DQ 3f60000000000000h + DQ 0000000000000000h + +__real_1_over_2 DQ 3fe0000000000000h + DQ 0000000000000000h +__real_1_over_3 DQ 3fd5555555555555h + DQ 0000000000000000h +__real_1_over_4 DQ 3fd0000000000000h + DQ 0000000000000000h +__real_1_over_5 DQ 3fc999999999999ah + DQ 0000000000000000h +__real_1_over_6 DQ 3fc5555555555555h + DQ 0000000000000000h + +__mask_1023_f DQ 0c08ff80000000000h + DQ 0000000000000000h + +__mask_2045 DQ 00000000000007fdh + DQ 0000000000000000h + +__real_threshold DQ 3fb0000000000000h ; .0625 + DQ 0000000000000000h + +__real_notsign DQ 7ffFFFFFFFFFFFFFh ; ^sign bit + DQ 0000000000000000h + +__real_ca1 DQ 3fb55555555554e6h ; 8.33333333333317923934e-02 + DQ 0000000000000000h +__real_ca2 DQ 3f89999999bac6d4h ; 1.25000000037717509602e-02 + DQ 0000000000000000h +__real_ca3 DQ 3f62492307f1519fh ; 2.23213998791944806202e-03 + DQ 0000000000000000h +__real_ca4 DQ 3f3c8034c85dfff0h ; 4.34887777707614552256e-04 + DQ 0000000000000000h +__real_log2_lead DQ 03fe62e42e0000000h ; 6.93147122859954833984e-01 + DQ 00000000000000000h +__real_log2_tail DQ 03e6efa39ef35793ch ; 5.76999904754328540596e-08 + DQ 00000000000000000h + +; these codes and the ones in the corresponding .c file have to match +__flag_x_zero DD 00000001 +__flag_x_neg DD 00000002 +__flag_x_nan DD 00000003 + + +EXTRN __log_256_lead:QWORD +EXTRN __log_256_tail:QWORD +EXTRN __log_F_inv_qword:QWORD +EXTRN __use_fma3_lib:DWORD + + +fname TEXTEQU +fname_special TEXTEQU <_log_special> + +; define local variable storage offsets + +save_xmm6 EQU 20h +dummy_space EQU 40h + +stack_size EQU 58h + +include fm.inc + +; external function +EXTERN fname_special:PROC + +.code +ALIGN 16 +PUBLIC fname +fname PROC FRAME + StackAllocate stack_size + SaveXmm xmm6, save_xmm6 + .ENDPROLOG + + cmp DWORD PTR __use_fma3_lib, 0 + jne Llog_fma3 + +Llog_sse2: + + ; compute exponent part + movdqa xmm3, xmm0 + movapd xmm4, xmm0 + psrlq xmm3, 52 + movd rax, xmm0 + psubq xmm3, XMMWORD PTR __mask_1023 + + ; NaN or inf + mov rcx, rax + btr rcx, 63 + cmp rcx, QWORD PTR __real_inf + jae __x_is_inf_or_nan + + movdqa xmm2, xmm0 + cvtdq2pd xmm6, xmm3 ; xexp + + + pand xmm2, XMMWORD PTR __real_mant + subsd xmm4, QWORD PTR __real_one + + comisd xmm6, QWORD PTR __mask_1023_f + je __denormal_adjust + +__continue_common: + + andpd xmm4, XMMWORD PTR __real_notsign + ; compute index into the log tables + mov r9, rax + and rax, QWORD PTR __mask_mant_all8 + and r9, QWORD PTR __mask_mant9 + shl r9, 1 + add rax, r9 + movd xmm1, rax + + ; near one codepath + comisd xmm4, QWORD PTR __real_threshold + jb __near_one + + ; F, Y + shr rax, 44 + por xmm2, XMMWORD PTR __real_half + por xmm1, XMMWORD PTR __real_half + lea r9, __log_F_inv_qword + + ; check for negative numbers or zero + xorpd xmm5, xmm5 + comisd xmm0, xmm5 + jbe __x_is_zero_or_neg + + ; f = F - Y, r = f * inv + subsd xmm1, xmm2 ; xmm1 <-- f = F - Y + mulsd xmm1, QWORD PTR [r9+rax*8] ; xmm1 <-- r = f * inv + + movapd xmm2, xmm1 ; xmm2 <-- copy of r + movapd xmm0, xmm1 ; xmm0 <-- copy of r + lea r9, QWORD PTR __log_256_lead + + ; poly + movsd xmm3, QWORD PTR __real_1_over_6 + movsd xmm1, QWORD PTR __real_1_over_3 + mulsd xmm3, xmm2 ; xmm3 <-- r/6 + mulsd xmm1, xmm2 ; xmm1 <-- r/3 + mulsd xmm0, xmm2 ; xmm0 <-- r*r + movapd xmm4, xmm0 ; xmm4 <-- copy of r*r + addsd xmm3, QWORD PTR __real_1_over_5 ; xmm3 <-- r/6 + 1/5 + addsd xmm1, QWORD PTR __real_1_over_2 ; xmm1 <-- r/3 + 1/2 + mulsd xmm4, xmm0 ; xmm4 <-- r^4 + mulsd xmm3, xmm2 ; xmm3 <-- (r/6 + 1/5)*r + mulsd xmm1, xmm0 ; xmm1 <-- (r/3 + 1/2)*r^2 + addsd xmm3, QWORD PTR __real_1_over_4 ; xmm3 <-- (r/6 + 1/5)*r + 1/4 + addsd xmm1, xmm2 ; xmm1 <-- (r/3 + 1/2)*r^2 + r + mulsd xmm3, xmm4 ; xmm3 <-- ((r/6+1/5)*r+1/4)*r^4 + addsd xmm1, xmm3 ; xmm1 <-- poly + + ; m*log(2)_tail + log(G)_tail - poly + movsd xmm5, QWORD PTR __real_log2_tail + mulsd xmm5, xmm6 ; xmm5 <-- m*log2_tail + subsd xmm5, xmm1 ; xmm5 <-- m*log2_tail - poly + + movsd xmm0, QWORD PTR [r9+rax*8] ; xmm0 <-- log(G)_lead + lea rdx, QWORD PTR __log_256_tail + movsd xmm2, QWORD PTR [rdx+rax*8] ; xmm2 <-- log(G)_tail + addsd xmm2, xmm5 ; xmm2 <-- (m*log2_tail - poly) + log(G)_tail + + movsd xmm4, QWORD PTR __real_log2_lead + mulsd xmm4, xmm6 ; xmm4 <-- m*log2_lead + addsd xmm0, xmm4 ; xmm0 <-- m*log2_lead + log(G)_lead + + addsd xmm0, xmm2 ; xmm0 <-- m*log(2)_tail + log(G)_tail - poly + + RestoreXmm xmm6, save_xmm6 + StackDeallocate stack_size + ret + +ALIGN 16 +__near_one: + + ; r = x - 1.0 + movsd xmm2, QWORD PTR __real_two + subsd xmm0, QWORD PTR __real_one ; r + + addsd xmm2, xmm0 + movsd xmm1, xmm0 + divsd xmm1, xmm2 ; r/(2+r) = u/2 + + movsd xmm4, QWORD PTR __real_ca2 + movsd xmm5, QWORD PTR __real_ca4 + + movsd xmm6, xmm0 + mulsd xmm6, xmm1 ; correction + + addsd xmm1, xmm1 ; u + movsd xmm2, xmm1 + + mulsd xmm2, xmm1 ; u^2 + + mulsd xmm4, xmm2 + mulsd xmm5, xmm2 + + addsd xmm4, __real_ca1 + addsd xmm5, __real_ca3 + + mulsd xmm2, xmm1 ; u^3 + mulsd xmm4, xmm2 + + mulsd xmm2, xmm2 + mulsd xmm2, xmm1 ; u^7 + mulsd xmm5, xmm2 + + addsd xmm4, xmm5 + subsd xmm4, xmm6 + addsd xmm0, xmm4 + + RestoreXmm xmm6, save_xmm6 + StackDeallocate stack_size + ret + +ALIGN 16 +__denormal_adjust: + por xmm2, XMMWORD PTR __real_one + subsd xmm2, QWORD PTR __real_one + movsd xmm5, xmm2 + pand xmm2, XMMWORD PTR __real_mant + movd rax, xmm2 + psrlq xmm5, 52 + psubd xmm5, XMMWORD PTR __mask_2045 + cvtdq2pd xmm6, xmm5 + jmp __continue_common + +ALIGN 16 +__x_is_zero_or_neg: + jne __x_is_neg + + movsd xmm1, QWORD PTR __real_ninf + mov r8d, DWORD PTR __flag_x_zero + call fname_special + jmp __finish + +ALIGN 16 +__x_is_neg: + + movsd xmm1, QWORD PTR __real_neg_qnan + mov r8d, DWORD PTR __flag_x_neg + call fname_special + jmp __finish + +ALIGN 16 +__x_is_inf_or_nan: + + cmp rax, QWORD PTR __real_inf + je __finish + + cmp rax, QWORD PTR __real_ninf + je __x_is_neg + + or rax, QWORD PTR __real_qnanbit + movd xmm1, rax + mov r8d, DWORD PTR __flag_x_nan + call fname_special + jmp __finish + +ALIGN 16 +__finish: + RestoreXmm xmm6, save_xmm6 + StackDeallocate stack_size + ret + +ALIGN 16 +Llog_fma3: + ; compute exponent part + xor rax,rax + vpsrlq xmm3,xmm0,52 + vmovq rax,xmm0 + vpsubq xmm3,xmm3,QWORD PTR __mask_1023 + vcvtdq2pd xmm6,xmm3 ; xexp + + ; NaN or inf + vpand xmm5,xmm0,QWORD PTR __real_inf + vcomisd xmm5,QWORD PTR __real_inf + je Llog_fma3_x_is_inf_or_nan + + ; check for negative numbers or zero + vpxor xmm5,xmm5,xmm5 + vcomisd xmm0,xmm5 + jbe Llog_fma3_x_is_zero_or_neg + + vpand xmm2,xmm0,QWORD PTR __real_mant + vsubsd xmm4,xmm0,QWORD PTR __real_one + + vcomisd xmm6,QWORD PTR __mask_1023_f + je Llog_fma3_denormal_adjust + +Llog_fma3_continue_common: + ; compute index into the log tables + vpand xmm1,xmm0,QWORD PTR __mask_mant_all8 + vpand xmm3,xmm0,QWORD PTR __mask_mant9 + vpsllq xmm3,xmm3,1 + vpaddq xmm1,xmm3,xmm1 + vmovq rax,xmm1 + + ; near one codepath + vpand xmm4,xmm4,QWORD PTR __real_notsign + vcomisd xmm4,QWORD PTR __real_threshold + jb Llog_fma3_near_one + + ; F,Y + shr rax,44 + vpor xmm2,xmm2,QWORD PTR __real_half + vpor xmm1,xmm1,QWORD PTR __real_half + lea r9,QWORD PTR __log_F_inv_qword + + ; f = F - Y,r = f * inv + vsubsd xmm1,xmm1,xmm2 + vmulsd xmm1,xmm1,QWORD PTR[r9 + rax * 8] + + lea r9,QWORD PTR __log_256_lead + + ; poly + vmulsd xmm0,xmm1,xmm1 ; r*r + vmovsd xmm3,QWORD PTR __real_1_over_6 + vmovsd xmm5,QWORD PTR __real_1_over_3 + vfmadd213sd xmm3,xmm1,QWORD PTR __real_1_over_5 ; r*1/6 + 1/5 + vfmadd213sd xmm5,xmm1,QWORD PTR __real_1_over_2 ; 1/2+r*1/3 + vmovsd xmm4,xmm0,xmm0 + vfmadd213sd xmm3,xmm1,QWORD PTR __real_1_over_4 ; 1/4+(1/5*r+r*r*1/6) + + vmulsd xmm4,xmm0,xmm0 ; r*r*r*r + vfmadd231sd xmm1,xmm5,xmm0 ; r*r*(1/2+r*1/3) + r + vfmadd231sd xmm1,xmm3,xmm4 + + ; m*log(2) + log(G) - poly + vmovsd xmm5,QWORD PTR __real_log2_tail + vfmsub213sd xmm5,xmm6,xmm1 + + vmovsd xmm0,QWORD PTR[r9 + rax * 8] + lea rdx,QWORD PTR __log_256_tail + vmovsd xmm1,QWORD PTR[rdx + rax * 8] + vaddsd xmm1,xmm1,xmm5 + + vfmadd231sd xmm0,xmm6,QWORD PTR __real_log2_lead + + vaddsd xmm0,xmm0,xmm1 + AVXRestoreXmm xmm6, save_xmm6 + StackDeallocate stack_size + ret + + +ALIGN 16 +Llog_fma3_near_one: + + ; r = x - 1.0 + vmovsd xmm3,QWORD PTR __real_two + vsubsd xmm0,xmm0,QWORD PTR __real_one ; r + + vaddsd xmm3,xmm3,xmm0 + vdivsd xmm1,xmm0,xmm3 ; r/(2+r) = u/2 + + vmovsd xmm4,QWORD PTR __real_ca2 + vmovsd xmm5,QWORD PTR __real_ca4 + + vmulsd xmm3,xmm0,xmm1 ; correction + vaddsd xmm1,xmm1,xmm1 ; u + + vmulsd xmm2,xmm1,xmm1 ; u^2 + vfmadd213sd xmm4,xmm2,QWORD PTR __real_ca1 + vfmadd213sd xmm5,xmm2,QWORD PTR __real_ca3 + + vmulsd xmm2,xmm2,xmm1 ; u^3 + vmulsd xmm4,xmm4,xmm2 + + vmulsd xmm2,xmm2,xmm2 + vmulsd xmm2,xmm2,xmm1 ; u^7 + + vfmadd231sd xmm4,xmm5,xmm2 + vsubsd xmm4,xmm4,xmm3 + vaddsd xmm0,xmm0,xmm4 + + AVXRestoreXmm xmm6, save_xmm6 + StackDeallocate stack_size + ret + + +Llog_fma3_denormal_adjust: + vpor xmm2,xmm2,QWORD PTR __real_one + vsubsd xmm2,xmm2,QWORD PTR __real_one + vpsrlq xmm5,xmm2,52 + vpand xmm2,xmm2,QWORD PTR __real_mant + vmovapd xmm0,xmm2 + vpsubd xmm5,xmm5,XMMWORD PTR __mask_2045 + vcvtdq2pd xmm6,xmm5 + jmp Llog_fma3_continue_common + +ALIGN 16 +Llog_fma3_x_is_zero_or_neg: + jne Llog_fma3_x_is_neg + vmovsd xmm1,QWORD PTR __real_ninf + mov r8d,DWORD PTR __flag_x_zero + call fname_special + + AVXRestoreXmm xmm6, save_xmm6 + StackDeallocate stack_size + ret + +ALIGN 16 +Llog_fma3_x_is_neg: + + vmovsd xmm1,QWORD PTR __real_neg_qnan + mov r8d,DWORD PTR __flag_x_neg + call fname_special + + AVXRestoreXmm xmm6, save_xmm6 + StackDeallocate stack_size + ret + +ALIGN 16 +Llog_fma3_x_is_inf_or_nan: + + cmp rax,QWORD PTR __real_inf + je Llog_fma3_finish + + cmp rax,QWORD PTR __real_ninf + je Llog_fma3_x_is_neg + + or rax,QWORD PTR __real_qnanbit + vmovq xmm1,rax + mov r8d,DWORD PTR __flag_x_nan + call fname_special + +ALIGN 16 +Llog_fma3_finish: + AVXRestoreXmm xmm6, save_xmm6 + StackDeallocate stack_size + ret +fname endp + +END + diff --git a/sdk/lib/crt/math/libm_sse2/log10.asm b/sdk/lib/crt/math/libm_sse2/log10.asm new file mode 100644 index 00000000000..790b535d0c7 --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/log10.asm @@ -0,0 +1,565 @@ +; +; MIT License +; ----------- +; +; Copyright (c) 2002-2019 Advanced Micro Devices, Inc. +; +; Permission is hereby granted, free of charge, to any person obtaining a copy +; of this Software and associated documentaon files (the "Software"), to deal +; in the Software without restriction, including without limitation the rights +; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +; copies of the Software, and to permit persons to whom the Software is +; furnished to do so, subject to the following conditions: +; +; The above copyright notice and this permission notice shall be included in +; all copies or substantial portions of the Software. +; +; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +; THE SOFTWARE. +; +; log10.asm +; +; An implementation of the log10 libm function. +; +; Prototype: +; +; double log10(double x); +; + +; +; Algorithm: +; Similar to one presnted in log.asm +; + +.const + +ALIGN 16 + +__real_ninf DQ 0fff0000000000000h ; -inf + DQ 0000000000000000h +__real_inf DQ 7ff0000000000000h ; +inf + DQ 0000000000000000h +__real_neg_qnan DQ 0fff8000000000000h ; neg qNaN + DQ 0000000000000000h +__real_qnanbit DQ 0008000000000000h + DQ 0000000000000000h +__int_1023 DQ 00000000000003ffh + DQ 0000000000000000h +__mask_001 DQ 0000000000000001h + DQ 0000000000000000h + +__mask_mant DQ 000FFFFFFFFFFFFFh ; mask for mantissa bits + DQ 0000000000000000h + +__mask_mant_top8 DQ 000ff00000000000h ; mask for top 8 mantissa bits + DQ 0000000000000000h + +__mask_mant9 DQ 0000080000000000h ; mask for 9th mantissa bit + DQ 0000000000000000h + +__real_log10_e DQ 3fdbcb7b1526e50eh + DQ 0000000000000000h + +__real_log10_e_lead DQ 3fdbcb7800000000h ; log10e_lead 4.34293746948242187500e-01 + DQ 0000000000000000h +__real_log10_e_tail DQ 3ea8a93728719535h ; log10e_tail 7.3495500964015109100644e-7 + DQ 0000000000000000h + +__real_log10_2_lead DQ 3fd3441350000000h + DQ 0000000000000000h +__real_log10_2_tail DQ 3e03ef3fde623e25h + DQ 0000000000000000h + +__real_two DQ 4000000000000000h ; 2 + DQ 0000000000000000h + +__real_one DQ 3ff0000000000000h ; 1 + DQ 0000000000000000h + +__real_half DQ 3fe0000000000000h ; 1/2 + DQ 0000000000000000h + +__mask_100 DQ 0000000000000100h + DQ 0000000000000000h +__real_1_over_512 DQ 3f60000000000000h + DQ 0000000000000000h + +__real_1_over_2 DQ 3fe0000000000000h + DQ 0000000000000000h +__real_1_over_3 DQ 3fd5555555555555h + DQ 0000000000000000h +__real_1_over_4 DQ 3fd0000000000000h + DQ 0000000000000000h +__real_1_over_5 DQ 3fc999999999999ah + DQ 0000000000000000h +__real_1_over_6 DQ 3fc5555555555555h + DQ 0000000000000000h + +__real_neg_1023 DQ 0c08ff80000000000h + DQ 0000000000000000h + +__mask_2045 DQ 00000000000007fdh + DQ 0000000000000000h + +__real_threshold DQ 3fb0000000000000h ; .0625 + DQ 0000000000000000h + +__real_near_one_lt DQ 3fee000000000000h ; .9375 + DQ 0000000000000000h + +__real_near_one_gt DQ 3ff1000000000000h ; 1.0625 + DQ 0000000000000000h + +__real_min_norm DQ 0010000000000000h + DQ 0000000000000000h + +__real_notsign DQ 7ffFFFFFFFFFFFFFh ; ^sign bit + DQ 0000000000000000h + +__real_ca1 DQ 3fb55555555554e6h ; 8.33333333333317923934e-02 + DQ 0000000000000000h +__real_ca2 DQ 3f89999999bac6d4h ; 1.25000000037717509602e-02 + DQ 0000000000000000h +__real_ca3 DQ 3f62492307f1519fh ; 2.23213998791944806202e-03 + DQ 0000000000000000h +__real_ca4 DQ 3f3c8034c85dfff0h ; 4.34887777707614552256e-04 + DQ 0000000000000000h + +__mask_lower DQ 0ffffffff00000000h + DQ 0000000000000000h + +; these codes and the ones in the corresponding .c file have to match +__flag_x_zero DD 00000001 +__flag_x_neg DD 00000002 +__flag_x_nan DD 00000003 + + +EXTRN __log10_256_lead:QWORD +EXTRN __log10_256_tail:QWORD +EXTRN __log_F_inv_qword:QWORD +EXTRN __use_fma3_lib:DWORD + + +; local variable storage offsets +save_xmm6 EQU 20h +dummy_space EQU 30h +stack_size EQU 058h + +include fm.inc + +fname TEXTEQU +fname_special TEXTEQU <_log10_special> + +EXTERN fname_special:PROC + +.code +ALIGN 16 +PUBLIC fname +fname PROC FRAME + StackAllocate stack_size + SaveXmm xmm6, save_xmm6 + .ENDPROLOG + + cmp DWORD PTR __use_fma3_lib, 0 + jne Llog10_fma3 + +Llog10_sse2: + + ; compute exponent part + movapd xmm3, xmm0 + movapd xmm4, xmm0 + psrlq xmm3, 52 + movd rax, xmm0 + psubq xmm3, XMMWORD PTR __int_1023 ; xmm3 <-- unbiased exponent + + ; NaN or inf + movapd xmm5, xmm0 + andpd xmm5, XMMWORD PTR __real_inf + comisd xmm5, QWORD PTR __real_inf + je Llog10_sse2_x_is_inf_or_nan + + movapd xmm2, xmm0 + cvtdq2pd xmm6, xmm3 ; xmm6 <-- unbiased exp as double + + + pand xmm2, XMMWORD PTR __mask_mant + subsd xmm4, QWORD PTR __real_one + + comisd xmm6, QWORD PTR __real_neg_1023 + je Llog10_sse2_denormal_adjust + +Llog10_sse2_continue_common: + + andpd xmm4, XMMWORD PTR __real_notsign + ; compute index into the log tables + mov r9, rax + and rax, QWORD PTR __mask_mant_top8 + and r9, QWORD PTR __mask_mant9 + shl r9, 1 + add rax, r9 + movd xmm1, rax + + ; near one codepath + comisd xmm4, QWORD PTR __real_threshold + jb Llog10_sse2_near_one + + ; F, Y + shr rax, 44 + por xmm2, XMMWORD PTR __real_half + por xmm1, XMMWORD PTR __real_half + lea r9, QWORD PTR __log_F_inv_qword + + ; check for negative numbers or zero + xorpd xmm5, xmm5 + comisd xmm0, xmm5 + jbe Llog10_sse2_x_is_zero_or_neg + + ; f = F - Y, r = f * inv + subsd xmm1, xmm2 + mulsd xmm1, QWORD PTR [r9+rax*8] + + movapd xmm2, xmm1 + movapd xmm0, xmm1 + lea r9, QWORD PTR __log10_256_lead + + ; poly + movsd xmm3, QWORD PTR __real_1_over_6 + movsd xmm1, QWORD PTR __real_1_over_3 + mulsd xmm3, xmm2 + mulsd xmm1, xmm2 + mulsd xmm0, xmm2 + movapd xmm4, xmm0 + addsd xmm3, QWORD PTR __real_1_over_5 + addsd xmm1, QWORD PTR __real_1_over_2 + mulsd xmm4, xmm0 + mulsd xmm3, xmm2 + mulsd xmm1, xmm0 + addsd xmm3, QWORD PTR __real_1_over_4 + addsd xmm1, xmm2 + mulsd xmm3, xmm4 + addsd xmm1, xmm3 + + movsd xmm5, QWORD PTR __real_log10_2_tail + mulsd xmm1, QWORD PTR __real_log10_e + + ; m*log(10) + log10(G) - poly + mulsd xmm5, xmm6 + subsd xmm5, xmm1 + + movsd xmm0, QWORD PTR [r9+rax*8] + lea rdx, QWORD PTR __log10_256_tail + movsd xmm2, QWORD PTR [rdx+rax*8] + + movsd xmm4, QWORD PTR __real_log10_2_lead + mulsd xmm4, xmm6 + addsd xmm0, xmm4 + addsd xmm2, xmm5 + + addsd xmm0, xmm2 + + RestoreXmm xmm6, save_xmm6 + StackDeallocate stack_size + ret + +ALIGN 16 +Llog10_sse2_near_one: + + ; r = x - 1.0 + movsd xmm2, QWORD PTR __real_two + subsd xmm0, QWORD PTR __real_one ; r + + addsd xmm2, xmm0 + movapd xmm1, xmm0 + divsd xmm1, xmm2 ; r/(2+r) = u/2 + + movsd xmm4, QWORD PTR __real_ca2 + movsd xmm5, QWORD PTR __real_ca4 + + movapd xmm6, xmm0 + mulsd xmm6, xmm1 ; correction + + addsd xmm1, xmm1 ; u + movapd xmm2, xmm1 + + mulsd xmm2, xmm1 ; u^2 + + mulsd xmm4, xmm2 + mulsd xmm5, xmm2 + + addsd xmm4, QWORD PTR __real_ca1 + addsd xmm5, QWORD PTR __real_ca3 + + mulsd xmm2, xmm1 ; u^3 + mulsd xmm4, xmm2 + + mulsd xmm2, xmm2 + mulsd xmm2, xmm1 ; u^7 + mulsd xmm5, xmm2 + + movsd xmm2, QWORD PTR __real_log10_e_tail + addsd xmm4, xmm5 + subsd xmm4, xmm6 + movsd xmm6, QWORD PTR __real_log10_e_lead + + movapd xmm3, xmm0 + pand xmm3, XMMWORD PTR __mask_lower + subsd xmm0, xmm3 + addsd xmm4, xmm0 + + movapd xmm0, xmm3 + movapd xmm1, xmm4 + + mulsd xmm4, xmm2 + mulsd xmm0, xmm2 + mulsd xmm1, xmm6 + mulsd xmm3, xmm6 + + addsd xmm0, xmm4 + addsd xmm0, xmm1 + addsd xmm0, xmm3 + + RestoreXmm xmm6, save_xmm6 + StackDeallocate stack_size + ret + +Llog10_sse2_denormal_adjust: + por xmm2, XMMWORD PTR __real_one + subsd xmm2, QWORD PTR __real_one + movsd xmm5, xmm2 + pand xmm2, XMMWORD PTR __mask_mant + movd rax, xmm2 + psrlq xmm5, 52 + psubd xmm5, XMMWORD PTR __mask_2045 + cvtdq2pd xmm6, xmm5 + jmp Llog10_sse2_continue_common + +ALIGN 16 +Llog10_sse2_x_is_zero_or_neg: + jne Llog10_sse2_x_is_neg + + movsd xmm1, QWORD PTR __real_ninf + mov r8d, DWORD PTR __flag_x_zero + call fname_special + jmp Llog10_sse2_finish + +ALIGN 16 +Llog10_sse2_x_is_neg: + + movsd xmm1, QWORD PTR __real_neg_qnan + mov r8d, DWORD PTR __flag_x_neg + call fname_special + jmp Llog10_sse2_finish + +ALIGN 16 +Llog10_sse2_x_is_inf_or_nan: + + cmp rax, QWORD PTR __real_inf + je Llog10_sse2_finish + + cmp rax, QWORD PTR __real_ninf + je Llog10_sse2_x_is_neg + + or rax, QWORD PTR __real_qnanbit + movd xmm1, rax + mov r8d, DWORD PTR __flag_x_nan + call fname_special + jmp Llog10_sse2_finish + +ALIGN 16 +Llog10_sse2_finish: + RestoreXmm xmm6, save_xmm6 + StackDeallocate stack_size + ret + +ALIGN 16 +Llog10_fma3: + ; compute exponent part + xor rax,rax + vpsrlq xmm3,xmm0,52 + vmovq rax,xmm0 + vpsubq xmm3,xmm3,QWORD PTR __int_1023 + vcvtdq2pd xmm6,xmm3 ; xmm6 <-- (double)xexp + + ; NaN or Inf? + vpand xmm5,xmm0,__real_inf + vcomisd xmm5,QWORD PTR __real_inf + je Llog10_fma3_x_is_inf_or_nan + + ; negative number or zero? + vpxor xmm5,xmm5,xmm5 + vcomisd xmm0,xmm5 + jbe Llog10_fma3_x_is_zero_or_neg + + vpand xmm2,xmm0,__mask_mant + vsubsd xmm4,xmm0,QWORD PTR __real_one + + ; Subnormal? + vcomisd xmm6,QWORD PTR __real_neg_1023 + je Llog10_fma3_denormal_adjust + +Llog10_fma3_continue_common: + ; compute index into the log tables + vpand xmm1,xmm0,DWORD PTR __mask_mant_top8 + vpand xmm3,xmm0,DWORD PTR __mask_mant9 + vpsllq xmm3,xmm3,1 + vpaddq xmm1,xmm3,xmm1 + vmovq rax,xmm1 + + ; near one codepath + vpand xmm4,xmm4,DWORD PTR __real_notsign + vcomisd xmm4,QWORD PTR __real_threshold + jb Llog10_fma3_near_one + + ; F,Y + shr rax,44 + vpor xmm2,xmm2,DWORD PTR __real_half + vpor xmm1,xmm1,DWORD PTR __real_half + lea r9,DWORD PTR __log_F_inv_qword + + ; f = F - Y,r = f * inv + vsubsd xmm1,xmm1,xmm2 + vmulsd xmm1,xmm1,QWORD PTR [r9 + rax * 8] + + lea r9,DWORD PTR __log10_256_lead + + ; poly + vmulsd xmm0,xmm1,xmm1 ; r*r + vmovsd xmm3,QWORD PTR __real_1_over_6 + vmovsd xmm5,QWORD PTR __real_1_over_3 + vfmadd213sd xmm3,xmm1,QWORD PTR __real_1_over_5 ; r*1/6 + 1/5 + vfmadd213sd xmm5,xmm1,QWORD PTR __real_half ; 1/2+r*1/3 + movsd xmm4,xmm0 ; r*r + vfmadd213sd xmm3 ,xmm1,QWORD PTR __real_1_over_4 ; 1/4+(1/5*r+r*r*1/6) + + vmulsd xmm4,xmm0,xmm0 ; r*r*r*r + vfmadd231sd xmm1,xmm5,xmm0 ; r*r*(1/2+r*1/3) + r + vfmadd231sd xmm1,xmm3,xmm4 + + vmulsd xmm1,xmm1,QWORD PTR __real_log10_e + ; m*log(2) + log(G) - poly*log10_e + vmovsd xmm5,QWORD PTR __real_log10_2_tail + vfmsub213sd xmm5,xmm6,xmm1 + + movsd xmm0,QWORD PTR [r9 + rax * 8] + lea rdx,DWORD PTR __log10_256_tail + movsd xmm2,QWORD PTR [rdx + rax * 8] + vaddsd xmm2,xmm2,xmm5 + + vfmadd231sd xmm0,xmm6,QWORD PTR __real_log10_2_lead + + vaddsd xmm0,xmm0,xmm2 + AVXRestoreXmm xmm6, save_xmm6 + StackDeallocate stack_size + ret + + +ALIGN 16 +Llog10_fma3_near_one: + ; r = x - 1.0 + vmovsd xmm2,QWORD PTR __real_two + vsubsd xmm0,xmm0,QWORD PTR __real_one ; r + + vaddsd xmm2,xmm2,xmm0 + vdivsd xmm1,xmm0,xmm2 ; r/(2+r) = u/2 + + vmovsd xmm4,QWORD PTR __real_ca2 + vmovsd xmm5,QWORD PTR __real_ca4 + + vmulsd xmm6,xmm0,xmm1 ; correction + vaddsd xmm1,xmm1,xmm1 ; u + + vmulsd xmm2,xmm1,xmm1 ; u^2 + vfmadd213sd xmm4,xmm2,QWORD PTR __real_ca1 + vfmadd213sd xmm5,xmm2,QWORD PTR __real_ca3 + + vmulsd xmm2,xmm2,xmm1 ; u^3 + vmulsd xmm4,xmm4,xmm2 + + vmulsd xmm2,xmm2,xmm2 + vmulsd xmm2,xmm2,xmm1 ; u^7 + + vmulsd xmm5,xmm5,xmm2 + vaddsd xmm4,xmm4,xmm5 + vsubsd xmm4,xmm4,xmm6 + vpand xmm3,xmm0,QWORD PTR __mask_lower + vsubsd xmm0,xmm0,xmm3 + vaddsd xmm4,xmm4,xmm0 + + vmulsd xmm1,xmm4,QWORD PTR __real_log10_e_lead + vmulsd xmm4,xmm4,QWORD PTR __real_log10_e_tail + vmulsd xmm0,xmm3,QWORD PTR __real_log10_e_tail + vmulsd xmm3,xmm3,QWORD PTR __real_log10_e_lead + + vaddsd xmm0,xmm0,xmm4 + vaddsd xmm0,xmm0,xmm1 + vaddsd xmm0,xmm0,xmm3 + + AVXRestoreXmm xmm6, save_xmm6 + StackDeallocate stack_size + ret + + +Llog10_fma3_denormal_adjust: + vpor xmm2,xmm2,QWORD PTR __real_one + vsubsd xmm2,xmm2,QWORD PTR __real_one + vpsrlq xmm5,xmm2,52 + vpand xmm2,xmm2,QWORD PTR __mask_mant + vmovapd xmm0,xmm2 + vpsubd xmm5,xmm5,DWORD PTR __mask_2045 + vcvtdq2pd xmm6,xmm5 + jmp Llog10_fma3_continue_common + +ALIGN 16 +Llog10_fma3_x_is_zero_or_neg: + jne Llog10_fma3_x_is_neg + vmovsd xmm1,QWORD PTR __real_ninf + mov r8d,DWORD PTR __flag_x_zero + call fname_special + + AVXRestoreXmm xmm6, save_xmm6 + StackDeallocate stack_size + ret + + +ALIGN 16 +Llog10_fma3_x_is_neg: + + vmovsd xmm1,QWORD PTR __real_neg_qnan + mov r8d,DWORD PTR __flag_x_neg + call fname_special + + AVXRestoreXmm xmm6, save_xmm6 + StackDeallocate stack_size + ret + + +ALIGN 16 +Llog10_fma3_x_is_inf_or_nan: + + cmp rax,QWORD PTR __real_inf + je Llog10_fma3_finish + + cmp rax,QWORD PTR __real_ninf + je Llog10_fma3_x_is_neg + + or rax,QWORD PTR __real_qnanbit + movd xmm1,rax + mov r8d,DWORD PTR __flag_x_nan + call fname_special + jmp Llog10_fma3_finish + +ALIGN 16 +Llog10_fma3_finish: + + AVXRestoreXmm xmm6, save_xmm6 + StackDeallocate stack_size + ret +fname endp + +END + diff --git a/sdk/lib/crt/math/libm_sse2/log10_128_lead_tail_table.asm b/sdk/lib/crt/math/libm_sse2/log10_128_lead_tail_table.asm new file mode 100644 index 00000000000..bf5ee104f10 --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/log10_128_lead_tail_table.asm @@ -0,0 +1,297 @@ +; +; MIT License +; ----------- +; +; Copyright (c) 2002-2019 Advanced Micro Devices, Inc. +; +; Permission is hereby granted, free of charge, to any person obtaining a copy +; of this Software and associated documentaon files (the "Software"), to deal +; in the Software without restriction, including without limitation the rights +; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +; copies of the Software, and to permit persons to whom the Software is +; furnished to do so, subject to the following conditions: +; +; The above copyright notice and this permission notice shall be included in +; all copies or substantial portions of the Software. +; +; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +; THE SOFTWARE. +; +;; +;; Defines __log_128_lead and __log_128_tail tables +;; Used by log and pow +;; + +.const + +ALIGN 16 +PUBLIC __log10_128_lead +__log10_128_lead: + DD 00000000h + DD 3b5d4000h + DD 3bdc8000h + DD 3c24c000h + DD 3c5ac000h + DD 3c884000h + DD 3ca2c000h + DD 3cbd4000h + DD 3cd78000h + DD 3cf1c000h + DD 3d05c000h + DD 3d128000h + DD 3d1f4000h + DD 3d2c0000h + DD 3d388000h + DD 3d450000h + DD 3d518000h + DD 3d5dc000h + DD 3d6a0000h + DD 3d760000h + DD 3d810000h + DD 3d870000h + DD 3d8d0000h + DD 3d92c000h + DD 3d98c000h + DD 3d9e8000h + DD 3da44000h + DD 3daa0000h + DD 3dafc000h + DD 3db58000h + DD 3dbb4000h + DD 3dc0c000h + DD 3dc64000h + DD 3dcc0000h + DD 3dd18000h + DD 3dd6c000h + DD 3ddc4000h + DD 3de1c000h + DD 3de70000h + DD 3dec8000h + DD 3df1c000h + DD 3df70000h + DD 3dfc4000h + DD 3e00c000h + DD 3e034000h + DD 3e05c000h + DD 3e088000h + DD 3e0b0000h + DD 3e0d8000h + DD 3e100000h + DD 3e128000h + DD 3e150000h + DD 3e178000h + DD 3e1a0000h + DD 3e1c8000h + DD 3e1ec000h + DD 3e214000h + DD 3e23c000h + DD 3e260000h + DD 3e288000h + DD 3e2ac000h + DD 3e2d4000h + DD 3e2f8000h + DD 3e31c000h + DD 3e344000h + DD 3e368000h + DD 3e38c000h + DD 3e3b0000h + DD 3e3d4000h + DD 3e3fc000h + DD 3e420000h + DD 3e440000h + DD 3e464000h + DD 3e488000h + DD 3e4ac000h + DD 3e4d0000h + DD 3e4f4000h + DD 3e514000h + DD 3e538000h + DD 3e55c000h + DD 3e57c000h + DD 3e5a0000h + DD 3e5c0000h + DD 3e5e4000h + DD 3e604000h + DD 3e624000h + DD 3e648000h + DD 3e668000h + DD 3e688000h + DD 3e6ac000h + DD 3e6cc000h + DD 3e6ec000h + DD 3e70c000h + DD 3e72c000h + DD 3e74c000h + DD 3e76c000h + DD 3e78c000h + DD 3e7ac000h + DD 3e7cc000h + DD 3e7ec000h + DD 3e804000h + DD 3e814000h + DD 3e824000h + DD 3e834000h + DD 3e840000h + DD 3e850000h + DD 3e860000h + DD 3e870000h + DD 3e880000h + DD 3e88c000h + DD 3e89c000h + DD 3e8ac000h + DD 3e8bc000h + DD 3e8c8000h + DD 3e8d8000h + DD 3e8e8000h + DD 3e8f4000h + DD 3e904000h + DD 3e914000h + DD 3e920000h + DD 3e930000h + DD 3e93c000h + DD 3e94c000h + DD 3e958000h + DD 3e968000h + DD 3e978000h + DD 3e984000h + DD 3e994000h + DD 3e9a0000h + +ALIGN 16 +PUBLIC __log10_128_tail +__log10_128_tail: + DD 00000000h + DD 367a8e44h + DD 368ed49fh + DD 36c21451h + DD 375211d6h + DD 3720ea11h + DD 37e9eb59h + DD 37b87be7h + DD 37bf2560h + DD 33d597a0h + DD 37806a05h + DD 3820581fh + DD 38223334h + DD 378e3bach + DD 3810684fh + DD 37feb7aeh + DD 36a9d609h + DD 37a68163h + DD 376a8b27h + DD 384c8fd6h + DD 3885183eh + DD 3874a760h + DD 380d1154h + DD 38ea42bdh + DD 384c1571h + DD 38ba66b8h + DD 38e7da3bh + DD 38eee632h + DD 38d00911h + DD 388bbedeh + DD 378a0512h + DD 3894c7a0h + DD 38e30710h + DD 36db2829h + DD 3729d609h + DD 38fa0e82h + DD 38bc9a75h + DD 383a9297h + DD 38dc83c8h + DD 37eac335h + DD 38706ac3h + DD 389574c2h + DD 3892d068h + DD 38615032h + DD 3917acf4h + DD 3967a126h + DD 38217840h + DD 38b420abh + DD 38f9c7b2h + DD 391103bdh + DD 39169a6bh + DD 390dd194h + DD 38eda471h + DD 38a38950h + DD 37f6844ah + DD 395e1cdbh + DD 390fcffch + DD 38503e9dh + DD 394b00fdh + DD 38a9910ah + DD 39518a31h + DD 3882d2c2h + DD 392488e4h + DD 397b0affh + DD 388a22d8h + DD 3902bd5eh + DD 39342f85h + DD 39598811h + DD 3972e6b1h + DD 34d53654h + DD 360ca25eh + DD 39785cc0h + DD 39630710h + DD 39424ed7h + DD 39165101h + DD 38be5421h + DD 37e7b0c0h + DD 394fd0c3h + DD 38efaaaah + DD 37a8f566h + DD 3927c744h + DD 383fa4d5h + DD 392d9e39h + DD 3803feaeh + DD 390a268ch + DD 39692b80h + DD 38789b4fh + DD 3909307dh + DD 394a601ch + DD 35e67edch + DD 383e386dh + DD 38a7743dh + DD 38dccec3h + DD 38ff57e0h + DD 39079d8bh + DD 390651a6h + DD 38f7bad9h + DD 38d0ab82h + DD 38979e7dh + DD 381978eeh + DD 397816c8h + DD 39410cb2h + DD 39015384h + DD 3863fa28h + DD 39f41065h + DD 39c7668ah + DD 39968afah + DD 39430db9h + DD 38a18cf3h + DD 39eb2907h + DD 39a9e10ch + DD 39492800h + DD 385a53d1h + DD 39ce0cf7h + DD 3979c7b2h + DD 389f5d99h + DD 39ceefcbh + DD 39646a39h + DD 380d7a9bh + DD 39ad6650h + DD 390ac3b8h + DD 39d9a9a8h + DD 39548a99h + DD 39f73c4bh + DD 3980960eh + DD 374b3d5ah + DD 39888f1eh + DD 37679a07h + DD 39826a13h +END diff --git a/sdk/lib/crt/math/libm_sse2/log10_256_lead_tail_table.asm b/sdk/lib/crt/math/libm_sse2/log10_256_lead_tail_table.asm new file mode 100644 index 00000000000..6f451cfc7c2 --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/log10_256_lead_tail_table.asm @@ -0,0 +1,552 @@ +; +; MIT License +; ----------- +; +; Copyright (c) 2002-2019 Advanced Micro Devices, Inc. +; +; Permission is hereby granted, free of charge, to any person obtaining a copy +; of this Software and associated documentaon files (the "Software"), to deal +; in the Software without restriction, including without limitation the rights +; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +; copies of the Software, and to permit persons to whom the Software is +; furnished to do so, subject to the following conditions: +; +; The above copyright notice and this permission notice shall be included in +; all copies or substantial portions of the Software. +; +; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +; THE SOFTWARE. +; +;; +;; Defines __log_256_lead and __log_256_tail tables +;; Used by log and pow +;; + +.const + +ALIGN 16 +PUBLIC __log10_256_lead +__log10_256_lead: + DQ 0000000000000000h + DQ 3f5bbd9e90000000h + DQ 3f6bafd470000000h + DQ 3f74b99560000000h + DQ 3f7b9476a0000000h + DQ 3f81344da0000000h + DQ 3f849b0850000000h + DQ 3f87fe71c0000000h + DQ 3f8b5e9080000000h + DQ 3f8ebb6af0000000h + DQ 3f910a83a0000000h + DQ 3f92b5b5e0000000h + DQ 3f945f4f50000000h + DQ 3f96075300000000h + DQ 3f97adc3d0000000h + DQ 3f9952a4f0000000h + DQ 3f9af5f920000000h + DQ 3f9c97c370000000h + DQ 3f9e3806a0000000h + DQ 3f9fd6c5b0000000h + DQ 3fa0ba01a0000000h + DQ 3fa187e120000000h + DQ 3fa25502c0000000h + DQ 3fa32167c0000000h + DQ 3fa3ed1190000000h + DQ 3fa4b80180000000h + DQ 3fa58238e0000000h + DQ 3fa64bb910000000h + DQ 3fa7148340000000h + DQ 3fa7dc98c0000000h + DQ 3fa8a3fad0000000h + DQ 3fa96aaac0000000h + DQ 3faa30a9d0000000h + DQ 3faaf5f920000000h + DQ 3fabba9a00000000h + DQ 3fac7e8d90000000h + DQ 3fad41d510000000h + DQ 3fae0471a0000000h + DQ 3faec66470000000h + DQ 3faf87aeb0000000h + DQ 3fb02428c0000000h + DQ 3fb08426f0000000h + DQ 3fb0e3d290000000h + DQ 3fb1432c30000000h + DQ 3fb1a23440000000h + DQ 3fb200eb60000000h + DQ 3fb25f5210000000h + DQ 3fb2bd68e0000000h + DQ 3fb31b3050000000h + DQ 3fb378a8e0000000h + DQ 3fb3d5d330000000h + DQ 3fb432afa0000000h + DQ 3fb48f3ed0000000h + DQ 3fb4eb8120000000h + DQ 3fb5477730000000h + DQ 3fb5a32160000000h + DQ 3fb5fe8040000000h + DQ 3fb6599440000000h + DQ 3fb6b45df0000000h + DQ 3fb70eddb0000000h + DQ 3fb7691400000000h + DQ 3fb7c30160000000h + DQ 3fb81ca630000000h + DQ 3fb8760300000000h + DQ 3fb8cf1830000000h + DQ 3fb927e640000000h + DQ 3fb9806d90000000h + DQ 3fb9d8aea0000000h + DQ 3fba30a9d0000000h + DQ 3fba885fa0000000h + DQ 3fbadfd070000000h + DQ 3fbb36fcb0000000h + DQ 3fbb8de4d0000000h + DQ 3fbbe48930000000h + DQ 3fbc3aea40000000h + DQ 3fbc910870000000h + DQ 3fbce6e410000000h + DQ 3fbd3c7da0000000h + DQ 3fbd91d580000000h + DQ 3fbde6ec00000000h + DQ 3fbe3bc1a0000000h + DQ 3fbe9056b0000000h + DQ 3fbee4aba0000000h + DQ 3fbf38c0c0000000h + DQ 3fbf8c9680000000h + DQ 3fbfe02d30000000h + DQ 3fc019c2a0000000h + DQ 3fc0434f70000000h + DQ 3fc06cbd60000000h + DQ 3fc0960c80000000h + DQ 3fc0bf3d00000000h + DQ 3fc0e84f10000000h + DQ 3fc11142f0000000h + DQ 3fc13a18a0000000h + DQ 3fc162d080000000h + DQ 3fc18b6a90000000h + DQ 3fc1b3e710000000h + DQ 3fc1dc4630000000h + DQ 3fc2048810000000h + DQ 3fc22cace0000000h + DQ 3fc254b4d0000000h + DQ 3fc27c9ff0000000h + DQ 3fc2a46e80000000h + DQ 3fc2cc20b0000000h + DQ 3fc2f3b690000000h + DQ 3fc31b3050000000h + DQ 3fc3428e20000000h + DQ 3fc369d020000000h + DQ 3fc390f680000000h + DQ 3fc3b80160000000h + DQ 3fc3def0e0000000h + DQ 3fc405c530000000h + DQ 3fc42c7e70000000h + DQ 3fc4531cd0000000h + DQ 3fc479a070000000h + DQ 3fc4a00970000000h + DQ 3fc4c65800000000h + DQ 3fc4ec8c30000000h + DQ 3fc512a640000000h + DQ 3fc538a630000000h + DQ 3fc55e8c50000000h + DQ 3fc5845890000000h + DQ 3fc5aa0b40000000h + DQ 3fc5cfa470000000h + DQ 3fc5f52440000000h + DQ 3fc61a8ad0000000h + DQ 3fc63fd850000000h + DQ 3fc6650cd0000000h + DQ 3fc68a2880000000h + DQ 3fc6af2b80000000h + DQ 3fc6d415e0000000h + DQ 3fc6f8e7d0000000h + DQ 3fc71da170000000h + DQ 3fc74242e0000000h + DQ 3fc766cc40000000h + DQ 3fc78b3da0000000h + DQ 3fc7af9730000000h + DQ 3fc7d3d910000000h + DQ 3fc7f80350000000h + DQ 3fc81c1620000000h + DQ 3fc8401190000000h + DQ 3fc863f5c0000000h + DQ 3fc887c2e0000000h + DQ 3fc8ab7900000000h + DQ 3fc8cf1830000000h + DQ 3fc8f2a0a0000000h + DQ 3fc9161270000000h + DQ 3fc9396db0000000h + DQ 3fc95cb280000000h + DQ 3fc97fe100000000h + DQ 3fc9a2f950000000h + DQ 3fc9c5fb70000000h + DQ 3fc9e8e7b0000000h + DQ 3fca0bbdf0000000h + DQ 3fca2e7e80000000h + DQ 3fca512960000000h + DQ 3fca73bea0000000h + DQ 3fca963e70000000h + DQ 3fcab8a8f0000000h + DQ 3fcadafe20000000h + DQ 3fcafd3e30000000h + DQ 3fcb1f6930000000h + DQ 3fcb417f40000000h + DQ 3fcb638070000000h + DQ 3fcb856cf0000000h + DQ 3fcba744b0000000h + DQ 3fcbc907f0000000h + DQ 3fcbeab6c0000000h + DQ 3fcc0c5130000000h + DQ 3fcc2dd750000000h + DQ 3fcc4f4950000000h + DQ 3fcc70a740000000h + DQ 3fcc91f130000000h + DQ 3fccb32740000000h + DQ 3fccd44980000000h + DQ 3fccf55810000000h + DQ 3fcd165300000000h + DQ 3fcd373a60000000h + DQ 3fcd580e60000000h + DQ 3fcd78cf00000000h + DQ 3fcd997c70000000h + DQ 3fcdba16a0000000h + DQ 3fcdda9dd0000000h + DQ 3fcdfb11f0000000h + DQ 3fce1b7330000000h + DQ 3fce3bc1a0000000h + DQ 3fce5bfd50000000h + DQ 3fce7c2660000000h + DQ 3fce9c3ce0000000h + DQ 3fcebc40e0000000h + DQ 3fcedc3280000000h + DQ 3fcefc11d0000000h + DQ 3fcf1bdee0000000h + DQ 3fcf3b99d0000000h + DQ 3fcf5b42a0000000h + DQ 3fcf7ad980000000h + DQ 3fcf9a5e70000000h + DQ 3fcfb9d190000000h + DQ 3fcfd932f0000000h + DQ 3fcff882a0000000h + DQ 3fd00be050000000h + DQ 3fd01b76a0000000h + DQ 3fd02b0430000000h + DQ 3fd03a8910000000h + DQ 3fd04a0540000000h + DQ 3fd05978e0000000h + DQ 3fd068e3f0000000h + DQ 3fd0784670000000h + DQ 3fd087a080000000h + DQ 3fd096f210000000h + DQ 3fd0a63b30000000h + DQ 3fd0b57bf0000000h + DQ 3fd0c4b450000000h + DQ 3fd0d3e460000000h + DQ 3fd0e30c30000000h + DQ 3fd0f22bc0000000h + DQ 3fd1014310000000h + DQ 3fd1105240000000h + DQ 3fd11f5940000000h + DQ 3fd12e5830000000h + DQ 3fd13d4f00000000h + DQ 3fd14c3dd0000000h + DQ 3fd15b24a0000000h + DQ 3fd16a0370000000h + DQ 3fd178da50000000h + DQ 3fd187a940000000h + DQ 3fd1967060000000h + DQ 3fd1a52fa0000000h + DQ 3fd1b3e710000000h + DQ 3fd1c296c0000000h + DQ 3fd1d13eb0000000h + DQ 3fd1dfdef0000000h + DQ 3fd1ee7770000000h + DQ 3fd1fd0860000000h + DQ 3fd20b91a0000000h + DQ 3fd21a1350000000h + DQ 3fd2288d70000000h + DQ 3fd2370010000000h + DQ 3fd2456b30000000h + DQ 3fd253ced0000000h + DQ 3fd2622b00000000h + DQ 3fd2707fd0000000h + DQ 3fd27ecd40000000h + DQ 3fd28d1360000000h + DQ 3fd29b5220000000h + DQ 3fd2a989a0000000h + DQ 3fd2b7b9e0000000h + DQ 3fd2c5e2e0000000h + DQ 3fd2d404b0000000h + DQ 3fd2e21f50000000h + DQ 3fd2f032c0000000h + DQ 3fd2fe3f20000000h + DQ 3fd30c4470000000h + DQ 3fd31a42b0000000h + DQ 3fd32839e0000000h + DQ 3fd3362a10000000h + DQ 3fd3441350000000h +ALIGN 16 +PUBLIC __log10_256_tail +__log10_256_tail: + DQ 0000000000000000h + DQ 3db20abc22b2208fh + DQ 3db10f69332e0dd4h + DQ 3dce950de87ed257h + DQ 3dd3f3443b626d69h + DQ 3df45aeaa5363e57h + DQ 3dc443683ce1bf0bh + DQ 3df989cd60c6a511h + DQ 3dfd626f201f2e9fh + DQ 3de94f8bb8dabdcdh + DQ 3e0088d8ef423015h + DQ 3e080413a62b79adh + DQ 3e059717c0eed3c4h + DQ 3dad4a77add44902h + DQ 3e0e763ff037300eh + DQ 3de162d74706f6c3h + DQ 3e0601cc1f4dbc14h + DQ 3deaf3e051f6e5bfh + DQ 3e097a0b1e1af3ebh + DQ 3dc0a38970c002c7h + DQ 3e102e000057c751h + DQ 3e155b00eecd6e0eh + DQ 3ddf86297003b5afh + DQ 3e1057b9b336a36dh + DQ 3e134bc84a06ea4fh + DQ 3e1643da9ea1bcadh + DQ 3e1d66a7b4f7ea2ah + DQ 3df6b2e038f7fcefh + DQ 3df3e954c670f088h + DQ 3e047209093acab3h + DQ 3e1d708fe7275da7h + DQ 3e1fdf9e7771b9e7h + DQ 3e0827bfa70a0660h + DQ 3e1601cc1f4dbc14h + DQ 3e0637f6106a5e5bh + DQ 3e126a13f17c624bh + DQ 3e093eb2ce80623ah + DQ 3e1430d1e91594deh + DQ 3e1d6b10108fa031h + DQ 3e16879c0bbaf241h + DQ 3dff08015ea6bc2bh + DQ 3e29b63dcdc6676ch + DQ 3e2b022cbcc4ab2ch + DQ 3df917d07ddd6544h + DQ 3e1540605703379eh + DQ 3e0cd18b947a1b60h + DQ 3e17ad65277ca97eh + DQ 3e11884dc59f5fa9h + DQ 3e1711c46006d082h + DQ 3e2f092e3c3108f8h + DQ 3e1714c5e32be13ah + DQ 3e26bba7fd734f9ah + DQ 3dfdf48fb5e08483h + DQ 3e232f9bc74d0b95h + DQ 3df973e848790c13h + DQ 3e1eccbc08c6586eh + DQ 3e2115e9f9524a98h + DQ 3e2f1740593131b8h + DQ 3e1bcf8b25643835h + DQ 3e1f5fa81d8bed80h + DQ 3e244a4df929d9e4h + DQ 3e129820d8220c94h + DQ 3e2a0b489304e309h + DQ 3e1f4d56aba665feh + DQ 3e210c9019365163h + DQ 3df80f78fe592736h + DQ 3e10528825c81ccah + DQ 3de095537d6d746ah + DQ 3e1827bfa70a0660h + DQ 3e06b0a8ec45933ch + DQ 3e105af81bf5dba9h + DQ 3e17e2fa2655d515h + DQ 3e0d59ecbfaee4bfh + DQ 3e1d8b2fda683fa3h + DQ 3e24b8ddfd3a3737h + DQ 3e13827e61ae1204h + DQ 3e2c8c7b49e90f9fh + DQ 3e29eaf01597591dh + DQ 3e19aaa66e317b36h + DQ 3e2e725609720655h + DQ 3e261c33fc7aac54h + DQ 3e29662bcf61a252h + DQ 3e1843c811c42730h + DQ 3e2064bb0b5acb36h + DQ 3e0a340c842701a4h + DQ 3e1a8e55b58f79d6h + DQ 3de92d219c5e9d9ah + DQ 3e3f63e60d7ffd6ah + DQ 3e2e9b0ed9516314h + DQ 3e2923901962350ch + DQ 3e326f8838785e81h + DQ 3e3b5b6a4caba6afh + DQ 3df0226adc8e761ch + DQ 3e3c4ad7313a1aedh + DQ 3e1564e87c738d17h + DQ 3e338fecf18a6618h + DQ 3e3d929ef5777666h + DQ 3e39483bf08da0b8h + DQ 3e3bdd0eeeaa5826h + DQ 3e39c4dd590237bah + DQ 3e1af3e9e0ebcac7h + DQ 3e35ce5382270dach + DQ 3e394f74532ab9bah + DQ 3e07342795888654h + DQ 3e0c5a000be34bf0h + DQ 3e2711c46006d082h + DQ 3e250025b4ed8cf8h + DQ 3e2ed18bcef2d2a0h + DQ 3e21282e0c0a7554h + DQ 3e0d70f33359a7cah + DQ 3e2b7f7e13a84025h + DQ 3e33306ec321891eh + DQ 3e3fc7f8038b7550h + DQ 3e3eb0358cd71d64h + DQ 3e3a76c822859474h + DQ 3e3d0ec652de86e3h + DQ 3e2fa4cce08658afh + DQ 3e3b84a2d2c00a9eh + DQ 3e20a5b0f2c25bd1h + DQ 3e3dd660225bf699h + DQ 3e08b10f859bf037h + DQ 3e3e8823b590cbe1h + DQ 3e361311f31e96f6h + DQ 3e2e1f875ca20f9ah + DQ 3e2c95724939b9a5h + DQ 3e3805957a3e58e2h + DQ 3e2ff126ea9f0334h + DQ 3e3953f5598e5609h + DQ 3e36c16ff856c448h + DQ 3e24cb220ff261f4h + DQ 3e35e120d53d53a2h + DQ 3e3a527f6189f256h + DQ 3e3856fcffd49c0fh + DQ 3e300c2e8228d7dah + DQ 3df113d09444dfe0h + DQ 3e2510630eea59a6h + DQ 3e262e780f32d711h + DQ 3ded3ed91a10f8cfh + DQ 3e23654a7e4bcd85h + DQ 3e055b784980ad21h + DQ 3e212f2dd4b16e64h + DQ 3e37c4add939f50ch + DQ 3e281784627180fch + DQ 3dea5162c7e14961h + DQ 3e310c9019365163h + DQ 3e373c4d2ba17688h + DQ 3e2ae8a5e0e93d81h + DQ 3e2ab0c6f01621afh + DQ 3e301e8b74dd5b66h + DQ 3e2d206fecbb5494h + DQ 3df0b48b724fcc00h + DQ 3e3f831f0b61e229h + DQ 3df81a97c407bcafh + DQ 3e3e286c1ccbb7aah + DQ 3e28630b49220a93h + DQ 3dff0b15c1a22c5ch + DQ 3e355445e71c0946h + DQ 3e3be630f8066d85h + DQ 3e2599dff0d96c39h + DQ 3e36cc85b18fb081h + DQ 3e34476d001ea8c8h + DQ 3e373f889e16d31fh + DQ 3e3357100d792a87h + DQ 3e3bd179ae6101f6h + DQ 3e0ca31056c3f6e2h + DQ 3e3d2870629c08fbh + DQ 3e3aba3880d2673fh + DQ 3e2c3633cb297da6h + DQ 3e21843899efea02h + DQ 3e3bccc99d2008e6h + DQ 3e38000544bdd350h + DQ 3e2b91c226606ae1h + DQ 3e2a7adf26b62bdfh + DQ 3e18764fc8826ec9h + DQ 3e1f4f3de50f68f0h + DQ 3df760ca757995e3h + DQ 3dfc667ed3805147h + DQ 3e3733f6196adf6fh + DQ 3e2fb710f33e836bh + DQ 3e39886eba641013h + DQ 3dfb5368d0af8c1ah + DQ 3e358c691b8d2971h + DQ 3dfe9465226d08fbh + DQ 3e33587e063f0097h + DQ 3e3618e702129f18h + DQ 3e361c33fc7aac54h + DQ 3e3f07a68408604ah + DQ 3e3c34bfe4945421h + DQ 3e38b1f00e41300bh + DQ 3e3f434284d61b63h + DQ 3e3a63095e397436h + DQ 3e34428656b919deh + DQ 3e36ca9201b2d9a6h + DQ 3e2738823a2a931ch + DQ 3e3c11880e179230h + DQ 3e313ddc8d6d52feh + DQ 3e33eed58922e917h + DQ 3e295992846bdd50h + DQ 3e0ddb4d5f2e278bh + DQ 3df1a5f12a0635c4h + DQ 3e4642f0882c3c34h + DQ 3e2aee9ba7f6475eh + DQ 3e264b7f834a60e4h + DQ 3e290d42e243792eh + DQ 3e4c272008134f01h + DQ 3e4a782e16d6cf5bh + DQ 3e44505c79da6648h + DQ 3e4ca9d4ea4dcd21h + DQ 3e297d3d627cd5bch + DQ 3e20b15cf9bcaa13h + DQ 3e315b2063cf76ddh + DQ 3e2983e6f3aa2748h + DQ 3e3f4c64f4ffe994h + DQ 3e46beba7ce85a0fh + DQ 3e3b9c69fd4ea6b8h + DQ 3e2b6aa5835fa4abh + DQ 3e43ccc3790fedd1h + DQ 3e29c04cc4404fe0h + DQ 3e40734b7a75d89dh + DQ 3e1b4404c4e01612h + DQ 3e40c565c2ce4894h + DQ 3e33c71441d935cdh + DQ 3d72a492556b3b4eh + DQ 3e20fa090341dc43h + DQ 3e2e8f7009e3d9f4h + DQ 3e4b1bf68b048a45h + DQ 3e3eee52dffaa956h + DQ 3e456b0900e465bdh + DQ 3e4d929ef5777666h + DQ 3e486ea28637e260h + DQ 3e4665aff10ca2f0h + DQ 3e2f11fdaf48ec74h + DQ 3e4cbe1b86a4d1c7h + DQ 3e25b05bfea87665h + DQ 3e41cec20a1a4a1dh + DQ 3e41cd5f0a409b9fh + DQ 3e453656c8265070h + DQ 3e377ed835282260h + DQ 3e2417bc3040b9d2h + DQ 3e408eef7b79eff2h + DQ 3e4dc76f39dc57e9h + DQ 3e4c0493a70cf457h + DQ 3e4a83d6cea5a60ch + DQ 3e30d6700dc557bah + DQ 3e44c96c12e8bd0ah + DQ 3e3d2c1993e32315h + DQ 3e22c721135f8242h + DQ 3e279a3e4dda747dh + DQ 3dfcf89f6941a72bh + DQ 3e2149a702f10831h + DQ 3e4ead4b7c8175dbh + DQ 3e4e6930fe63e70ah + DQ 3e41e106bed9ee2fh + DQ 3e2d682b82f11c92h + DQ 3e3a07f188dba47ch + DQ 3e40f9342dc172f6h + DQ 3e03ef3fde623e25h +END diff --git a/sdk/lib/crt/math/libm_sse2/log_128_lead_tail_table.asm b/sdk/lib/crt/math/libm_sse2/log_128_lead_tail_table.asm new file mode 100644 index 00000000000..b968b1ed87a --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/log_128_lead_tail_table.asm @@ -0,0 +1,294 @@ +; +; MIT License +; ----------- +; +; Copyright (c) 2002-2019 Advanced Micro Devices, Inc. +; +; Permission is hereby granted, free of charge, to any person obtaining a copy +; of this Software and associated documentaon files (the "Software"), to deal +; in the Software without restriction, including without limitation the rights +; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +; copies of the Software, and to permit persons to whom the Software is +; furnished to do so, subject to the following conditions: +; +; The above copyright notice and this permission notice shall be included in +; all copies or substantial portions of the Software. +; +; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +; THE SOFTWARE. +; +;; Defines __log_128_lead and __log_128_tail tables +;; Used by log and pow +;; + +.const + +ALIGN 16 +PUBLIC __log_128_lead +__log_128_lead DD 000000000h + DD 03bff0000h + DD 03c7e0000h + DD 03cbdc000h + DD 03cfc1000h + DD 03d1cf000h + DD 03d3ba000h + DD 03d5a1000h + DD 03d785000h + DD 03d8b2000h + DD 03d9a0000h + DD 03da8d000h + DD 03db78000h + DD 03dc61000h + DD 03dd49000h + DD 03de2f000h + DD 03df13000h + DD 03dff6000h + DD 03e06b000h + DD 03e0db000h + DD 03e14a000h + DD 03e1b8000h + DD 03e226000h + DD 03e293000h + DD 03e2ff000h + DD 03e36b000h + DD 03e3d5000h + DD 03e43f000h + DD 03e4a9000h + DD 03e511000h + DD 03e579000h + DD 03e5e1000h + DD 03e647000h + DD 03e6ae000h + DD 03e713000h + DD 03e778000h + DD 03e7dc000h + DD 03e820000h + DD 03e851000h + DD 03e882000h + DD 03e8b3000h + DD 03e8e4000h + DD 03e914000h + DD 03e944000h + DD 03e974000h + DD 03e9a3000h + DD 03e9d3000h + DD 03ea02000h + DD 03ea30000h + DD 03ea5f000h + DD 03ea8d000h + DD 03eabb000h + DD 03eae8000h + DD 03eb16000h + DD 03eb43000h + DD 03eb70000h + DD 03eb9c000h + DD 03ebc9000h + DD 03ebf5000h + DD 03ec21000h + DD 03ec4d000h + DD 03ec78000h + DD 03eca3000h + DD 03ecce000h + DD 03ecf9000h + DD 03ed24000h + DD 03ed4e000h + DD 03ed78000h + DD 03eda2000h + DD 03edcc000h + DD 03edf5000h + DD 03ee1e000h + DD 03ee47000h + DD 03ee70000h + DD 03ee99000h + DD 03eec1000h + DD 03eeea000h + DD 03ef12000h + DD 03ef3a000h + DD 03ef61000h + DD 03ef89000h + DD 03efb0000h + DD 03efd7000h + DD 03effe000h + DD 03f012000h + DD 03f025000h + DD 03f039000h + DD 03f04c000h + DD 03f05f000h + DD 03f072000h + DD 03f084000h + DD 03f097000h + DD 03f0aa000h + DD 03f0bc000h + DD 03f0cf000h + DD 03f0e1000h + DD 03f0f4000h + DD 03f106000h + DD 03f118000h + DD 03f12a000h + DD 03f13c000h + DD 03f14e000h + DD 03f160000h + DD 03f172000h + DD 03f183000h + DD 03f195000h + DD 03f1a7000h + DD 03f1b8000h + DD 03f1c9000h + DD 03f1db000h + DD 03f1ec000h + DD 03f1fd000h + DD 03f20e000h + DD 03f21f000h + DD 03f230000h + DD 03f241000h + DD 03f252000h + DD 03f263000h + DD 03f273000h + DD 03f284000h + DD 03f295000h + DD 03f2a5000h + DD 03f2b5000h + DD 03f2c6000h + DD 03f2d6000h + DD 03f2e6000h + DD 03f2f7000h + DD 03f307000h + DD 03f317000h + +ALIGN 16 +PUBLIC __log_128_tail +__log_128_tail DD 000000000h + DD 03429ac41h + DD 035a8b0fch + DD 0368d83eah + DD 0361b0e78h + DD 03687b9feh + DD 03631ec65h + DD 036dd7119h + DD 035c30045h + DD 0379b7751h + DD 037ebcb0dh + DD 037839f83h + DD 037528ae5h + DD 037a2eb18h + DD 036da7495h + DD 036a91eb7h + DD 03783b715h + DD 0371131dbh + DD 0383f3e68h + DD 038156a97h + DD 038297c0fh + DD 0387e100fh + DD 03815b665h + DD 037e5e3a1h + DD 038183853h + DD 035fe719dh + DD 038448108h + DD 038503290h + DD 0373539e8h + DD 0385e0ff1h + DD 03864a740h + DD 03786742dh + DD 0387be3cdh + DD 03685ad3eh + DD 03803b715h + DD 037adcbdch + DD 0380c36afh + DD 0371652d3h + DD 038927139h + DD 038c5fcd7h + DD 038ae55d5h + DD 03818c169h + DD 038a0fde7h + DD 038ad09efh + DD 03862bae1h + DD 038eecd4ch + DD 03798aad2h + DD 037421a1ah + DD 038c5e10eh + DD 037bf2aeeh + DD 0382d872dh + DD 037ee2e8ah + DD 038dedfach + DD 03802f2b9h + DD 038481e9bh + DD 0380eaa2bh + DD 038ebfb5dh + DD 038255fddh + DD 038783b82h + DD 03851da1eh + DD 0374e1b05h + DD 0388f439bh + DD 038ca0e10h + DD 038cac08bh + DD 03891f65fh + DD 0378121cbh + DD 0386c9a9ah + DD 038949923h + DD 038777bcch + DD 037b12d26h + DD 038a6ced3h + DD 038ebd3e6h + DD 038fbe3cdh + DD 038d785c2h + DD 0387e7e00h + DD 038f392c5h + DD 037d40983h + DD 038081a7ch + DD 03784c3adh + DD 038cce923h + DD 0380f5fafh + DD 03891fd38h + DD 038ac47bch + DD 03897042bh + DD 0392952d2h + DD 0396fced4h + DD 037f97073h + DD 0385e9eaeh + DD 03865c84ah + DD 038130ba3h + DD 03979cf16h + DD 03938cac9h + DD 038c3d2f4h + DD 039755dech + DD 038e6b467h + DD 0395c0fb8h + DD 0383ebce0h + DD 038dcd192h + DD 039186bdfh + DD 0392de74ch + DD 0392f0944h + DD 0391bff61h + DD 038e9ed44h + DD 038686dc8h + DD 0396b99a7h + DD 039099c89h + DD 037a27673h + DD 0390bdaa3h + DD 0397069abh + DD 0388449ffh + DD 039013538h + DD 0392dc268h + DD 03947f423h + DD 0394ff17ch + DD 03945e10eh + DD 03929e8f5h + DD 038f85db0h + DD 038735f99h + DD 0396c08dbh + DD 03909e600h + DD 037b4996fh + DD 0391233cch + DD 0397cead9h + DD 038adb5cdh + DD 03920261ah + DD 03958ee36h + DD 035aa4905h + DD 037cbd11eh + DD 03805fdf4h +END diff --git a/sdk/lib/crt/math/libm_sse2/log_256_lead_tail_table.asm b/sdk/lib/crt/math/libm_sse2/log_256_lead_tail_table.asm new file mode 100644 index 00000000000..a6fef64ac08 --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/log_256_lead_tail_table.asm @@ -0,0 +1,554 @@ +;; +; +; MIT License +; ----------- +; +; Copyright (c) 2002-2019 Advanced Micro Devices, Inc. +; +; Permission is hereby granted, free of charge, to any person obtaining a copy +; of this Software and associated documentaon files (the "Software"), to deal +; in the Software without restriction, including without limitation the rights +; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +; copies of the Software, and to permit persons to whom the Software is +; furnished to do so, subject to the following conditions: +; +; The above copyright notice and this permission notice shall be included in +; all copies or substantial portions of the Software. +; +; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +; THE SOFTWARE. +; +;; Defines __log_256_lead and __log_256_tail tables +;; Used by log and pow +;; + +.const + +ALIGN 16 +PUBLIC __log_256_lead +__log_256_lead DQ 0000000000000000h + DQ 3f6ff00aa0000000h + DQ 3f7fe02a60000000h + DQ 3f87dc4750000000h + DQ 3f8fc0a8b0000000h + DQ 3f93cea440000000h + DQ 3f97b91b00000000h + DQ 3f9b9fc020000000h + DQ 3f9f829b00000000h + DQ 3fa1b0d980000000h + DQ 3fa39e87b0000000h + DQ 3fa58a5ba0000000h + DQ 3fa77458f0000000h + DQ 3fa95c8300000000h + DQ 3fab42dd70000000h + DQ 3fad276b80000000h + DQ 3faf0a30c0000000h + DQ 3fb0759830000000h + DQ 3fb16536e0000000h + DQ 3fb253f620000000h + DQ 3fb341d790000000h + DQ 3fb42edcb0000000h + DQ 3fb51b0730000000h + DQ 3fb60658a0000000h + DQ 3fb6f0d280000000h + DQ 3fb7da7660000000h + DQ 3fb8c345d0000000h + DQ 3fb9ab4240000000h + DQ 3fba926d30000000h + DQ 3fbb78c820000000h + DQ 3fbc5e5480000000h + DQ 3fbd4313d0000000h + DQ 3fbe270760000000h + DQ 3fbf0a30c0000000h + DQ 3fbfec9130000000h + DQ 3fc0671510000000h + DQ 3fc0d77e70000000h + DQ 3fc1478580000000h + DQ 3fc1b72ad0000000h + DQ 3fc2266f10000000h + DQ 3fc29552f0000000h + DQ 3fc303d710000000h + DQ 3fc371fc20000000h + DQ 3fc3dfc2b0000000h + DQ 3fc44d2b60000000h + DQ 3fc4ba36f0000000h + DQ 3fc526e5e0000000h + DQ 3fc59338d0000000h + DQ 3fc5ff3070000000h + DQ 3fc66acd40000000h + DQ 3fc6d60fe0000000h + DQ 3fc740f8f0000000h + DQ 3fc7ab8900000000h + DQ 3fc815c0a0000000h + DQ 3fc87fa060000000h + DQ 3fc8e928d0000000h + DQ 3fc9525a90000000h + DQ 3fc9bb3620000000h + DQ 3fca23bc10000000h + DQ 3fca8becf0000000h + DQ 3fcaf3c940000000h + DQ 3fcb5b5190000000h + DQ 3fcbc28670000000h + DQ 3fcc296850000000h + DQ 3fcc8ff7c0000000h + DQ 3fccf63540000000h + DQ 3fcd5c2160000000h + DQ 3fcdc1bca0000000h + DQ 3fce270760000000h + DQ 3fce8c0250000000h + DQ 3fcef0adc0000000h + DQ 3fcf550a50000000h + DQ 3fcfb91860000000h + DQ 3fd00e6c40000000h + DQ 3fd0402590000000h + DQ 3fd071b850000000h + DQ 3fd0a324e0000000h + DQ 3fd0d46b50000000h + DQ 3fd1058bf0000000h + DQ 3fd1368700000000h + DQ 3fd1675ca0000000h + DQ 3fd1980d20000000h + DQ 3fd1c898c0000000h + DQ 3fd1f8ff90000000h + DQ 3fd22941f0000000h + DQ 3fd2596010000000h + DQ 3fd2895a10000000h + DQ 3fd2b93030000000h + DQ 3fd2e8e2b0000000h + DQ 3fd31871c0000000h + DQ 3fd347dd90000000h + DQ 3fd3772660000000h + DQ 3fd3a64c50000000h + DQ 3fd3d54fa0000000h + DQ 3fd4043080000000h + DQ 3fd432ef20000000h + DQ 3fd4618bc0000000h + DQ 3fd4900680000000h + DQ 3fd4be5f90000000h + DQ 3fd4ec9730000000h + DQ 3fd51aad80000000h + DQ 3fd548a2c0000000h + DQ 3fd5767710000000h + DQ 3fd5a42ab0000000h + DQ 3fd5d1bdb0000000h + DQ 3fd5ff3070000000h + DQ 3fd62c82f0000000h + DQ 3fd659b570000000h + DQ 3fd686c810000000h + DQ 3fd6b3bb20000000h + DQ 3fd6e08ea0000000h + DQ 3fd70d42e0000000h + DQ 3fd739d7f0000000h + DQ 3fd7664e10000000h + DQ 3fd792a550000000h + DQ 3fd7bede00000000h + DQ 3fd7eaf830000000h + DQ 3fd816f410000000h + DQ 3fd842d1d0000000h + DQ 3fd86e9190000000h + DQ 3fd89a3380000000h + DQ 3fd8c5b7c0000000h + DQ 3fd8f11e80000000h + DQ 3fd91c67e0000000h + DQ 3fd9479410000000h + DQ 3fd972a340000000h + DQ 3fd99d9580000000h + DQ 3fd9c86b00000000h + DQ 3fd9f323e0000000h + DQ 3fda1dc060000000h + DQ 3fda484090000000h + DQ 3fda72a490000000h + DQ 3fda9cec90000000h + DQ 3fdac718c0000000h + DQ 3fdaf12930000000h + DQ 3fdb1b1e00000000h + DQ 3fdb44f770000000h + DQ 3fdb6eb590000000h + DQ 3fdb985890000000h + DQ 3fdbc1e080000000h + DQ 3fdbeb4d90000000h + DQ 3fdc149ff0000000h + DQ 3fdc3dd7a0000000h + DQ 3fdc66f4e0000000h + DQ 3fdc8ff7c0000000h + DQ 3fdcb8e070000000h + DQ 3fdce1af00000000h + DQ 3fdd0a63a0000000h + DQ 3fdd32fe70000000h + DQ 3fdd5b7f90000000h + DQ 3fdd83e720000000h + DQ 3fddac3530000000h + DQ 3fddd46a00000000h + DQ 3fddfc8590000000h + DQ 3fde248810000000h + DQ 3fde4c71a0000000h + DQ 3fde744260000000h + DQ 3fde9bfa60000000h + DQ 3fdec399d0000000h + DQ 3fdeeb20c0000000h + DQ 3fdf128f50000000h + DQ 3fdf39e5b0000000h + DQ 3fdf6123f0000000h + DQ 3fdf884a30000000h + DQ 3fdfaf5880000000h + DQ 3fdfd64f20000000h + DQ 3fdffd2e00000000h + DQ 3fe011fab0000000h + DQ 3fe02552a0000000h + DQ 3fe0389ee0000000h + DQ 3fe04bdf90000000h + DQ 3fe05f14b0000000h + DQ 3fe0723e50000000h + DQ 3fe0855c80000000h + DQ 3fe0986f40000000h + DQ 3fe0ab76b0000000h + DQ 3fe0be72e0000000h + DQ 3fe0d163c0000000h + DQ 3fe0e44980000000h + DQ 3fe0f72410000000h + DQ 3fe109f390000000h + DQ 3fe11cb810000000h + DQ 3fe12f7190000000h + DQ 3fe1422020000000h + DQ 3fe154c3d0000000h + DQ 3fe1675ca0000000h + DQ 3fe179eab0000000h + DQ 3fe18c6e00000000h + DQ 3fe19ee6b0000000h + DQ 3fe1b154b0000000h + DQ 3fe1c3b810000000h + DQ 3fe1d610f0000000h + DQ 3fe1e85f50000000h + DQ 3fe1faa340000000h + DQ 3fe20cdcd0000000h + DQ 3fe21f0bf0000000h + DQ 3fe23130d0000000h + DQ 3fe2434b60000000h + DQ 3fe2555bc0000000h + DQ 3fe2676200000000h + DQ 3fe2795e10000000h + DQ 3fe28b5000000000h + DQ 3fe29d37f0000000h + DQ 3fe2af15f0000000h + DQ 3fe2c0e9e0000000h + DQ 3fe2d2b400000000h + DQ 3fe2e47430000000h + DQ 3fe2f62a90000000h + DQ 3fe307d730000000h + DQ 3fe3197a00000000h + DQ 3fe32b1330000000h + DQ 3fe33ca2b0000000h + DQ 3fe34e2890000000h + DQ 3fe35fa4e0000000h + DQ 3fe37117b0000000h + DQ 3fe38280f0000000h + DQ 3fe393e0d0000000h + DQ 3fe3a53730000000h + DQ 3fe3b68440000000h + DQ 3fe3c7c7f0000000h + DQ 3fe3d90260000000h + DQ 3fe3ea3390000000h + DQ 3fe3fb5b80000000h + DQ 3fe40c7a40000000h + DQ 3fe41d8fe0000000h + DQ 3fe42e9c60000000h + DQ 3fe43f9fe0000000h + DQ 3fe4509a50000000h + DQ 3fe4618bc0000000h + DQ 3fe4727430000000h + DQ 3fe48353d0000000h + DQ 3fe4942a80000000h + DQ 3fe4a4f850000000h + DQ 3fe4b5bd60000000h + DQ 3fe4c679a0000000h + DQ 3fe4d72d30000000h + DQ 3fe4e7d810000000h + DQ 3fe4f87a30000000h + DQ 3fe50913c0000000h + DQ 3fe519a4c0000000h + DQ 3fe52a2d20000000h + DQ 3fe53aad00000000h + DQ 3fe54b2460000000h + DQ 3fe55b9350000000h + DQ 3fe56bf9d0000000h + DQ 3fe57c57f0000000h + DQ 3fe58cadb0000000h + DQ 3fe59cfb20000000h + DQ 3fe5ad4040000000h + DQ 3fe5bd7d30000000h + DQ 3fe5cdb1d0000000h + DQ 3fe5ddde50000000h + DQ 3fe5ee02a0000000h + DQ 3fe5fe1ed0000000h + DQ 3fe60e32f0000000h + DQ 3fe61e3ef0000000h + DQ 3fe62e42e0000000h + DQ 0000000000000000h + +ALIGN 16 +PUBLIC __log_256_tail +__log_256_tail DQ 0000000000000000h + DQ 3db5885e0250435ah + DQ 3de620cf11f86ed2h + DQ 3dff0214edba4a25h + DQ 3dbf807c79f3db4eh + DQ 3dea352ba779a52bh + DQ 3dff56c46aa49fd5h + DQ 3dfebe465fef5196h + DQ 3e0cf0660099f1f8h + DQ 3e1247b2ff85945dh + DQ 3e13fd7abf5202b6h + DQ 3e1f91c9a918d51eh + DQ 3e08cb73f118d3cah + DQ 3e1d91c7d6fad074h + DQ 3de1971bec28d14ch + DQ 3e15b616a423c78ah + DQ 3da162a6617cc971h + DQ 3e166391c4c06d29h + DQ 3e2d46f5c1d0c4b8h + DQ 3e2e14282df1f6d3h + DQ 3e186f47424a660dh + DQ 3e2d4c8de077753eh + DQ 3e2e0c307ed24f1ch + DQ 3e226ea18763bdd3h + DQ 3e25cad69737c933h + DQ 3e2af62599088901h + DQ 3e18c66c83d6b2d0h + DQ 3e1880ceb36fb30fh + DQ 3e2495aac6ca17a4h + DQ 3e2761db4210878ch + DQ 3e2eb78e862bac2fh + DQ 3e19b2cd75790dd9h + DQ 3e2c55e5cbd3d50fh + DQ 3db162a6617cc971h + DQ 3dfdbeabaaa2e519h + DQ 3e1652cb7150c647h + DQ 3e39a11cb2cd2ee2h + DQ 3e219d0ab1a28813h + DQ 3e24bd9e80a41811h + DQ 3e3214b596faa3dfh + DQ 3e303fea46980bb8h + DQ 3e31c8ffa5fd28c7h + DQ 3dce8f743bcd96c5h + DQ 3dfd98c5395315c6h + DQ 3e3996fa3ccfa7b2h + DQ 3e1cd2af2ad13037h + DQ 3e1d0da1bd17200eh + DQ 3e3330410ba68b75h + DQ 3df4f27a790e7c41h + DQ 3e13956a86f6ff1bh + DQ 3e2c6748723551d9h + DQ 3e2500de9326cdfch + DQ 3e1086c848df1b59h + DQ 3e04357ead6836ffh + DQ 3e24832442408024h + DQ 3e3d10da8154b13dh + DQ 3e39e8ad68ec8260h + DQ 3e3cfbf706abaf18h + DQ 3e3fc56ac6326e23h + DQ 3e39105e3185cf21h + DQ 3e3d017fe5b19cc0h + DQ 3e3d1f6b48dd13feh + DQ 3e20b63358a7e73ah + DQ 3e263063028c211ch + DQ 3e2e6a6886b09760h + DQ 3e3c138bb891cd03h + DQ 3e369f7722b7221ah + DQ 3df57d8fac1a628ch + DQ 3e3c55e5cbd3d50fh + DQ 3e1552d2ff48fe2eh + DQ 3e37b8b26ca431bch + DQ 3e292decdc1c5f6dh + DQ 3e3abc7c551aaa8ch + DQ 3e36b540731a354bh + DQ 3e32d341036b89efh + DQ 3e4f9ab21a3a2e0fh + DQ 3e239c871afb9fbdh + DQ 3e3e6add2c81f640h + DQ 3e435c95aa313f41h + DQ 3e249d4582f6cc53h + DQ 3e47574c1c07398fh + DQ 3e4ba846dece9e8dh + DQ 3e16999fafbc68e7h + DQ 3e4c9145e51b0103h + DQ 3e479ef2cb44850ah + DQ 3e0beec73de11275h + DQ 3e2ef4351af5a498h + DQ 3e45713a493b4a50h + DQ 3e45c23a61385992h + DQ 3e42a88309f57299h + DQ 3e4530faa9ac8aceh + DQ 3e25fec2d792a758h + DQ 3e35a517a71cbcd7h + DQ 3e3707dc3e1cd9a3h + DQ 3e3a1a9f8ef43049h + DQ 3e4409d0276b3674h + DQ 3e20e2f613e85bd9h + DQ 3df0027433001e5fh + DQ 3e35dde2836d3265h + DQ 3e2300134d7aaf04h + DQ 3e3cb7e0b42724f5h + DQ 3e2d6e93167e6308h + DQ 3e3d1569b1526adbh + DQ 3e0e99fc338a1a41h + DQ 3e4eb01394a11b1ch + DQ 3e04f27a790e7c41h + DQ 3e25ce3ca97b7af9h + DQ 3e281f0f940ed857h + DQ 3e4d36295d88857ch + DQ 3e21aca1ec4af526h + DQ 3e445743c7182726h + DQ 3e23c491aead337eh + DQ 3e3aef401a738931h + DQ 3e21cede76092a29h + DQ 3e4fba8f44f82bb4h + DQ 3e446f5f7f3c3e1ah + DQ 3e47055f86c9674bh + DQ 3e4b41a92b6b6e1ah + DQ 3e443d162e927628h + DQ 3e4466174013f9b1h + DQ 3e3b05096ad69c62h + DQ 3e40b169150faa58h + DQ 3e3cd98b1df85da7h + DQ 3e468b507b0f8fa8h + DQ 3e48422df57499bah + DQ 3e11351586970274h + DQ 3e117e08acba92eeh + DQ 3e26e04314dd0229h + DQ 3e497f3097e56d1ah + DQ 3e3356e655901286h + DQ 3e0cb761457f94d6h + DQ 3e39af67a85a9dach + DQ 3e453410931a909fh + DQ 3e22c587206058f5h + DQ 3e223bc358899c22h + DQ 3e4d7bf8b6d223cbh + DQ 3e47991ec5197ddbh + DQ 3e4a79e6bb3a9219h + DQ 3e3a4c43ed663ec5h + DQ 3e461b5a1484f438h + DQ 3e4b4e36f7ef0c3ah + DQ 3e115f026acd0d1bh + DQ 3e3f36b535cecf05h + DQ 3e2ffb7fbf3eb5c6h + DQ 3e3e6a6886b09760h + DQ 3e3135eb27f5bbc3h + DQ 3e470be7d6f6fa57h + DQ 3e4ce43cc84ab338h + DQ 3e4c01d7aac3bd91h + DQ 3e45c58d07961060h + DQ 3e3628bcf941456eh + DQ 3e4c58b2a8461cd2h + DQ 3e33071282fb989ah + DQ 3e420dab6a80f09ch + DQ 3e44f8d84c397b1eh + DQ 3e40d0ee08599e48h + DQ 3e1d68787e37da36h + DQ 3e366187d591bafch + DQ 3e22346600bae772h + DQ 3e390377d0d61b8eh + DQ 3e4f5e0dd966b907h + DQ 3e49023cb79a00e2h + DQ 3e44e05158c28ad8h + DQ 3e3bfa7b08b18ae4h + DQ 3e4ef1e63db35f67h + DQ 3e0ec2ae39493d4fh + DQ 3e40afe930ab2fa0h + DQ 3e225ff8a1810dd4h + DQ 3e469743fb1a71a5h + DQ 3e5f9cc676785571h + DQ 3e5b524da4cbf982h + DQ 3e5a4c8b381535b8h + DQ 3e5839be809caf2ch + DQ 3e50968a1cb82c13h + DQ 3e5eae6a41723fb5h + DQ 3e5d9c29a380a4dbh + DQ 3e4094aa0ada625eh + DQ 3e5973ad6fc108cah + DQ 3e4747322fdbab97h + DQ 3e593692fa9d4221h + DQ 3e5c5a992dfbc7d9h + DQ 3e4e1f33e102387ah + DQ 3e464fbef14c048ch + DQ 3e4490f513ca5e3bh + DQ 3e37a6af4d4c799dh + DQ 3e57574c1c07398fh + DQ 3e57b133417f8c1ch + DQ 3e5feb9e0c176514h + DQ 3e419f25bb3172f7h + DQ 3e45f68a7bbfb852h + DQ 3e5ee278497929f1h + DQ 3e5ccee006109d58h + DQ 3e5ce081a07bd8b3h + DQ 3e570e12981817b8h + DQ 3e292ab6d93503d0h + DQ 3e58cb7dd7c3b61eh + DQ 3e4efafd0a0b78dah + DQ 3e5e907267c4288eh + DQ 3e5d31ef96780875h + DQ 3e23430dfcd2ad50h + DQ 3e344d88d75bc1f9h + DQ 3e5bec0f055e04fch + DQ 3e5d85611590b9adh + DQ 3df320568e583229h + DQ 3e5a891d1772f538h + DQ 3e22edc9dabba74dh + DQ 3e4b9009a1015086h + DQ 3e52a12a8c5b1a19h + DQ 3e3a7885f0fdac85h + DQ 3e5f4ffcd43ac691h + DQ 3e52243ae2640aadh + DQ 3e546513299035d3h + DQ 3e5b39c3a62dd725h + DQ 3e5ba6dd40049f51h + DQ 3e451d1ed7177409h + DQ 3e5cb0f2fd7f5216h + DQ 3e3ab150cd4e2213h + DQ 3e5cfd7bf3193844h + DQ 3e53fff8455f1dbdh + DQ 3e5fee640b905fc9h + DQ 3e54e2adf548084ch + DQ 3e3b597adc1ecdd2h + DQ 3e4345bd096d3a75h + DQ 3e5101b9d2453c8bh + DQ 3e508ce55cc8c979h + DQ 3e5bbf017e595f71h + DQ 3e37ce733bd393dch + DQ 3e233bb0a503f8a1h + DQ 3e30e2f613e85bd9h + DQ 3e5e67555a635b3ch + DQ 3e2ea88df73d5e8bh + DQ 3e3d17e03bda18a8h + DQ 3e5b607d76044f7eh + DQ 3e52adc4e71bc2fch + DQ 3e5f99dc7362d1d9h + DQ 3e5473fa008e6a6ah + DQ 3e2b75bb09cb0985h + DQ 3e5ea04dd10b9abah + DQ 3e5802d0d6979674h + DQ 3e174688ccd99094h + DQ 3e496f16abb9df22h + DQ 3e46e66df2aa374fh + DQ 3e4e66525ea4550ah + DQ 3e42d02f34f20cbdh + DQ 3e46cfce65047188h + DQ 3e39b78c842d58b8h + DQ 3e4735e624c24bc9h + DQ 3e47eba1f7dd1adfh + DQ 3e586b3e59f65355h + DQ 3e1ce38e637f1b4dh + DQ 3e58d82ec919edc7h + DQ 3e4c52648ddcfa37h + DQ 3e52482ceae1ac12h + DQ 3e55a312311aba4fh + DQ 3e411e236329f225h + DQ 3e5b48c8cd2f246ch + DQ 3e6efa39ef35793ch + DQ 0000000000000000h + +END diff --git a/sdk/lib/crt/math/libm_sse2/log_F_inv_dword_table.asm b/sdk/lib/crt/math/libm_sse2/log_F_inv_dword_table.asm new file mode 100644 index 00000000000..44101f94955 --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/log_F_inv_dword_table.asm @@ -0,0 +1,164 @@ +;; +; +; MIT License +; ----------- +; +; Copyright (c) 2002-2019 Advanced Micro Devices, Inc. +; +; Permission is hereby granted, free of charge, to any person obtaining a copy +; of this Software and associated documentaon files (the "Software"), to deal +; in the Software without restriction, including without limitation the rights +; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +; copies of the Software, and to permit persons to whom the Software is +; furnished to do so, subject to the following conditions: +; +; The above copyright notice and this permission notice shall be included in +; all copies or substantial portions of the Software. +; +; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +; THE SOFTWARE. +; +;; Defines __log_F_inv_dword +;; Used in log10f and logf +;; + +.const + +ALIGN 16 +PUBLIC __log_F_inv_dword +__log_F_inv_dword DD 40000000h + DD 3ffe03f8h + DD 3ffc0fc1h + DD 3ffa232dh + DD 3ff83e10h + DD 3ff6603eh + DD 3ff4898dh + DD 3ff2b9d6h + DD 3ff0f0f1h + DD 3fef2eb7h + DD 3fed7304h + DD 3febbdb3h + DD 3fea0ea1h + DD 3fe865ach + DD 3fe6c2b4h + DD 3fe52598h + DD 3fe38e39h + DD 3fe1fc78h + DD 3fe07038h + DD 3fdee95ch + DD 3fdd67c9h + DD 3fdbeb62h + DD 3fda740eh + DD 3fd901b2h + DD 3fd79436h + DD 3fd62b81h + DD 3fd4c77bh + DD 3fd3680dh + DD 3fd20d21h + DD 3fd0b6a0h + DD 3fcf6475h + DD 3fce168ah + DD 3fcccccdh + DD 3fcb8728h + DD 3fca4588h + DD 3fc907dah + DD 3fc7ce0ch + DD 3fc6980ch + DD 3fc565c8h + DD 3fc43730h + DD 3fc30c31h + DD 3fc1e4bch + DD 3fc0c0c1h + DD 3fbfa030h + DD 3fbe82fah + DD 3fbd6910h + DD 3fbc5264h + DD 3fbb3ee7h + DD 3fba2e8ch + DD 3fb92144h + DD 3fb81703h + DD 3fb70fbbh + DD 3fb60b61h + DD 3fb509e7h + DD 3fb40b41h + DD 3fb30f63h + DD 3fb21643h + DD 3fb11fd4h + DD 3fb02c0bh + DD 3faf3adeh + DD 3fae4c41h + DD 3fad602bh + DD 3fac7692h + DD 3fab8f6ah + DD 3faaaaabh + DD 3fa9c84ah + DD 3fa8e83fh + DD 3fa80a81h + DD 3fa72f05h + DD 3fa655c4h + DD 3fa57eb5h + DD 3fa4a9cfh + DD 3fa3d70ah + DD 3fa3065eh + DD 3fa237c3h + DD 3fa16b31h + DD 3fa0a0a1h + DD 3f9fd80ah + DD 3f9f1166h + DD 3f9e4cadh + DD 3f9d89d9h + DD 3f9cc8e1h + DD 3f9c09c1h + DD 3f9b4c70h + DD 3f9a90e8h + DD 3f99d723h + DD 3f991f1ah + DD 3f9868c8h + DD 3f97b426h + DD 3f97012eh + DD 3f964fdah + DD 3f95a025h + DD 3f94f209h + DD 3f944581h + DD 3f939a86h + DD 3f92f114h + DD 3f924925h + DD 3f91a2b4h + DD 3f90fdbch + DD 3f905a38h + DD 3f8fb824h + DD 3f8f177ah + DD 3f8e7835h + DD 3f8dda52h + DD 3f8d3dcbh + DD 3f8ca29ch + DD 3f8c08c1h + DD 3f8b7034h + DD 3f8ad8f3h + DD 3f8a42f8h + DD 3f89ae41h + DD 3f891ac7h + DD 3f888889h + DD 3f87f781h + DD 3f8767abh + DD 3f86d905h + DD 3f864b8ah + DD 3f85bf37h + DD 3f853408h + DD 3f84a9fah + DD 3f842108h + DD 3f839930h + DD 3f83126fh + DD 3f828cc0h + DD 3f820821h + DD 3f81848eh + DD 3f810204h + DD 3f808081h + DD 3f800000h + +END diff --git a/sdk/lib/crt/math/libm_sse2/log_F_inv_qword_table.asm b/sdk/lib/crt/math/libm_sse2/log_F_inv_qword_table.asm new file mode 100644 index 00000000000..5c65eeed4ba --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/log_F_inv_qword_table.asm @@ -0,0 +1,294 @@ +;; +; +; MIT License +; ----------- +; +; Copyright (c) 2002-2019 Advanced Micro Devices, Inc. +; +; Permission is hereby granted, free of charge, to any person obtaining a copy +; of this Software and associated documentaon files (the "Software"), to deal +; in the Software without restriction, including without limitation the rights +; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +; copies of the Software, and to permit persons to whom the Software is +; furnished to do so, subject to the following conditions: +; +; The above copyright notice and this permission notice shall be included in +; all copies or substantial portions of the Software. +; +; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +; THE SOFTWARE. +; +;; Defines __log_F_inv_qword +;; Used in log10 and log +;; + +.const + +ALIGN 16 +PUBLIC __log_F_inv_qword +__log_F_inv_qword DQ 4000000000000000h + DQ 3fffe01fe01fe020h + DQ 3fffc07f01fc07f0h + DQ 3fffa11caa01fa12h + DQ 3fff81f81f81f820h + DQ 3fff6310aca0dbb5h + DQ 3fff44659e4a4271h + DQ 3fff25f644230ab5h + DQ 3fff07c1f07c1f08h + DQ 3ffee9c7f8458e02h + DQ 3ffecc07b301ecc0h + DQ 3ffeae807aba01ebh + DQ 3ffe9131abf0b767h + DQ 3ffe741aa59750e4h + DQ 3ffe573ac901e574h + DQ 3ffe3a9179dc1a73h + DQ 3ffe1e1e1e1e1e1eh + DQ 3ffe01e01e01e01eh + DQ 3ffde5d6e3f8868ah + DQ 3ffdca01dca01dcah + DQ 3ffdae6076b981dbh + DQ 3ffd92f2231e7f8ah + DQ 3ffd77b654b82c34h + DQ 3ffd5cac807572b2h + DQ 3ffd41d41d41d41dh + DQ 3ffd272ca3fc5b1ah + DQ 3ffd0cb58f6ec074h + DQ 3ffcf26e5c44bfc6h + DQ 3ffcd85689039b0bh + DQ 3ffcbe6d9601cbe7h + DQ 3ffca4b3055ee191h + DQ 3ffc8b265afb8a42h + DQ 3ffc71c71c71c71ch + DQ 3ffc5894d10d4986h + DQ 3ffc3f8f01c3f8f0h + DQ 3ffc26b5392ea01ch + DQ 3ffc0e070381c0e0h + DQ 3ffbf583ee868d8bh + DQ 3ffbdd2b899406f7h + DQ 3ffbc4fd65883e7bh + DQ 3ffbacf914c1bad0h + DQ 3ffb951e2b18ff23h + DQ 3ffb7d6c3dda338bh + DQ 3ffb65e2e3beee05h + DQ 3ffb4e81b4e81b4fh + DQ 3ffb37484ad806ceh + DQ 3ffb2036406c80d9h + DQ 3ffb094b31d922a4h + DQ 3ffaf286bca1af28h + DQ 3ffadbe87f94905eh + DQ 3ffac5701ac5701bh + DQ 3ffaaf1d2f87ebfdh + DQ 3ffa98ef606a63beh + DQ 3ffa82e65130e159h + DQ 3ffa6d01a6d01a6dh + DQ 3ffa574107688a4ah + DQ 3ffa41a41a41a41ah + DQ 3ffa2c2a87c51ca0h + DQ 3ffa16d3f97a4b02h + DQ 3ffa01a01a01a01ah + DQ 3ff9ec8e951033d9h + DQ 3ff9d79f176b682dh + DQ 3ff9c2d14ee4a102h + DQ 3ff9ae24ea5510dah + DQ 3ff999999999999ah + DQ 3ff9852f0d8ec0ffh + DQ 3ff970e4f80cb872h + DQ 3ff95cbb0be377aeh + DQ 3ff948b0fcd6e9e0h + DQ 3ff934c67f9b2ce6h + DQ 3ff920fb49d0e229h + DQ 3ff90d4f120190d5h + DQ 3ff8f9c18f9c18fah + DQ 3ff8e6527af1373fh + DQ 3ff8d3018d3018d3h + DQ 3ff8bfce8062ff3ah + DQ 3ff8acb90f6bf3aah + DQ 3ff899c0f601899ch + DQ 3ff886e5f0abb04ah + DQ 3ff87427bcc092b9h + DQ 3ff8618618618618h + DQ 3ff84f00c2780614h + DQ 3ff83c977ab2beddh + DQ 3ff82a4a0182a4a0h + DQ 3ff8181818181818h + DQ 3ff8060180601806h + DQ 3ff7f405fd017f40h + DQ 3ff7e225515a4f1dh + DQ 3ff7d05f417d05f4h + DQ 3ff7beb3922e017ch + DQ 3ff7ad2208e0ecc3h + DQ 3ff79baa6bb6398bh + DQ 3ff78a4c8178a4c8h + DQ 3ff77908119ac60dh + DQ 3ff767dce434a9b1h + DQ 3ff756cac201756dh + DQ 3ff745d1745d1746h + DQ 3ff734f0c541fe8dh + DQ 3ff724287f46debch + DQ 3ff713786d9c7c09h + DQ 3ff702e05c0b8170h + DQ 3ff6f26016f26017h + DQ 3ff6e1f76b4337c7h + DQ 3ff6d1a62681c861h + DQ 3ff6c16c16c16c17h + DQ 3ff6b1490aa31a3dh + DQ 3ff6a13cd1537290h + DQ 3ff691473a88d0c0h + DQ 3ff6816816816817h + DQ 3ff6719f3601671ah + DQ 3ff661ec6a5122f9h + DQ 3ff6524f853b4aa3h + DQ 3ff642c8590b2164h + DQ 3ff63356b88ac0deh + DQ 3ff623fa77016240h + DQ 3ff614b36831ae94h + DQ 3ff6058160581606h + DQ 3ff5f66434292dfch + DQ 3ff5e75bb8d015e7h + DQ 3ff5d867c3ece2a5h + DQ 3ff5c9882b931057h + DQ 3ff5babcc647fa91h + DQ 3ff5ac056b015ac0h + DQ 3ff59d61f123ccaah + DQ 3ff58ed2308158edh + DQ 3ff5805601580560h + DQ 3ff571ed3c506b3ah + DQ 3ff56397ba7c52e2h + DQ 3ff5555555555555h + DQ 3ff54725e6bb82feh + DQ 3ff5390948f40febh + DQ 3ff52aff56a8054bh + DQ 3ff51d07eae2f815h + DQ 3ff50f22e111c4c5h + DQ 3ff5015015015015h + DQ 3ff4f38f62dd4c9bh + DQ 3ff4e5e0a72f0539h + DQ 3ff4d843bedc2c4ch + DQ 3ff4cab88725af6eh + DQ 3ff4bd3edda68fe1h + DQ 3ff4afd6a052bf5bh + DQ 3ff4a27fad76014ah + DQ 3ff49539e3b2d067h + DQ 3ff4880522014880h + DQ 3ff47ae147ae147bh + DQ 3ff46dce34596066h + DQ 3ff460cbc7f5cf9ah + DQ 3ff453d9e2c776cah + DQ 3ff446f86562d9fbh + DQ 3ff43a2730abee4dh + DQ 3ff42d6625d51f87h + DQ 3ff420b5265e5951h + DQ 3ff4141414141414h + DQ 3ff40782d10e6566h + DQ 3ff3fb013fb013fbh + DQ 3ff3ee8f42a5af07h + DQ 3ff3e22cbce4a902h + DQ 3ff3d5d991aa75c6h + DQ 3ff3c995a47babe7h + DQ 3ff3bd60d9232955h + DQ 3ff3b13b13b13b14h + DQ 3ff3a524387ac822h + DQ 3ff3991c2c187f63h + DQ 3ff38d22d366088eh + DQ 3ff3813813813814h + DQ 3ff3755bd1c945eeh + DQ 3ff3698df3de0748h + DQ 3ff35dce5f9f2af8h + DQ 3ff3521cfb2b78c1h + DQ 3ff34679ace01346h + DQ 3ff33ae45b57bcb2h + DQ 3ff32f5ced6a1dfah + DQ 3ff323e34a2b10bfh + DQ 3ff3187758e9ebb6h + DQ 3ff30d190130d190h + DQ 3ff301c82ac40260h + DQ 3ff2f684bda12f68h + DQ 3ff2eb4ea1fed14bh + DQ 3ff2e025c04b8097h + DQ 3ff2d50a012d50a0h + DQ 3ff2c9fb4d812ca0h + DQ 3ff2bef98e5a3711h + DQ 3ff2b404ad012b40h + DQ 3ff2a91c92f3c105h + DQ 3ff29e4129e4129eh + DQ 3ff293725bb804a5h + DQ 3ff288b01288b013h + DQ 3ff27dfa38a1ce4dh + DQ 3ff27350b8812735h + DQ 3ff268b37cd60127h + DQ 3ff25e22708092f1h + DQ 3ff2539d7e9177b2h + DQ 3ff2492492492492h + DQ 3ff23eb79717605bh + DQ 3ff23456789abcdfh + DQ 3ff22a0122a0122ah + DQ 3ff21fb78121fb78h + DQ 3ff21579804855e6h + DQ 3ff20b470c67c0d9h + DQ 3ff2012012012012h + DQ 3ff1f7047dc11f70h + DQ 3ff1ecf43c7fb84ch + DQ 3ff1e2ef3b3fb874h + DQ 3ff1d8f5672e4abdh + DQ 3ff1cf06ada2811dh + DQ 3ff1c522fc1ce059h + DQ 3ff1bb4a4046ed29h + DQ 3ff1b17c67f2bae3h + DQ 3ff1a7b9611a7b96h + DQ 3ff19e0119e0119eh + DQ 3ff19453808ca29ch + DQ 3ff18ab083902bdbh + DQ 3ff1811811811812h + DQ 3ff1778a191bd684h + DQ 3ff16e0689427379h + DQ 3ff1648d50fc3201h + DQ 3ff15b1e5f75270dh + DQ 3ff151b9a3fdd5c9h + DQ 3ff1485f0e0acd3bh + DQ 3ff13f0e8d344724h + DQ 3ff135c81135c811h + DQ 3ff12c8b89edc0ach + DQ 3ff12358e75d3033h + DQ 3ff11a3019a74826h + DQ 3ff1111111111111h + DQ 3ff107fbbe011080h + DQ 3ff0fef010fef011h + DQ 3ff0f5edfab325a2h + DQ 3ff0ecf56be69c90h + DQ 3ff0e40655826011h + DQ 3ff0db20a88f4696h + DQ 3ff0d24456359e3ah + DQ 3ff0c9714fbcda3bh + DQ 3ff0c0a7868b4171h + DQ 3ff0b7e6ec259dc8h + DQ 3ff0af2f722eecb5h + DQ 3ff0a6810a6810a7h + DQ 3ff09ddba6af8360h + DQ 3ff0953f39010954h + DQ 3ff08cabb37565e2h + DQ 3ff0842108421084h + DQ 3ff07b9f29b8eae2h + DQ 3ff073260a47f7c6h + DQ 3ff06ab59c7912fbh + DQ 3ff0624dd2f1a9fch + DQ 3ff059eea0727586h + DQ 3ff05197f7d73404h + DQ 3ff04949cc1664c5h + DQ 3ff0410410410410h + DQ 3ff038c6b78247fch + DQ 3ff03091b51f5e1ah + DQ 3ff02864fc7729e9h + DQ 3ff0204081020408h + DQ 3ff0182436517a37h + DQ 3ff0101010101010h + DQ 3ff0080402010080h + DQ 3ff0000000000000h + DQ 0000000000000000h + + +END diff --git a/sdk/lib/crt/math/libm_sse2/log_special.c b/sdk/lib/crt/math/libm_sse2/log_special.c new file mode 100644 index 00000000000..957ba5e2499 --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/log_special.c @@ -0,0 +1,133 @@ + +/******************************************************************************* +MIT License +----------- + +Copyright (c) 2002-2019 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this Software and associated documentaon files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include +#include +#include +#include +#include + +#include "libm_new.h" + +// y = log10f(x) +// y = log10(x) +// y = logf(x) +// y = log(x) + +// these codes and the ones in the related .asm files have to match +#define LOG_X_ZERO 1 +#define LOG_X_NEG 2 +#define LOG_X_NAN 3 + +static float _logf_special_common(float x, float y, U32 code, unsigned int op, char *name) +{ + switch(code) + { + case LOG_X_ZERO: + { + UT64 ym; ym.u64 = 0; ym.f32[0] = y; + _handle_errorf(name, op, ym.u64, _SING, AMD_F_DIVBYZERO, ERANGE, x, 0.0, 1); + } + break; + + case LOG_X_NEG: + { + UT64 ym; ym.u64 = 0; ym.f32[0] = y; + _handle_errorf(name, op, ym.u64, _DOMAIN, AMD_F_INVALID, EDOM, x, 0.0, 1); + } + break; + + case LOG_X_NAN: + { + unsigned int is_snan; + UT32 xm; UT64 ym; + xm.f32 = x; + is_snan = (((xm.u32 & QNAN_MASK_32) == QNAN_SET_32) ? 0 : 1); + ym.u64 = 0; ym.f32[0] = y; + + if(is_snan) + { + _handle_errorf(name, op, ym.u64, _DOMAIN, AMD_F_INVALID, EDOM, x, 0.0, 1); + } + else + { + _handle_errorf(name, op, ym.u64, _DOMAIN, 0, EDOM, x, 0.0, 1); + } + } + break; + } + + return y; +} + +float _logf_special(float x, float y, U32 code) +{ + return _logf_special_common(x, y, code, _FpCodeLog, "logf"); +} + +float _log10f_special(float x, float y, U32 code) +{ + return _logf_special_common(x, y, code, _FpCodeLog10, "log10f"); +} + +static double _log_special_common(double x, double y, U32 code, unsigned int op, char *name) +{ + switch(code) + { + case LOG_X_ZERO: + { + UT64 ym; ym.f64 = y; + _handle_error(name, op, ym.u64, _SING, AMD_F_DIVBYZERO, ERANGE, x, 0.0, 1); + } + break; + + case LOG_X_NEG: + { + UT64 ym; ym.f64 = y; + _handle_error(name, op, ym.u64, _DOMAIN, AMD_F_INVALID, EDOM, x, 0.0, 1); + } + break; + + case LOG_X_NAN: + { + UT64 ym; ym.f64 = y; + _handle_error(name, op, ym.u64, _DOMAIN, 0, EDOM, x, 0.0, 1); + } + break; + } + + return y; +} + +double _log_special(double x, double y, U32 code) +{ + return _log_special_common(x, y, code, _FpCodeLog, "log"); +} + +double _log10_special(double x, double y, U32 code) +{ + return _log_special_common(x, y, code, _FpCodeLog10, "log10"); +} diff --git a/sdk/lib/crt/math/libm_sse2/logb.c b/sdk/lib/crt/math/libm_sse2/logb.c new file mode 100644 index 00000000000..f8680e43d08 --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/logb.c @@ -0,0 +1,84 @@ + +/******************************************************************************* +MIT License +----------- + +Copyright (c) 2002-2019 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this Software and associated documentaon files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include "libm.h" +#include "libm_util.h" + +#define USE_INFINITY_WITH_FLAGS +#define USE_HANDLE_ERROR +#include "libm_inlines.h" +#undef USE_INFINITY_WITH_FLAGS +#undef USE_HANDLE_ERROR + +#include "libm_errno.h" + +double _logb(double x) +{ + + unsigned long ux; + long u; + GET_BITS_DP64(x, ux); + u = ((ux & EXPBITS_DP64) >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64; + if ((ux & ~SIGNBIT_DP64) == 0) + /* x is +/-zero. Return -infinity with div-by-zero flag. */ + return _handle_error("_logb", OP_LOGB, NINFBITPATT_DP64, _SING, + AMD_F_DIVBYZERO, ERANGE, x, 0.0, 1); + else if (EMIN_DP64 <= u && u <= EMAX_DP64) + /* x is a normal number */ + return (double)u; + else if (u > EMAX_DP64) + { + /* x is infinity or NaN */ + if ((ux & MANTBITS_DP64) == 0) + /* x is +/-infinity. For VC++, return infinity of same sign. */ + return x; + else + /* x is NaN, result is NaN */ + return _handle_error("_logb", OP_LOGB, ux|0x0008000000000000, _DOMAIN, + 0, EDOM, x, 0.0, 1); + } + else + { + /* x is denormalized. */ +#ifdef FOLLOW_IEEE754_LOGB + /* Return the value of the minimum exponent to ensure that + the relationship between logb and scalb, defined in + IEEE 754, holds. */ + return EMIN_DP64; +#else + /* Follow the rule set by IEEE 854 for logb */ + ux &= MANTBITS_DP64; + u = EMIN_DP64; + while (ux < IMPBIT_DP64) + { + ux <<= 1; + u--; + } + return (double)u; +#endif + } + +} diff --git a/sdk/lib/crt/math/libm_sse2/logbf.c b/sdk/lib/crt/math/libm_sse2/logbf.c new file mode 100644 index 00000000000..006b384eb10 --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/logbf.c @@ -0,0 +1,82 @@ + +/******************************************************************************* +MIT License +----------- + +Copyright (c) 2002-2019 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this Software and associated documentaon files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include "libm.h" +#include "libm_util.h" + +#define USE_INFINITYF_WITH_FLAGS +#define USE_HANDLE_ERRORF +#include "libm_inlines.h" +#undef USE_INFINITYF_WITH_FLAGS +#undef USE_HANDLE_ERRORF + +#include "libm_errno.h" + +float _logbf(float x) +{ + unsigned int ux; + int u; + GET_BITS_SP32(x, ux); + u = ((ux & EXPBITS_SP32) >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32; + if ((ux & ~SIGNBIT_SP32) == 0) + /* x is +/-zero. Return -infinity with div-by-zero flag. */ + return _handle_errorf("_logbf", OP_LOGB, NINFBITPATT_SP32, _SING, + AMD_F_DIVBYZERO, ERANGE, x, 0.0F, 1); + else if (EMIN_SP32 <= u && u <= EMAX_SP32) + /* x is a normal number */ + return (float)u; + else if (u > EMAX_SP32) + { + /* x is infinity or NaN */ + if ((ux & MANTBITS_SP32) == 0) + /* x is +/-infinity. For VC++, return infinity of same sign. */ + return x; + else + /* x is NaN, result is NaN */ + return _handle_errorf("_logbf", OP_LOGB, ux|0x00400000, _DOMAIN, + 0, EDOM, x, 0.0F, 1); + } + else + { + /* x is denormalized. */ +#ifdef FOLLOW_IEEE754_LOGB + /* Return the value of the minimum exponent to ensure that + the relationship between logb and scalb, defined in + IEEE 754, holds. */ + return EMIN_SP32; +#else + /* Follow the rule set by IEEE 854 for logb */ + ux &= MANTBITS_SP32; + u = EMIN_SP32; + while (ux < IMPBIT_SP32) + { + ux <<= 1; + u--; + } + return (float)u; +#endif + } +} diff --git a/sdk/lib/crt/math/libm_sse2/logf.asm b/sdk/lib/crt/math/libm_sse2/logf.asm new file mode 100644 index 00000000000..5c426d5ca44 --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/logf.asm @@ -0,0 +1,451 @@ +; +; MIT License +; ----------- +; +; Copyright (c) 2002-2019 Advanced Micro Devices, Inc. +; +; Permission is hereby granted, free of charge, to any person obtaining a copy +; of this Software and associated documentaon files (the "Software"), to deal +; in the Software without restriction, including without limitation the rights +; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +; copies of the Software, and to permit persons to whom the Software is +; furnished to do so, subject to the following conditions: +; +; The above copyright notice and this permission notice shall be included in +; all copies or substantial portions of the Software. +; +; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +; THE SOFTWARE. +; +; +; logf.asm +; +; An implementation of the logf libm function. +; +; Prototype: +; +; float logf(float x); +; + +; +; Algorithm: +; Similar to one presnted in log.asm +; +.const + + +ALIGN 16 + +L_real_one DQ 0000000003f800000h ; 1.0 + DQ 0000000000000000h +L_real_two DQ 00000000040000000h ; 1.0 + DQ 00000000000000000h +L_real_ninf DQ 000000000ff800000h ; -inf + DQ 0000000000000000h +L_real_inf DQ 0000000007f800000h ; +inf + DQ 0000000000000000h +L_real_nan DQ 0000000007fc00000h ; NaN + DQ 0000000000000000h +L_real_neg_qnan DQ 000000000ffc00000h + DQ 0000000000000000h +L_real_notsign DQ 0000000007ffFFFFFh ; ^sign bit + DQ 0000000000000000h +L_real_mant DQ 0007FFFFF007FFFFFh ; mantissa bits + DQ 0007FFFFF007FFFFFh +L_mask_127 DQ 00000007f0000007fh ; + DQ 00000007f0000007fh +L_mask_253 DQ 000000000000000fdh + DQ 00000000000000000h +L_mask_mant_all7 DQ 00000000007f0000h + DQ 00000000007f0000h +L_mask_mant8 DQ 0000000000008000h + DQ 0000000000000000h +L_real_ca1 DQ 0000000003DAAAAABh ; 8.33333333333317923934e-02 + DQ 00000000000000000h +L_real_ca2 DQ 0000000003C4CCCCDh ; 1.25000000037717509602e-02 + DQ 00000000000000000h +L_real_log2_lead DQ 03F3170003F317000h ; 0.693115234375 + DQ 00000000000000000h +L_real_log2_tail DQ 0000000003805FDF4h ; 0.000031946183 + DQ 00000000000000000h +L_real_half DQ 0000000003f000000h ; 1/2 + DQ 00000000000000000h +L_real_1_over_3 DQ 0000000003eaaaaabh + DQ 00000000000000000h + +L_real_1_over_2 DD 03f000000h +L_real_neg127 DD 0c2fe0000h +L_real_qnanbit DD 000400000h ; quiet nan bit +L_real_threshold DD 03d800000h + +; these codes and the ones in the corresponding .c file have to match +L_flag_x_zero DD 00000001 +L_flag_x_neg DD 00000002 +L_flag_x_nan DD 00000003 + +EXTRN __log_128_lead:DWORD +EXTRN __log_128_tail:DWORD +EXTRN __log_F_inv_dword:DWORD +EXTRN __use_fma3_lib:DWORD + +fname TEXTEQU +fname_special TEXTEQU <_logf_special> + +; define local variable storage offsets + +dummy_space EQU 020h +stack_size EQU 038h + +include fm.inc + +; external function +EXTERN fname_special:PROC + +.code + +PUBLIC fname +fname PROC FRAME + StackAllocate stack_size + .ENDPROLOG + cmp DWORD PTR __use_fma3_lib, 0 + jne Llogf_fma3 + + ; Some of the placement of instructions below iwll be odd. + ; We are attempting to have no more than one branch per 32-byte block. +Llogf_sse2: + ; Zero the high bits of rax because it will be used as an index later. + xor rax, rax + movdqa xmm3, xmm0 + movaps xmm4, xmm0 + + ; This computation of the expoonent of x will produce nonsenes if x <= 0., + ; but those cases are eliminated below, so it does no harm. + psrld xmm3, 23 ; xmm3 <-- biased exp if x > 0. + + ; Is x Inf or NaN? + movd eax, xmm0 ; eax <-- x + mov ecx, eax + btr ecx, 31 ; ecx <-- |x| + cmp ecx, DWORD PTR L_real_inf + jae Llogf_sse2_x_is_inf_or_nan + + ; Finish computing exponent. + psubd xmm3, XMMWORD PTR L_mask_127 ; xmm3 <-- xexp (unbiased) + movdqa xmm2, xmm0 + cvtdq2ps xmm5, xmm3 ; (float)xexp, unless x <= 0. + + ; Is x negative or zero? + xorps xmm1, xmm1 + comiss xmm0, xmm1 + jbe Llogf_sse2_x_is_zero_or_neg + + pand xmm2, XMMWORD PTR L_real_mant ; xmm2 <-- x mantissa for later + subss xmm4, DWORD PTR L_real_one ; xmm4 <-- x - 1. for later + + comiss xmm5, DWORD PTR L_real_neg127 ; x!=0, xexp==0 ==> subnormal + je Llogf_sse2_subnormal_adjust + +Llogf_sse2_continue_common: + ; At this point we need |x| (possibly adjusted) in eax + ; and m = xexpx (possibly adjusted) in xmm5 + ; We also need the value of x - 1. computed above. + + ; compute the index into the log tables + mov r9d, eax + and eax, DWORD PTR L_mask_mant_all7 ; eax <-- 7 bits of x mantissa + and r9d, DWORD PTR L_mask_mant8 ; r9d <-- 8th bit + shl r9d, 1 + add eax, r9d ; use 8th bit to round up + movd xmm1, eax + + ; Is x near 1.0 ? + ; Note that if x is subnormal it is perforce not near one. + andps xmm4, XMMWORD PTR L_real_notsign ; xmm4 <-- |x-1| + comiss xmm4, DWORD PTR L_real_threshold ; is |x-1| < 1/16? + jb Llogf_sse2_near_one ; if so, handle elsewhere + + ; F, Y + ; F is a number in [.5,1) scaled from the rounded mantissa bits computed + ; above by oring in the exponent of .5. + ; Y is all of the mantissa bits of X scaled to [.5,1.) similarly + shr eax, 16 ; shift eax to use as index + por xmm2, XMMWORD PTR L_real_half ; xmm2 <-- Y + por xmm1, XMMWORD PTR L_real_half ; xmm2 <-- F + lea r9, QWORD PTR __log_F_inv_dword + + + ; f = F - Y, r = f * inv + subss xmm1, xmm2 ; xmm1 <-- f + mulss xmm1, DWORD PTR [r9+rax*4] ; xmm1 <-- r = f*inv (tabled) + + movaps xmm2, xmm1 + movaps xmm0, xmm1 + + ; poly + mulss xmm2, DWORD PTR L_real_1_over_3 ; xmm2 <-- r/3 + mulss xmm0, xmm1 ; xmm0 <-- r^2 + addss xmm2, DWORD PTR L_real_1_over_2 + movaps xmm3, XMMWORD PTR L_real_log2_tail + + lea r9, QWORD PTR __log_128_tail + lea r10, QWORD PTR __log_128_lead + + mulss xmm2, xmm0 ; xmm2 <-- r^2 * (r/3 + 1/2) + mulss xmm3, xmm5 ; xmm3 <-- (m=xexp)*log2_tail + addss xmm1, xmm2 ; xmm1 <-- poly + + ; m*log(2) + log(G) - poly, where G is just 2*F + ; log(G) is precomputed to extra precision. + ; small pieces and large pieces are separated until the final add, + ; to preserve accuracy + movaps xmm0, XMMWORD PTR L_real_log2_lead + subss xmm3, xmm1 ; xmm3 <-- m*log2_tail - poly + mulss xmm0, xmm5 ; xmm0 <-- m*log1_lead + addss xmm3, DWORD PTR [r9+rax*4] ; xmm3 += log(G) tail + addss xmm0, DWORD PTR [r10+rax*4] ; xmm0 += log(G) lead + + addss xmm0, xmm3 ; xmm0 <-- m*log(2)+log(G)-poly + + StackDeallocate stack_size + ret + +ALIGN 16 +Llogf_sse2_near_one: + ; Computation of the log for x near one requires special techniques. + movaps xmm2, DWORD PTR L_real_two + subss xmm0, DWORD PTR L_real_one ; xmm0 <-- r = x - 1.0 + addss xmm2, xmm0 + movaps xmm1, xmm0 + divss xmm1, xmm2 ; xmm1 <-- u = r/(2.0+r) + movaps xmm4, xmm0 + mulss xmm4, xmm1 ; xmm4 <-- correction = r*u + addss xmm1, xmm1 ; xmm1 <-- u = 2.*u + movaps xmm2, xmm1 + mulss xmm2, xmm2 ; xmm2 <-- u^2 + + ; r2 = (u^3 * (ca_1 + u^2 * ca_2) - correction) + movaps xmm3, xmm1 + mulss xmm3, xmm2 ; xmm3 <-- u^3 + mulss xmm2, DWORD PTR L_real_ca2 ; xmm2 <-- ca2*u^2 + addss xmm2, DWORD PTR L_real_ca1 ; xmm2 <-- ca2*u^2 + ca1 + mulss xmm2, xmm3 ; xmm2 <-- u^3*(ca1+u^2*ca2) + subss xmm2, xmm4 ; xmm2 <-- r2 + + ; return r + r2 + addss xmm0, xmm2 + StackDeallocate stack_size + ret + +ALIGN 16 +Llogf_sse2_subnormal_adjust: + ; This code adjusts eax and xmm5. + ; It must preserve xmm4. + por xmm2, XMMWORD PTR L_real_one + subss xmm2, DWORD PTR L_real_one + movdqa xmm5, xmm2 + pand xmm2, XMMWORD PTR L_real_mant + movd eax, xmm2 + psrld xmm5, 23 + psubd xmm5, XMMWORD PTR L_mask_253 + cvtdq2ps xmm5, xmm5 + jmp Llogf_sse2_continue_common + +; Until we get to the FMA3 code, the rest of this is special case handling. +Llogf_sse2_x_is_zero_or_neg: + jne Llogf_sse2_x_is_neg + + movaps xmm1, XMMWORD PTR L_real_ninf + mov r8d, DWORD PTR L_flag_x_zero + call fname_special + jmp Llogf_sse2_finish + +Llogf_sse2_x_is_neg: + + movaps xmm1, XMMWORD PTR L_real_neg_qnan + mov r8d, DWORD PTR L_flag_x_neg + call fname_special + jmp Llogf_sse2_finish + +Llogf_sse2_x_is_inf_or_nan: + + cmp eax, DWORD PTR L_real_inf + je Llogf_sse2_finish + + cmp eax, DWORD PTR L_real_ninf + je Llogf_sse2_x_is_neg + + or eax, DWORD PTR L_real_qnanbit + movd xmm1, eax + mov r8d, DWORD PTR L_flag_x_nan + call fname_special + jmp Llogf_sse2_finish + +Llogf_sse2_finish: + StackDeallocate stack_size + ret + +ALIGN 16 +Llogf_fma3: + ; compute exponent part + vmovaps xmm4,XMMWORD PTR L_real_inf ; preload for inf/nan test + xor rax,rax + vpsrld xmm3,xmm0,23 ; xmm3 <-- (ux>>23) + vmovd eax,xmm0 ;eax = x + vpsubd xmm3,xmm3,DWORD PTR L_mask_127 ; xmm3 <-- (ux>>23) - 127 + vcvtdq2ps xmm5,xmm3 ; xmm5 <-- float((ux>>23)-127) = xexp + + ; NaN or inf + vpand xmm1,xmm0,xmm4 ; xmm1 <-- (ux & 07f800000h) + vcomiss xmm1,xmm4 + je Llogf_fma3_x_is_inf_or_nan + + ; check for negative numbers or zero + vpxor xmm1,xmm1,xmm1 + vcomiss xmm0,xmm1 + jbe Llogf_fma3_x_is_zero_or_neg + + vpand xmm2,xmm0,DWORD PTR L_real_mant ; xmm2 <-- ux & 0007FFFFFh + vsubss xmm4,xmm0,DWORD PTR L_real_one ; xmm4 <-- x - 1.0 + + vcomiss xmm5,DWORD PTR L_real_neg127 + je Llogf_fma3_subnormal_adjust + +Llogf_fma3_continue_common: + + ; compute the index into the log tables + vpand xmm1,xmm0,DWORD PTR L_mask_mant_all7 ; xmm1 = ux & 0007f0000h + vpand xmm3,xmm0,DWORD PTR L_mask_mant8 ; xmm3 = ux & 000008000h + vpslld xmm3,xmm3,1 ; xmm3 = (ux & 000008000h) << 1 + vpaddd xmm1,xmm3,xmm1 + ; eax = (ux & 0007f0000h) + ((ux & 000008000h) << 1) + ; eax <-- x/127., rounded to nearest + vmovd eax,xmm1 + + ; near one codepath + vandps xmm4,xmm4,DWORD PTR L_real_notsign ; xmm4 <-- fabs (x - 1.0) + vcomiss xmm4,DWORD PTR L_real_threshold + jb Llogf_fma3_near_one + + ; F,Y + shr eax,16 + vpor xmm2,xmm2,DWORD PTR L_real_half ; xmm2 <-- Y + vpor xmm1,xmm1,DWORD PTR L_real_half ; xmm1 <-- F + lea r9,QWORD PTR __log_F_inv_dword + + ; f = F - Y + vsubss xmm1,xmm1,xmm2 ; f = F - Y + ; r = f * log_F_inv_dword[index] + vmulss xmm1,xmm1,DWORD PTR [r9 + rax * 4] + + ; poly + vmovaps xmm2,XMMWORD PTR L_real_1_over_3 + vfmadd213ss xmm2,xmm1,DWORD PTR L_real_1_over_2 ; 1/3*r + 1/2 + vmulss xmm0,xmm1,xmm1 ; r*r + vmovaps xmm3,DWORD PTR L_real_log2_tail; + + lea r9,DWORD PTR __log_128_tail + lea r10,DWORD PTR __log_128_lead + + vfmadd231ss xmm1,xmm2,xmm0 ; poly = r + 1/2*r*r + 1/3*r*r*r + vfmsub213ss xmm3,xmm5,xmm1 ; (xexp * log2_tail) - poly + + ; m*log(2) + log(G) - poly + vmovaps xmm0,DWORD PTR L_real_log2_lead + vfmadd213ss xmm0,xmm5,[r10 + rax * 4] + ; z2 = (xexp * log2_tail) - poly + log_128_tail[index] + vaddss xmm3,xmm3,DWORD PTR [r9 + rax * 4] + vaddss xmm0,xmm0,xmm3 ; return z1 + z2 + + StackDeallocate stack_size + ret + +ALIGN 16 +Llogf_fma3_near_one: + ; r = x - 1.0; + vmovaps xmm2,DWORD PTR L_real_two + vsubss xmm0,xmm0,DWORD PTR L_real_one ; xmm0 = r = = x - 1.0 + + ; u = r / (2.0 + r) + vaddss xmm2,xmm2,xmm0 ; (r+2.0) + vdivss xmm1,xmm0,xmm2 ; u = r / (2.0 + r) + + ; correction = r * u + vmulss xmm4,xmm0,xmm1 ; correction = u*r + + ; u = u + u; + vaddss xmm1,xmm1,xmm1 ; u = u+u + vmulss xmm2,xmm1,xmm1 ; v = u^2 + + ; r2 = (u * v * (ca_1 + v * ca_2) - correction) + vmulss xmm3,xmm1,xmm2 ; u^3 + vmovaps xmm5,DWORD PTR L_real_ca2 + vfmadd213ss xmm2,xmm5,DWORD PTR L_real_ca1 + vfmsub213ss xmm2,xmm3,xmm4 ; r2 = (ca1 + ca2 * v) * u^3 - correction + + ; r + r2 + vaddss xmm0,xmm0,xmm2 + StackDeallocate stack_size + ret + + +ALIGN 16 +Llogf_fma3_subnormal_adjust: + vmovaps xmm3,DWORD PTR L_real_one + vpor xmm2,xmm2,xmm3 ; xmm2 = temp = ((ux &0007FFFFFh) | 03f800000h) + vsubss xmm2,xmm2,xmm3 ; xmm2 = temp -1.0 + vpsrld xmm5,xmm2,23 ; xmm5 = (utemp >> 23) + vpand xmm2,xmm2,DWORD PTR L_real_mant ; xmm2 = (utemp & 0007FFFFFh) + vmovaps xmm0,xmm2 + vpsubd xmm5,xmm5,DWORD PTR L_mask_253 ; xmm5 = (utemp >> 23) - 253 + vcvtdq2ps xmm5,xmm5 ; xmm5 = (float) ((utemp >> 23) - 253) + jmp Llogf_fma3_continue_common + +Llogf_fma3_x_is_zero_or_neg: + jne Llogf_fma3_x_is_neg + + vmovaps xmm1,DWORD PTR L_real_ninf + mov r8d,DWORD PTR L_flag_x_zero + call fname_special + + StackDeallocate stack_size + ret + + +Llogf_fma3_x_is_neg: + + vmovaps xmm1,DWORD PTR L_real_neg_qnan + mov r8d,DWORD PTR L_flag_x_neg + call fname_special + + StackDeallocate stack_size + ret + +Llogf_fma3_x_is_inf_or_nan: + + cmp eax,DWORD PTR L_real_inf + je Llogf_fma3_finish + + cmp eax,DWORD PTR L_real_ninf + je Llogf_fma3_x_is_neg + + or eax,DWORD PTR L_real_qnanbit + vmovd xmm1,eax + mov r8d,DWORD PTR L_flag_x_nan + call fname_special + + StackDeallocate stack_size + ret + +Llogf_fma3_finish: + + StackDeallocate stack_size + ret + + +fname endp +END diff --git a/sdk/lib/crt/math/libm_sse2/modf.c b/sdk/lib/crt/math/libm_sse2/modf.c new file mode 100644 index 00000000000..0b0900cf66e --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/modf.c @@ -0,0 +1,76 @@ + +/******************************************************************************* +MIT License +----------- + +Copyright (c) 2002-2019 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this Software and associated documentaon files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include "libm.h" +#include "libm_util.h" + +double modf(double x, double *iptr) +{ + /* modf splits the argument x into integer and fraction parts, + each with the same sign as x. */ + + + long xexp; + unsigned long ux, ax, mask; + + GET_BITS_DP64(x, ux); + ax = ux & (~SIGNBIT_DP64); + + if (ax >= 0x4340000000000000) + { + /* abs(x) is either NaN, infinity, or >= 2^53 */ + if (ax > 0x7ff0000000000000) + { + /* x is NaN */ + *iptr = x; + return x + x; /* Raise invalid if it is a signalling NaN */ + } + else + { + /* x is infinity or large. Return zero with the sign of x */ + *iptr = x; + PUT_BITS_DP64(ux & SIGNBIT_DP64, x); + return x; + } + } + else if (ax < 0x3ff0000000000000) + { + /* abs(x) < 1.0. Set iptr to zero with the sign of x + and return x. */ + PUT_BITS_DP64(ux & SIGNBIT_DP64, *iptr); + return x; + } + else + { + xexp = ((ux & EXPBITS_DP64) >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64; + /* Mask out the bits of x that we don't want */ + mask = 1; + mask = (mask << (EXPSHIFTBITS_DP64 - xexp)) - 1; + PUT_BITS_DP64(ux & ~mask, *iptr); + return x - *iptr; + } + +} diff --git a/sdk/lib/crt/math/libm_sse2/modff.c b/sdk/lib/crt/math/libm_sse2/modff.c new file mode 100644 index 00000000000..023689e2e01 --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/modff.c @@ -0,0 +1,70 @@ + +/******************************************************************************* +MIT License +----------- + +Copyright (c) 2002-2019 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this Software and associated documentaon files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include "libm.h" +#include "libm_util.h" + +float modff(float x, float *iptr) +{ + /* modff splits the argument x into integer and fraction parts, + each with the same sign as x. */ + + unsigned int ux, mask; + int xexp; + + GET_BITS_SP32(x, ux); + xexp = ((ux & (~SIGNBIT_SP32)) >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32; + + if (xexp < 0) + { + /* abs(x) < 1.0. Set iptr to zero with the sign of x + and return x. */ + PUT_BITS_SP32(ux & SIGNBIT_SP32, *iptr); + return x; + } + else if (xexp < EXPSHIFTBITS_SP32) + { + /* x lies between 1.0 and 2**(24) */ + /* Mask out the bits of x that we don't want */ + mask = (1 << (EXPSHIFTBITS_SP32 - xexp)) - 1; + PUT_BITS_SP32(ux & ~mask, *iptr); + return x - *iptr; + } + else if ((ux & (~SIGNBIT_SP32)) > 0x7f800000) + { + /* x is NaN */ + *iptr = x; + return x + x; /* Raise invalid if it is a signalling NaN */ + } + else + { + /* x is infinity or large. Set iptr to x and return zero + with the sign of x. */ + *iptr = x; + PUT_BITS_SP32(ux & SIGNBIT_SP32, x); + return x; + } +} diff --git a/sdk/lib/crt/math/libm_sse2/pow.asm b/sdk/lib/crt/math/libm_sse2/pow.asm new file mode 100644 index 00000000000..65038c705b0 --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/pow.asm @@ -0,0 +1,2411 @@ +; +; MIT License +; ----------- +; +; Copyright (c) 2002-2019 Advanced Micro Devices, Inc. +; +; Permission is hereby granted, free of charge, to any person obtaining a copy +; of this Software and associated documentaon files (the "Software"), to deal +; in the Software without restriction, including without limitation the rights +; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +; copies of the Software, and to permit persons to whom the Software is +; furnished to do so, subject to the following conditions: +; +; The above copyright notice and this permission notice shall be included in +; all copies or substantial portions of the Software. +; +; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +; THE SOFTWARE. +; +; pow.asm +; +; An implementation of the pow libm function. +; +; Prototype: +; +; double pow(double x, double y); +; + +; +; Algorithm: +; x^y = e^(y*ln(x)) +; +; Look in exp, log for the respective algorithms +; + +.const + +ALIGN 16 + +; these codes and the ones in the corresponding .c file have to match +__flag_x_one_y_snan DD 00000001 +__flag_x_zero_z_inf DD 00000002 +__flag_x_nan DD 00000003 +__flag_y_nan DD 00000004 +__flag_x_nan_y_nan DD 00000005 +__flag_x_neg_y_notint DD 00000006 +__flag_z_zero DD 00000007 +__flag_z_denormal DD 00000008 +__flag_z_inf DD 00000009 + +ALIGN 16 + +__ay_max_bound DQ 43e0000000000000h +__ay_min_bound DQ 3c00000000000000h +__sign_mask DQ 8000000000000000h +__sign_and_exp_mask DQ 0fff0000000000000h +__exp_mask DQ 7ff0000000000000h +__neg_inf DQ 0fff0000000000000h +__pos_inf DQ 7ff0000000000000h +__pos_one DQ 3ff0000000000000h +__pos_zero DQ 0000000000000000h +__exp_mant_mask DQ 7fffffffffffffffh +__mant_mask DQ 000fffffffffffffh +__ind_pattern DQ 0fff8000000000000h + + +__neg_qnan DQ 0fff8000000000000h +__qnan DQ 7ff8000000000000h +__qnan_set DQ 0008000000000000h + +__neg_one DQ 0bff0000000000000h +__neg_zero DQ 8000000000000000h + +__exp_shift DQ 0000000000000034h ; 52 +__exp_bias DQ 00000000000003ffh ; 1023 +__exp_bias_m1 DQ 00000000000003feh ; 1022 + +__yexp_53 DQ 0000000000000035h ; 53 +__mant_full DQ 000fffffffffffffh +__1_before_mant DQ 0010000000000000h + +__mask_mant_all8 DQ 000ff00000000000h +__mask_mant9 DQ 0000080000000000h + + + +ALIGN 16 +__real_fffffffff8000000 DQ 0fffffffff8000000h + DQ 0fffffffff8000000h + +__mask_8000000000000000 DQ 8000000000000000h + DQ 8000000000000000h + +__real_4090040000000000 DQ 4090040000000000h + DQ 4090040000000000h + +__real_C090C80000000000 DQ 0C090C80000000000h + DQ 0C090C80000000000h + +;--------------------- +; log data +;--------------------- + +ALIGN 16 + +__real_ninf DQ 0fff0000000000000h ; -inf + DQ 0000000000000000h +__real_inf DQ 7ff0000000000000h ; +inf + DQ 0000000000000000h +__real_nan DQ 7ff8000000000000h ; NaN + DQ 0000000000000000h +__real_mant DQ 000FFFFFFFFFFFFFh ; mantissa bits + DQ 0000000000000000h +__mask_1023 DQ 00000000000003ffh + DQ 0000000000000000h +__mask_001 DQ 0000000000000001h + DQ 0000000000000000h + +__real_log2_lead DQ 3fe62e42e0000000h ; log2_lead 6.93147122859954833984e-01 + DQ 0000000000000000h +__real_log2_tail DQ 3e6efa39ef35793ch ; log2_tail 5.76999904754328540596e-08 + DQ 0000000000000000h + +__real_two DQ 4000000000000000h ; 2 + DQ 0000000000000000h + +__real_one DQ 3ff0000000000000h ; 1 + DQ 0000000000000000h + +__real_half DQ 3fe0000000000000h ; 1/2 + DQ 0000000000000000h + +__mask_100 DQ 0000000000000100h + DQ 0000000000000000h + +__real_1_over_2 DQ 3fe0000000000000h + DQ 0000000000000000h +__real_1_over_3 DQ 3fd5555555555555h + DQ 0000000000000000h +__real_1_over_4 DQ 3fd0000000000000h + DQ 0000000000000000h +__real_1_over_5 DQ 3fc999999999999ah + DQ 0000000000000000h +__real_1_over_6 DQ 3fc5555555555555h + DQ 0000000000000000h +__real_1_over_7 DQ 3fc2492492492494h + DQ 0000000000000000h + +__mask_1023_f DQ 0c08ff80000000000h + DQ 0000000000000000h + +__mask_2045 DQ 00000000000007fdh + DQ 0000000000000000h + +__real_threshold DQ 3fc0000000000000h ; 0.125 + DQ 3fc0000000000000h + +__real_notsign DQ 7ffFFFFFFFFFFFFFh ; ^sign bit + DQ 0000000000000000h + + +EXTRN __log_256_lead:QWORD +EXTRN __log_256_tail:QWORD +EXTRN __use_fma3_lib:DWORD + +; This table differs from the tables in log_256_lead_tail_table.asm: +; the heads have fewer significant bits (hence the tails also differ). +ALIGN 16 +__log_F_inv_head DQ 4000000000000000h + DQ 3fffe00000000000h + DQ 3fffc00000000000h + DQ 3fffa00000000000h + DQ 3fff800000000000h + DQ 3fff600000000000h + DQ 3fff400000000000h + DQ 3fff200000000000h + DQ 3fff000000000000h + DQ 3ffee00000000000h + DQ 3ffec00000000000h + DQ 3ffea00000000000h + DQ 3ffe900000000000h + DQ 3ffe700000000000h + DQ 3ffe500000000000h + DQ 3ffe300000000000h + DQ 3ffe100000000000h + DQ 3ffe000000000000h + DQ 3ffde00000000000h + DQ 3ffdc00000000000h + DQ 3ffda00000000000h + DQ 3ffd900000000000h + DQ 3ffd700000000000h + DQ 3ffd500000000000h + DQ 3ffd400000000000h + DQ 3ffd200000000000h + DQ 3ffd000000000000h + DQ 3ffcf00000000000h + DQ 3ffcd00000000000h + DQ 3ffcb00000000000h + DQ 3ffca00000000000h + DQ 3ffc800000000000h + DQ 3ffc700000000000h + DQ 3ffc500000000000h + DQ 3ffc300000000000h + DQ 3ffc200000000000h + DQ 3ffc000000000000h + DQ 3ffbf00000000000h + DQ 3ffbd00000000000h + DQ 3ffbc00000000000h + DQ 3ffba00000000000h + DQ 3ffb900000000000h + DQ 3ffb700000000000h + DQ 3ffb600000000000h + DQ 3ffb400000000000h + DQ 3ffb300000000000h + DQ 3ffb200000000000h + DQ 3ffb000000000000h + DQ 3ffaf00000000000h + DQ 3ffad00000000000h + DQ 3ffac00000000000h + DQ 3ffaa00000000000h + DQ 3ffa900000000000h + DQ 3ffa800000000000h + DQ 3ffa600000000000h + DQ 3ffa500000000000h + DQ 3ffa400000000000h + DQ 3ffa200000000000h + DQ 3ffa100000000000h + DQ 3ffa000000000000h + DQ 3ff9e00000000000h + DQ 3ff9d00000000000h + DQ 3ff9c00000000000h + DQ 3ff9a00000000000h + DQ 3ff9900000000000h + DQ 3ff9800000000000h + DQ 3ff9700000000000h + DQ 3ff9500000000000h + DQ 3ff9400000000000h + DQ 3ff9300000000000h + DQ 3ff9200000000000h + DQ 3ff9000000000000h + DQ 3ff8f00000000000h + DQ 3ff8e00000000000h + DQ 3ff8d00000000000h + DQ 3ff8b00000000000h + DQ 3ff8a00000000000h + DQ 3ff8900000000000h + DQ 3ff8800000000000h + DQ 3ff8700000000000h + DQ 3ff8600000000000h + DQ 3ff8400000000000h + DQ 3ff8300000000000h + DQ 3ff8200000000000h + DQ 3ff8100000000000h + DQ 3ff8000000000000h + DQ 3ff7f00000000000h + DQ 3ff7e00000000000h + DQ 3ff7d00000000000h + DQ 3ff7b00000000000h + DQ 3ff7a00000000000h + DQ 3ff7900000000000h + DQ 3ff7800000000000h + DQ 3ff7700000000000h + DQ 3ff7600000000000h + DQ 3ff7500000000000h + DQ 3ff7400000000000h + DQ 3ff7300000000000h + DQ 3ff7200000000000h + DQ 3ff7100000000000h + DQ 3ff7000000000000h + DQ 3ff6f00000000000h + DQ 3ff6e00000000000h + DQ 3ff6d00000000000h + DQ 3ff6c00000000000h + DQ 3ff6b00000000000h + DQ 3ff6a00000000000h + DQ 3ff6900000000000h + DQ 3ff6800000000000h + DQ 3ff6700000000000h + DQ 3ff6600000000000h + DQ 3ff6500000000000h + DQ 3ff6400000000000h + DQ 3ff6300000000000h + DQ 3ff6200000000000h + DQ 3ff6100000000000h + DQ 3ff6000000000000h + DQ 3ff5f00000000000h + DQ 3ff5e00000000000h + DQ 3ff5d00000000000h + DQ 3ff5c00000000000h + DQ 3ff5b00000000000h + DQ 3ff5a00000000000h + DQ 3ff5900000000000h + DQ 3ff5800000000000h + DQ 3ff5800000000000h + DQ 3ff5700000000000h + DQ 3ff5600000000000h + DQ 3ff5500000000000h + DQ 3ff5400000000000h + DQ 3ff5300000000000h + DQ 3ff5200000000000h + DQ 3ff5100000000000h + DQ 3ff5000000000000h + DQ 3ff5000000000000h + DQ 3ff4f00000000000h + DQ 3ff4e00000000000h + DQ 3ff4d00000000000h + DQ 3ff4c00000000000h + DQ 3ff4b00000000000h + DQ 3ff4a00000000000h + DQ 3ff4a00000000000h + DQ 3ff4900000000000h + DQ 3ff4800000000000h + DQ 3ff4700000000000h + DQ 3ff4600000000000h + DQ 3ff4600000000000h + DQ 3ff4500000000000h + DQ 3ff4400000000000h + DQ 3ff4300000000000h + DQ 3ff4200000000000h + DQ 3ff4200000000000h + DQ 3ff4100000000000h + DQ 3ff4000000000000h + DQ 3ff3f00000000000h + DQ 3ff3e00000000000h + DQ 3ff3e00000000000h + DQ 3ff3d00000000000h + DQ 3ff3c00000000000h + DQ 3ff3b00000000000h + DQ 3ff3b00000000000h + DQ 3ff3a00000000000h + DQ 3ff3900000000000h + DQ 3ff3800000000000h + DQ 3ff3800000000000h + DQ 3ff3700000000000h + DQ 3ff3600000000000h + DQ 3ff3500000000000h + DQ 3ff3500000000000h + DQ 3ff3400000000000h + DQ 3ff3300000000000h + DQ 3ff3200000000000h + DQ 3ff3200000000000h + DQ 3ff3100000000000h + DQ 3ff3000000000000h + DQ 3ff3000000000000h + DQ 3ff2f00000000000h + DQ 3ff2e00000000000h + DQ 3ff2e00000000000h + DQ 3ff2d00000000000h + DQ 3ff2c00000000000h + DQ 3ff2b00000000000h + DQ 3ff2b00000000000h + DQ 3ff2a00000000000h + DQ 3ff2900000000000h + DQ 3ff2900000000000h + DQ 3ff2800000000000h + DQ 3ff2700000000000h + DQ 3ff2700000000000h + DQ 3ff2600000000000h + DQ 3ff2500000000000h + DQ 3ff2500000000000h + DQ 3ff2400000000000h + DQ 3ff2300000000000h + DQ 3ff2300000000000h + DQ 3ff2200000000000h + DQ 3ff2100000000000h + DQ 3ff2100000000000h + DQ 3ff2000000000000h + DQ 3ff2000000000000h + DQ 3ff1f00000000000h + DQ 3ff1e00000000000h + DQ 3ff1e00000000000h + DQ 3ff1d00000000000h + DQ 3ff1c00000000000h + DQ 3ff1c00000000000h + DQ 3ff1b00000000000h + DQ 3ff1b00000000000h + DQ 3ff1a00000000000h + DQ 3ff1900000000000h + DQ 3ff1900000000000h + DQ 3ff1800000000000h + DQ 3ff1800000000000h + DQ 3ff1700000000000h + DQ 3ff1600000000000h + DQ 3ff1600000000000h + DQ 3ff1500000000000h + DQ 3ff1500000000000h + DQ 3ff1400000000000h + DQ 3ff1300000000000h + DQ 3ff1300000000000h + DQ 3ff1200000000000h + DQ 3ff1200000000000h + DQ 3ff1100000000000h + DQ 3ff1100000000000h + DQ 3ff1000000000000h + DQ 3ff0f00000000000h + DQ 3ff0f00000000000h + DQ 3ff0e00000000000h + DQ 3ff0e00000000000h + DQ 3ff0d00000000000h + DQ 3ff0d00000000000h + DQ 3ff0c00000000000h + DQ 3ff0c00000000000h + DQ 3ff0b00000000000h + DQ 3ff0a00000000000h + DQ 3ff0a00000000000h + DQ 3ff0900000000000h + DQ 3ff0900000000000h + DQ 3ff0800000000000h + DQ 3ff0800000000000h + DQ 3ff0700000000000h + DQ 3ff0700000000000h + DQ 3ff0600000000000h + DQ 3ff0600000000000h + DQ 3ff0500000000000h + DQ 3ff0500000000000h + DQ 3ff0400000000000h + DQ 3ff0400000000000h + DQ 3ff0300000000000h + DQ 3ff0300000000000h + DQ 3ff0200000000000h + DQ 3ff0200000000000h + DQ 3ff0100000000000h + DQ 3ff0100000000000h + DQ 3ff0000000000000h + DQ 3ff0000000000000h + +ALIGN 16 +__log_F_inv_tail DQ 0000000000000000h + DQ 3effe01fe01fe020h + DQ 3f1fc07f01fc07f0h + DQ 3f31caa01fa11caah + DQ 3f3f81f81f81f820h + DQ 3f48856506ddaba6h + DQ 3f5196792909c560h + DQ 3f57d9108c2ad433h + DQ 3f5f07c1f07c1f08h + DQ 3f638ff08b1c03ddh + DQ 3f680f6603d980f6h + DQ 3f6d00f57403d5d0h + DQ 3f331abf0b7672a0h + DQ 3f506a965d43919bh + DQ 3f5ceb240795ceb2h + DQ 3f6522f3b834e67fh + DQ 3f6c3c3c3c3c3c3ch + DQ 3f3e01e01e01e01eh + DQ 3f575b8fe21a291ch + DQ 3f6403b9403b9404h + DQ 3f6cc0ed7303b5cch + DQ 3f479118f3fc4da2h + DQ 3f5ed952e0b0ce46h + DQ 3f695900eae56404h + DQ 3f3d41d41d41d41dh + DQ 3f5cb28ff16c69aeh + DQ 3f696b1edd80e866h + DQ 3f4372e225fe30d9h + DQ 3f60ad12073615a2h + DQ 3f6cdb2c0397cdb3h + DQ 3f52cc157b864407h + DQ 3f664cb5f7148404h + DQ 3f3c71c71c71c71ch + DQ 3f6129a21a930b84h + DQ 3f6f1e0387f1e038h + DQ 3f5ad4e4ba80709bh + DQ 3f6c0e070381c0e0h + DQ 3f560fba1a362bb0h + DQ 3f6a5713280dee96h + DQ 3f53f59620f9ece9h + DQ 3f69f22983759f23h + DQ 3f5478ac63fc8d5ch + DQ 3f6ad87bb4671656h + DQ 3f578b8efbb8148ch + DQ 3f6d0369d0369d03h + DQ 3f5d212b601b3748h + DQ 3f0b2036406c80d9h + DQ 3f629663b24547d1h + DQ 3f4435e50d79435eh + DQ 3f67d0ff2920bc03h + DQ 3f55c06b15c06b16h + DQ 3f6e3a5f0fd7f954h + DQ 3f61dec0d4c77b03h + DQ 3f473289870ac52eh + DQ 3f6a034da034da03h + DQ 3f5d041da2292856h + DQ 3f3a41a41a41a41ah + DQ 3f68550f8a39409dh + DQ 3f5b4fe5e92c0686h + DQ 3f3a01a01a01a01ah + DQ 3f691d2a2067b23ah + DQ 3f5e7c5dada0b4e5h + DQ 3f468a7725080ce1h + DQ 3f6c49d4aa21b490h + DQ 3f63333333333333h + DQ 3f54bc363b03fccfh + DQ 3f2c9f01970e4f81h + DQ 3f697617c6ef5b25h + DQ 3f6161f9add3c0cah + DQ 3f5319fe6cb39806h + DQ 3f2f693a1c451ab3h + DQ 3f6a9e240321a9e2h + DQ 3f63831f3831f383h + DQ 3f5949ebc4dcfc1ch + DQ 3f480c6980c6980ch + DQ 3f6f9d00c5fe7403h + DQ 3f69721ed7e75347h + DQ 3f6381ec0313381fh + DQ 3f5b97c2aec12653h + DQ 3f509ef3024ae3bah + DQ 3f38618618618618h + DQ 3f6e0184f00c2780h + DQ 3f692ef5657dba52h + DQ 3f64940305494030h + DQ 3f60303030303030h + DQ 3f58060180601806h + DQ 3f5017f405fd017fh + DQ 3f412a8ad278e8ddh + DQ 3f17d05f417d05f4h + DQ 3f6d67245c02f7d6h + DQ 3f6a4411c1d986a9h + DQ 3f6754d76c7316dfh + DQ 3f649902f149902fh + DQ 3f621023358c1a68h + DQ 3f5f7390d2a6c406h + DQ 3f5b2b0805d5b2b1h + DQ 3f5745d1745d1746h + DQ 3f53c31507fa32c4h + DQ 3f50a1fd1b7af017h + DQ 3f4bc36ce3e0453ah + DQ 3f4702e05c0b8170h + DQ 3f4300b79300b793h + DQ 3f3f76b4337c6cb1h + DQ 3f3a62681c860fb0h + DQ 3f36c16c16c16c17h + DQ 3f3490aa31a3cfc7h + DQ 3f33cd153729043eh + DQ 3f3473a88d0bfd2eh + DQ 3f36816816816817h + DQ 3f39f36016719f36h + DQ 3f3ec6a5122f9016h + DQ 3f427c29da5519cfh + DQ 3f4642c8590b2164h + DQ 3f4ab5c45606f00bh + DQ 3f4fd3b80b11fd3ch + DQ 3f52cda0c6ba4eaah + DQ 3f56058160581606h + DQ 3f5990d0a4b7ef87h + DQ 3f5d6ee340579d6fh + DQ 3f60cf87d9c54a69h + DQ 3f6310572620ae4ch + DQ 3f65798c8ff522a2h + DQ 3f680ad602b580adh + DQ 3f6ac3e24799546fh + DQ 3f6da46102b1da46h + DQ 3f15805601580560h + DQ 3f3ed3c506b39a23h + DQ 3f4cbdd3e2970f60h + DQ 3f55555555555555h + DQ 3f5c979aee0bf805h + DQ 3f621291e81fd58eh + DQ 3f65fead500a9580h + DQ 3f6a0fd5c5f02a3ah + DQ 3f6e45c223898adch + DQ 3f35015015015015h + DQ 3f4c7b16ea64d422h + DQ 3f57829cbc14e5e1h + DQ 3f60877db8589720h + DQ 3f65710e4b5edceah + DQ 3f6a7dbb4d1fc1c8h + DQ 3f6fad40a57eb503h + DQ 3f43fd6bb00a5140h + DQ 3f54e78ecb419ba9h + DQ 3f600a44029100a4h + DQ 3f65c28f5c28f5c3h + DQ 3f6b9c68b2c0cc4ah + DQ 3f2978feb9f34381h + DQ 3f4ecf163bb6500ah + DQ 3f5be1958b67ebb9h + DQ 3f644e6157dc9a3bh + DQ 3f6acc4baa3f0ddfh + DQ 3f26a4cbcb2a247bh + DQ 3f50505050505050h + DQ 3f5e0b4439959819h + DQ 3f66027f6027f602h + DQ 3f6d1e854b5e0db4h + DQ 3f4165e7254813e2h + DQ 3f576646a9d716efh + DQ 3f632b48f757ce88h + DQ 3f6ac1b24652a906h + DQ 3f33b13b13b13b14h + DQ 3f5490e1eb208984h + DQ 3f62385830fec66eh + DQ 3f6a45a6cc111b7eh + DQ 3f33813813813814h + DQ 3f556f472517b708h + DQ 3f631be7bc0e8f2ah + DQ 3f6b9cbf3e55f044h + DQ 3f40e7d95bc609a9h + DQ 3f59e6b3804d19e7h + DQ 3f65c8b6af7963c2h + DQ 3f6eb9dad43bf402h + DQ 3f4f1a515885fb37h + DQ 3f60eeb1d3d76c02h + DQ 3f6a320261a32026h + DQ 3f3c82ac40260390h + DQ 3f5a12f684bda12fh + DQ 3f669d43fda2962ch + DQ 3f02e025c04b8097h + DQ 3f542804b542804bh + DQ 3f63f69b02593f6ah + DQ 3f6df31cb46e21fah + DQ 3f5012b404ad012bh + DQ 3f623925e7820a7fh + DQ 3f6c8253c8253c82h + DQ 3f4b92ddc02526e5h + DQ 3f61602511602511h + DQ 3f6bf471439c9adfh + DQ 3f4a85c40939a85ch + DQ 3f6166f9ac024d16h + DQ 3f6c44e10125e227h + DQ 3f4cebf48bbd90e5h + DQ 3f62492492492492h + DQ 3f6d6f2e2ec0b673h + DQ 3f5159e26af37c05h + DQ 3f64024540245402h + DQ 3f6f6f0243f6f024h + DQ 3f55e60121579805h + DQ 3f668e18cf81b10fh + DQ 3f32012012012012h + DQ 3f5c11f7047dc11fh + DQ 3f69e878ff70985eh + DQ 3f4779d9fdc3a219h + DQ 3f61eace5c957907h + DQ 3f6e0d5b450239e1h + DQ 3f548bf073816367h + DQ 3f6694808dda5202h + DQ 3f37c67f2bae2b21h + DQ 3f5ee58469ee5847h + DQ 3f6c0233c0233c02h + DQ 3f514e02328a7012h + DQ 3f6561072057b573h + DQ 3f31811811811812h + DQ 3f5e28646f5a1060h + DQ 3f6c0d1284e6f1d7h + DQ 3f523543f0c80459h + DQ 3f663cbeea4e1a09h + DQ 3f3b9a3fdd5c8cb8h + DQ 3f60be1c159a76d2h + DQ 3f6e1d1a688e4838h + DQ 3f572044d72044d7h + DQ 3f691713db81577bh + DQ 3f4ac73ae9819b50h + DQ 3f6460334e904cf6h + DQ 3f31111111111111h + DQ 3f5feef80441fef0h + DQ 3f6de021fde021feh + DQ 3f57b7eacc9686a0h + DQ 3f69ead7cd391fbch + DQ 3f50195609804390h + DQ 3f6641511e8d2b32h + DQ 3f4222b1acf1ce96h + DQ 3f62e29f79b47582h + DQ 3f24f0d1682e11cdh + DQ 3f5f9bb096771e4dh + DQ 3f6e5ee45dd96ae2h + DQ 3f5a0429a0429a04h + DQ 3f6bb74d5f06c021h + DQ 3f54fce404254fceh + DQ 3f695766eacbc402h + DQ 3f50842108421084h + DQ 3f673e5371d5c338h + DQ 3f4930523fbe3368h + DQ 3f656b38f225f6c4h + DQ 3f426e978d4fdf3bh + DQ 3f63dd40e4eb0cc6h + DQ 3f397f7d73404146h + DQ 3f6293982cc98af1h + DQ 3f30410410410410h + DQ 3f618d6f048ff7e4h + DQ 3f2236a3ebc349deh + DQ 3f60c9f8ee53d18ch + DQ 3f10204081020408h + DQ 3f60486ca2f46ea6h + DQ 3ef0101010101010h + DQ 3f60080402010080h + DQ 0000000000000000h + +;--------------------- +; exp data +;--------------------- + +ALIGN 16 + +__denormal_threshold DD 0fffffc02h ; -1022 + DD 0 + DQ 0 + +__enable_almost_inf DQ 7fe0000000000000h + DQ 0 + +__real_zero DQ 0000000000000000h + DQ 0 + +__real_smallest_denormal DQ 0000000000000001h + DQ 0 +__denormal_tiny_threshold DQ 0c0874046dfefd9d0h + DQ 0 + +__real_p65536 DQ 40f0000000000000h ; 65536 + DQ 0 +__real_m68800 DQ 0c0f0cc0000000000h ; -68800 + DQ 0 +__real_64_by_log2 DQ 40571547652b82feh ; 64/ln(2) + DQ 0 +__real_log2_by_64_head DQ 3f862e42f0000000h ; log2_by_64_head + DQ 0 +__real_log2_by_64_tail DQ 0bdfdf473de6af278h ; -log2_by_64_tail + DQ 0 +__real_1_by_720 DQ 3f56c16c16c16c17h ; 1/720 + DQ 0 +__real_1_by_120 DQ 3f81111111111111h ; 1/120 + DQ 0 +__real_1_by_24 DQ 3fa5555555555555h ; 1/24 + DQ 0 +__real_1_by_6 DQ 3fc5555555555555h ; 1/6 + DQ 0 +__real_1_by_2 DQ 3fe0000000000000h ; 1/2 + DQ 0 + + +EXTRN __two_to_jby64_head_table:QWORD +EXTRN __two_to_jby64_tail_table:QWORD +EXTRN __use_fma3_lib:DWORD + +fname TEXTEQU +fname_special TEXTEQU <_pow_special> + +; define local variable storage offsets + +save_x EQU 10h +save_y EQU 20h +p_temp_exp EQU 30h +negate_result EQU 40h +save_ax EQU 50h +y_head EQU 60h +p_temp_log EQU 70h +save_xmm6 EQU 080h +save_xmm7 EQU 090h +dummy_space EQU 0a0h + +stack_size EQU 0c8h + +include fm.inc + +; external function +EXTERN fname_special:PROC + +.code +ALIGN 16 +PUBLIC fname +fname PROC FRAME + StackAllocate stack_size + SaveXmm xmm6, save_xmm6 + SaveXmm xmm7, save_xmm7 + .ENDPROLOG + cmp DWORD PTR __use_fma3_lib, 0 + jne Lpow_fma3 + +ALIGN 16 +Lpow_sse2: + movsd QWORD PTR [save_x+rsp], xmm0 + movsd QWORD PTR [save_y+rsp], xmm1 + + mov rdx, QWORD PTR [save_x+rsp] + mov r8, QWORD PTR [save_y+rsp] + + mov r10, QWORD PTR __exp_mant_mask + and r10, r8 + jz Lpow_sse2_y_is_zero + + cmp r8, QWORD PTR __pos_one + je Lpow_sse2_y_is_one + + mov r9, QWORD PTR __sign_mask + and r9, rdx + mov rax, QWORD PTR __pos_zero + mov QWORD PTR [negate_result+rsp], rax + cmp r9, QWORD PTR __sign_mask + je Lpow_sse2_x_is_neg + + cmp rdx, QWORD PTR __pos_one + je Lpow_sse2_x_is_pos_one + + cmp rdx, QWORD PTR __pos_zero + je Lpow_sse2_x_is_zero + + mov r9, QWORD PTR __exp_mask + and r9, rdx + cmp r9, QWORD PTR __exp_mask + je Lpow_sse2_x_is_inf_or_nan + + mov r10, QWORD PTR __exp_mask + and r10, r8 + cmp r10, QWORD PTR __ay_max_bound + jg Lpow_sse2_ay_is_very_large + + mov r10, QWORD PTR __exp_mask + and r10, r8 + cmp r10, QWORD PTR __ay_min_bound + jl Lpow_sse2_ay_is_very_small + + ; ----------------------------- + ; compute log(x) here + ; ----------------------------- +Lpow_sse2_log_x: + + ; compute exponent part + xor r8, r8 + movdqa xmm3, xmm0 + psrlq xmm3, 52 + movd r8, xmm0 + psubq xmm3, XMMWORD PTR __mask_1023 + movdqa xmm2, xmm0 + cvtdq2pd xmm6, xmm3 ; xexp + pand xmm2, XMMWORD PTR __real_mant + + comisd xmm6, QWORD PTR __mask_1023_f + je Lpow_sse2_denormal_adjust + +Lpow_sse2_continue_common: + + ; compute index into the log tables + movsd xmm7, xmm0 + mov r9, r8 + and r8, QWORD PTR __mask_mant_all8 + and r9, QWORD PTR __mask_mant9 + subsd xmm7, __real_one + shl r9, 1 + add r8, r9 + mov QWORD PTR [p_temp_log+rsp], r8 + andpd xmm7, __real_notsign + + ; F, Y, switch to near-one codepath + movsd xmm1, QWORD PTR [p_temp_log+rsp] + shr r8, 44 + por xmm2, XMMWORD PTR __real_half + por xmm1, XMMWORD PTR __real_half + lea r9, QWORD PTR __log_F_inv_head + lea rdx, QWORD PTR __log_F_inv_tail + comisd xmm7, __real_threshold + jb Lpow_sse2_near_one + + ; f = F - Y, r = f * inv + subsd xmm1, xmm2 + movsd xmm4, xmm1 + mulsd xmm1, QWORD PTR [r9+r8*8] + movsd xmm5, xmm1 + mulsd xmm4, QWORD PTR [rdx+r8*8] + movsd xmm7, xmm4 + addsd xmm1, xmm4 + + movsd xmm2, xmm1 + movsd xmm0, xmm1 + lea r9, __log_256_lead + + ; poly + movsd xmm3, QWORD PTR __real_1_over_6 + movsd xmm1, QWORD PTR __real_1_over_3 + mulsd xmm3, xmm2 + mulsd xmm1, xmm2 + mulsd xmm0, xmm2 + subsd xmm5, xmm2 + movsd xmm4, xmm0 + addsd xmm3, QWORD PTR __real_1_over_5 + addsd xmm1, QWORD PTR __real_1_over_2 + mulsd xmm4, xmm0 + mulsd xmm3, xmm2 + mulsd xmm1, xmm0 + addsd xmm3, QWORD PTR __real_1_over_4 + addsd xmm7, xmm5 + mulsd xmm3, xmm4 + addsd xmm1, xmm3 + addsd xmm1, xmm7 + + movsd xmm5, QWORD PTR __real_log2_tail + lea rdx, __log_256_tail + mulsd xmm5, xmm6 + movsd xmm0, QWORD PTR [r9+r8*8] + subsd xmm5, xmm1 + + movsd xmm3, QWORD PTR [rdx+r8*8] + addsd xmm3, xmm5 + movsd xmm1, xmm3 + subsd xmm3, xmm2 + + movsd xmm7, QWORD PTR __real_log2_lead + mulsd xmm7, xmm6 + addsd xmm0, xmm7 + + ; result of ln(x) is computed from head and tail parts, resH and resT + ; res = ln(x) = resH + resT + ; resH and resT are in full precision + + ; resT is computed from head and tail parts, resT_h and resT_t + ; resT = resT_h + resT_t + + ; now + ; xmm3 - resT + ; xmm0 - resH + ; xmm1 - (resT_t) + ; xmm2 - (-resT_h) + +Lpow_sse2_log_x_continue: + + movsd xmm7, xmm0 + addsd xmm0, xmm3 + movsd xmm5, xmm0 + andpd xmm0, XMMWORD PTR __real_fffffffff8000000 + + ; xmm0 - H + ; xmm7 - resH + ; xmm5 - res + + mov rax, QWORD PTR [save_y+rsp] + and rax, QWORD PTR __real_fffffffff8000000 + + addsd xmm2, xmm3 + subsd xmm7, xmm5 + subsd xmm1, xmm2 + addsd xmm7, xmm3 + subsd xmm5, xmm0 + + mov QWORD PTR [y_head+rsp], rax + movsd xmm4, QWORD PTR [save_y+rsp] + + addsd xmm7, xmm1 + addsd xmm7, xmm5 + + ; res = H + T + ; H has leading 26 bits of precision + ; T has full precision + + ; xmm0 - H + ; xmm7 - T + + movsd xmm2, QWORD PTR [y_head+rsp] + subsd xmm4, xmm2 + + ; y is split into head and tail + ; for y * ln(x) computation + + ; xmm4 - Yt + ; xmm2 - Yh + ; xmm0 - H + ; xmm7 - T + + movsd xmm3, xmm4 + movsd xmm5, xmm7 + movsd xmm6, xmm0 + mulsd xmm3, xmm7 ; YtRt + mulsd xmm4, xmm0 ; YtRh + mulsd xmm5, xmm2 ; YhRt + mulsd xmm6, xmm2 ; YhRh + + movsd xmm1, xmm6 + addsd xmm3, xmm4 + addsd xmm3, xmm5 + + addsd xmm1, xmm3 + movsd xmm0, xmm1 + + subsd xmm6, xmm1 + addsd xmm6, xmm3 + + ; y * ln(x) = v + vt + ; v and vt are in full precision + + ; xmm0 - v + ; xmm6 - vt + + ; ----------------------------- + ; compute exp( y * ln(x) ) here + ; ----------------------------- + + ; v * (64/ln(2)) + movsd xmm7, QWORD PTR __real_64_by_log2 + movsd QWORD PTR [p_temp_exp+rsp], xmm0 + mulsd xmm7, xmm0 + mov rdx, QWORD PTR [p_temp_exp+rsp] + + ; v < 1024*ln(2), ( v * (64/ln(2)) ) < 64*1024 + ; v >= -1075*ln(2), ( v * (64/ln(2)) ) >= 64*(-1075) + comisd xmm7, QWORD PTR __real_p65536 + ja Lpow_sse2_process_result_inf + + comisd xmm7, QWORD PTR __real_m68800 + jb Lpow_sse2_process_result_zero + + ; n = int( v * (64/ln(2)) ) + cvtpd2dq xmm4, xmm7 + lea r10, __two_to_jby64_head_table + lea r11, __two_to_jby64_tail_table + cvtdq2pd xmm1, xmm4 + + ; r1 = x - n * ln(2)/64 head + movsd xmm2, QWORD PTR __real_log2_by_64_head + mulsd xmm2, xmm1 + movd ecx, xmm4 + mov rax, 3fh + and eax, ecx + subsd xmm0, xmm2 + + ; r2 = - n * ln(2)/64 tail + mulsd xmm1, QWORD PTR __real_log2_by_64_tail + movsd xmm2, xmm0 + + ; m = (n - j) / 64 + sub ecx, eax + sar ecx, 6 + + ; r1+r2 + addsd xmm2, xmm1 + addsd xmm2, xmm6 ; add vt here + movsd xmm1, xmm2 + + ; q + movsd xmm0, QWORD PTR __real_1_by_2 + movsd xmm3, QWORD PTR __real_1_by_24 + movsd xmm4, QWORD PTR __real_1_by_720 + mulsd xmm1, xmm2 + mulsd xmm0, xmm2 + mulsd xmm3, xmm2 + mulsd xmm4, xmm2 + + movsd xmm5, xmm1 + mulsd xmm1, xmm2 + addsd xmm0, QWORD PTR __real_one + addsd xmm3, QWORD PTR __real_1_by_6 + mulsd xmm5, xmm1 + addsd xmm4, QWORD PTR __real_1_by_120 + mulsd xmm0, xmm2 + mulsd xmm3, xmm1 + + mulsd xmm4, xmm5 + + ; deal with denormal results + xor r9d, r9d + + addsd xmm3, xmm4 + addsd xmm0, xmm3 + + cmp ecx, DWORD PTR __denormal_threshold + cmovle r9d, ecx + add rcx, 1023 + shl rcx, 52 + + ; f1, f2 + movsd xmm5, QWORD PTR [r11+rax*8] + movsd xmm1, QWORD PTR [r10+rax*8] + mulsd xmm5, xmm0 + mulsd xmm1, xmm0 + + + ; (f1+f2)*(1+q) + addsd xmm5, QWORD PTR [r11+rax*8] + addsd xmm1, xmm5 + addsd xmm1, QWORD PTR [r10+rax*8] + movsd xmm0, xmm1 + + cmp rcx, QWORD PTR __real_inf + je Lpow_sse2_process_almost_inf + + mov QWORD PTR [p_temp_exp+rsp], rcx + test r9d, r9d + jnz Lpow_sse2_process_denormal + mulsd xmm0, QWORD PTR [p_temp_exp+rsp] + orpd xmm0, XMMWORD PTR [negate_result+rsp] + +Lpow_sse2_final_check: + RestoreXmm xmm7, save_xmm7 + RestoreXmm xmm6, save_xmm6 + StackDeallocate stack_size + ret + +ALIGN 16 +Lpow_sse2_process_almost_inf: + comisd xmm0, QWORD PTR __real_one + jae Lpow_sse2_process_result_inf + + orpd xmm0, XMMWORD PTR __enable_almost_inf + orpd xmm0, XMMWORD PTR [negate_result+rsp] + jmp Lpow_sse2_final_check + +ALIGN 16 +Lpow_sse2_process_denormal: + mov ecx, r9d + xor r11d, r11d + comisd xmm0, QWORD PTR __real_one + cmovae r11d, ecx + cmp r11d, DWORD PTR __denormal_threshold + jne Lpow_sse2_process_true_denormal + + mulsd xmm0, QWORD PTR [p_temp_exp+rsp] + orpd xmm0, XMMWORD PTR [negate_result+rsp] + jmp Lpow_sse2_final_check + +ALIGN 16 +Lpow_sse2_process_true_denormal: + xor r8, r8 + mov r9, 1 + cmp rdx, QWORD PTR __denormal_tiny_threshold + jg Lpow_sse2_process_denormal_tiny + add ecx, 1074 + cmovs rcx, r8 + shl r9, cl + mov rcx, r9 + + mov QWORD PTR [p_temp_exp+rsp], rcx + mulsd xmm0, QWORD PTR [p_temp_exp+rsp] + orpd xmm0, XMMWORD PTR [negate_result+rsp] + jmp Lpow_sse2_z_denormal + +ALIGN 16 +Lpow_sse2_process_denormal_tiny: + movsd xmm0, QWORD PTR __real_smallest_denormal + orpd xmm0, XMMWORD PTR [negate_result+rsp] + jmp Lpow_sse2_z_denormal + +ALIGN 16 +Lpow_sse2_process_result_zero: + mov r11, QWORD PTR __real_zero + or r11, QWORD PTR [negate_result+rsp] + jmp Lpow_sse2_z_is_zero_or_inf + +ALIGN 16 +Lpow_sse2_process_result_inf: + mov r11, QWORD PTR __real_inf + or r11, QWORD PTR [negate_result+rsp] + jmp Lpow_sse2_z_is_zero_or_inf + +ALIGN 16 +Lpow_sse2_denormal_adjust: + por xmm2, XMMWORD PTR __real_one + subsd xmm2, QWORD PTR __real_one + movsd xmm5, xmm2 + pand xmm2, XMMWORD PTR __real_mant + movd r8, xmm2 + psrlq xmm5, 52 + psubd xmm5, XMMWORD PTR __mask_2045 + cvtdq2pd xmm6, xmm5 + jmp Lpow_sse2_continue_common + +ALIGN 16 +Lpow_sse2_x_is_neg: + + mov r10, QWORD PTR __exp_mask + and r10, r8 + cmp r10, QWORD PTR __ay_max_bound + jg Lpow_sse2_ay_is_very_large + + ; determine if y is an integer + mov r10, QWORD PTR __exp_mant_mask + and r10, r8 + mov r11, r10 + mov rcx, QWORD PTR __exp_shift + shr r10, cl + sub r10, QWORD PTR __exp_bias + js Lpow_sse2_x_is_neg_y_is_not_int + + mov rax, QWORD PTR __exp_mant_mask + and rax, rdx + mov QWORD PTR [save_ax+rsp], rax + + mov rcx, r10 + cmp r10, QWORD PTR __yexp_53 + jg Lpow_sse2_continue_after_y_int_check + + mov r9, QWORD PTR __mant_full + shr r9, cl + and r9, r11 + jnz Lpow_sse2_x_is_neg_y_is_not_int + + mov r9, QWORD PTR __1_before_mant + shr r9, cl + and r9, r11 + jz Lpow_sse2_continue_after_y_int_check + + mov rax, QWORD PTR __sign_mask + mov QWORD PTR [negate_result+rsp], rax + +Lpow_sse2_continue_after_y_int_check: + + cmp rdx, QWORD PTR __neg_zero + je Lpow_sse2_x_is_zero + + cmp rdx, QWORD PTR __neg_one + je Lpow_sse2_x_is_neg_one + + mov r9, QWORD PTR __exp_mask + and r9, rdx + cmp r9, QWORD PTR __exp_mask + je Lpow_sse2_x_is_inf_or_nan + + movsd xmm0, QWORD PTR [save_ax+rsp] + jmp Lpow_sse2_log_x + + +ALIGN 16 +Lpow_sse2_near_one: + + ; f = F - Y, r = f * inv + movsd xmm0, xmm1 + subsd xmm1, xmm2 + movsd xmm4, xmm1 + + movsd xmm3, QWORD PTR [r9+r8*8] + addsd xmm3, QWORD PTR [rdx+r8*8] + mulsd xmm4, xmm3 + andpd xmm4, QWORD PTR __real_fffffffff8000000 + movsd xmm5, xmm4 ; r1 + mulsd xmm4, xmm0 + subsd xmm1, xmm4 + mulsd xmm1, xmm3 + movsd xmm7, xmm1 ; r2 + addsd xmm1, xmm5 + + movsd xmm2, xmm1 + movsd xmm0, xmm1 + + lea r9, __log_256_lead + + ; poly + movsd xmm3, QWORD PTR __real_1_over_7 + movsd xmm1, QWORD PTR __real_1_over_4 + mulsd xmm3, xmm2 + mulsd xmm1, xmm2 + mulsd xmm0, xmm2 + movsd xmm4, xmm0 + addsd xmm3, QWORD PTR __real_1_over_6 + addsd xmm1, QWORD PTR __real_1_over_3 + mulsd xmm4, xmm0 + mulsd xmm3, xmm2 + mulsd xmm1, xmm2 + addsd xmm3, QWORD PTR __real_1_over_5 + mulsd xmm3, xmm2 + mulsd xmm1, xmm0 + mulsd xmm3, xmm4 + + movsd xmm2, xmm5 + movsd xmm0, xmm7 + mulsd xmm0, xmm0 + mulsd xmm0, QWORD PTR __real_1_over_2 + mulsd xmm5, xmm7 + addsd xmm5, xmm0 + addsd xmm5, xmm7 + + movsd xmm0, xmm2 + movsd xmm7, xmm2 + mulsd xmm0, xmm0 + mulsd xmm0, QWORD PTR __real_1_over_2 + movsd xmm4, xmm0 + addsd xmm2, xmm0 ; r1 + r1^2/2 + subsd xmm7, xmm2 + addsd xmm7, xmm4 + + addsd xmm3, xmm7 + movsd xmm4, QWORD PTR __real_log2_tail + addsd xmm1, xmm3 + mulsd xmm4, xmm6 + lea rdx, __log_256_tail + addsd xmm1, xmm5 + addsd xmm4, QWORD PTR [rdx+r8*8] + subsd xmm4, xmm1 + + movsd xmm3, xmm4 + movsd xmm1, xmm4 + subsd xmm3, xmm2 + + movsd xmm0, QWORD PTR [r9+r8*8] + movsd xmm7, QWORD PTR __real_log2_lead + mulsd xmm7, xmm6 + addsd xmm0, xmm7 + + jmp Lpow_sse2_log_x_continue + + +ALIGN 16 +Lpow_sse2_x_is_pos_one: + jmp Lpow_sse2_final_check + +ALIGN 16 +Lpow_sse2_y_is_zero: + movsd xmm0, QWORD PTR __real_one + jmp Lpow_sse2_final_check + +ALIGN 16 +Lpow_sse2_y_is_one: + xor rax, rax + mov r11, rdx + mov r9, QWORD PTR __exp_mask + ;or r11, QWORD PTR __qnan_set + and r9, rdx + cmp r9, QWORD PTR __exp_mask + cmove rax, rdx + mov r9, QWORD PTR __mant_mask + and r9, rax + jnz Lpow_sse2_x_is_nan + + movd xmm0, rdx + jmp Lpow_sse2_final_check + +ALIGN 16 +Lpow_sse2_x_is_neg_one: + mov rdx, QWORD PTR __pos_one + or rdx, QWORD PTR [negate_result+rsp] + xor rax, rax + mov r11, r8 + mov r10, QWORD PTR __exp_mask + ;or r11, QWORD PTR __qnan_set + and r10, r8 + cmp r10, QWORD PTR __exp_mask + cmove rax, r8 + mov r10, QWORD PTR __mant_mask + and r10, rax + jnz Lpow_sse2_y_is_nan + + movd xmm0, rdx + jmp Lpow_sse2_final_check + +ALIGN 16 +Lpow_sse2_x_is_neg_y_is_not_int: + mov r9, QWORD PTR __exp_mask + and r9, rdx + cmp r9, QWORD PTR __exp_mask + je Lpow_sse2_x_is_inf_or_nan + + cmp rdx, QWORD PTR __neg_zero + je Lpow_sse2_x_is_zero + + movsd xmm0, QWORD PTR [save_x+rsp] + movsd xmm1, QWORD PTR [save_y+rsp] + movsd xmm2, QWORD PTR __neg_qnan + mov r9d, DWORD PTR __flag_x_neg_y_notint + + call fname_special + jmp Lpow_sse2_final_check + +ALIGN 16 +Lpow_sse2_ay_is_very_large: + mov r9, QWORD PTR __exp_mask + and r9, rdx + cmp r9, QWORD PTR __exp_mask + je Lpow_sse2_x_is_inf_or_nan + + mov r9, QWORD PTR __exp_mant_mask + and r9, rdx + jz Lpow_sse2_x_is_zero + + cmp rdx, QWORD PTR __neg_one + je Lpow_sse2_x_is_neg_one + + mov r9, rdx + and r9, QWORD PTR __exp_mant_mask + cmp r9, QWORD PTR __pos_one + jl Lpow_sse2_ax_lt1_y_is_large_or_inf_or_nan + + jmp Lpow_sse2_ax_gt1_y_is_large_or_inf_or_nan + +ALIGN 16 +Lpow_sse2_x_is_zero: + mov r10, QWORD PTR __exp_mask + xor rax, rax + and r10, r8 + cmp r10, QWORD PTR __exp_mask + je Lpow_sse2_x_is_zero_y_is_inf_or_nan + + mov r10, QWORD PTR __sign_mask + and r10, r8 + cmovnz rax, QWORD PTR __pos_inf + jnz Lpow_sse2_x_is_zero_z_is_inf + + movd xmm0, rax + orpd xmm0, XMMWORD PTR [negate_result+rsp] + jmp Lpow_sse2_final_check + +ALIGN 16 +Lpow_sse2_x_is_zero_z_is_inf: + + movsd xmm0, QWORD PTR [save_x+rsp] + movsd xmm1, QWORD PTR [save_y+rsp] + movd xmm2, rax + orpd xmm2, XMMWORD PTR [negate_result+rsp] + mov r9d, DWORD PTR __flag_x_zero_z_inf + + call fname_special + jmp Lpow_sse2_final_check + +ALIGN 16 +Lpow_sse2_x_is_zero_y_is_inf_or_nan: + mov r11, r8 + cmp r8, QWORD PTR __neg_inf + cmove rax, QWORD PTR __pos_inf + je Lpow_sse2_x_is_zero_z_is_inf + + ;or r11, QWORD PTR __qnan_set + mov r10, QWORD PTR __mant_mask + and r10, r8 + jnz Lpow_sse2_y_is_nan + + movd xmm0, rax + jmp Lpow_sse2_final_check + +ALIGN 16 +Lpow_sse2_x_is_inf_or_nan: + xor r11, r11 + mov r10, QWORD PTR __sign_mask + and r10, r8 + cmovz r11, QWORD PTR __pos_inf + mov rax, rdx + mov r9, QWORD PTR __mant_mask + ;or rax, QWORD PTR __qnan_set + and r9, rdx + cmovnz r11, rax + jnz Lpow_sse2_x_is_nan + + xor rax, rax + mov r9, r8 + mov r10, QWORD PTR __exp_mask + ;or r9, QWORD PTR __qnan_set + and r10, r8 + cmp r10, QWORD PTR __exp_mask + cmove rax, r8 + mov r10, QWORD PTR __mant_mask + and r10, rax + cmovnz r11, r9 + jnz Lpow_sse2_y_is_nan + + movd xmm0, r11 + orpd xmm0, XMMWORD PTR [negate_result+rsp] + jmp Lpow_sse2_final_check + +ALIGN 16 +Lpow_sse2_ay_is_very_small: + movsd xmm0, QWORD PTR __pos_one + addsd xmm0, xmm1 + jmp Lpow_sse2_final_check + + +ALIGN 16 +Lpow_sse2_ax_lt1_y_is_large_or_inf_or_nan: + xor r11, r11 + mov r10, QWORD PTR __sign_mask + and r10, r8 + cmovnz r11, QWORD PTR __pos_inf + jmp Lpow_sse2_adjust_for_nan + +ALIGN 16 +Lpow_sse2_ax_gt1_y_is_large_or_inf_or_nan: + xor r11, r11 + mov r10, QWORD PTR __sign_mask + and r10, r8 + cmovz r11, QWORD PTR __pos_inf + +ALIGN 16 +Lpow_sse2_adjust_for_nan: + + xor rax, rax + mov r9, r8 + mov r10, QWORD PTR __exp_mask + ;or r9, QWORD PTR __qnan_set + and r10, r8 + cmp r10, QWORD PTR __exp_mask + cmove rax, r8 + mov r10, QWORD PTR __mant_mask + and r10, rax + cmovnz r11, r9 + jnz Lpow_sse2_y_is_nan + + test rax, rax + jnz Lpow_sse2_y_is_inf + +ALIGN 16 +Lpow_sse2_z_is_zero_or_inf: + + mov r9d, DWORD PTR __flag_z_zero + test r11, QWORD PTR __exp_mant_mask + cmovnz r9d, DWORD PTR __flag_z_inf + + movsd xmm0, QWORD PTR [save_x+rsp] + movsd xmm1, QWORD PTR [save_y+rsp] + movd xmm2, r11 + + call fname_special + jmp Lpow_sse2_final_check + +ALIGN 16 +Lpow_sse2_y_is_inf: + + movd xmm0, r11 + jmp Lpow_sse2_final_check + +ALIGN 16 +Lpow_sse2_x_is_nan: + + xor rax, rax + mov r10, QWORD PTR __exp_mask + and r10, r8 + cmp r10, QWORD PTR __exp_mask + cmove rax, r8 + mov r10, QWORD PTR __mant_mask + and r10, rax + jnz Lpow_sse2_x_is_nan_y_is_nan + + movsd xmm0, QWORD PTR [save_x+rsp] + movsd xmm1, QWORD PTR [save_y+rsp] + movd xmm2, r11 + mov r9d, DWORD PTR __flag_x_nan + + call fname_special + jmp Lpow_sse2_final_check + +ALIGN 16 +Lpow_sse2_y_is_nan: + + movsd xmm0, QWORD PTR [save_x+rsp] + movsd xmm1, QWORD PTR [save_y+rsp] + movd xmm2, r11 + mov r9d, DWORD PTR __flag_y_nan + + call fname_special + jmp Lpow_sse2_final_check + +ALIGN 16 +Lpow_sse2_x_is_nan_y_is_nan: + + mov r9, r8 + + cmp r11, QWORD PTR __ind_pattern + cmove r11, r9 + je Lpow_sse2_continue_xy_nan + + cmp r9, QWORD PTR __ind_pattern + cmove r9, r11 + + mov r10, r9 + and r10, QWORD PTR __sign_mask + cmovnz r9, r11 + + mov r10, r11 + and r10, QWORD PTR __sign_mask + cmovnz r11, r9 + +Lpow_sse2_continue_xy_nan: + ;or r11, QWORD PTR __qnan_set + movsd xmm0, QWORD PTR [save_x+rsp] + movsd xmm1, QWORD PTR [save_y+rsp] + movd xmm2, r11 + mov r9d, DWORD PTR __flag_x_nan_y_nan + + call fname_special + jmp Lpow_sse2_final_check + +ALIGN 16 +Lpow_sse2_z_denormal: + + movsd xmm2, xmm0 + movsd xmm0, QWORD PTR [save_x+rsp] + movsd xmm1, QWORD PTR [save_y+rsp] + mov r9d, DWORD PTR __flag_z_denormal + + call fname_special + jmp Lpow_sse2_final_check + +Lpow_fma3: + vmovsd QWORD PTR [save_x+rsp], xmm0 + vmovsd QWORD PTR [save_y+rsp], xmm1 + + mov rdx, QWORD PTR [save_x+rsp] + mov r8, QWORD PTR [save_y+rsp] + + mov r10, QWORD PTR __exp_mant_mask + and r10, r8 + jz Lpow_fma3_y_is_zero + + cmp r8, QWORD PTR __pos_one + je Lpow_fma3_y_is_one + + mov r9, QWORD PTR __sign_mask + and r9, rdx + cmp r9, QWORD PTR __sign_mask + mov rax, QWORD PTR __pos_zero + mov QWORD PTR [negate_result+rsp], rax + je Lpow_fma3_x_is_neg + + cmp rdx, QWORD PTR __pos_one + je Lpow_fma3_x_is_pos_one + + cmp rdx, QWORD PTR __pos_zero + je Lpow_fma3_x_is_zero + + mov r9, QWORD PTR __exp_mask + and r9, rdx + cmp r9, QWORD PTR __exp_mask + je Lpow_fma3_x_is_inf_or_nan + + mov r10, QWORD PTR __exp_mask + and r10, r8 + cmp r10, QWORD PTR __ay_max_bound + jg Lpow_fma3_ay_is_very_large + + mov r10, QWORD PTR __exp_mask + and r10, r8 + cmp r10, QWORD PTR __ay_min_bound + jl Lpow_fma3_ay_is_very_small + + ; ----------------------------- + ; compute log(x) here + ; ----------------------------- +Lpow_fma3_log_x: + + ; compute exponent part + vpsrlq xmm3, xmm0, 52 + vmovq r8, xmm0 + vpsubq xmm3, xmm3, XMMWORD PTR __mask_1023 + vcvtdq2pd xmm6, xmm3 ; xexp + vpand xmm2, xmm0, XMMWORD PTR __real_mant + + vcomisd xmm6, QWORD PTR __mask_1023_f + je Lpow_fma3_denormal_adjust + +Lpow_fma3_continue_common: + + ; compute index into the log tables + mov r9, r8 + and r8, QWORD PTR __mask_mant_all8 + and r9, QWORD PTR __mask_mant9 + vsubsd xmm7, xmm0, __real_one + shl r9, 1 + add r8, r9 + vmovq xmm1, r8 + vandpd xmm7, xmm7, __real_notsign + + ; F, Y, switch to near-one codepath + shr r8, 44 + vpor xmm2, xmm2, XMMWORD PTR __real_half + vpor xmm1, xmm1, XMMWORD PTR __real_half + vcomisd xmm7, __real_threshold + lea r9, QWORD PTR __log_F_inv_head + lea rdx, QWORD PTR __log_F_inv_tail + jb Lpow_fma3_near_one + + ; f = F - Y, r = f * inv + vsubsd xmm4, xmm1, xmm2 ; xmm4 <-- f = F - Y + vmulsd xmm1, xmm4, QWORD PTR [r9+r8*8] ; xmm1 <-- rhead = f*inv_head + vmovapd xmm5, xmm1 ; xmm5 <-- copy of rhead + vmulsd xmm4, xmm4, QWORD PTR [rdx+r8*8] ; xmm4 <-- rtail = f*inv_tail + vmovapd xmm7, xmm4 ; xmm7 <-- copy of rtail + vaddsd xmm1, xmm1, xmm4 ; xmm1 <-- r = rhead + rtail + + vmovapd xmm2, xmm1 ; xmm2 <-- copy of r + vmovapd xmm0, xmm1 ; xmm1 <-- copy of r + lea r9, __log_256_lead + + ; poly +; movsd xmm3, QWORD PTR __real_1_over_6 +; movsd xmm1, QWORD PTR __real_1_over_3 +; mulsd xmm3, xmm2 ; r*1/6 +; mulsd xmm1, xmm2 ; r*1/3 +; mulsd xmm0, xmm2 ; r^2 +; subsd xmm5, xmm2 ; xmm5 <-- rhead - r +; movsd xmm4, xmm0 ; xmm4 <-- copy of r^2 +; addsd xmm3, QWORD PTR __real_1_over_5 ; xmm3 <-- r*1/6 + 1/5 +; addsd xmm1, QWORD PTR __real_1_over_2 ; xmm1 <-- r*1/3 + 1/2 +; mulsd xmm4, xmm0 ; xmm4 <-- r^4 +; mulsd xmm3, xmm2 ; xmm3 <-- (r*1/6 + 1/5)*r +; mulsd xmm1, xmm0 ; xmm1 <-- (r*1/3 + 1/2)*r^2 +; addsd xmm3, QWORD PTR __real_1_over_4 ; xmm3 <-- (r*1/6+1/5)*r + 1/4 +; addsd xmm7, xmm5 ; xmm7 <-- rtail + (rhead - r) +; mulsd xmm3, xmm4 ; xmm3 <-- (r*1/6 + 1/5)*r^5 + r^4*1/4 +; addsd xmm1, xmm3 ; xmm1 <-- poly down to r^2 +; addsd xmm1, xmm7 ; xmm1 <-- poly + correction + + + vsubsd xmm3, xmm5, xmm2 + vmovsd xmm1, QWORD PTR __real_1_over_6 + vmulsd xmm0,xmm0,xmm0 + vaddsd xmm3, xmm3, xmm7 + vfmadd213sd xmm1, xmm2, QWORD PTR __real_1_over_5 + vfmadd213sd xmm1, xmm2, QWORD PTR __real_1_over_4 + vfmadd213sd xmm1, xmm2, QWORD PTR __real_1_over_3 + vfmadd213sd xmm1, xmm2, QWORD PTR __real_1_over_2 + vfmadd213sd xmm1, xmm0, xmm3 + + vmovsd xmm5, QWORD PTR __real_log2_tail + lea rdx, __log_256_tail + vfmsub213sd xmm5, xmm6, xmm1 + vmovsd xmm0, QWORD PTR [r9+r8*8] + + vaddsd xmm3, xmm5, QWORD PTR [rdx+r8*8] + vmovapd xmm1, xmm3 + vsubsd xmm3, xmm3, xmm2 + + vfmadd231sd xmm0, xmm6, QWORD PTR __real_log2_lead + + ; result of ln(x) is computed from head and tail parts, resH and resT + ; res = ln(x) = resH + resT + ; resH and resT are in full precision + + ; resT is computed from head and tail parts, resT_h and resT_t + ; resT = resT_h + resT_t + + ; now + ; xmm3 - resT + ; xmm0 - resH + ; xmm1 - (resT_t) + ; xmm2 - (-resT_h) + +Lpow_fma3_log_x_continue: + + vmovapd xmm7, xmm0 + vaddsd xmm0, xmm0, xmm3 + vmovapd xmm5, xmm0 + vandpd xmm0, xmm0, XMMWORD PTR __real_fffffffff8000000 + + ; xmm0 - H + ; xmm7 - resH + ; xmm5 - res + + mov rax, QWORD PTR [save_y+rsp] + and rax, QWORD PTR __real_fffffffff8000000 + + vaddsd xmm2, xmm2, xmm3 + vsubsd xmm7, xmm7, xmm5 + vsubsd xmm1, xmm1, xmm2 + vaddsd xmm7, xmm7, xmm3 + vsubsd xmm5, xmm5, xmm0 + + mov QWORD PTR [y_head+rsp], rax + vmovsd xmm4, QWORD PTR [save_y+rsp] + + vaddsd xmm7, xmm7, xmm1 + vaddsd xmm7, xmm7, xmm5 + + ; res = H + T + ; H has leading 26 bits of precision + ; T has full precision + + ; xmm0 - H + ; xmm7 - T + + vmovsd xmm2, QWORD PTR [y_head+rsp] + vsubsd xmm4, xmm4, xmm2 + + ; y is split into head and tail + ; for y * ln(x) computation + + ; xmm4 - Yt + ; xmm2 - Yh + ; xmm0 - H + ; xmm7 - T + + vmulsd xmm3, xmm4, xmm7 ; YtRt + vmulsd xmm4, xmm4, xmm0 ; YtRh + vmulsd xmm5, xmm7, xmm2 ; YhRt + vmulsd xmm6, xmm0, xmm2 ; YhRh + + vmovapd xmm1, xmm6 + vaddsd xmm3, xmm3, xmm4 + vaddsd xmm3, xmm3, xmm5 + + vaddsd xmm1, xmm1, xmm3 + vmovapd xmm0, xmm1 + + vsubsd xmm6, xmm6, xmm1 + vaddsd xmm6, xmm6, xmm3 + + ; y * ln(x) = v + vt + ; v and vt are in full precision + + ; xmm0 - v + ; xmm6 - vt + + ; ----------------------------- + ; compute exp( y * ln(x) ) here + ; ----------------------------- + + ; v * (64/ln(2)) + vmovsd QWORD PTR [p_temp_exp+rsp], xmm0 + vmulsd xmm7, xmm0, QWORD PTR __real_64_by_log2 + mov rdx, QWORD PTR [p_temp_exp+rsp] + + ; v < 1024*ln(2), ( v * (64/ln(2)) ) < 64*1024 + ; v >= -1075*ln(2), ( v * (64/ln(2)) ) >= 64*(-1075) + vcomisd xmm7, QWORD PTR __real_p65536 + ja Lpow_fma3_process_result_inf + + vcomisd xmm7, QWORD PTR __real_m68800 + jb Lpow_fma3_process_result_zero + + ; n = int( v * (64/ln(2)) ) + vcvtpd2dq xmm4, xmm7 + lea r10, __two_to_jby64_head_table + lea r11, __two_to_jby64_tail_table + vcvtdq2pd xmm1, xmm4 + + ; r1 = x - n * ln(2)/64 head + vfnmadd231sd xmm0, xmm1, QWORD PTR __real_log2_by_64_head + vmovd ecx, xmm4 + mov rax, 3fh + and eax, ecx + + ; r2 = - n * ln(2)/64 tail + vmulsd xmm1, xmm1, QWORD PTR __real_log2_by_64_tail + vmovapd xmm2, xmm0 + + ; m = (n - j) / 64 + sub ecx, eax + sar ecx, 6 + + ; r1+r2 + vaddsd xmm2, xmm2, xmm1 + vaddsd xmm2, xmm2, xmm6 ; add vt here + vmovapd xmm1, xmm2 + + ; q + vmovsd xmm0, QWORD PTR __real_1_by_720 + xor r9d, r9d + vfmadd213sd xmm0, xmm2, QWORD PTR __real_1_by_120 + cmp ecx, DWORD PTR __denormal_threshold + vfmadd213sd xmm0, xmm2, QWORD PTR __real_1_by_24 + cmovle r9d, ecx + vfmadd213sd xmm0, xmm2, QWORD PTR __real_1_by_6 + add rcx, 1023 + vfmadd213sd xmm0, xmm2, QWORD PTR __real_1_by_2 + shl rcx, 52 + vfmadd213sd xmm0, xmm2, QWORD PTR __real_one + vmulsd xmm0, xmm0, xmm2 ; xmm0 <-- q +; movsd xmm0, QWORD PTR __real_1_by_2 +; movsd xmm3, QWORD PTR __real_1_by_24 +; movsd xmm4, QWORD PTR __real_1_by_720 +; mulsd xmm1, xmm2 ; xmm1 <-- r^2 +; mulsd xmm0, xmm2 ; xmm0 <-- r/2 +; mulsd xmm3, xmm2 ; xmm3 <-- r/24 +; mulsd xmm4, xmm2 ; xmm4 <-- r/720 + +; movsd xmm5, xmm1 ; xmm5 <-- copy of r^2 +; mulsd xmm1, xmm2 ; xmm1 <-- r^3 +; addsd xmm0, QWORD PTR __real_one ; xmm0 <-- r/2 + 1 +; addsd xmm3, QWORD PTR __real_1_by_6 ; xmm3 <-- r/24 + 1/6 +; mulsd xmm5, xmm1 ; xmm5 <-- r^5 +; addsd xmm4, QWORD PTR __real_1_by_120 ; xmm4 <-- r/720 + 1/120 +; mulsd xmm0, xmm2 ; xmm0 <-- (r/2 + 1)*r +; mulsd xmm3, xmm1 ; xmm3 <-- (r/24 + 1/6)*r^3 + +; mulsd xmm4, xmm5 ; xmm4 <-- (r/720 + 1/120)*r^5 + +; ; deal with denormal results +; xor r9d, r9d +; cmp ecx, DWORD PTR __denormal_threshold + +; addsd xmm3, xmm4 ; xmm3 <-- (r/720 + 1/120)*r^5 + (r/24 + 1/6)*r^3 +; addsd xmm0, xmm3 ; xmm0 <-- poly + +; cmovle r9d, ecx +; add rcx, 1023 +; shl rcx, 52 + + ; f1, f2 + vmulsd xmm5, xmm0, QWORD PTR [r11+rax*8] + vmulsd xmm1, xmm0, QWORD PTR [r10+rax*8] + + cmp rcx, QWORD PTR __real_inf + + ; (f1+f2)*(1+q) + vaddsd xmm5, xmm5, QWORD PTR [r11+rax*8] + vaddsd xmm1, xmm1, xmm5 + vaddsd xmm1, xmm1, QWORD PTR [r10+rax*8] + vmovapd xmm0, xmm1 + + je Lpow_fma3_process_almost_inf + + test r9d, r9d + mov QWORD PTR [p_temp_exp+rsp], rcx + jnz Lpow_fma3_process_denormal + vmulsd xmm0, xmm0, QWORD PTR [p_temp_exp+rsp] + vorpd xmm0, xmm0, XMMWORD PTR [negate_result+rsp] + +Lpow_fma3_final_check: + AVXRestoreXmm xmm7, save_xmm7 + AVXRestoreXmm xmm6, save_xmm6 + StackDeallocate stack_size + ret + +ALIGN 16 +Lpow_fma3_process_almost_inf: + vcomisd xmm0, QWORD PTR __real_one + jae Lpow_fma3_process_result_inf + + vorpd xmm0, xmm0, XMMWORD PTR __enable_almost_inf + vorpd xmm0, xmm0, XMMWORD PTR [negate_result+rsp] + jmp Lpow_fma3_final_check + +ALIGN 16 +Lpow_fma3_process_denormal: + mov ecx, r9d + xor r11d, r11d + vcomisd xmm0, QWORD PTR __real_one + cmovae r11d, ecx + cmp r11d, DWORD PTR __denormal_threshold + jne Lpow_fma3_process_true_denormal + + vmulsd xmm0, xmm0, QWORD PTR [p_temp_exp+rsp] + vorpd xmm0, xmm0, XMMWORD PTR [negate_result+rsp] + jmp Lpow_fma3_final_check + +ALIGN 16 +Lpow_fma3_process_true_denormal: + xor r8, r8 + cmp rdx, QWORD PTR __denormal_tiny_threshold + mov r9, 1 + jg Lpow_fma3_process_denormal_tiny + add ecx, 1074 + cmovs rcx, r8 + shl r9, cl + mov rcx, r9 + + mov QWORD PTR [p_temp_exp+rsp], rcx + vmulsd xmm0, xmm0, QWORD PTR [p_temp_exp+rsp] + vorpd xmm0, xmm0, XMMWORD PTR [negate_result+rsp] + jmp Lpow_fma3_z_denormal + +ALIGN 16 +Lpow_fma3_process_denormal_tiny: + vmovsd xmm0, QWORD PTR __real_smallest_denormal + vorpd xmm0, xmm0, XMMWORD PTR [negate_result+rsp] + jmp Lpow_fma3_z_denormal + +ALIGN 16 +Lpow_fma3_process_result_zero: + mov r11, QWORD PTR __real_zero + or r11, QWORD PTR [negate_result+rsp] + jmp Lpow_fma3_z_is_zero_or_inf + +ALIGN 16 +Lpow_fma3_process_result_inf: + mov r11, QWORD PTR __real_inf + or r11, QWORD PTR [negate_result+rsp] + jmp Lpow_fma3_z_is_zero_or_inf + +ALIGN 16 +Lpow_fma3_denormal_adjust: + vpor xmm2, xmm2, XMMWORD PTR __real_one + vsubsd xmm2, xmm2, QWORD PTR __real_one + vmovapd xmm5, xmm2 + vpand xmm2, xmm2, XMMWORD PTR __real_mant + vmovq r8, xmm2 + vpsrlq xmm5, xmm5, 52 + vpsubd xmm5, xmm5, XMMWORD PTR __mask_2045 + vcvtdq2pd xmm6, xmm5 + jmp Lpow_fma3_continue_common + +ALIGN 16 +Lpow_fma3_x_is_neg: + + mov r10, QWORD PTR __exp_mask + and r10, r8 + cmp r10, QWORD PTR __ay_max_bound + jg Lpow_fma3_ay_is_very_large + + ; determine if y is an integer + mov r10, QWORD PTR __exp_mant_mask + and r10, r8 + mov r11, r10 + mov rcx, QWORD PTR __exp_shift + shr r10, cl + sub r10, QWORD PTR __exp_bias + js Lpow_fma3_x_is_neg_y_is_not_int + + mov rax, QWORD PTR __exp_mant_mask + and rax, rdx + mov QWORD PTR [save_ax+rsp], rax + + cmp r10, QWORD PTR __yexp_53 + mov rcx, r10 + jg Lpow_fma3_continue_after_y_int_check + + mov r9, QWORD PTR __mant_full + shr r9, cl + and r9, r11 + jnz Lpow_fma3_x_is_neg_y_is_not_int + + mov r9, QWORD PTR __1_before_mant + shr r9, cl + and r9, r11 + jz Lpow_fma3_continue_after_y_int_check + + mov rax, QWORD PTR __sign_mask + mov QWORD PTR [negate_result+rsp], rax + +Lpow_fma3_continue_after_y_int_check: + + cmp rdx, QWORD PTR __neg_zero + je Lpow_fma3_x_is_zero + + cmp rdx, QWORD PTR __neg_one + je Lpow_fma3_x_is_neg_one + + mov r9, QWORD PTR __exp_mask + and r9, rdx + cmp r9, QWORD PTR __exp_mask + je Lpow_fma3_x_is_inf_or_nan + + vmovsd xmm0, QWORD PTR [save_ax+rsp] + jmp Lpow_fma3_log_x + + +ALIGN 16 +Lpow_fma3_near_one: + + ; f = F - Y, r = f * inv + vmovapd xmm0, xmm1 + vsubsd xmm1, xmm1, xmm2 ; xmm1 <-- f + vmovapd xmm4, xmm1 ; xmm4 <-- copy of f + + vmovsd xmm3, QWORD PTR [r9+r8*8] + vaddsd xmm3, xmm3, QWORD PTR [rdx+r8*8] + vmulsd xmm4, xmm4, xmm3 ; xmm4 <-- r = f*inv + vandpd xmm4, xmm4, QWORD PTR __real_fffffffff8000000 ; r1 + vmovapd xmm5, xmm4 ; xmm5 <-- copy of r1 +; mulsd xmm4, xmm0 ; xmm4 <-- F*r1 +; subsd xmm1, xmm4 ; xmm1 <-- f - F*r1 + vfnmadd231sd xmm1, xmm4, xmm0 ; xmm1 <-- f - F*r1 + vmulsd xmm1, xmm1, xmm3 ; xmm1 <-- r2 = (f - F*r1)*inv + vmovapd xmm7, xmm1 ; xmm7 <-- copy of r2 + vaddsd xmm1, xmm1, xmm5 ; xmm1 <-- r = r1 + r2 + + vmovapd xmm2, xmm1 ; xmm2 <-- copy of r + vmovapd xmm0, xmm1 ; xmm0 <-- copy of r + + lea r9, __log_256_lead + + ; poly + ; NOTE: Given the complicated corrections here, + ; I'm afraid to mess with it too much - WAT + vmovsd xmm3, QWORD PTR __real_1_over_7 + vmovsd xmm1, QWORD PTR __real_1_over_4 + vmulsd xmm0, xmm0, xmm2 ; xmm0 <-- r^2 + vmovapd xmm4, xmm0 ; xmm4 <-- copy of r^2 + vfmadd213sd xmm3, xmm2, QWORD PTR __real_1_over_6 ; xmm3 <-- r/7 + 1/6 + vfmadd213sd xmm1, xmm2, QWORD PTR __real_1_over_3 ; xmm1 <-- r/4 + 1/3 + vmulsd xmm4, xmm4, xmm0 ; xmm4 <-- r^4 + vmulsd xmm1, xmm1, xmm2 ; xmm1 <-- (r/4 + 1/3)*r + vfmadd213sd xmm3, xmm2, QWORD PTR __real_1_over_5 ; xmm3 <-- ((r/7 + 1/6)*r) + 1/5 + vmulsd xmm3, xmm3, xmm2 ; xmm3 <-- (((r/7 + 1/6)*r) + 1/5)*r + vmulsd xmm1, xmm1, xmm0 ; xmm1 <-- ((r/4 + 1/3)*r)*r^2 + vmulsd xmm3, xmm3, xmm4 ; xmm3 <-- ((((r/7 + 1/6)*r) + 1/5)*r)*r^4 + + vmovapd xmm2, xmm5 ; xmm2 <-- copy of r1 + vmovapd xmm0, xmm7 ; xmm0 <-- copy of r2 + vmulsd xmm0, xmm0, xmm0 ; xmm0 <-- r2^2 + vmulsd xmm0, xmm0, QWORD PTR __real_1_over_2 ; xmm0 <-- r2^2/2 +; mulsd xmm5, xmm7 ; xmm5 <-- r1*r2 +; addsd xmm5, xmm0 ; xmm5 <-- r1*r2 + r2^2^2 + vfmadd213sd xmm5, xmm7, xmm0 ; xmm5 <-- r1*r2 + r2^2^2 + vaddsd xmm5, xmm5, xmm7 ; xmm5 <-- r1*r2 + r2^2/2 + r2 + + vmovapd xmm0, xmm2 ; xmm0 <-- copy of r1 + vmovapd xmm7, xmm2 ; xmm7 <-- copy of r1 + vmulsd xmm0, xmm0, xmm0 ; xmm0 <-- r1^2 + vmulsd xmm0, xmm0, QWORD PTR __real_1_over_2 ; xmm0 <-- r1^2/2 + vmovapd xmm4, xmm0 ; xmm4 <-- copy of r1^2/2 + vaddsd xmm2, xmm2, xmm0 ; xmm2 <-- r1 + r1^2/2 + vsubsd xmm7, xmm7, xmm2 ; xmm7 <-- r1 - (r1 + r1^2/2) + vaddsd xmm7, xmm7, xmm4 ; xmm7 <-- r1 - (r1 + r1^2/2) + r1^2/2 + ; xmm3 <-- ((((r/7 + 1/6)*r) + 1/5)*r)*r^4 + r1 - (r1 + r1^2/2) + r1^2/2 + vaddsd xmm3, xmm3, xmm7 + vmovsd xmm4, QWORD PTR __real_log2_tail + ; xmm1 <-- (((((r/7 + 1/6)*r) + 1/5)*r)*r^4) + + ; (r1 - (r1 + r1^2/2) + r1^2/2) + ((r/4 + 1/3)*r)*r^2) + vaddsd xmm1, xmm1, xmm3 + lea rdx, __log_256_tail + ; xmm1 <-- ((((((r/7 + 1/6)*r) + 1/5)*r)*r^4) + + ; (r1 - (r1 + r1^2/2) + r1^2/2) + ((r/4 + 1/3)*r)*r^2)) + ; +(r1*r2 + r2^2/2 + r2) + vaddsd xmm1, xmm1, xmm5 + ; xmm4 <-- vt * log2_tail + log256_tail + vfmadd213sd xmm4, xmm6, QWORD PTR [rdx+r8*8] + ; xmm4 <-- vt * log2_tail + log2_tail - corrected poly + vsubsd xmm4, xmm4, xmm1 + + vmovapd xmm1, xmm4 + vsubsd xmm3, xmm4, xmm2 ; xmm3 <-- xmm4 - more correction??? + + vmovsd xmm0, QWORD PTR [r9+r8*8] ; xmm0 <-- log256_lead + ; xmm0 <-- log256_lead + vt*log2_lead + vfmadd231sd xmm0, xmm6, QWORD PTR __real_log2_lead + + ; at this point, xmm0, xmm1, xmm2, and xmm3 should matter + jmp Lpow_fma3_log_x_continue + + +ALIGN 16 +Lpow_fma3_x_is_pos_one: + jmp Lpow_fma3_final_check + +ALIGN 16 +Lpow_fma3_y_is_zero: + vmovsd xmm0, QWORD PTR __real_one + jmp Lpow_fma3_final_check + +ALIGN 16 +Lpow_fma3_y_is_one: + xor rax, rax + mov r11, rdx + mov r9, QWORD PTR __exp_mask + ;or r11, QWORD PTR __qnan_set + and r9, rdx + cmp r9, QWORD PTR __exp_mask + cmove rax, rdx + mov r9, QWORD PTR __mant_mask + and r9, rax + jnz Lpow_fma3_x_is_nan + + vmovq xmm0, rdx + jmp Lpow_fma3_final_check + +ALIGN 16 +Lpow_fma3_x_is_neg_one: + mov rdx, QWORD PTR __pos_one + or rdx, QWORD PTR [negate_result+rsp] + xor rax, rax + mov r11, r8 + mov r10, QWORD PTR __exp_mask + ;or r11, QWORD PTR __qnan_set + and r10, r8 + cmp r10, QWORD PTR __exp_mask + cmove rax, r8 + mov r10, QWORD PTR __mant_mask + and r10, rax + jnz Lpow_fma3_y_is_nan + + vmovq xmm0, rdx + jmp Lpow_fma3_final_check + +ALIGN 16 +Lpow_fma3_x_is_neg_y_is_not_int: + mov r9, QWORD PTR __exp_mask + and r9, rdx + cmp r9, QWORD PTR __exp_mask + je Lpow_fma3_x_is_inf_or_nan + + cmp rdx, QWORD PTR __neg_zero + je Lpow_fma3_x_is_zero + + vmovsd xmm0, QWORD PTR [save_x+rsp] + vmovsd xmm1, QWORD PTR [save_y+rsp] + vmovsd xmm2, QWORD PTR __neg_qnan + mov r9d, DWORD PTR __flag_x_neg_y_notint + + call fname_special + jmp Lpow_fma3_final_check + +ALIGN 16 +Lpow_fma3_ay_is_very_large: + mov r9, QWORD PTR __exp_mask + and r9, rdx + cmp r9, QWORD PTR __exp_mask + je Lpow_fma3_x_is_inf_or_nan + + mov r9, QWORD PTR __exp_mant_mask + and r9, rdx + jz Lpow_fma3_x_is_zero + + cmp rdx, QWORD PTR __neg_one + je Lpow_fma3_x_is_neg_one + + mov r9, rdx + and r9, QWORD PTR __exp_mant_mask + cmp r9, QWORD PTR __pos_one + jl Lpow_fma3_ax_lt1_y_is_large_or_inf_or_nan + + jmp Lpow_fma3_ax_gt1_y_is_large_or_inf_or_nan + +ALIGN 16 +Lpow_fma3_x_is_zero: + mov r10, QWORD PTR __exp_mask + xor rax, rax + and r10, r8 + cmp r10, QWORD PTR __exp_mask + je Lpow_fma3_x_is_zero_y_is_inf_or_nan + + mov r10, QWORD PTR __sign_mask + and r10, r8 + cmovnz rax, QWORD PTR __pos_inf + jnz Lpow_fma3_x_is_zero_z_is_inf + + vmovq xmm0, rax + vorpd xmm0, xmm0, XMMWORD PTR [negate_result+rsp] + jmp Lpow_fma3_final_check + +ALIGN 16 +Lpow_fma3_x_is_zero_z_is_inf: + + vmovsd xmm0, QWORD PTR [save_x+rsp] + vmovsd xmm1, QWORD PTR [save_y+rsp] + vmovq xmm2, rax + vorpd xmm2, xmm2, XMMWORD PTR [negate_result+rsp] + mov r9d, DWORD PTR __flag_x_zero_z_inf + + call fname_special + jmp Lpow_fma3_final_check + +ALIGN 16 +Lpow_fma3_x_is_zero_y_is_inf_or_nan: + mov r11, r8 + cmp r8, QWORD PTR __neg_inf +; The next two lines do not correspond to IEEE754-2008. +; +-0 ^ -Inf should be +Inf with no exception +; +-0 ^ +Inf should be +0 with no exception +; cmove rax, QWORD PTR __pos_inf +; je Lpow_fma3_x_is_zero_z_is_inf +; begin replacement + je Lpow_fma3_x_is_zero_y_is_neg_inf + cmp r8, QWORD PTR __neg_inf + je Lpow_fma3_x_is_zero_y_is_pos_inf +; end replacement + + ;or r11, QWORD PTR __qnan_set + mov r10, QWORD PTR __mant_mask + and r10, r8 + jnz Lpow_fma3_y_is_nan + + vmovq xmm0, rax + jmp Lpow_fma3_final_check + +ALIGN 16 +Lpow_fma3_x_is_zero_y_is_neg_inf: + ; quietly return +Inf + vmovsd xmm0, __pos_inf + jmp Lpow_fma3_final_check + +ALIGN 16 +Lpow_fma3_x_is_zero_y_is_pos_inf: + ; quietly return +0. + vxorpd xmm0, xmm0, xmm0 + jmp Lpow_fma3_final_check + +ALIGN 16 +Lpow_fma3_x_is_inf_or_nan: + xor r11, r11 + mov r10, QWORD PTR __sign_mask + and r10, r8 + cmovz r11, QWORD PTR __pos_inf + mov rax, rdx + mov r9, QWORD PTR __mant_mask + ;or rax, QWORD PTR __qnan_set + and r9, rdx + cmovnz r11, rax + jnz Lpow_fma3_x_is_nan + + xor rax, rax + mov r9, r8 + mov r10, QWORD PTR __exp_mask + ;or r9, QWORD PTR __qnan_set + and r10, r8 + cmp r10, QWORD PTR __exp_mask + cmove rax, r8 + mov r10, QWORD PTR __mant_mask + and r10, rax + cmovnz r11, r9 + jnz Lpow_fma3_y_is_nan + + vmovq xmm0, r11 + vorpd xmm0, xmm0, XMMWORD PTR [negate_result+rsp] + jmp Lpow_fma3_final_check + +ALIGN 16 +Lpow_fma3_ay_is_very_small: + vaddsd xmm0, xmm1, QWORD PTR __pos_one + jmp Lpow_fma3_final_check + + +ALIGN 16 +Lpow_fma3_ax_lt1_y_is_large_or_inf_or_nan: + xor r11, r11 + mov r10, QWORD PTR __sign_mask + and r10, r8 + cmovnz r11, QWORD PTR __pos_inf + jmp Lpow_fma3_adjust_for_nan + +ALIGN 16 +Lpow_fma3_ax_gt1_y_is_large_or_inf_or_nan: + xor r11, r11 + mov r10, QWORD PTR __sign_mask + and r10, r8 + cmovz r11, QWORD PTR __pos_inf + +ALIGN 16 +Lpow_fma3_adjust_for_nan: + + xor rax, rax + mov r9, r8 + mov r10, QWORD PTR __exp_mask + ;or r9, QWORD PTR __qnan_set + and r10, r8 + cmp r10, QWORD PTR __exp_mask + cmove rax, r8 + mov r10, QWORD PTR __mant_mask + and r10, rax + cmovnz r11, r9 + jnz Lpow_fma3_y_is_nan + + test rax, rax + jnz Lpow_fma3_y_is_inf + +ALIGN 16 +Lpow_fma3_z_is_zero_or_inf: + + mov r9d, DWORD PTR __flag_z_zero + test r11, QWORD PTR __exp_mant_mask + cmovnz r9d, DWORD PTR __flag_z_inf + + vmovsd xmm0, QWORD PTR [save_x+rsp] + vmovsd xmm1, QWORD PTR [save_y+rsp] + vmovq xmm2, r11 + + call fname_special + jmp Lpow_fma3_final_check + +ALIGN 16 +Lpow_fma3_y_is_inf: + + vmovq xmm0, r11 + jmp Lpow_fma3_final_check + +ALIGN 16 +Lpow_fma3_x_is_nan: + + xor rax, rax + mov r10, QWORD PTR __exp_mask + and r10, r8 + cmp r10, QWORD PTR __exp_mask + cmove rax, r8 + mov r10, QWORD PTR __mant_mask + and r10, rax + jnz Lpow_fma3_x_is_nan_y_is_nan + + vmovsd xmm0, QWORD PTR [save_x+rsp] + vmovsd xmm1, QWORD PTR [save_y+rsp] + vmovq xmm2, r11 + mov r9d, DWORD PTR __flag_x_nan + + call fname_special + jmp Lpow_fma3_final_check + +ALIGN 16 +Lpow_fma3_y_is_nan: + + vmovsd xmm0, QWORD PTR [save_x+rsp] + vmovsd xmm1, QWORD PTR [save_y+rsp] + vmovq xmm2, r11 + mov r9d, DWORD PTR __flag_y_nan + + call fname_special + jmp Lpow_fma3_final_check + +ALIGN 16 +Lpow_fma3_x_is_nan_y_is_nan: + + mov r9, r8 + + cmp r11, QWORD PTR __ind_pattern + cmove r11, r9 + je Lpow_fma3_continue_xy_nan + + cmp r9, QWORD PTR __ind_pattern + cmove r9, r11 + + mov r10, r9 + and r10, QWORD PTR __sign_mask + cmovnz r9, r11 + + mov r10, r11 + and r10, QWORD PTR __sign_mask + cmovnz r11, r9 + +Lpow_fma3_continue_xy_nan: + ;or r11, QWORD PTR __qnan_set + vmovsd xmm0, QWORD PTR [save_x+rsp] + vmovsd xmm1, QWORD PTR [save_y+rsp] + vmovq xmm2, r11 + mov r9d, DWORD PTR __flag_x_nan_y_nan + + call fname_special + jmp Lpow_fma3_final_check + +ALIGN 16 +Lpow_fma3_z_denormal: + vmovapd xmm2, xmm0 + vmovsd xmm0, QWORD PTR [save_x+rsp] + vmovsd xmm1, QWORD PTR [save_y+rsp] + mov r9d, DWORD PTR __flag_z_denormal + + call fname_special + jmp Lpow_fma3_final_check + +fname endp +END diff --git a/sdk/lib/crt/math/libm_sse2/pow_special.c b/sdk/lib/crt/math/libm_sse2/pow_special.c new file mode 100644 index 00000000000..a2063df9c8d --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/pow_special.c @@ -0,0 +1,130 @@ + +/******************************************************************************* +MIT License +----------- + +Copyright (c) 2002-2019 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this Software and associated documentaon files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include +#include +#include +#include +#include + +#include "libm_new.h" + +// these codes and the ones in the related .asm files have to match +#define POW_X_ONE_Y_SNAN 1 +#define POW_X_ZERO_Z_INF 2 +#define POW_X_NAN 3 +#define POW_Y_NAN 4 +#define POW_X_NAN_Y_NAN 5 +#define POW_X_NEG_Y_NOTINT 6 +#define POW_Z_ZERO 7 +#define POW_Z_DENORMAL 8 +#define POW_Z_INF 9 + +float _powf_special(float x, float y, float z, U32 code) +{ + switch(code) + { + case POW_X_ONE_Y_SNAN: + { + UT64 zm; zm.u64 = 0; zm.f32[0] = z; + _handle_errorf("powf", _FpCodePow, zm.u64, 0, AMD_F_INVALID, 0, x, y, 2); + } + break; + + case POW_X_ZERO_Z_INF: + { + UT64 zm; zm.u64 = 0; zm.f32[0] = z; + _handle_errorf("powf", _FpCodePow, zm.u64, _SING, AMD_F_DIVBYZERO, ERANGE, x, y, 2); + } + break; + + case POW_X_NAN: + case POW_Y_NAN: + case POW_X_NAN_Y_NAN: + case POW_X_NEG_Y_NOTINT: + { + UT64 zm; zm.u64 = 0; zm.f32[0] = z; + _handle_errorf("powf", _FpCodePow, zm.u64, _DOMAIN, AMD_F_INVALID, EDOM, x, y, 2); + } + break; + + case POW_Z_ZERO: + { + UT64 zm; zm.u64 = 0; zm.f32[0] = z; + _handle_errorf("powf", _FpCodePow, zm.u64, _UNDERFLOW, AMD_F_INEXACT|AMD_F_UNDERFLOW, ERANGE, x, y, 2); + } + break; + + case POW_Z_INF: + { + UT64 zm; zm.u64 = 0; zm.f32[0] = z; + _handle_errorf("powf", _FpCodePow, zm.u64, _OVERFLOW, AMD_F_INEXACT|AMD_F_OVERFLOW, ERANGE, x, y, 2); + } + break; + } + + return z; +} + +double _pow_special(double x, double y, double z, U32 code) +{ + switch(code) + { + case POW_X_ZERO_Z_INF: + { + UT64 zm; zm.f64 = z; + _handle_error("pow", _FpCodePow, zm.u64, _SING, AMD_F_DIVBYZERO, ERANGE, x, y, 2); + } + break; + + case POW_X_NAN: + case POW_Y_NAN: + case POW_X_NAN_Y_NAN: + case POW_X_NEG_Y_NOTINT: + { + UT64 zm; zm.f64 = z; + _handle_error("pow", _FpCodePow, zm.u64, _DOMAIN, AMD_F_INVALID, EDOM, x, y, 2); + } + break; + + case POW_Z_ZERO: + case POW_Z_DENORMAL: + { + UT64 zm; zm.f64 = z; + _handle_error("pow", _FpCodePow, zm.u64, _UNDERFLOW, AMD_F_INEXACT|AMD_F_UNDERFLOW, ERANGE, x, y, 2); + } + break; + + case POW_Z_INF: + { + UT64 zm; zm.f64 = z; + _handle_error("pow", _FpCodePow, zm.u64, _OVERFLOW, AMD_F_INEXACT|AMD_F_OVERFLOW, ERANGE, x, y, 2); + } + break; + } + + return z; +} diff --git a/sdk/lib/crt/math/libm_sse2/remainder.c b/sdk/lib/crt/math/libm_sse2/remainder.c new file mode 100644 index 00000000000..67a4d0a24d4 --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/remainder.c @@ -0,0 +1,319 @@ + +/******************************************************************************* +MIT License +----------- + +Copyright (c) 2002-2019 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this Software and associated documentaon files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include "libm.h" +#include "libm_util.h" + +#define USE_NAN_WITH_FLAGS +#define USE_SCALEDOUBLE_3 +#define USE_GET_FPSW_INLINE +#define USE_SET_FPSW_INLINE +#define USE_HANDLE_ERROR +#include "libm_inlines.h" +#undef USE_NAN_WITH_FLAGS +#undef USE_SCALEDOUBLE_3 +#undef USE_GET_FPSW_INLINE +#undef USE_SET_FPSW_INLINE +#undef USE_HANDLE_ERROR + +#if !defined(_CRTBLD_C9X) +#define _CRTBLD_C9X +#endif + +#include "libm_errno.h" + +/* Computes the exact product of x and y, the result being the + nearly doublelength number (z,zz) */ +static inline void dekker_mul12(double x, double y, + double *z, double *zz) +{ + double hx, tx, hy, ty; + /* Split x into hx (head) and tx (tail). Do the same for y. */ + unsigned long u; + GET_BITS_DP64(x, u); + u &= 0xfffffffff8000000; + PUT_BITS_DP64(u, hx); + tx = x - hx; + GET_BITS_DP64(y, u); + u &= 0xfffffffff8000000; + PUT_BITS_DP64(u, hy); + ty = y - hy; + *z = x * y; + *zz = (((hx * hy - *z) + hx * ty) + tx * hy) + tx * ty; +} + +#pragma function(fmod) +#undef _FUNCNAME +#if defined(COMPILING_FMOD) +double fmod(double x, double y) +#define _FUNCNAME "fmod" +#define _OPERATION OP_FMOD +#else +double remainder(double x, double y) +#define _FUNCNAME "remainder" +#define _OPERATION OP_REM +#endif +{ + double dx, dy, scale, w, t, v, c, cc; + int i, ntimes, xexp, yexp; + unsigned long u, ux, uy, ax, ay, todd; + unsigned int sw; + + dx = x; + dy = y; + + + GET_BITS_DP64(dx, ux); + GET_BITS_DP64(dy, uy); + ax = ux & ~SIGNBIT_DP64; + ay = uy & ~SIGNBIT_DP64; + xexp = (int)((ux & EXPBITS_DP64) >> EXPSHIFTBITS_DP64); + yexp = (int)((uy & EXPBITS_DP64) >> EXPSHIFTBITS_DP64); + + if (xexp < 1 || xexp > BIASEDEMAX_DP64 || + yexp < 1 || yexp > BIASEDEMAX_DP64) + { + /* x or y is zero, denormalized, NaN or infinity */ + if (xexp > BIASEDEMAX_DP64) + { + /* x is NaN or infinity */ + if (ux & MANTBITS_DP64) + { + /* x is NaN */ + return _handle_error(_FUNCNAME, _OPERATION, ux|0x0008000000000000, _DOMAIN, 0, + EDOM, x, y, 2); + } + else + { + /* x is infinity; result is NaN */ + return _handle_error(_FUNCNAME, _OPERATION, INDEFBITPATT_DP64, _DOMAIN, + AMD_F_INVALID, EDOM, x, y, 2); + } + } + else if (yexp > BIASEDEMAX_DP64) + { + /* y is NaN or infinity */ + if (uy & MANTBITS_DP64) + { + /* y is NaN */ + return _handle_error(_FUNCNAME, _OPERATION, uy|0x0008000000000000, _DOMAIN, 0, + EDOM, x, y, 2); + } + else + { +#ifdef _CRTBLD_C9X + /* C99 return for y = +-inf is x */ + return x; +#else + /* y is infinity; result is indefinite */ + return _handle_error(_FUNCNAME, _OPERATION, INDEFBITPATT_DP64, _DOMAIN, + AMD_F_INVALID, EDOM, x, y, 2); +#endif + } + } + else if (ax == 0x0000000000000000) + { + /* x is zero */ + if (ay == 0x0000000000000000) + { + /* y is zero */ + return _handle_error(_FUNCNAME, _OPERATION, INDEFBITPATT_DP64, _DOMAIN, + AMD_F_INVALID, EDOM, x, y, 2); + } + else + /* C99 return for x = 0 must preserve sign */ + return x; + } + else if (ay == 0x0000000000000000) + { + /* y is zero */ + return _handle_error(_FUNCNAME, _OPERATION, INDEFBITPATT_DP64, _DOMAIN, + AMD_F_INVALID, EDOM, x, y, 2); + } + + /* We've exhausted all other possibilities. One or both of x and + y must be denormalized */ + if (xexp < 1) + { + /* x is denormalized. Figure out its exponent. */ + u = ax; + while (u < IMPBIT_DP64) + { + xexp--; + u <<= 1; + } + } + if (yexp < 1) + { + /* y is denormalized. Figure out its exponent. */ + u = ay; + while (u < IMPBIT_DP64) + { + yexp--; + u <<= 1; + } + } + } + else if (ax == ay) + { + /* abs(x) == abs(y); return zero with the sign of x */ + PUT_BITS_DP64(ux & SIGNBIT_DP64, dx); + return dx; + } + + /* Set x = abs(x), y = abs(y) */ + PUT_BITS_DP64(ax, dx); + PUT_BITS_DP64(ay, dy); + + if (ax < ay) + { + /* abs(x) < abs(y) */ +#if !defined(COMPILING_FMOD) + if (dx > 0.5*dy) + dx -= dy; +#endif + return x < 0.0? -dx : dx; + } + + /* Save the current floating-point status word. We need + to do this because the remainder function is always + exact for finite arguments, but our algorithm causes + the inexact flag to be raised. We therefore need to + restore the entry status before exiting. */ + sw = get_fpsw_inline(); + + /* Set ntimes to the number of times we need to do a + partial remainder. If the exponent of x is an exact multiple + of 52 larger than the exponent of y, and the mantissa of x is + less than the mantissa of y, ntimes will be one too large + but it doesn't matter - it just means that we'll go round + the loop below one extra time. */ + if (xexp <= yexp) + ntimes = 0; + else + ntimes = (xexp - yexp) / 52; + + if (ntimes == 0) + { + w = dy; + scale = 1.0; + } + else + { + /* Set w = y * 2^(52*ntimes) */ + w = scaleDouble_3(dy, ntimes * 52); + + /* Set scale = 2^(-52) */ + PUT_BITS_DP64((unsigned long)(-52 + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, + scale); + } + + + /* Each time round the loop we compute a partial remainder. + This is done by subtracting a large multiple of w + from x each time, where w is a scaled up version of y. + The subtraction must be performed exactly in quad + precision, though the result at each stage can + fit exactly in a double precision number. */ + for (i = 0; i < ntimes; i++) + { + /* t is the integer multiple of w that we will subtract. + We use a truncated value for t. + + N.B. w has been chosen so that the integer t will have + at most 52 significant bits. This is the amount by + which the exponent of the partial remainder dx gets reduced + every time around the loop. In theory we could use + 53 bits in t, but the quad precision multiplication + routine dekker_mul12 does not allow us to do that because + it loses the last (106th) bit of its quad precision result. */ + + /* Set dx = dx - w * t, where t is equal to trunc(dx/w). */ + t = (double)(long)(dx / w); + /* At this point, t may be one too large due to + rounding of dx/w */ + + /* Compute w * t in quad precision */ + dekker_mul12(w, t, &c, &cc); + + /* Subtract w * t from dx */ + v = dx - c; + dx = v + (((dx - v) - c) - cc); + + /* If t was one too large, dx will be negative. Add back + one w */ + /* It might be possible to speed up this loop by finding + a way to compute correctly truncated t directly from dx and w. + We would then avoid the need for this check on negative dx. */ + if (dx < 0.0) + dx += w; + + /* Scale w down by 2^(-52) for the next iteration */ + w *= scale; + } + + /* One more time */ + /* Variable todd says whether the integer t is odd or not */ + t = (double)(long)(dx / w); + todd = ((long)(dx / w)) & 1; + dekker_mul12(w, t, &c, &cc); + v = dx - c; + dx = v + (((dx - v) - c) - cc); + if (dx < 0.0) + { + todd = !todd; + dx += w; + } + + /* At this point, dx lies in the range [0,dy) */ +#if !defined(COMPILING_FMOD) + /* For the fmod function, we're done apart from setting + the correct sign. */ + /* For the remainder function, we need to adjust dx + so that it lies in the range (-y/2, y/2] by carefully + subtracting w (== dy == y) if necessary. The rigmarole + with todd is to get the correct sign of the result + when x/y lies exactly half way between two integers, + when we need to choose the even integer. */ + if (ay < 0x7fd0000000000000) + { + if (dx + dx > w || (todd && (dx + dx == w))) + dx -= w; + } + else if (dx > 0.5 * w || (todd && (dx == 0.5 * w))) + dx -= w; + +#endif + + /* **** N.B. for some reason this breaks the 32 bit version + of remainder when compiling with optimization. */ + /* Restore the entry status flags */ + set_fpsw_inline(sw); + + /* Set the result sign according to input argument x */ + return x < 0.0? -dx : dx; + +} diff --git a/sdk/lib/crt/math/libm_sse2/remainder_piby2.c b/sdk/lib/crt/math/libm_sse2/remainder_piby2.c new file mode 100644 index 00000000000..412bcb26df6 --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/remainder_piby2.c @@ -0,0 +1,251 @@ + +/******************************************************************************* +MIT License +----------- + +Copyright (c) 2002-2019 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this Software and associated documentaon files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include "libm.h" +#include "libm_util.h" + + +/* Given positive argument x, reduce it to the range [-pi/4,pi/4] using + extra precision, and return the result in r, rr. + Return value "region" tells how many lots of pi/2 were subtracted + from x to put it in the range [-pi/4,pi/4], mod 4. */ +void __remainder_piby2(double x, double *r, double *rr, int *region) +{ + /* This method simulates multi-precision floating-point + arithmetic and is accurate for all 1 <= x < infinity */ + static const double + piby2_lead = 1.57079632679489655800e+00, /* 0x3ff921fb54442d18 */ + piby2_part1 = 1.57079631090164184570e+00, /* 0x3ff921fb50000000 */ + piby2_part2 = 1.58932547122958567343e-08, /* 0x3e5110b460000000 */ + piby2_part3 = 6.12323399573676480327e-17; /* 0x3c91a62633145c06 */ + const int bitsper = 10; + unsigned long res[500]; + unsigned long ux, u, carry, mask, mant, highbitsrr; + int first, last, i, rexp, xexp, resexp, ltb, determ; + double xx, t; + static unsigned long pibits[] = + { + 0, 0, 0, 0, 0, 0, + 162, 998, 54, 915, 580, 84, 671, 777, 855, 839, + 851, 311, 448, 877, 553, 358, 316, 270, 260, 127, + 593, 398, 701, 942, 965, 390, 882, 283, 570, 265, + 221, 184, 6, 292, 750, 642, 465, 584, 463, 903, + 491, 114, 786, 617, 830, 930, 35, 381, 302, 749, + 72, 314, 412, 448, 619, 279, 894, 260, 921, 117, + 569, 525, 307, 637, 156, 529, 504, 751, 505, 160, + 945, 1022, 151, 1023, 480, 358, 15, 956, 753, 98, + 858, 41, 721, 987, 310, 507, 242, 498, 777, 733, + 244, 399, 870, 633, 510, 651, 373, 158, 940, 506, + 997, 965, 947, 833, 825, 990, 165, 164, 746, 431, + 949, 1004, 287, 565, 464, 533, 515, 193, 111, 798 + }; + + GET_BITS_DP64(x, ux); + + + xexp = (int)(((ux & EXPBITS_DP64) >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64); + ux = (ux & MANTBITS_DP64) | IMPBIT_DP64; + + /* Now ux is the mantissa bit pattern of x as a long integer */ + carry = 0; + mask = 1; + mask = (mask << bitsper) - 1; + + /* Set first and last to the positions of the first + and last chunks of 2/pi that we need */ + first = xexp / bitsper; + resexp = xexp - first * bitsper; + /* 180 is the theoretical maximum number of bits (actually + 175 for IEEE double precision) that we need to extract + from the middle of 2/pi to compute the reduced argument + accurately enough for our purposes */ + last = first + 180 / bitsper; + + /* Do a long multiplication of the bits of 2/pi by the + integer mantissa */ +#if 0 + for (i = last; i >= first; i--) + { + u = pibits[i] * ux + carry; + res[i - first] = u & mask; + carry = u >> bitsper; + } + res[last - first + 1] = 0; +#else + /* Unroll the loop. This is only correct because we know + that bitsper is fixed as 10. */ + res[19] = 0; + u = pibits[last] * ux; + res[18] = u & mask; + carry = u >> bitsper; + u = pibits[last-1] * ux + carry; + res[17] = u & mask; + carry = u >> bitsper; + u = pibits[last-2] * ux + carry; + res[16] = u & mask; + carry = u >> bitsper; + u = pibits[last-3] * ux + carry; + res[15] = u & mask; + carry = u >> bitsper; + u = pibits[last-4] * ux + carry; + res[14] = u & mask; + carry = u >> bitsper; + u = pibits[last-5] * ux + carry; + res[13] = u & mask; + carry = u >> bitsper; + u = pibits[last-6] * ux + carry; + res[12] = u & mask; + carry = u >> bitsper; + u = pibits[last-7] * ux + carry; + res[11] = u & mask; + carry = u >> bitsper; + u = pibits[last-8] * ux + carry; + res[10] = u & mask; + carry = u >> bitsper; + u = pibits[last-9] * ux + carry; + res[9] = u & mask; + carry = u >> bitsper; + u = pibits[last-10] * ux + carry; + res[8] = u & mask; + carry = u >> bitsper; + u = pibits[last-11] * ux + carry; + res[7] = u & mask; + carry = u >> bitsper; + u = pibits[last-12] * ux + carry; + res[6] = u & mask; + carry = u >> bitsper; + u = pibits[last-13] * ux + carry; + res[5] = u & mask; + carry = u >> bitsper; + u = pibits[last-14] * ux + carry; + res[4] = u & mask; + carry = u >> bitsper; + u = pibits[last-15] * ux + carry; + res[3] = u & mask; + carry = u >> bitsper; + u = pibits[last-16] * ux + carry; + res[2] = u & mask; + carry = u >> bitsper; + u = pibits[last-17] * ux + carry; + res[1] = u & mask; + carry = u >> bitsper; + u = pibits[last-18] * ux + carry; + res[0] = u & mask; +#endif + + + /* Reconstruct the result */ + ltb = (int)((((res[0] << bitsper) | res[1]) + >> (bitsper - 1 - resexp)) & 7); + + /* determ says whether the fractional part is >= 0.5 */ + determ = ltb & 1; + + + i = 1; + if (determ) + { + /* The mantissa is >= 0.5. We want to subtract it + from 1.0 by negating all the bits */ + *region = ((ltb >> 1) + 1) & 3; + mant = 1; + mant = ~(res[1]) & ((mant << (bitsper - resexp)) - 1); + while (mant < 0x0020000000000000) + { + i++; + mant = (mant << bitsper) | (~(res[i]) & mask); + } + highbitsrr = ~(res[i + 1]) << (64 - bitsper); + } + else + { + *region = (ltb >> 1); + mant = 1; + mant = res[1] & ((mant << (bitsper - resexp)) - 1); + while (mant < 0x0020000000000000) + { + i++; + mant = (mant << bitsper) | res[i]; + } + highbitsrr = res[i + 1] << (64 - bitsper); + } + + rexp = 52 + resexp - i * bitsper; + + while (mant >= 0x0020000000000000) + { + rexp++; + highbitsrr = (highbitsrr >> 1) | ((mant & 1) << 63); + mant >>= 1; + } + + + /* Put the result exponent rexp onto the mantissa pattern */ + u = ((unsigned long)rexp + EXPBIAS_DP64) << EXPSHIFTBITS_DP64; + ux = (mant & MANTBITS_DP64) | u; + if (determ) + /* If we negated the mantissa we negate x too */ + ux |= SIGNBIT_DP64; + PUT_BITS_DP64(ux, x); + + /* Create the bit pattern for rr */ + highbitsrr >>= 12; /* Note this is shifted one place too far */ + u = ((unsigned long)rexp + EXPBIAS_DP64 - 53) << EXPSHIFTBITS_DP64; + PUT_BITS_DP64(u, t); + u |= highbitsrr; + PUT_BITS_DP64(u, xx); + + /* Subtract the implicit bit we accidentally added */ + xx -= t; + /* Set the correct sign, and double to account for the + "one place too far" shift */ + if (determ) + xx *= -2.0; + else + xx *= 2.0; + + + /* (x,xx) is an extra-precise version of the fractional part of + x * 2 / pi. Multiply (x,xx) by pi/2 in extra precision + to get the reduced argument (r,rr). */ + { + double hx, tx, c, cc; + /* Split x into hx (head) and tx (tail) */ + GET_BITS_DP64(x, ux); + ux &= 0xfffffffff8000000; + PUT_BITS_DP64(ux, hx); + tx = x - hx; + + c = piby2_lead * x; + cc = ((((piby2_part1 * hx - c) + piby2_part1 * tx) + + piby2_part2 * hx) + piby2_part2 * tx) + + (piby2_lead * xx + piby2_part3 * x); + *r = c + cc; + *rr = (c - *r) + cc; + } + + return; +} diff --git a/sdk/lib/crt/math/libm_sse2/remainder_piby2_forAsm.asm b/sdk/lib/crt/math/libm_sse2/remainder_piby2_forAsm.asm new file mode 100644 index 00000000000..2bf8c0feea9 --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/remainder_piby2_forAsm.asm @@ -0,0 +1,415 @@ +; +; +; MIT License +; ----------- +; +; Copyright (c) 2002-2019 Advanced Micro Devices, Inc. +; +; Permission is hereby granted, free of charge, to any person obtaining a copy +; of this Software and associated documentaon files (the "Software"), to deal +; in the Software without restriction, including without limitation the rights +; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +; copies of the Software, and to permit persons to whom the Software is +; furnished to do so, subject to the following conditions: +; +; The above copyright notice and this permission notice shall be included in +; all copies or substantial portions of the Software. +; +; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +; THE SOFTWARE. +; +; An implementation of the remainder by pi/2 function +; This is a service routine for use by trig functions coded in asm +; +; On input, +; xmm0 = x; +; On ouput +; xmm0 = r +; xmm1 = rr +; xmm2 = region + +.const +ALIGN 16 +L__piby2_part3_piby2_lead DQ 03ff921fb54442d18h, 03c91a62633145c06h +L__piby2_part1 DQ 03ff921fb50000000h, 03ff921fb50000000h +L__piby2_part2 DQ 03e5110b460000000h, 03e5110b460000000h +;; constants for CW reduction +L_piby2_1 DQ 03FF921FB54400000h, 03FF921FB54400000h +L_piby2_2 DQ 03DD0B4611A600000h, 03DD0B4611A600000h +L_piby2_3 DQ 03BA3198A2E000000h, 03BA3198A2E000000h +L_piby2_1tail DQ 03DD0B4611A626331h, 03DD0B4611A626331h +L_piby2_2tail DQ 03BA3198A2E037073h, 03BA3198A2E037073h +L_piby2_3tail DQ 0397B839A252049C1h, 0397B839A252049C1h +L_twobypi DQ 03FE45F306DC9C883h, 03FE45F306DC9C883h +L_point_five DQ 03FE0000000000000h, 03FE0000000000000h +L_int_three DQ 00000000000000003h, 00000000000000003h +L_inf_mask_64 DQ 07FF0000000000000h, 07FF0000000000000h +L_signbit DQ 08000000000000000h, 08000000000000000h +L_int_1 DQ 00000000000000001h, 00000000000000001h +L_int_15 DQ 0000000000000000Fh +L_int_48 DQ 00000000000000030h +L_3pio4 DQ 04002D97C7F3321D2h +L_5pio4 DQ 0400F6A7A2955385Eh +L_7pio4 DQ 04015FDBBE9BBA775h +L_9pio4 DQ 0401c463abeccb2bbh +ALIGN 16 +L__2_by_pi_bits DB 224, 241, 27, 193, 12, 88, 33, 116 + DB 53, 126, 196, 126, 237, 175, 169, 75 + DB 74, 41, 222, 231, 28, 244, 236, 197 + DB 151, 175, 31, 235, 158, 212, 181, 168 + DB 127, 121, 154, 253, 24, 61, 221, 38 + DB 44, 159, 60, 251, 217, 180, 125, 180 + DB 41, 104, 45, 70, 188, 188, 63, 96 + DB 22, 120, 255, 95, 226, 127, 236, 160 + DB 228, 247, 46, 126, 17, 114, 210, 231 + DB 76, 13, 230, 88, 71, 230, 4, 249 + DB 125, 209, 154, 192, 113, 166, 19, 18 + DB 237, 186, 212, 215, 8, 162, 251, 156 + DB 166, 196, 114, 172, 119, 248, 115, 72 + DB 70, 39, 168, 187, 36, 25, 128, 75 + DB 55, 9, 233, 184, 145, 220, 134, 21 + DB 239, 122, 175, 142, 69, 249, 7, 65 + DB 14, 241, 100, 86, 138, 109, 3, 119 + DB 211, 212, 71, 95, 157, 240, 167, 84 + DB 16, 57, 185, 13, 230, 139, 2, 0 + DB 0, 0, 0, 0, 0, 0 + + +; local storage offsets +region EQU 000h +stack_size EQU 018h +sstack_size EQU 000h ; no stack for fsname + +include fm.inc + +fname TEXTEQU <__remainder_piby2_forAsm> +fsname TEXTEQU <__remainder_piby2_cw_forAsm> + + +.code + +; xmm0l has |x| +PUBLIC fname +fname PROC FRAME + StackAllocate stack_size + .ENDPROLOG + + ; This function is not using rdx, r8, and r9 as pointers; + ; all returns are in registers + + ; get the unbiased exponent and the mantissa part of x + lea r9,L__2_by_pi_bits + + ;xexp = (x >> 52) - 1023 + movd r11,xmm0 + mov rcx,r11 + shr r11,52 + sub r11,1023 ; r11 <-- xexp = exponent of input x + + ;calculate the last byte from which to start multiplication + ;last = 134 - (xexp >> 3) + mov r10,r11 + shr r10,3 + sub r10,134 ; r10 <-- -last + neg r10 ; r10 <-- last + + ; load 64 bits of 2_by_pi + mov rax,[r9 + r10] + + ; mantissa of x = ((x << 12) >> 12) | implied bit + shl rcx,12 + shr rcx,12 ; rcx <-- mantissa part of input x + bts rcx,52 ; add the implied bit as well + + ; load next 128 bits of 2_by_pi + add r10,8 ;increment to next 8 bytes of 2_by_pi + movdqu xmm0,[r9 + r10] + + ; do three 64-bit multiplications with mant of x + mul rcx + mov r8,rax ; r8 <-- last 64 bits of mul = res1[2] + mov r10,rdx ; r10 <-- carry + movd rax,xmm0 + mul rcx + ; resexp = xexp & 7 + and r11,7 ; r11 <-- resexp = xexp & 7 = last 3 bits + psrldq xmm0,8 + add rax,r10 ; add the previous carry + adc rdx,0 + mov r9,rax ; r9 <-- next 64 bits of mul = res1[1] + mov r10,rdx ; r10 <-- carry + movd rax,xmm0 + mul rcx + add r10,rax ; r10 <-- most sig. 64 bits = res1[0] + ; find the region + ; last three bits ltb = most sig bits >> (54 - resexp)); + ; decimal point in last 18 bits ==> 8 lsb's in first 64 bits + ; and 8 msb's in next 64 bits + ; point_five = ltb & 01h; + ; region = ((ltb >> 1) + point_five) & 3; + mov rcx,54 + mov rax,r10 + sub rcx,r11 + xor rdx,rdx ; rdx <-- sign of x + shr rax,cl + jnc L__no_point_five + ; if there is carry then negate the result of multiplication + not r10 + not r9 + not r8 + mov rdx,08000000000000000h + +ALIGN 16 +L__no_point_five: + adc rax,0 + and rax,3 ; rax now has region + mov QWORD PTR [region+rsp],rax + + ; calculate the number of integer bits and zero them out + mov rcx,r11 + add rcx,10 ; rcx = no. of integer bits + shl r10,cl + shr r10,cl ; r10 contains only mant bits + sub rcx,64 ; form the exponent + mov r11,rcx + + ;find the highest set bit + bsr rcx,r10 + jnz L__form_mantissa + mov r10,r9 + mov r9,r8 + mov r8,0 + bsr rcx,r10 ; rcx = hsb + sub r11,64 + + +ALIGN 16 +L__form_mantissa: + add r11,rcx ; for exp of x + sub rcx,52 ; rcx = no. of bits to shift in r10 + cmp rcx,0 + jl L__hsb_below_52 + je L__form_numbers + ; hsb above 52 + mov r8,r10 ; previous contents of r8 not required + shr r10,cl ; r10 = mantissa of x with hsb at 52 + shr r9,cl ; make space for bits from r10 + sub rcx,64 + neg rcx + ; rcx <-- no of bits to shift r10 to move those bits to r9 + shl r8,cl + or r9,r8 ; r9 = mantissa bits of xx + jmp L__form_numbers + +ALIGN 16 +L__hsb_below_52: + neg rcx + mov rax,r9 + shl r10,cl + shl r9,cl + sub rcx,64 + neg rcx + shr rax,cl + or r10,rax + shr r8,cl + or r9,r8 + +ALIGN 16 +L__form_numbers: + add r11,1023 + btr r10,52 ; remove the implicit bit + mov rcx,r11 + or r10,rdx ; put the sign + shl rcx,52 + or r10,rcx ; r10 <-- x + + movd xmm0,r10 ; xmm0 <-- x + movdqa xmm1,xmm0 ; xmm1 <-- x + psrlq xmm1,27 + psllq xmm1,27 ; xmm1 <-- hx + movdqa xmm2,xmm0 ; xmm2 <-- x + subsd xmm2,xmm1 ; xmm2 <-- tx + movlhps xmm0,xmm0 ; xmm0 <-- x,x + movlhps xmm2,xmm1 ; xmm2 <-- hx,tx + + movdqa xmm1,XMMWORD PTR L__piby2_part3_piby2_lead + movdqa xmm3,XMMWORD PTR L__piby2_part1 + movdqa xmm4,XMMWORD PTR L__piby2_part2 + + ; form xx + xor rcx,rcx + bsr rcx,r9 + sub rcx,64 ; to shift the implicit bit as well + neg rcx + shl r9,cl + shr r9,12 + add rcx,52 + sub r11,rcx + shl r11,52 + or r9,rdx + or r9,r11 + movd xmm5,r9 ; xmm5 <-- xx + + mulpd xmm0,xmm1 ; xmm0 <-- piby2_part3 * x,piby2_lead * x = c + mulpd xmm5,xmm1 ; xmm5 <-- piby2_lead * xx + mulpd xmm3,xmm2 ; xmm3 <-- piby2_part1 * hx,piby2_part1 * tx + mulpd xmm4,xmm2 ; xmm4 <-- piby2_part2 * hx,piby2_part2 * tx + + ; cc = (piby2_part1 * hx - c) + (piby2_part1 * tx) + + ; (piby2_part2 * hx) + (piby2_part2 * tx) + + ; (piby2_lead * xx + piby2_part3 * x) + movhlps xmm1,xmm3 ; xmm1 = piby2_part1 * hx + movhlps xmm2,xmm4 ; xmm2 = piby2_part2 * hx + subsd xmm1,xmm0 ; xmm1 = (piby2_part1 * hx - c) + addsd xmm1,xmm3 ; xmm1 = (piby2_part1 * hx - c) + (piby2_part1 * tx) + movhlps xmm3,xmm0 ; xmm3 = piby2_part3 * x + addsd xmm1,xmm2 + ; xmm1 = (piby2_part1 * hx - c) + (piby2_part1 * tx) + (piby2_part2 * hx) + addsd xmm3,xmm5 ; xmm3 = (piby2_lead * xx + piby2_part3 * x) + addsd xmm1,xmm4 + ; xmm1 = (piby2_part1 * hx - c) + (piby2_part1 * tx) + + ; (piby2_part2 * hx) + (piby2_part2 * tx) + addsd xmm1,xmm3 ; xmm1 = cc + + ; xmm0 <-- c, xmm1 <-- cc + ; r = c + cc + ; rr = (c - r) + cc + + movdqa xmm2,xmm0 ; xmm2 <-- copy of c + addsd xmm0,xmm1 ; xmm0 <-- r = c + cc + subsd xmm2,xmm0 ; xmm2 <-- c - r + addsd xmm1,xmm2 ; xmm1 <-- rr = cc + (c - r) + mov rax, QWORD PTR[region+rsp] ; rax <-- region + + StackDeallocate stack_size + ret + +fname endp + +; NOTE: If this is not going to be used, should probably remove it. - WAT +ALIGN 16 +PUBLIC fsname +fsname PROC FRAME + StackAllocate sstack_size + .ENDPROLOG + +; xmm0l has |x| +; r9 also has |x| +; ASSUMPTION: if we call this function, |x| > pi/4 + + xor r8d,r8d + cmp r9, QWORD PTR L_5pio4 + ja Lax_gt_5pio4 + cmp r9, QWORD PTR L_3pio4 + seta r8b + inc r8d + jmp Lstage_npi2 +Lax_gt_5pio4: + cmp r9, QWORD PTR L_9pio4 + ja Lnpi2_full_computation + cmp r9, QWORD PTR L_7pio4 + seta r8b + add r8d,3 +Lstage_npi2: + movd xmm2, r8d + cvtdq2pd xmm4, xmm2 + jmp Lnpi2_known + +Lnpi2_full_computation: +; movapd xmm1, L_twobypi +; movapd xmm3, L_point_five + movapd xmm5,xmm0 +; mulsd xmm5,xmm1 +; addsd xmm5,xmm3 ; xmm5 <-- |x|*2/pi + .5 + mulsd xmm5, L_twobypi + addsd xmm5, L_point_five + + cvttpd2dq xmm5,xmm5 ; xmm5 < npi2 = int part + movapd xmm2,xmm5 + andpd xmm2,L_int_three + cvtdq2pd xmm4,xmm5 + +Lnpi2_known: + movapd xmm5,xmm4 + mulsd xmm5,QWORD PTR L_piby2_1 ; xmm5 <-- npi2*piby2_1 + xorpd xmm5,L_signbit ; xmm5 <-- -npi2*piby2_1 + addpd xmm5,xmm0 ; xmm5 <-- rhead = x - npi2*piby2_1 + movapd xmm3,xmm4 + mulsd xmm3,QWORD PTR L_piby2_1tail ; xmm3 <-- rtail = npi2*piby2_1tail + + ; If x is nearly a multiple of pi/2, rhead will be small compared to |x| + ; we check this by checking exponent difference. + + ; Note that both the unbiased exponents are positive, and that of rhead + ; must be <= that of |x| + movapd xmm1,xmm5 ; xmm1l <-- rhead + subpd xmm1,xmm3 ; xmm1l <-- r = rhead - rtail + andpd xmm1,L_inf_mask_64 + psubq xmm0,xmm1 ; xmm0 <-- |x| - r + psrlq xmm0,52 + comisd xmm0,L_int_15 + +; movd rax, xmm5 ; really a movq +; shr rax, 52 +; shr rdx, 52 ; get exponent of |x| (no and needed) +; sub rdx, rax +; cmp rdx, 15 + jbe Lcw_get_r_rr + + ; here expdiff > 15, so x is nearly a multiple of pi/2 and things are hard + ; we use another piece of pi/2 in the reduction + + movapd xmm1,xmm5 + movapd xmm3,xmm4 + mulsd xmm3,QWORD PTR L_piby2_2 ; xmm3 <--- rtail = npi2*piby2_2 + subsd xmm5,xmm3 ; xmm5 <-- rhead = t - rtail + + ; now rtail = npi2*piby2_2tail - ((t-rhead) - rtail) + subsd xmm1,xmm5 + subsd xmm1,xmm3 + movapd xmm3,xmm4 + mulsd xmm3,QWORD PTR L_piby2_2tail + subsd xmm3,xmm1 ; xmm3 <-- rtail + + comisd xmm0,L_int_48 +; cmp rdx, 48 + jbe Lcw_get_r_rr + + ; here expdiff > 48, so x is REALLY close to a multiple of pi/2 + ; and we use yet another piece of pi/2 in the reduction + + movapd xmm0,xmm5 ; xmm0 <-- t = rhead + movapd xmm3,xmm4 + mulsd xmm3,QWORD PTR L_piby2_3 ; xmm3 <-- rtail = npi2 * piby2_3 + movapd xmm5,xmm0 + subsd xmm5,xmm3 ; xmm5 <-- rhead = t - rtail + + ; now rtail = npi2 * piby2_3tail - ((t - rhead) - rtail) + movapd xmm1,xmm0 + subsd xmm1,xmm5 + subsd xmm1,xmm3 + movapd xmm3,xmm4 + mulsd xmm3,QWORD PTR L_piby2_3tail + subsd xmm3,xmm1 ; xmm3 <-- rtail + +Lcw_get_r_rr: + ; We have a satisfactory rhead in xmm5 and rtail in xmm3 + ; We now produce r in xmm0 and rr in xmm1, where the actual reduced argument + ; is the sum of r and rr, and rr is insignificant + ; with respect to r under addition (i.e., r + rr == r). + movapd xmm0,xmm5 ; xmm0 <-- rhead + subsd xmm0,xmm3 ; xmm0 <-- r = rhead - rtail + movapd xmm1,xmm5 ; xmm1 <-- rhead + subsd xmm1,xmm0 ; xmm1 <-- (rhead - r) + subsd xmm1,xmm3 ; xmm1 <-- rr = (rhead - r) - rtail + movd rax,xmm2 ; rax <-- region + StackDeallocate sstack_size + ret +fsname endp + +END diff --git a/sdk/lib/crt/math/libm_sse2/remainder_piby2_forFMA3.asm b/sdk/lib/crt/math/libm_sse2/remainder_piby2_forFMA3.asm new file mode 100644 index 00000000000..dc9b96cd3fb --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/remainder_piby2_forFMA3.asm @@ -0,0 +1,283 @@ +; +; +; MIT License +; ----------- +; +; Copyright (c) 2002-2019 Advanced Micro Devices, Inc. +; +; Permission is hereby granted, free of charge, to any person obtaining a copy +; of this Software and associated documentaon files (the "Software"), to deal +; in the Software without restriction, including without limitation the rights +; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +; copies of the Software, and to permit persons to whom the Software is +; furnished to do so, subject to the following conditions: +; +; The above copyright notice and this permission notice shall be included in +; all copies or substantial portions of the Software. +; +; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +; THE SOFTWARE. +; +; An implementation of the remainder by pi/2 function using fma3 +; This is a service routine for use by trig functions coded in asm that use fma3 +; +; On input, +; xmm0 = x; +; On ouput +; xmm0 = r +; xmm1 = rr +; rax = region + +.const +ALIGN 16 +L_piby2_lead DQ 03ff921fb54442d18h, 03ff921fb54442d18h +L_fff800 DQ 0fffffffff8000000h, 0fffffffff8000000h +L_piby2_part1 DQ 03ff921fb50000000h, 03ff921fb50000000h +L_piby2_part2 DQ 03e5110b460000000h, 03e5110b460000000h +L_piby2_part3 DQ 03c91a62633145c06h, 03c91a62633145c06h +L_piby2_1 DQ 03FF921FB54400000h, 03FF921FB54400000h +L_piby2_2 DQ 03DD0B4611A600000h, 03DD0B4611A600000h +L_piby2_3 DQ 03BA3198A2E000000h, 03BA3198A2E000000h +L_piby2_1tail DQ 03DD0B4611A626331h, 03DD0B4611A626331h +L_piby2_2tail DQ 03BA3198A2E037073h, 03BA3198A2E037073h +L_piby2_3tail DQ 0397B839A252049C1h, 0397B839A252049C1h +L_sign_mask DQ 07FFFFFFFFFFFFFFFh, 07FFFFFFFFFFFFFFFh +L_twobypi DQ 03FE45F306DC9C883h, 03FE45F306DC9C883h +L_point_five DQ 03FE0000000000000h, 03FE0000000000000h +L_int_three DQ 00000000000000003h, 00000000000000003h +L_inf_mask_64 DQ 07FF0000000000000h, 07FF0000000000000h +L_signbit DQ 08000000000000000h, 08000000000000000h +;; constants for BDL reduction +L_r DQ 03FE45F306DC9C883h, 03FE45F306DC9C883h ; 2/pi +L_xc1 DQ 03FF921FB54442D18H, 03FF921FB54442D18h ; pi/2 (L_piby2_lead) +L_xc2 DQ 03C91A62633145C00H, 03C91A62633145C00h ; pi/2 part 2 +L_xc3 DQ 0397B839A252049C0H, 0397B839A252049C0h ; pi/2 part 3 +; sigma is 3*2^(p-n-2) where n is 0 and p is 53. +L_sigma DQ 04338000000000000h, 04338000000000000h ; 6755399441055744. + +EXTRN __L_2_by_pi_bits:BYTE + +region EQU 020h +stack_size EQU 038h + +include fm.inc + +fname TEXTEQU <__remainder_piby2_fma3> +fbname TEXTEQU <__remainder_piby2_fma3_bdl> + +.code + +PUBLIC fname +fname PROC FRAME + StackAllocate stack_size + .ENDPROLOG + + ; This function is not using rdx, r8, and r9 as pointers; + ; all returns are in registers + + ; get the unbiased exponent and the mantissa part of x + lea r9,__L_2_by_pi_bits + + ; xexp = (x >> 52) - 1023 + vmovq r11,xmm0 + mov rcx,r11 + shr r11,52 + sub r11,1023 ; r11 <-- xexp = exponent of input x + + ; calculate the last byte from which to start multiplication + ; last = 134 - (xexp >> 3) + mov r10,r11 + shr r10,3 + sub r10,134 ; r10 <-- -last + neg r10 ; r10 <-- last + + ; load 64 bits of 2_by_pi + mov rax,[r9 + r10] + + ; mantissa of x = ((x << 12) >> 12) | implied bit + shl rcx,12 + shr rcx,12 ; rcx <-- mantissa part of input x + bts rcx,52 ; add the implied bit as well + + ; load next 128 bits of 2_by_pi + add r10,8 ; increment to next 8 bytes of 2_by_pi + vmovdqu xmm0,XMMWORD PTR[r9 + r10] + + ; do three 64-bit multiplications with mant of x + mul rcx + mov r8,rax ; r8 <-- last 64 bits of mul = res1[2] + mov r10,rdx ; r10 <-- carry + vmovq rax,xmm0 + mul rcx + ; resexp = xexp & 7 + and r11,7 ; r11 <-- resexp = last 3 bits of xexp + vpsrldq xmm0,xmm0,8 + add rax,r10 ; add the previous carry + adc rdx,0 + mov r9,rax ; r9 <-- next 64 bits of mul = res1[1] + mov r10,rdx ; r10 <-- carry + vmovq rax,xmm0 + mul rcx + add r10,rax ; r10 <-- most sig. 64 bits = res1[0] + + ; find the region + ; last three bits ltb = most sig bits >> (54 - resexp)); + ; decimal point in last 18 bits ==> 8 lsb's in first 64 bits + ; and 8 msb's in next 64 bits + ; point_five = ltb & 01h; + ; region = ((ltb >> 1) + point_five) & 3; + mov rcx,54 + mov rax,r10 + sub rcx,r11 + xor rdx,rdx ; rdx <-- sign of x + shr rax,cl + jnc L__no_point_five + ; if there is carry then negate the result of multiplication + not r10 + not r9 + not r8 + mov rdx,08000000000000000h + +ALIGN 16 +L__no_point_five: + adc rax,0 + and rax,3 ; rax now has region + mov QWORD PTR [region+rsp], rax + + ; calculate the number of integer bits and zero them out + mov rcx,r11 + add rcx,10 ; rcx = no. of integer bits + shl r10,cl + shr r10,cl ; r10 contains only mant bits + sub rcx,64 ; form the exponent + mov r11,rcx + + ; find the highest set bit + bsr rcx,r10 + jnz L__form_mantissa + mov r10,r9 + mov r9,r8 + mov r8,0 + bsr rcx,r10 ; rcx = hsb + sub r11,64 + +ALIGN 16 +L__form_mantissa: + add r11,rcx ; for exp of x + sub rcx,52 ; rcx = no. of bits to shift in r10 + cmp rcx,0 + jl L__hsb_below_52 + je L__form_numbers + ; hsb above 52 + mov r8,r10 ; previous r8 not required + shr r10,cl ; r10 = mantissa of x with hsb at 52 + shr r9,cl ; make space for bits from r10 + sub rcx,64 + neg rcx + ; rcx <-- no of bits to shift r10 to move those bits to r9 + shl r8,cl + or r9,r8 ; r9 = mantissa bits of xx + jmp L__form_numbers + +ALIGN 16 +L__hsb_below_52: + ; rcx has shift count (< 0) + neg rcx + mov rax,r9 + shl r10,cl + shl r9,cl + sub rcx,64 + neg rcx + shr rax,cl + or r10,rax + shr r8,cl + or r9,r8 + +ALIGN 16 +; Here r11 has unbiased exponent +; r10 has mantissa, with implicit bit possibly set +; rdx has the sign bit +L__form_numbers: + add r11,1023 ; r11 <-- biased exponent + btr r10,52 ; remove the implicit bit + mov rcx,r11 ; rcx <-- copy of biased exponent + or r10,rdx ; put the sign + shl rcx,52 ; shift biased exponent into place + or r10,rcx ; r10 <-- x + vmovq xmm2,r10 ; xmm1l <-- x + + ; form xx +; xor rcx,rcx ; Why is this necessary??? + bsr rcx,r9 ; scan for high bit of xx mantissa + sub rcx,64 ; to shift the implied bit as well + neg rcx + shl r9,cl + shr r9,12 + add rcx,52 + sub r11,rcx + shl r11,52 + or r9,rdx + or r9,r11 + vmovq xmm1,r9 ; xmm1 <-- xx + vandpd xmm4,xmm2,L_fff800 ; xmm4 <-- hx + vsubsd xmm0,xmm2,xmm4 ; xmm0 <-- tx + vmulsd xmm5,xmm2,L_piby2_lead ; xmm5 <-- c + vmulsd xmm3,xmm4,L_piby2_part1 + vsubsd xmm3,xmm3,xmm5 + vfmadd231sd xmm3,xmm0,L_piby2_part1 + vfmadd231sd xmm3,xmm4,L_piby2_part2 + vfmadd231sd xmm3,xmm0,L_piby2_part2 + vmulsd xmm4,xmm1,L_piby2_lead + vfmadd231sd xmm4,xmm2,L_piby2_part3 + vaddsd xmm3,xmm3,xmm4 ; xmm3 <-- cc + vaddsd xmm0,xmm5,xmm3 ; xmm0 <--r + vsubsd xmm1,xmm5,xmm0 + vaddsd xmm1,xmm1,xmm3 ; xmm1 <-- rr + mov rax, QWORD PTR [region+rsp] + + StackDeallocate stack_size + ret +fname endp + +ALIGN 16 +PUBLIC fbname +fbname PROC FRAME + .ENDPROLOG + ; Boldo, Daumas, annd Li, "Formally Verified Argument + ; Reduction With a Fused Multiply-Add," + ; IEEE Trans. Comp., vol. 58, #8, Aug. 2009 + ; coefficients are from table 1, mutatis mutandis + ; algorithm is their formula 3.1 (for getting z from sigma) and + ; algorithm 5.1 (and extended version) for actual reduction + vmovapd xmm1,xmm0 + vmovapd xmm4,L_xc2 ; xmm4 <-- xc2 + vmovapd xmm2,L_sigma + vfmadd132sd xmm1,xmm2,L_r ; z = arg*r + sigma + vsubsd xmm1,xmm1,xmm2 ; xmm1 <-- z -= sigma + vcvttpd2dq xmm5,xmm1 + vmovq rax, xmm5 + vmovapd xmm2,xmm1 + vfnmadd132sd xmm2,xmm0,L_xc1 ; xmm2 <-- u = arg - z*xc1 + vmulsd xmm3,xmm1,xmm4 ; xmm3 <-- p1 = z*xc2 + vmovapd xmm0,xmm1 ; xmm0 <-- copy of z + vfmsub213sd xmm0,xmm4,xmm3 ; xmm0 <-- p2 = z*xc2 - p1 + vsubsd xmm5,xmm2,xmm3 ; xmm5 <-- t1 = u - p1 + ; We really don't want to spill in this code, so we're commandeering xmm4 + vsubsd xmm4,xmm2,xmm5 ; xmm4 <-- temp = u - t1 + vsubsd xmm4,xmm4,xmm3 ; xmm4 <-- t2 = temp - p1 + ; used to use xmm4 here for L_xc2 + vfnmadd231sd xmm2,xmm1,L_xc2 ; xmm2 <-- v1 = -xc2*z + u + vsubsd xmm5,xmm5,xmm2 ; xmm5 <-- v2 = t1 - v1 + vaddsd xmm5,xmm5,xmm4 ; xmm5 <-- v2 += t2 + vsubsd xmm5,xmm5,xmm0 ; xmm5 <-- v2 -= p2 + vmovapd xmm0,xmm2 ; xmm0 <-- arghead = v1 + vfnmadd132sd xmm1,xmm5,L_xc3 ; xmm1 <-- argtail = -xc3*z + v2 + and rax, 3 ; rax <-- region + ret +fbname endp +END diff --git a/sdk/lib/crt/math/libm_sse2/remainder_piby2f.c b/sdk/lib/crt/math/libm_sse2/remainder_piby2f.c new file mode 100644 index 00000000000..71e97e8b1f5 --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/remainder_piby2f.c @@ -0,0 +1,173 @@ + +/******************************************************************************* +MIT License +----------- + +Copyright (c) 2002-2019 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this Software and associated documentaon files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include "libm.h" +#include "libm_util.h" + + +/* Given positive argument x, reduce it to the range [-pi/4,pi/4] using + extra precision, and return the result in r. + Return value "region" tells how many lots of pi/2 were subtracted + from x to put it in the range [-pi/4,pi/4], mod 4. */ +void __remainder_piby2f(unsigned long ux, double *r, int *region) +{ + + + /* This method simulates multi-precision floating-point + arithmetic and is accurate for all 1 <= x < infinity */ +#define bitsper 36 + unsigned long res[10]; + unsigned long u, carry, mask, mant, nextbits; + int first, last, i, rexp, xexp, resexp, ltb, determ, bc; + double dx; + static const double + piby2 = 1.57079632679489655800e+00; /* 0x3ff921fb54442d18 */ + static unsigned long pibits[] = + { + 0LL, + 5215LL, 13000023176LL, 11362338026LL, 67174558139LL, + 34819822259LL, 10612056195LL, 67816420731LL, 57840157550LL, + 19558516809LL, 50025467026LL, 25186875954LL, 18152700886LL + }; + + + xexp = (int)(((ux & EXPBITS_DP64) >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64); + ux = ((ux & MANTBITS_DP64) | IMPBIT_DP64) >> 29; + + + /* Now ux is the mantissa bit pattern of x as a long integer */ + mask = 1; + mask = (mask << bitsper) - 1; + + /* Set first and last to the positions of the first + and last chunks of 2/pi that we need */ + first = xexp / bitsper; + resexp = xexp - first * bitsper; + /* 120 is the theoretical maximum number of bits (actually + 115 for IEEE single precision) that we need to extract + from the middle of 2/pi to compute the reduced argument + accurately enough for our purposes */ + last = first + 120 / bitsper; + + + /* Do a long multiplication of the bits of 2/pi by the + integer mantissa */ +#if 0 + for (i = last; i >= first; i--) + { + u = pibits[i] * ux + carry; + res[i - first] = u & mask; + carry = u >> bitsper; + } + res[last - first + 1] = 0; +#else + /* Unroll the loop. This is only correct because we know + that bitsper is fixed as 36. */ + res[4] = 0; + u = pibits[last] * ux; + res[3] = u & mask; + carry = u >> bitsper; + u = pibits[last - 1] * ux + carry; + res[2] = u & mask; + carry = u >> bitsper; + u = pibits[last - 2] * ux + carry; + res[1] = u & mask; + carry = u >> bitsper; + u = pibits[first] * ux + carry; + res[0] = u & mask; +#endif + + + /* Reconstruct the result */ + ltb = (int)((((res[0] << bitsper) | res[1]) + >> (bitsper - 1 - resexp)) & 7); + + /* determ says whether the fractional part is >= 0.5 */ + determ = ltb & 1; + + i = 1; + if (determ) + { + /* The mantissa is >= 0.5. We want to subtract it + from 1.0 by negating all the bits */ + *region = ((ltb >> 1) + 1) & 3; + mant = 1; + mant = ~(res[1]) & ((mant << (bitsper - resexp)) - 1); + while (mant < 0x0000000000010000) + { + i++; + mant = (mant << bitsper) | (~(res[i]) & mask); + } + nextbits = (~(res[i+1]) & mask); + } + else + { + *region = (ltb >> 1); + mant = 1; + mant = res[1] & ((mant << (bitsper - resexp)) - 1); + while (mant < 0x0000000000010000) + { + i++; + mant = (mant << bitsper) | res[i]; + } + nextbits = res[i+1]; + } + + + /* Normalize the mantissa. The shift value 6 here, determined by + trial and error, seems to give optimal speed. */ + bc = 0; + while (mant < 0x0000400000000000) + { + bc += 6; + mant <<= 6; + } + while (mant < 0x0010000000000000) + { + bc++; + mant <<= 1; + } + mant |= nextbits >> (bitsper - bc); + + rexp = 52 + resexp - bc - i * bitsper; + + + /* Put the result exponent rexp onto the mantissa pattern */ + u = ((unsigned long)rexp + EXPBIAS_DP64) << EXPSHIFTBITS_DP64; + ux = (mant & MANTBITS_DP64) | u; + if (determ) + /* If we negated the mantissa we negate x too */ + ux |= SIGNBIT_DP64; + PUT_BITS_DP64(ux, dx); + + + /* x is a double precision version of the fractional part of + x * 2 / pi. Multiply x by pi/2 in double precision + to get the reduced argument r. */ + *r = dx * piby2; + return; + +} diff --git a/sdk/lib/crt/math/libm_sse2/remainder_piby2f_forAsm.asm b/sdk/lib/crt/math/libm_sse2/remainder_piby2f_forAsm.asm new file mode 100644 index 00000000000..e40bccde8b9 --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/remainder_piby2f_forAsm.asm @@ -0,0 +1,180 @@ +; +; MIT License +; ----------- +; +; Copyright (c) 2002-2019 Advanced Micro Devices, Inc. +; +; Permission is hereby granted, free of charge, to any person obtaining a copy +; of this Software and associated documentaon files (the "Software"), to deal +; in the Software without restriction, including without limitation the rights +; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +; copies of the Software, and to permit persons to whom the Software is +; furnished to do so, subject to the following conditions: +; +; The above copyright notice and this permission notice shall be included in +; all copies or substantial portions of the Software. +; +; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +; THE SOFTWARE. +; +; An implementation of the remainder by pi/2 function +; This is a service routine for use by trig functions coded in asm +; +; On input, +; xmm0 = x; Note that we assume x >= pi/4 +; On ouput +; xmm0 = r +; eax = region + +.const + +ALIGN 16 +L__piby2 DQ 03ff921fb54442d18h +EXTRN __L_2_by_pi_bits:BYTE + + +fname TEXTEQU <__remainder_piby2d2f_forAsm> + +stack_size EQU 000h +include fm.inc + +.code +PUBLIC fname +fname PROC FRAME + StackAllocate stack_size + .ENDPROLOG + + lea r9,__L_2_by_pi_bits + + ;get the unbiased exponent and the mantissa part of x + ;Since x >= pi/4, xexp = (x >> 52) - 1023 + movd r11,xmm0 + mov rcx,r11 + shr r11,52 + sub r11,1023 ; r11 <-- xexp = exponent of input x + + ;calculate the last byte from which to start multiplication + ;last = 134 - (xexp >> 3) + mov r10,r11 + shr r10,3 + sub r10,134 ;r10 = -last + neg r10 ;r10 = last + + ;load 64 bits of 2_by_pi + mov rax,[r9 + r10] + + ;mantissa of x = ((x << 12) >> 12) | implied bit + shl rcx,12 + shr rcx,12 ;rcx = mantissa part of input x + bts rcx,52 ;add the implied bit as well + + ;load next 128 bits of 2_by_pi + add r10,8 ;increment to next 8 bytes of 2_by_pi + movdqu xmm0,[r9 + r10] + + ;do three 64-bit multiplications with mant of x + mul rcx + mov r8,rax ;r8 = last 64 bits of mul = res1[2] + mov r10,rdx ;r10 <-- carry + movd rax,xmm0 + mul rcx + ;resexp = xexp & 7 + and r11,7 ;r11 = resexp = xexp & 7 = last 3 bits + psrldq xmm0,8 + add rax,r10 ; add the previous carry + adc rdx,0 + mov r9,rax ;r9 = next 64 bits of mul = res1[1] + mov r10,rdx ;r10 <-- carry + movd rax,xmm0 + mul rcx + add r10,rax ;r10 = most sig 64 bits = res1[0] + + ;find the region + ;last three bits ltb = most sig bits >> (54 - resexp)) + ; decimal point in last 18 bits ==> 8 lsb's in first 64 bits and + ; 8 msb's in next 64 bits + ;point_five = ltb & 01h; + ;region = ((ltb >> 1) + point_five) & 3; + mov rcx,54 + mov rax,r10 + sub rcx,r11 + xor rdx,rdx ;rdx = sign of x(i.e first part of x * 2bypi) + shr rax,cl + jnc L__no_point_five + ;;if there is carry.. then negate the result of multiplication + not r10 + not r9 + not r8 + mov rdx,08000000000000000h + +ALIGN 16 +L__no_point_five: + adc rax,0 + and rax,3 + ; Until / unless we find a better place to save it, we're putting + ; the region in xmm1. + movd xmm1, rax + + ;calculate the number of integer bits and zero them out + mov rcx,r11 + add rcx,10 ;rcx = no. of integer bits + shl r10,cl + shr r10,cl ;r10 contains only mant bits + sub rcx,64 ;form the exponent + mov r11,rcx + + ;find the highest set bit + bsr rcx,r10 + jnz L__form_mantissa + mov r10,r9 + mov r9,r8 + bsr rcx,r10 ;rcx = hsb + sub r11,64 + +ALIGN 16 +L__form_mantissa: + add r11,rcx ;for exp of x + sub rcx,52 ;rcx = no. of bits to shift in r10 + cmp rcx,0 + jl L__hsb_below_52 + je L__form_numbers + ;hsb above 52 + mov r8,r10 ;previous contents of r8 not required + shr r10,cl ;r10 = mantissa of x with hsb at 52 + jmp L__form_numbers + +ALIGN 16 +L__hsb_below_52: + neg rcx + mov rax,r9 + shl r10,cl + shl r9,cl + sub rcx,64 + neg rcx + shr rax,cl + or r10,rax + +ALIGN 16 +L__form_numbers: + add r11,1023 + btr r10,52 ;remove the implied bit + mov rcx,r11 + or r10,rdx ;put the sign + shl rcx,52 + or r10,rcx ;x is in r10 + movd xmm0,r10 ; xmm0 = x + movd rax, xmm1 ; rax <-- region + + ; At this point xmm0 has a double precision version of the fractional part + ; of x * 2/pi. To get the reduced argument r, we multiply that by pi/2. + mulsd xmm0,L__piby2 + StackDeallocate stack_size + ret + +fname endp +END diff --git a/sdk/lib/crt/math/libm_sse2/remainder_piby2f_forC.asm b/sdk/lib/crt/math/libm_sse2/remainder_piby2f_forC.asm new file mode 100644 index 00000000000..cac19822038 --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/remainder_piby2f_forC.asm @@ -0,0 +1,341 @@ +; +; MIT License +; ----------- +; +; Copyright (c) 2002-2019 Advanced Micro Devices, Inc. +; +; Permission is hereby granted, free of charge, to any person obtaining a copy +; of this Software and associated documentaon files (the "Software"), to deal +; in the Software without restriction, including without limitation the rights +; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +; copies of the Software, and to permit persons to whom the Software is +; furnished to do so, subject to the following conditions: +; +; The above copyright notice and this permission notice shall be included in +; all copies or substantial portions of the Software. +; +; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +; THE SOFTWARE. +; +; An implementation of the remainder by pi/2 function +; This is a service routine for use by trig functions coded in C +; + +fname TEXTEQU <__remainder_piby2d2f_forC> + +save_rdi EQU 20h +save_rsi EQU 30h +stack_size EQU 088h +include fm.inc + +.code +PUBLIC fname +fname PROC FRAME + StackAllocate stack_size + SaveReg rdi,save_rdi + SaveReg rsi,save_rsi + .ENDPROLOG + + mov rdi, rcx + mov rsi, rdx + mov rdx, r8 + + ;get the unbiased exponent and the mantissa part of x + movd xmm0,rdi + lea r9,L__2_by_pi_bits + + ;xexp = (x >> 52) - 1023 + movd r11,xmm0 + mov rcx,r11 + shr r11,52 + sub r11,1023 ;r11 = xexp = exponent of input x + + ;calculate the last byte from which to start multiplication + ;last = 134 - (xexp >> 3) + mov r10,r11 + shr r10,3 + sub r10,134 ;r10 = -last + neg r10 ;r10 = last + + ;load 64 bits of 2_by_pi + mov rax,[r9 + r10] + mov rdi,rdx ; save address of region since mul modifies rdx + + ;mantissa of x = ((x << 12) >> 12) | implied bit + shl rcx,12 + shr rcx,12 ;rcx = mantissa part of input x + bts rcx,52 ;add the implied bit as well + + ;load next 128 bits of 2_by_pi + add r10,8 ;increment to next 8 bytes of 2_by_pi + movdqu xmm0,[r9 + r10] + + ;do three 64-bit multiplications with mant of x + mul rcx + mov r8,rax ;r8 = last 64 bits of multiplication = res1[2] + mov r10,rdx ;r10 = carry + movd rax,xmm0 + mul rcx + ;resexp = xexp & 7 + and r11,7 ;r11 = resexp = xexp & 7 = last 3 bits + psrldq xmm0,8 + add rax,r10 ; add the previous carry + adc rdx,0 + mov r9,rax ;r9 = next 64 bits of multiplication = res1[1] + mov r10,rdx ;r10 = carry + movd rax,xmm0 + mul rcx + add r10,rax ;r10 = most significant 64 bits = res1[0] + + ;find the region + ;last three bits ltb = most sig bits >> (54 - resexp)); decimal point in last 18 bits == 8 lsb's in first 64 bits and 8 msb's in next 64 bits + ;point_five = ltb & 01h; + ;region = ((ltb >> 1) + point_five) & 3; + mov rcx,54 + mov rax,r10 + sub rcx,r11 + xor rdx,rdx ;rdx = sign of x(i.e first part of x * 2bypi) + shr rax,cl + jnc L__no_point_five + ;;if there is carry.. then negate the result of multiplication + not r10 + not r9 + not r8 + mov rdx,08000000000000000h + +ALIGN 16 +L__no_point_five: + adc rax,0 + and rax,3 + mov DWORD PTR[rdi],eax ;store region to memory + + ;calculate the number of integer bits and zero them out + mov rcx,r11 + add rcx,10 ;rcx = no. of integer bits + shl r10,cl + shr r10,cl ;r10 contains only mant bits + sub rcx,64 ;form the exponent + mov r11,rcx + + ;find the highest set bit + bsr rcx,r10 + jnz L__form_mantissa + mov r10,r9 + mov r9,r8 + bsr rcx,r10 ;rcx = hsb + sub r11,64 + + +ALIGN 16 +L__form_mantissa: + add r11,rcx ;for exp of x + sub rcx,52 ;rcx = no. of bits to shift in r10 + cmp rcx,0 + jl L__hsb_below_52 + je L__form_numbers + ;hsb above 52 + mov r8,r10 ;previous contents of r8 not required + shr r10,cl ;r10 = mantissa of x with hsb at 52 + jmp L__form_numbers + +ALIGN 16 +L__hsb_below_52: + neg rcx + mov rax,r9 + shl r10,cl + shl r9,cl + sub rcx,64 + neg rcx + shr rax,cl + or r10,rax + +ALIGN 16 +L__form_numbers: + add r11,1023 + btr r10,52 ;remove the implied bit + mov rcx,r11 + or r10,rdx ;put the sign + shl rcx,52 + or r10,rcx ;x is in r10 + + movd xmm0,r10 ;xmm0 = x + mulsd xmm0,L__piby2 + movsd QWORD PTR[rsi],xmm0 + RestoreReg rsi,save_rsi + RestoreReg rdi,save_rdi + StackDeallocate stack_size + ret + +fname endp + +.const +ALIGN 16 +L__piby2 DQ 03ff921fb54442d18h + +ALIGN 16 +L__2_by_pi_bits DB 224 + DB 241 + DB 27 + DB 193 + DB 12 + DB 88 + DB 33 + DB 116 + DB 53 + DB 126 + DB 196 + DB 126 + DB 237 + DB 175 + DB 169 + DB 75 + DB 74 + DB 41 + DB 222 + DB 231 + DB 28 + DB 244 + DB 236 + DB 197 + DB 151 + DB 175 + DB 31 + DB 235 + DB 158 + DB 212 + DB 181 + DB 168 + DB 127 + DB 121 + DB 154 + DB 253 + DB 24 + DB 61 + DB 221 + DB 38 + DB 44 + DB 159 + DB 60 + DB 251 + DB 217 + DB 180 + DB 125 + DB 180 + DB 41 + DB 104 + DB 45 + DB 70 + DB 188 + DB 188 + DB 63 + DB 96 + DB 22 + DB 120 + DB 255 + DB 95 + DB 226 + DB 127 + DB 236 + DB 160 + DB 228 + DB 247 + DB 46 + DB 126 + DB 17 + DB 114 + DB 210 + DB 231 + DB 76 + DB 13 + DB 230 + DB 88 + DB 71 + DB 230 + DB 4 + DB 249 + DB 125 + DB 209 + DB 154 + DB 192 + DB 113 + DB 166 + DB 19 + DB 18 + DB 237 + DB 186 + DB 212 + DB 215 + DB 8 + DB 162 + DB 251 + DB 156 + DB 166 + DB 196 + DB 114 + DB 172 + DB 119 + DB 248 + DB 115 + DB 72 + DB 70 + DB 39 + DB 168 + DB 187 + DB 36 + DB 25 + DB 128 + DB 75 + DB 55 + DB 9 + DB 233 + DB 184 + DB 145 + DB 220 + DB 134 + DB 21 + DB 239 + DB 122 + DB 175 + DB 142 + DB 69 + DB 249 + DB 7 + DB 65 + DB 14 + DB 241 + DB 100 + DB 86 + DB 138 + DB 109 + DB 3 + DB 119 + DB 211 + DB 212 + DB 71 + DB 95 + DB 157 + DB 240 + DB 167 + DB 84 + DB 16 + DB 57 + DB 185 + DB 13 + DB 230 + DB 139 + DB 2 + DB 0 + DB 0 + DB 0 + DB 0 + DB 0 + DB 0 + DB 0 + +END + diff --git a/sdk/lib/crt/math/libm_sse2/remainderf.c b/sdk/lib/crt/math/libm_sse2/remainderf.c new file mode 100644 index 00000000000..1b389b0d8e4 --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/remainderf.c @@ -0,0 +1,247 @@ + +/******************************************************************************* +MIT License +----------- + +Copyright (c) 2002-2019 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this Software and associated documentaon files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include "libm.h" +#include "libm_util.h" + +#define USE_NANF_WITH_FLAGS +#define USE_SCALEDOUBLE_1 +#define USE_GET_FPSW_INLINE +#define USE_SET_FPSW_INLINE +#define USE_HANDLE_ERRORF +#include "libm_inlines.h" +#undef USE_NANF_WITH_FLAGS +#undef USE_SCALEDOUBLE_1 +#undef USE_GET_FPSW_INLINE +#undef USE_SET_FPSW_INLINE +#undef USE_HANDLE_ERRORF + +#if !defined(_CRTBLD_C9X) +#define _CRTBLD_C9X +#endif + +#include "libm_errno.h" + +// Disable "C4163: not available as intrinsic function" warning that older +// compilers may issue here. +#pragma warning(disable:4163) +#pragma function(remainderf,fmodf) + + +#undef _FUNCNAME +#if defined(COMPILING_FMOD) +float fmodf(float x, float y) +#define _FUNCNAME "fmodf" +#define _OPERATION OP_FMOD +#else +float remainderf(float x, float y) +#define _FUNCNAME "remainderf" +#define _OPERATION OP_REM +#endif +{ + double dx, dy, scale, w, t; + int i, ntimes, xexp, yexp; + unsigned long ux, uy, ax, ay; + + unsigned int sw; + + dx = x; + dy = y; + + + GET_BITS_DP64(dx, ux); + GET_BITS_DP64(dy, uy); + ax = ux & ~SIGNBIT_DP64; + ay = uy & ~SIGNBIT_DP64; + xexp = (int)((ux & EXPBITS_DP64) >> EXPSHIFTBITS_DP64); + yexp = (int)((uy & EXPBITS_DP64) >> EXPSHIFTBITS_DP64); + + if (xexp < 1 || xexp > BIASEDEMAX_DP64 || + yexp < 1 || yexp > BIASEDEMAX_DP64) + { + /* x or y is zero, NaN or infinity (neither x nor y can be + denormalized because we promoted from float to double) */ + if (xexp > BIASEDEMAX_DP64) + { + /* x is NaN or infinity */ + if (ux & MANTBITS_DP64) + { + /* x is NaN */ + unsigned int ufx; + GET_BITS_SP32(x, ufx); + return _handle_errorf(_FUNCNAME, _OPERATION, ufx|0x00400000, _DOMAIN, 0, + EDOM, x, y, 2); + } + else + { + /* x is infinity; result is NaN */ + return _handle_errorf(_FUNCNAME, _OPERATION, INDEFBITPATT_SP32, _DOMAIN, + AMD_F_INVALID, EDOM, x, y, 2); + } + } + else if (yexp > BIASEDEMAX_DP64) + { + /* y is NaN or infinity */ + if (uy & MANTBITS_DP64) + { + /* y is NaN */ + unsigned int ufy; + GET_BITS_SP32(y, ufy); + return _handle_errorf(_FUNCNAME, _OPERATION, ufy|0x00400000, _DOMAIN, 0, + EDOM, x, y, 2); + } + else + { +#ifdef _CRTBLD_C9X + /* C99 return for y = +-inf is x */ + return x; +#else + /* y is infinity; result is indefinite */ + return _handle_errorf(_FUNCNAME, _OPERATION, INDEFBITPATT_SP32, _DOMAIN, + AMD_F_INVALID, EDOM, x, y, 2); +#endif + } + } + else if (xexp < 1) + { + /* x must be zero (cannot be denormalized) */ + if (yexp < 1) + { + /* y must be zero (cannot be denormalized) */ + return _handle_errorf(_FUNCNAME, _OPERATION, INDEFBITPATT_SP32, _DOMAIN, + AMD_F_INVALID, EDOM, x, y, 2); + } + else + /* C99 return for x = 0 must preserve sign */ + return x; + } + else + { + /* y must be zero */ + return _handle_errorf(_FUNCNAME, _OPERATION, INDEFBITPATT_SP32, _DOMAIN, + AMD_F_INVALID, EDOM, x, y, 2); + } + } + else if (ax == ay) + { + /* abs(x) == abs(y); return zero with the sign of x */ + PUT_BITS_DP64(ux & SIGNBIT_DP64, dx); + return (float)dx; + } + + /* Set dx = abs(x), dy = abs(y) */ + PUT_BITS_DP64(ax, dx); + PUT_BITS_DP64(ay, dy); + + if (ax < ay) + { + /* abs(x) < abs(y) */ +#if !defined(COMPILING_FMOD) + if (dx > 0.5*dy) + dx -= dy; +#endif + return (float)(x < 0.0? -dx : dx); + } + + /* Save the current floating-point status word. We need + to do this because the remainder function is always + exact for finite arguments, but our algorithm causes + the inexact flag to be raised. We therefore need to + restore the entry status before exiting. */ + sw = get_fpsw_inline(); + + /* Set ntimes to the number of times we need to do a + partial remainder. If the exponent of x is an exact multiple + of 24 larger than the exponent of y, and the mantissa of x is + less than the mantissa of y, ntimes will be one too large + but it doesn't matter - it just means that we'll go round + the loop below one extra time. */ + if (xexp <= yexp) + { + ntimes = 0; + w = dy; + scale = 1.0; + } + else + { + ntimes = (xexp - yexp) / 24; + + /* Set w = y * 2^(24*ntimes) */ + PUT_BITS_DP64((unsigned long)(ntimes * 24 + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, + scale); + w = scale * dy; + /* Set scale = 2^(-24) */ + PUT_BITS_DP64((unsigned long)(-24 + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, + scale); + } + + + /* Each time round the loop we compute a partial remainder. + This is done by subtracting a large multiple of w + from x each time, where w is a scaled up version of y. + The subtraction can be performed exactly when performed + in double precision, and the result at each stage can + fit exactly in a single precision number. */ + for (i = 0; i < ntimes; i++) + { + /* t is the integer multiple of w that we will subtract. + We use a truncated value for t. */ + t = (double)((int)(dx / w)); + dx -= w * t; + /* Scale w down by 2^(-24) for the next iteration */ + w *= scale; + } + + /* One more time */ +#if defined(COMPILING_FMOD) + t = (double)((int)(dx / w)); + dx -= w * t; +#else + { + unsigned int todd; + /* Variable todd says whether the integer t is odd or not */ + t = (double)((int)(dx / w)); + todd = ((int)(dx / w)) & 1; + dx -= w * t; + + /* At this point, dx lies in the range [0,dy) */ + /* For the remainder function, we need to adjust dx + so that it lies in the range (-y/2, y/2] by carefully + subtracting w (== dy == y) if necessary. */ + if (dx > 0.5 * w || ((dx == 0.5 * w) && todd)) + dx -= w; + } +#endif + + /* **** N.B. for some reason this breaks the 32 bit version + of remainder when compiling with optimization. */ + /* Restore the entry status flags */ + set_fpsw_inline(sw); + + /* Set the result sign according to input argument x */ + return (float)(x < 0.0? -dx : dx); + +} diff --git a/sdk/lib/crt/math/libm_sse2/simd.h b/sdk/lib/crt/math/libm_sse2/simd.h new file mode 100644 index 00000000000..c5f93ff610f --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/simd.h @@ -0,0 +1,369 @@ +/***********************************************************************************/ +/** MIT License **/ +/** ----------- **/ +/** **/ +/** Copyright (c) 2002-2019 Advanced Micro Devices, Inc. **/ +/** **/ +/** Permission is hereby granted, free of charge, to any person obtaining a copy **/ +/** of this Software and associated documentaon files (the "Software"), to deal **/ +/** in the Software without restriction, including without limitation the rights **/ +/** to use, copy, modify, merge, publish, distribute, sublicense, and/or sell **/ +/** copies of the Software, and to permit persons to whom the Software is **/ +/** furnished to do so, subject to the following conditions: **/ +/** **/ +/** The above copyright notice and this permission notice shall be included in **/ +/** all copies or substantial portions of the Software. **/ +/** **/ +/** THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR **/ +/** IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, **/ +/** FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE **/ +/** AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER **/ +/** LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, **/ +/** OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN **/ +/** THE SOFTWARE. **/ +/***********************************************************************************/ + +/* +****************************************************************************** + * Source File : simd.h + * Archive File : $Archive: $ + * Date : 6/04/01 + * Description : The include file for the AMD SIMD exception filter routine + * for Microsoft Structured Exception Handling + * + * +$Revision:$ +$Name:$ +$Date:$ +$Author:$ +$History: simd.h $ + * + */ + +#include + +// simd.h +// This file contains structure definitions to provide +// convenient access to SIMD and MMX data as unsigned +// integer data. + +// change the following define to a 1 to print terse output +#define DO_PRINT 0 + +// can't use the 3DNOW SDK as written with 64 bit tools +#if !defined (_AMD64_) +#define USE_3DNOW_SDK 1 +#define SUPPORTS_FTZ 1 +#endif + + +/*****************************************************************/ + +// Basic type definitions + +typedef UINT_PTR AWORD; // x86-64 safe + +typedef union +{ + float f; + unsigned long l; +} LFLOAT; + +//typedef struct +//{ +// DWORD dw[2]; +//} +typedef unsigned _int64 QWORD; + +typedef union +{ + double f; + unsigned long l[2]; +} LDOUBLE; + +typedef __declspec(align(16)) struct +{ + LFLOAT f0,f1,f2,f3; +} SSESINGLE; + +typedef __declspec(align(16)) struct +{ + LDOUBLE d0,d1; +} SSEDOUBLE; + + +// this is the key data structure type used by the filter +// and the test program. It will be aligned, since +// the __m128 types are all aligned. It allows the +// use of one variable to carry all the needed data +// types. +typedef union +{ + __m128 m; + __m128d md; + __m128i mi; + __m64 m64[2]; + DWORD l[4]; + int i[4]; + LFLOAT f[4]; + QWORD q[2]; + LDOUBLE d[2]; +} ML128; + +// this defined to provide a MMX type for the FXSTOR structure. +typedef union +{ + unsigned short mmx[4]; // mmx regs are 64 bits + unsigned short fp[5]; // floating point regs are 80 bits +} MMX80; + +/*****************************************************************/ + +// define constants used by SIMD + +// define MXCSR rounding control bits. +#define SDIMCW_RC 0x6000 +#define SDIRC_NEAR 0x0000 +#define SDIRC_DOWN 0x2000 +#define SDIRC_UP 0x4000 +#define SDIRC_CHOP 0x6000 + +// define other MXCSR control bits +#define SDDAZ 0x0040 +#define SDFTZ 0x8000 + +#define opADD 0x58 +#define opAND 0x54 +#define opANDN 0x55 +#define opCMP 0xC2 +#define opCOMISS 0x2F +#define opCVTPI2PS 0x2A +#define opCVTTPS2PI 0x2C +#define opCVTPS2PI 0x2D +#define opCVTPS2PD 0x5A +#define opCVTDQ2PS 0x5B +#define opCVTTPD2DQ 0xE6 +#define opDIV 0x5E +#define opMAX 0x5F +#define opMIN 0x5D +#define opMUL 0x59 +#define opSQRT 0x51 +#define opSUB 0x5C +#define opUCOMISS 0x2E + +// define EFlags bits +#define ZF (1 << 6) +#define PF (1 << 2) +#define CF (1 << 0) + +// define the REX prefix bits +#define REX_PREFIX 0x40 +#define REX_W 0x8 +#define REX_R 0x4 +#define REX_X 0x2 +#define REX_B 0x1 + + +// define the exception information record + +// constants for the status bits +#define IEM_INEXACT 0x20 +#define IEM_UNDERFLOW 0x10 +#define IEM_OVERFLOW 0x08 +#define IEM_ZERODIVIDE 0x04 +#define IEM_DENORMAL 0x02 +#define IEM_INVALID 0x01 +#define IEM_MASK 0x3F + +#define IMM_INEXACT 0x1000 +#define IMM_UNDERFLOW 0x0800 +#define IMM_OVERFLOW 0x0400 +#define IMM_ZERODIVIDE 0x0200 +#define IMM_DENORMAL 0x0100 +#define IMM_INVALID 0x0080 +#define IMM_MASK 0x1F80 + +/*****************************************************************/ + +// Instruction forms + +// Type enumerations +// + +typedef enum +{ + fGdWsd, + fGdWss, + fQqWpd, + fQqWps, + fVpdQq, + fVpdWpd, + fVpdWpdIb, + fVpdWpdi, + fVpdWps, + fVpdiWpd, + fVpdiWps, + fVpsQq, + fVpsWpd, + fVpsWpdi, + fVpsWps, + fVpsWpsIb, + fVsdEd, + fVsdWsd, + fVsdWsdIb, + fVsdWss, + fVssEd, + fVssWsd, + fVssWss, + fVssWssIb +} InstType; + +// operand types +typedef enum +{ + oEd, //General register dword mod R/M + oGd, //General register dword + oQq, // MMX quadword mod R/M + oVpd, // XMM register + oVpdi, + oVps, + oVsd, + oVss, + oWpd, // XMM mod R/M + oWpdi, + oWps, + oWsd, + oWss +} OpType; + +// operand class +typedef enum +{ + oXMMreg, + oXMMmrm, + oMMXreg, + oMMXmrm, + oGENreg, + oGENmrm, +} OpClass; + +// data types +typedef enum +{ + dDW, // integer DWORD + dPD, // packed double precision + dPDI, // packed integer DWORD + dPS, // packed single precision + dQ, // integer quadword + dSD, // scalar double precision + dSS // scalar single precision +} DataType; + +/*****************************************************************/ + +// Structure definitions +// + + +// define the format of the data used by +// the FXSAVE and FXRSTOR commands +typedef struct +{ + MMX80 mmx; // the mmx/fp register + unsigned short reserved[3]; // floating point regs are 80 bits +} FPMMX; + +#if defined (_AMD64_) +// x86-64 version +typedef struct _FXMM_SAVE_AREA { + WORD ControlWord; + WORD StatusWord; + WORD TagWord; + WORD OpCode; + QWORD ErrorOffset; + QWORD DataOffset; + DWORD Mxcsr; + DWORD reserved3; + FPMMX FMMXreg[8]; + ML128 XMMreg[16]; +} FXMM_SAVE_AREA; +#else +// 32 bit x86 version +typedef struct _FXMM_SAVE_AREA { + WORD ControlWord; + WORD StatusWord; + WORD TagWord; + WORD OpCode; + DWORD ErrorOffset; + WORD ErrorSelector; + WORD reserved1; + DWORD DataOffset; + WORD DataSelector; + WORD reserved2; + DWORD Mxcsr; + DWORD reserved3; + FPMMX FMMXreg[8]; + ML128 XMMreg[8]; +} FXMM_SAVE_AREA; +#endif +typedef FXMM_SAVE_AREA *PFXMM_SAVE_AREA; + +/* This structure is used to access the excepting opcode */ +typedef struct { + unsigned char opcode; + unsigned char rmbyte; + union { + unsigned long offset; // this will need work for x86-64 + unsigned char imm8; + } data; + +} SIMD_OP, *PSIMD_OP; + +// Define a SIMD exception flag type. +// This is just like the _FPIEEE_EXCEPTION_FLAGS +// except that it adds the denormal field. +typedef struct { + unsigned int Inexact : 1; + unsigned int Underflow : 1; + unsigned int Overflow : 1; + unsigned int ZeroDivide : 1; + unsigned int InvalidOperation : 1; + unsigned int Denormal : 1; +} _SIMD_EXCEPTION_FLAGS; + + +/* define the local simd record structures */ +typedef struct { + unsigned int RoundingMode; + _SIMD_EXCEPTION_FLAGS Cause; + _SIMD_EXCEPTION_FLAGS Enable; + _SIMD_EXCEPTION_FLAGS Status; + PSIMD_OP opaddress; // points to 0F xx opcode + int curAddr; // used when parsing mod R/M byte + unsigned char prefix; + unsigned char opcode; + unsigned char rmbyte; + unsigned char immediate8; + // add a rex field for x86-64 + unsigned char rex; + int eopcode; // encoded opcode (index for tables) + int op_form; + int op1_class; // XMM, MMX, or gen register + int op1_type; // data format + int op2_class; + int op2_type; + int is_commiss; + int commiss_val; + unsigned int mxcsr; // value of mscsr from context record. + ML128 op1_value; + ML128 op2_value; + ML128 *op2_ptr; + +} _SIMD_RECORD, *_PSIMD_RECORD; + +/* define a record for the operand form table */ +typedef struct { + int op1; // form of operand 1 + int op2; // form of operand 2 +} _OPERAND_RECORD; + diff --git a/sdk/lib/crt/math/libm_sse2/sin.asm b/sdk/lib/crt/math/libm_sse2/sin.asm new file mode 100644 index 00000000000..0c07d44455d --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/sin.asm @@ -0,0 +1,511 @@ +; +; MIT License +; ----------- +; +; Copyright (c) 2002-2019 Advanced Micro Devices, Inc. +; +; Permission is hereby granted, free of charge, to any person obtaining a copy +; of this Software and associated documentaon files (the "Software"), to deal +; in the Software without restriction, including without limitation the rights +; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +; copies of the Software, and to permit persons to whom the Software is +; furnished to do so, subject to the following conditions: +; +; The above copyright notice and this permission notice shall be included in +; all copies or substantial portions of the Software. +; +; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +; THE SOFTWARE. +; +; +; An implementation of the sin function. +; +; Prototype: +; +; double sin(double x); +; +; Computes sin(x). +; It will provide proper C99 return values, +; but may not raise floating point status bits properly. +; Based on the NAG C implementation. +; +; If FMA3 hardware is available, an FMA3 implementation of sin will be used. + + +.const +ALIGN 16 +L_real_piby2_1 DQ 03ff921fb54400000h ; piby2_1 + DQ 0 +L_real_piby2_1tail DQ 03dd0b4611a626331h ; piby2_1tail + DQ 0 +L_real_piby2_2 DQ 03dd0b4611a600000h ; piby2_2 + DQ 0 +L_real_piby2_2tail DQ 03ba3198a2e037073h ; piby2_2tail + DQ 0 +ALIGN 16 + +L_one DQ 03FF0000000000000h, 03FF0000000000000h +L_signbit DQ 08000000000000000h, 00000000000000000h +L_int_one DQ 00000000000000001h, 00000000000000000h +L_int_two DQ 00000000000000002h, 00000000000000000h +L_int_three DQ 00000000000000003h, 00000000000000000h + +L_2_by_pi DQ 03fe45f306dc9c883h ; 2/pi +L_one_half DQ 03FE0000000000000h ; .5 +L_one_sixth DQ 03FC5555555555555h ; .1666... +L_two_to_neg_27 DQ 03e40000000000000h ; 2^-27 +L_two_to_neg_13 DQ 03f20000000000000h ; 2^-13 +L_piby4 DQ 03FE921FB54442D18h ; pi/4 +L_small_arg_cw DQ 0411E848000000000h ; 5.e5, appropriate for CW +L_small_arg_bdl DQ 0417312D000000000h ; 2e7, works for BDL + +L__inf_mask_64 DQ 07FF0000000000000h ; +Inf + +EXTRN __Lcosarray:QWORD +EXTRN __Lsinarray:QWORD +EXTRN __use_fma3_lib:DWORD + +; define local variable storage offsets +p_temp EQU 030h +p_temp1 EQU 040h +save_r10 EQU 050h +dummy_space EQU 060h +stack_size EQU 078h + +include fm.inc + +fname TEXTEQU +fname_special TEXTEQU <_sin_special> + +;Define name and any external functions being called +EXTERN __remainder_piby2_forAsm : PROC +EXTERN __remainder_piby2_fma3 : PROC +EXTERN __remainder_piby2_fma3_bdl : PROC +EXTERN fname_special : PROC + +.code + +PUBLIC fname +fname PROC FRAME + StackAllocate stack_size + .ENDPROLOG + cmp DWORD PTR __use_fma3_lib, 0 + jne Lsin_fma3 + +Lsin_sse2: + movd rdx, xmm0 + xorpd xmm2, xmm2 ; zeroed out for later use + + mov r10,rdx + mov r8d, 1 ; for determining region later on + btr r10,63 ; r10 <-- |x| + cmp r10,L_piby4 + jb Lsin_sse2_absx_lt_piby4 + +Lsin_sse2_absx_nlt_piby4: ; common case + mov r11,rdx + shr r11,63 + movd xmm0,r10 ; xmm0 <-- |x| + cmp r10, QWORD PTR L_small_arg_cw + jae Lsin_reduce_precise ; Note NaN/Inf will branch + +; At this point we have |x| < L_small_arg_cw, which is currently 500000. +; Note that if |x| were too large, conversion of npi2 to integer would fail. +; We reduce the argument to be in a range from -pi/4 to +pi/4 +; by subtracting multiples of pi/2 + movapd xmm2, xmm0 + mulsd xmm2, L_2_by_pi + movapd xmm4, xmm0 + +; xexp = ax >> EXPSHIFTBITS_DP64; + mov r9, r10 + shr r9, 52 ; >>EXPSHIFTBITS_DP64 + +; How many pi/2 is |x| a multiple of? +; npi2 = (int)(x * twobypi + 0.5); + addsd xmm2, L_one_half ; npi2 + + movsd xmm3, L_real_piby2_1 + cvttpd2dq xmm0, xmm2 ; convert npi2 to integer + movsd xmm1, L_real_piby2_1tail + cvtdq2pd xmm2, xmm0 ; npi2 back to double + +; Subtract the multiple from x to get an extra-precision remainder +; rhead = x - npi2 * piby2_1; + mulsd xmm3, xmm2 + subsd xmm4, xmm3 ; rhead + +; rtail = npi2 * piby2_1tail; + mulsd xmm1, xmm2 ; rtail + movd eax, xmm0 ; eax <-- npi2 + +; GET_BITS_DP64(rhead-rtail, uy); +; originally only rhead + movapd xmm0, xmm4 + subsd xmm0, xmm1 + + movsd xmm3, L_real_piby2_2 + movd rcx, xmm0 ; rcx <-- rhead - rtail + movsd xmm5, L_real_piby2_2tail ; piby2_2tail + +; xmm0=r, xmm1=rtail, xmm2=npi2, xmm3=temp for calc, +; xmm4=rhead, xmm5= temp for calc +; expdiff = xexp - ((uy & EXPBITS_DP64) >> EXPSHIFTBITS_DP64); +; expdiff measures how close rhead - rtail is to |x| +; (larger expdiff ==> more cancellation in |x| - (rhead-rtail) ==> closer) + shl rcx, 1 ; strip any sign bit + shr rcx, 53 ; >> EXPSHIFTBITS_DP64 +1 + sub r9, rcx ; expdiff + +;; if (expdiff > 15) + cmp r9, 15 + jle Lsin_sse2_cw_reduction_done + +; Here the remainder is pretty small compared with x, which +; implies that x is a near multiple of pi/2 +; (x matches the multiple to at least 15 bits) +; So we do another stage of argument reduction. + +; t = rhead; + movapd xmm1, xmm4 + +; rtail = npi2 * piby2_2; + mulsd xmm3, xmm2 + +; rhead = t - rtail; + mulsd xmm5, xmm2 ; npi2 * piby2_2tail + subsd xmm4, xmm3 ; rhead + +; rtail = npi2 * piby2_2tail - ((t - rhead) - rtail); + subsd xmm1, xmm4 ; t - rhead + subsd xmm1, xmm3 ; -rtail + subsd xmm5, xmm1 ; rtail + +; r = rhead - rtail; + movapd xmm0, xmm4 + +;HARSHA +;xmm1=rtail + movapd xmm1, xmm5 ; xmm1 <-- copy of rtail + subsd xmm0, xmm5 + + +; xmm0=r, xmm4=rhead, xmm1=rtail +Lsin_sse2_cw_reduction_done: +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; if the input was close to a pi/2 multiple +; The original NAG code missed this trick. +; If the input is very close to n*pi/2 after reduction, so r < 2^-27, +; then the sin is either ~ 1.0 or ~r, to within 53 bits. + +; Note: Unfortunately this introduces two jcc instructions close to each +; other and to other branches. As r < 2^-13 should be rather uncommon, it +; almost certainly costs more than it saves. - WAT +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; region = npi2 & 3; + + subsd xmm4, xmm0 ; rhead-r + subsd xmm4, xmm1 ; rr = (rhead-r) - rtail + +Lsin_piby4: +; perform taylor series to calc sinx, sinx for |x| <= pi/4 +; x2 = r * r; + +;xmm4 = a part of rr for the sin path, xmm4 is overwritten in the sin path +;instead use xmm3 because that was freed up in the sin path, xmm3 is overwritten in sin path + movapd xmm3, xmm0 + movapd xmm2, xmm0 + mulsd xmm2, xmm0 ;x2 + + bt eax,0 + jc Lsin_sse2_calc_cos + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; region 0 or 2 do a sin calculation + movsd xmm3, __Lsinarray+50h ; s6 + mulsd xmm3, xmm2 ; x2s6 + movsd xmm5, __Lsinarray+20h ; s3 + movsd QWORD PTR p_temp[rsp], xmm4 ; store xx + movapd xmm1, xmm2 ; move for x4 + mulsd xmm1, xmm2 ; x4 + movsd QWORD PTR p_temp1[rsp], xmm0 ; store x + mulsd xmm5, xmm2 ; x2s3 + movapd xmm4, xmm0 ; move for x3 + addsd xmm3, __Lsinarray+40h ; s5+x2s6 + mulsd xmm1, xmm2 ; x6 + mulsd xmm3, xmm2 ; x2(s5+x2s6) + mulsd xmm4, xmm2 ; x3 + addsd xmm5, __Lsinarray+10h ; s2+x2s3 + mulsd xmm5, xmm2 ; x2(s2+x2s3) + addsd xmm3, __Lsinarray+30h ; s4 + x2(s5+x2s6) + mulsd xmm2, L_one_half ; 0.5 *x2 + movsd xmm0, QWORD PTR p_temp[rsp] ; load xx + mulsd xmm3, xmm1 ; x6(s4 + x2(s5+x2s6)) + addsd xmm5, __Lsinarray ; s1+x2(s2+x2s3) + mulsd xmm2, xmm0 ; 0.5 * x2 *xx + addsd xmm3, xmm5 ; zs + mulsd xmm4, xmm3 ; *x3 + subsd xmm4, xmm2 ; x3*zs - 0.5 * x2 *xx + addsd xmm0, xmm4 ; +xx + addsd xmm0, QWORD PTR p_temp1[rsp] ; +x + + jmp Lsin_sse2_adjust_region + +ALIGN 16 +Lsin_sse2_calc_cos: +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; region 1 or 3 - do a cos calculation +; zc = (c2 + x2 * (c3 + x2 * (c4 + x2 * (c5 + x2 * c6)))); + mulsd xmm4, xmm0 ; x*xx + movsd xmm5, L_one_half + movsd xmm1, __Lcosarray+50h ; c6 + movsd xmm0, __Lcosarray+20h ; c3 + mulsd xmm5, xmm2 ; r = 0.5 *x2 + movapd xmm3, xmm2 ; copy of x2 + movsd QWORD PTR p_temp[rsp], xmm4 ; store x*xx + mulsd xmm1, xmm2 ; c6*x2 + mulsd xmm0, xmm2 ; c3*x2 + subsd xmm5, L_one ; -t=r-1.0, trash r + mulsd xmm3, xmm2 ; x4 + addsd xmm1, __Lcosarray+40h ; c5+x2c6 + addsd xmm0, __Lcosarray+10h ; c2+x2C3 + addsd xmm5, L_one ; 1 + (-t), trash t + mulsd xmm3, xmm2 ; x6 + mulsd xmm1, xmm2 ; x2(c5+x2c6) + mulsd xmm0, xmm2 ; x2(c2+x2C3) + movapd xmm4, xmm2 ; copy of x2 + mulsd xmm4, L_one_half ; r recalculate + addsd xmm1, __Lcosarray+30h ; c4 + x2(c5+x2c6) + addsd xmm0, __Lcosarray ; c1+x2(c2+x2C3) + mulsd xmm2, xmm2 ; x4 recalculate + subsd xmm5, xmm4 ; (1 + (-t)) - r + mulsd xmm1, xmm3 ; x6(c4 + x2(c5+x2c6)) + addsd xmm0, xmm1 ; zc + subsd xmm4, L_one ; t relaculate + subsd xmm5, QWORD PTR p_temp[rsp] ; ((1 + (-t)) - r) - x*xx + mulsd xmm0, xmm2 ; x4 * zc + addsd xmm0, xmm5 ; x4 * zc + ((1 + (-t)) - r -x*xx) + subsd xmm0, xmm4 ; result - (-t) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +Lsin_sse2_adjust_region: +; positive or negative +; switch (region) + shr eax, 1 + mov ecx, eax + and eax, r11d + + not ecx + not r11d + and ecx, r11d + + or eax, ecx + and eax, 1 + jnz Lsin_sse2_cleanup + +;; if the original region 0, 1 and arg is negative, then we negate the result. +;; if the original region 2, 3 and arg is positive, then we negate the result. + movapd xmm2, xmm0 + xorpd xmm0, xmm0 + subsd xmm0, xmm2 + +ALIGN 16 +Lsin_sse2_cleanup: + StackDeallocate stack_size + ret + +ALIGN 16 +Lsin_sse2_absx_lt_piby4: +; sin = sin_piby4(x, 0.0); + +; x2 = r * r; + movapd xmm2, xmm0 + mulsd xmm2, xmm0 ; x2 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; region 0 - do a sin calculation +; zs = (s2 + x2 * (s3 + x2 * (s4 + x2 * (s5 + x2 * s6)))); + movsd xmm3, __Lsinarray+50h ; s6 + mulsd xmm3, xmm2 ; x2s6 + movsd xmm5, __Lsinarray+20h ; s3 + movapd xmm1, xmm2 ; move for x4 + mulsd xmm1, xmm2 ; x4 + mulsd xmm5, xmm2 ; x2s3 + movapd xmm4, xmm0 ; move for x3 + addsd xmm3, __Lsinarray+40h ; s5+x2s6 + mulsd xmm1, xmm2 ; x6 + mulsd xmm3, xmm2 ; x2(s5+x2s6) + mulsd xmm4, xmm2 ; x3 + addsd xmm5, __Lsinarray+10h ; s2+x2s3 + mulsd xmm5, xmm2 ; x2(s2+x2s3) + addsd xmm3, __Lsinarray+30h ; s4 + x2(s5+x2s6) + mulsd xmm3, xmm1 ; x6(s4 + x2(s5+x2s6)) + addsd xmm5, __Lsinarray ; s1+x2(s2+x2s3) + addsd xmm3, xmm5 ; zs + mulsd xmm4, xmm3 ; *x3 + addsd xmm0, xmm4 ; +x + jmp Lsin_sse2_cleanup + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +ALIGN 16 +Lsin_reduce_precise: +; Reduce x into range [-pi/4, pih/4] + cmp r10,L__inf_mask_64 + jae Lsin_x_naninf + mov QWORD PTR p_temp[rsp], r11 + call __remainder_piby2_forAsm + mov r11, QWORD PTR p_temp[rsp] + + ; At this point xmm0 has r, xmm1 has rr, rax has region + + movapd xmm4, xmm1 ; xmm4 <-- rr + jmp Lsin_piby4 + +; xmm0 = x, xmm4 = xx, eax= region + + +ALIGN 16 +Lsin_x_naninf: + call fname_special + StackDeallocate stack_size + ret + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; From this point we assume that FMA3 and AVX hardware are present. + +ALIGN 16 +Lsin_fma3: + vmovq r9,xmm0 + mov r10,r9 ; save x to get sign later + btr r9,63 ; r9 <-- |x| + cmp r9,L_piby4 + jae Lsin_fma3_absx_nlt_piby4 ; Note that NaN will branch + cmp r9,L_two_to_neg_13 + jae Lsin_fma3_calc_sin_for_absx_lt_piby4 + cmp r9,L_two_to_neg_27 + jae Lsin_fma3_compute_x_xxx_0_1666 + StackDeallocate stack_size + ret ; sin x ~= x for |x| < 2^-27 + +ALIGN 16 +Lsin_fma3_compute_x_xxx_0_1666: ; |x| in [2^-27,2^-13] + vmulsd xmm1,xmm0,xmm0 ; xmm1l <-- x*x + vmulsd xmm1,xmm1,xmm0 ; xmm1l <-- x*x*x + vfnmadd231sd xmm0,xmm1,L_one_sixth ; xmm0l <-- x - x*x*x*(1/6) + StackDeallocate stack_size + ret + +ALIGN 16 +Lsin_fma3_calc_sin_for_absx_lt_piby4: ; |x| in [2^-13,pi/4] + vmovsd xmm5,__Lsinarray+050h + vmulsd xmm3,xmm0,xmm0 ; xmm3l <-- x^2 + + vfmadd213sd xmm5,xmm3,__Lsinarray+040h + vfmadd213sd xmm5,xmm3,__Lsinarray+030h + vfmadd213sd xmm5,xmm3,__Lsinarray+020h + vfmadd213sd xmm5,xmm3,__Lsinarray+010h + + vmulsd xmm4,xmm0,xmm3 ; xmm4l <-- x^3 + vfmadd213sd xmm5,xmm3,__Lsinarray + vfmadd231sd xmm0,xmm4,xmm5 ; xmm0l <-- x + x^3 p(x^2) + + StackDeallocate stack_size + ret + +ALIGN 16 +Lsin_fma3_absx_nlt_piby4: ; !(|x| < pi/4) + ; here r9 has |x| + cmp r9,L__inf_mask_64 + jae Lsin_x_naninf +;Lrange_reduce: ;; unused label + + vmovq xmm0,r9 ; xmm0 <-- |x| + cmp r9,L_small_arg_bdl + jae Lsin_fma3_do_general_arg_reduction + + ; Note that __remainder_piby2_fma3 conventions are + ; on input + ; |x| is in xmm0 + ; on output + ; r is in xmm0 + ; rr is in xmm1 + ; region of |x| is in rax + + ; Boldo-Daumas-Li reduction for reasonably small |x| + call __remainder_piby2_fma3_bdl +Lsin_fma3_exit_s: + bt rax,0 + vmulsd xmm3,xmm0,xmm0 ; xmm3 <-- x2 = x * x + jc Lsin_fma3_calc_cos + +Lsin_fma3_calc_sin: ;; unused label + ; region 0 or 2 + ; compute the sine of r+rr, where this sum is in [-pi/4,pi/4] + vmovsd xmm5,__Lsinarray+050h + vfmadd213sd xmm5,xmm3,__Lsinarray+040h + vfmadd213sd xmm5,xmm3,__Lsinarray+030h + vfmadd213sd xmm5,xmm3,__Lsinarray+020h + vfmadd213sd xmm5,xmm3,__Lsinarray+010h ; xmm5 <-- r + + vmulsd xmm4,xmm0,xmm3 ; xmm4 <-- x3 = x*x*x + vmulsd xmm2,xmm4,xmm5 ; xmm2 <-- x*x*x * r + vmulsd xmm5,xmm1,L_one_half ; xmm5 <-- .5*x*x + vsubsd xmm2,xmm5,xmm2 ; xmm2 <-- .5*x*x - x*x*x*r + vmulsd xmm2,xmm3,xmm2 + vsubsd xmm2,xmm2,xmm1 + vfnmadd231sd xmm2, xmm4,__Lsinarray + vsubsd xmm0,xmm0,xmm2 + jmp Lsin_fma3_exit_s_1 + +ALIGN 16 +Lsin_fma3_calc_cos: + ; region 1 or 3 + ; compute the cosine of r+rr, where this sum is in [-pi/4,pi/4] + vmovapd xmm2,L_one + vmulsd xmm5,xmm3,L_one_half ; xmm5 <-- x*x*.5 == r + vsubsd xmm4,xmm2,xmm5 ; xmm4 <-- t = 1. - x*x*.5 + vsubsd xmm2,xmm2,xmm4 ; 1-t + vsubsd xmm2,xmm2,xmm5 ; xmm2 <-- (1-t) - r + vmovsd xmm5,__Lcosarray+050h + vfnmadd231sd xmm2,xmm0,xmm1 ; (1.0 - t) - r) - x * xx) xmm2 + vmulsd xmm1,xmm3,xmm3 ; x2 * x2 xmm1 + vfmadd213sd xmm5,xmm3,__Lcosarray+040h + vfmadd213sd xmm5,xmm3,__Lcosarray+030h + vfmadd213sd xmm5,xmm3,__Lcosarray+020h + vfmadd213sd xmm5,xmm3,__Lcosarray+010h + vfmadd213sd xmm5,xmm3,__Lcosarray + vfmadd213sd xmm5,xmm1,xmm2 + vaddsd xmm0,xmm5,xmm4 + +Lsin_fma3_exit_s_1: + xor r8,r8 ; prepare r8 for cmov + and r10,L_signbit ; isolate original sign of x + bt eax,1 + cmovc r8,L_signbit + xor r8,r10 + vmovq xmm3,r8 + vxorpd xmm0,xmm0,xmm3 + + StackDeallocate stack_size + ret + +ALIGN 16 +Lsin_fma3_do_general_arg_reduction: + ; argument reduction for general x + + ; NOTE: the BDL argument reduction routine does not touch r10, + ; but the general-purpose reduction does. + mov QWORD PTR [save_r10+rsp], r10 + call __remainder_piby2_fma3 + mov r10, QWORD PTR [save_r10+rsp] + jmp Lsin_fma3_exit_s + +fname endp +END + diff --git a/sdk/lib/crt/math/libm_sse2/sincos_special.c b/sdk/lib/crt/math/libm_sse2/sincos_special.c new file mode 100644 index 00000000000..c67a5ba9ccd --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/sincos_special.c @@ -0,0 +1,130 @@ + +/******************************************************************************* +MIT License +----------- + +Copyright (c) 2002-2019 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this Software and associated documentaon files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include +#include +#include +#include +#include + +#include "libm_new.h" + +double _sincos_special(double x, char *name, unsigned int operation) +{ + UT64 xu; + unsigned int is_snan; + + xu.f64 = x; + + if((xu.u64 & INF_POS_64) == INF_POS_64) + { + // x is Inf or NaN + if((xu.u64 & MANTISSA_MASK_64) == 0x0) + { + // x is Inf + xu.u64 = IND_64; + _handle_error(name, operation, xu.u64, _DOMAIN, AMD_F_INVALID, EDOM, x, 0, 1); + } + else + { + // x is NaN + is_snan = (((xu.u64 & QNAN_MASK_64) == QNAN_MASK_64) ? 0 : 1); + if(is_snan) + { + xu.u64 |= QNAN_MASK_64; + } + _handle_error(name, operation, xu.u64, _DOMAIN, 0, EDOM, x, 0, 1); + } + } + + return xu.f64; +} + +float _sincosf_special(float x, char *name, unsigned int operation) +{ + UT64 xu; + unsigned int is_snan; + + xu.u64 = 0; + xu.f32[0] = x; + + if((xu.u32[0] & INF_POS_32) == INF_POS_32) + { + // x is Inf or NaN + if((xu.u32[0] & MANTISSA_MASK_32) == 0x0) + { + // x is Inf + xu.u32[0] = IND_32; + _handle_errorf(name, operation, xu.u64, _DOMAIN, AMD_F_INVALID, EDOM, x, 0, 1); + } + else + { + // x is NaN + is_snan = (((xu.u32[0] & QNAN_MASK_32) == QNAN_MASK_32) ? 0 : 1); + if(is_snan) + { + xu.u32[0] |= QNAN_SET_32; + _handle_errorf(name, operation, xu.u64, _DOMAIN, AMD_F_INVALID, EDOM, x, 0, 1); + } + else + { + _handle_errorf(name, operation, xu.u64, _DOMAIN, 0, EDOM, x, 0, 1); + } + } + } + + return xu.f32[0]; +} + +float _sinf_special(float x) +{ + return _sincosf_special(x, "sinf", _FpCodeSin); +} + +double _sin_special(double x) +{ + return _sincos_special(x, "sin", _FpCodeSin); +} + +float _cosf_special(float x) +{ + return _sincosf_special(x, "cosf", _FpCodeCos); +} + +double _cos_special(double x) +{ + return _sincos_special(x, "cos", _FpCodeCos); +} + +double _tan_special(double x) +{ + return _sincos_special(x, "tan",_FpCodeTan); +} + +float _tanf_special(float x) +{ + return _sincosf_special(x, "tanf",_FpCodeTan); +} diff --git a/sdk/lib/crt/math/libm_sse2/sinf.asm b/sdk/lib/crt/math/libm_sse2/sinf.asm new file mode 100644 index 00000000000..acfda812a7a --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/sinf.asm @@ -0,0 +1,664 @@ +; +; MIT License +; ----------- +; +; Copyright (c) 2002-2019 Advanced Micro Devices, Inc. +; +; Permission is hereby granted, free of charge, to any person obtaining a copy +; of this Software and associated documentaon files (the "Software"), to deal +; in the Software without restriction, including without limitation the rights +; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +; copies of the Software, and to permit persons to whom the Software is +; furnished to do so, subject to the following conditions: +; +; The above copyright notice and this permission notice shall be included in +; all copies or substantial portions of the Software. +; +; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +; THE SOFTWARE. +; +; +; An implementation of the sinf function. +; +; Prototype +; +; float sinf(float x); +; +; Computes sinf(x). +; It will provide proper C99 return values, +; but may not raise floating point status bits properly. +; Based on the NAG C implementation. +; + +.const +ALIGN 16 +L_signbit DQ 08000000000000000h + DQ 08000000000000000h +L_sign_mask DQ 07FFFFFFFFFFFFFFFh + DQ 07FFFFFFFFFFFFFFFh +L_one DQ 03FF0000000000000h + DQ 03FF0000000000000h +L_int_three DQ 00000000000000003h + DQ 00000000000000003h +L_one_half DQ 03FE0000000000000h + DQ 03FE0000000000000h +L_twobypi DQ 03FE45F306DC9C883h + DQ 03FE45F306DC9C883h +L_piby2_1 DQ 03FF921FB54400000h + DQ 03FF921FB54400000h +L_one_sixth DQ 03FC5555555555555h + DQ 03FC5555555555555h +L_piby2_1tail DQ 03DD0B4611A626331h + DQ 03DD0B4611A626331h +L_piby2_2 DQ 03dd0b4611a600000h + DQ 03dd0b4611a600000h +L_piby2_2tail DQ 03ba3198a2e037073h + DQ 03ba3198a2e037073h +L_inf_mask_32 DD 07F800000h + DD 07F800000h + DQ 07F8000007F800000h +L_int_two DQ 00000000000000002h + DQ 00000000000000002h +L_piby2_lead DQ 03ff921fb54442d18h + DQ 03ff921fb54442d18h +L_piby4 DQ 03fe921fb54442d18h + DQ 03fe921fb54442d18h +L_mask_3f2 DQ 03f20000000000000h + DQ 03f20000000000000h +L_mask_3f8 DQ 03f80000000000000h + DQ 03f80000000000000h + +; Do these really need to be different? +L_large_x_fma3 DQ 04170008AC0000000h ; 16779436 +L_large_x_sse2 DQ 0416E848000000000h ; 16000000 + +EXTRN __Lcosfarray:QWORD +EXTRN __Lsinfarray:QWORD +EXTRN __use_fma3_lib:DWORD +EXTRN __L_2_by_pi_bits:BYTE + +; define local variable storage offsets +p_temp EQU 010h ; temporary for get/put bits operation +p_temp1 EQU 018h ; temporary for get/put bits operation +region EQU 020h ; pointer to region for remainder_piby2 +r EQU 028h ; pointer to r for remainder_piby2 +dummy_space EQU 040h + +stack_size EQU 058h + +include fm.inc + +fname TEXTEQU +fname_special TEXTEQU <_sinf_special> + +;Define name and any external functions being called +EXTRN __remainder_piby2d2f_forC : PROC ; NEAR +EXTERN fname_special : PROC + +.code +ALIGN 16 +PUBLIC fname +fname PROC FRAME + StackAllocate stack_size + .ENDPROLOG + cmp DWORD PTR __use_fma3_lib, 0 + jne Lsinf_fma3 + +Lsinf_sse2: + + xorpd xmm2, xmm2 ; zeroed out for later use + +;; if NaN or inf + movd edx, xmm0 + mov eax, 07f800000h + mov r10d, eax + and r10d, edx + cmp r10d, eax + jz Lsinf_sse2_naninf + +; GET_BITS_DP64(x, ux); +; get the input value to an integer register. + cvtss2sd xmm0, xmm0 ; convert input to double. + movd rdx, xmm0 ; rdx is ux + +; ax = (ux & ~SIGNBIT_DP64); + mov r10, rdx + btr r10, 63 ; r10 is ax + mov r8d, 1 ; for determining region later on + +;; if (ax <= 0x3fe921fb54442d18) abs(x) <= pi/4 + mov rax, 03fe921fb54442d18h + cmp r10, rax + jg Lsinf_absx_gt_piby4 + +;; if (ax < 0x3f80000000000000) abs(x) < 2.0^(-7) + mov rax, 3f80000000000000h + cmp r10, rax + jge Lsinf_sse2_small + +;; if (ax < 0x3f20000000000000) abs(x) < 2.0^(-13) + mov rax, 3f20000000000000h + cmp r10, rax + jge Lsinf_sse2_smaller + +; sinf = x; + jmp Lsinf_sse2_cleanup + +ALIGN 16 +Lsinf_sse2_smaller: +; sinf = x - x^3 * 0.1666666666666666666; + movsd xmm2, xmm0 + movsd xmm4, QWORD PTR L_one_sixth ; 0.1666666666666666666 + mulsd xmm2, xmm2 ; x^2 + mulsd xmm2, xmm0 ; x^3 + mulsd xmm2, xmm4 ; x^3 * 0.1666666666666666666 + subsd xmm0, xmm2 ; x - x^3 * 0.1666666666666666666 + jmp Lsinf_sse2_cleanup + +ALIGN 16 +Lsinf_sse2_small: + movsd xmm2, xmm0 ; x2 = r * r; + mulsd xmm2, xmm0 ; x2 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; region 0 or 2 - do a sinf calculation +; zs = x + x3((s1 + x2 * s2) + x4(s3 + x2 * s4)); + movsd xmm1, QWORD PTR __Lsinfarray+18h ; s4 + mulsd xmm1, xmm2 ; s4x2 + movsd xmm4, xmm2 ; move for x4 + movsd xmm5, QWORD PTR __Lsinfarray+8h ; s2 + mulsd xmm4, xmm2 ; x4 + movsd xmm3, xmm0 ; move for x3 + mulsd xmm5, xmm2 ; s2x2 + mulsd xmm3, xmm2 ; x3 + addsd xmm1, QWORD PTR __Lsinfarray+10h ; s3+s4x2 + mulsd xmm1, xmm4 ; s3x4+s4x6 + addsd xmm5, QWORD PTR __Lsinfarray ; s1+s2x2 + addsd xmm1, xmm5 ; s1+s2x2+s3x4+s4x6 + mulsd xmm1, xmm3 ; x3(s1+s2x2+s3x4+s4x6) + addsd xmm0, xmm1 ; x + x3(s1+s2x2+s3x4+s4x6) + jmp Lsinf_sse2_cleanup + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +ALIGN 16 +Lsinf_absx_gt_piby4: +; xneg = (ax != ux); + cmp rdx, r10 + mov r11d, 0 +;; if (xneg) x = -x; + jz Lsinf_sse2_reduce_moderate + + mov r11d, 1 + subsd xmm2, xmm0 + movsd xmm0, xmm2 + +Lsinf_sse2_reduce_moderate: +;; if (x < 5.0e6) + cmp r10, QWORD PTR L_large_x_sse2 + jae Lsinf_sse2_reduce_large + +; reduce the argument to be in a range from -pi/4 to +pi/4 +; by subtracting multiples of pi/2 + movsd xmm2, xmm0 + movsd xmm3, QWORD PTR L_twobypi + movsd xmm4, xmm0 + movsd xmm5, QWORD PTR L_one_half ; .5 + mulsd xmm2, xmm3 + +;/* How many pi/2 is x a multiple of? */ +; xexp = ax >> EXPSHIFTBITS_DP64; + mov r9, r10 + shr r9, 52 ; >>EXPSHIFTBITS_DP64 + +; npi2 = (int)(x * twobypi + 0.5); + addsd xmm2, xmm5 ; npi2 + + movsd xmm3, QWORD PTR L_piby2_1 + cvttpd2dq xmm0, xmm2 ; convert to integer + movsd xmm1, QWORD PTR L_piby2_1tail + cvtdq2pd xmm2, xmm0 ; and back to double. + +; /* Subtract the multiple from x to get an extra-precision remainder */ +; rhead = x - npi2 * piby2_1; + mulsd xmm3, xmm2 + subsd xmm4, xmm3 ; rhead + +; rtail = npi2 * piby2_1tail; + mulsd xmm1, xmm2 + movd eax, xmm0 + +; GET_BITS_DP64(rhead-rtail, uy); +; originally only rhead + movsd xmm0, xmm4 + subsd xmm0, xmm1 + + movsd xmm3, QWORD PTR L_piby2_2 + movd rcx, xmm0 + movsd xmm5, QWORD PTR L_piby2_2tail + +; xmm0=r, xmm4=rhead, xmm1=rtail, xmm2=npi2, xmm3=temp for calc, xmm5= temp for calc +; expdiff = xexp - ((uy & EXPBITS_DP64) >> EXPSHIFTBITS_DP64); + shl rcx, 1 ; strip any sign bit + shr rcx, 53 ; >> EXPSHIFTBITS_DP64 +1 + sub r9, rcx ; expdiff + +;; if (expdiff > 15) + cmp r9, 15 + jle Lsinf_sse2_expdiff_le_15 + +; The remainder is pretty small compared with x, which +; implies that x is a near multiple of pi/2 +; (x matches the multiple to at least 15 bits) +; t = rhead; + movsd xmm1, xmm4 + +; rtail = npi2 * piby2_2; + mulsd xmm3, xmm2 + +; rhead = t - rtail; + mulsd xmm5, xmm2 ; npi2 * piby2_2tail + subsd xmm4, xmm3 ; rhead + +; rtail = npi2 * piby2_2tail - ((t - rhead) - rtail); + subsd xmm1, xmm4 ; t - rhead + subsd xmm1, xmm3 ; -rtail + subsd xmm5, xmm1 ; rtail + +; r = rhead - rtail; + movsd xmm0, xmm4 + +;HARSHA +;xmm1=rtail + movsd xmm1, xmm5 + subsd xmm0, xmm5 + +; xmm0=r, xmm4=rhead, xmm1=rtail + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +Lsinf_sse2_expdiff_le_15: + cmp rcx, 03f2h ; is r < 2^-13 ? + jge Lsinf_sse2_calc_sincosf_piby4 ; use taylor series if not + cmp rcx, 03deh ; if r really small. + jle Lsinf_sse2_r_very_small ; then sinf(r) ~ r or 1 + + movsd xmm2, xmm0 + mulsd xmm2, xmm0 ; xmm2 <-- r^2 + +;; if region is 0 or 2 do a sinf calc. + and r8d, eax + jnz Lsinf_sse2_small_calc_sin + +; region 0 or 2 do a sinf calculation +; use simply polynomial +; x - x*x*x*0.166666666666666666; + movsd xmm3, QWORD PTR L_one_sixth + mulsd xmm3, xmm0 ; * x + mulsd xmm3, xmm2 ; * x^2 + subsd xmm0, xmm3 ; xs + jmp Lsinf_sse2_adjust_region + +ALIGN 16 +Lsinf_sse2_small_calc_sin: +; region 1 or 3 do a cosf calculation +; use simply polynomial +; 1.0 - x*x*0.5; + movsd xmm0, QWORD PTR L_one ; 1.0 + mulsd xmm2, QWORD PTR L_one_half ; 0.5 *x^2 + subsd xmm0, xmm2 ; xc + jmp Lsinf_sse2_adjust_region + +ALIGN 16 +Lsinf_sse2_r_very_small: +;; if region is 0 or 2 do a sinf calc. (sinf ~ x) + and r8d, eax + jz Lsinf_sse2_adjust_region + + movsd xmm0, QWORD PTR L_one ; cosf(r) is a 1 + jmp Lsinf_sse2_adjust_region + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +ALIGN 16 +Lsinf_sse2_reduce_large: +; Reduce x into range [-pi/4, pi/4] +; __remainder_piby2d2f_forC(x, &r, ®ion); + + mov QWORD PTR p_temp[rsp], r11 + lea rdx, QWORD PTR r[rsp] + lea r8, QWORD PTR region[rsp] + movd rcx, xmm0 + call __remainder_piby2d2f_forC + mov r11, QWORD PTR p_temp[rsp] + mov r8d, 1 ; for determining region later on + movsd xmm1, QWORD PTR r[rsp] ; x + mov eax, DWORD PTR region[rsp] ; region + +; xmm0 = x, xmm4 = xx, r8d = 1, eax= region +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; perform taylor series to calc sinfx, cosfx +Lsinf_sse2_calc_sincosf_piby4: +; x2 = r * r; + movsd xmm2, xmm0 + mulsd xmm2, xmm0 ; x2 + +;; if region is 1 or 3, do a cosf calc. + and r8d, eax + jnz Lsinf_sse2_do_cosf_calc + +; region is 0 or 2: do a sinf calc. +; zs = x + x3((s1 + x2 * s2) + x4(s3 + x2 * s4)); +Lsinf_sse2_do_sinf_calc: + movsd xmm1, QWORD PTR __Lsinfarray+18h ; s4 + mulsd xmm1, xmm2 ; s4x2 + movsd xmm4, xmm2 ; move for x4 + mulsd xmm4, xmm2 ; x4 + movsd xmm5, QWORD PTR __Lsinfarray+8h ; s2 + mulsd xmm5, xmm2 ; s2x2 + movsd xmm3, xmm0 ; move for x3 + mulsd xmm3, xmm2 ; x3 + addsd xmm1, QWORD PTR __Lsinfarray+10h ; s3+s4x2 + mulsd xmm1, xmm4 ; s3x4+s4x6 + addsd xmm5, QWORD PTR __Lsinfarray ; s1+s2x2 + addsd xmm1, xmm5 ; s1+s2x2+s3x4+s4x6 + mulsd xmm1, xmm3 ; x3(s1+s2x2+s3x4+s4x6) + addsd xmm0, xmm1 ; x + x3(s1+s2x2+s3x4+s4x6) + jmp Lsinf_sse2_adjust_region + +ALIGN 16 +Lsinf_sse2_do_cosf_calc: + +; region 1 or 3 - do a cosf calculation +; zc = 1-0.5*x2+ c1*x4 +c2*x6 +c3*x8; +; zc = 1-0.5*x2+ c1*x4 +c2*x6 +c3*x8 + c4*x10 for a higher precision + movsd xmm1, QWORD PTR __Lcosfarray+20h ; c4 + movsd xmm4, xmm2 ; move for x4 + mulsd xmm1, xmm2 ; c4x2 + movsd xmm3, QWORD PTR __Lcosfarray+10h ; c2 + mulsd xmm4, xmm2 ; x4 + movsd xmm0, QWORD PTR __Lcosfarray ; c0 + mulsd xmm3, xmm2 ; c2x2 + mulsd xmm0, xmm2 ; c0x2 (=-0.5x2) + addsd xmm1, QWORD PTR __Lcosfarray+18h ; c3+c4x2 + mulsd xmm1, xmm4 ; c3x4 + c4x6 + addsd xmm3, QWORD PTR __Lcosfarray+8h ; c1+c2x2 + addsd xmm1, xmm3 ; c1 + c2x2 + c3x4 + c4x6 + mulsd xmm1, xmm4 ; c1x4 + c2x6 + c3x8 + c4x10 + addsd xmm0, QWORD PTR L_one ; 1 - 0.5x2 + addsd xmm0, xmm1 ; 1 - 0.5x2 + c1x4 + c2x6 + c3x8 + c4x10 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + +Lsinf_sse2_adjust_region: +; positive or negative +; switch (region) + shr eax, 1 + mov ecx, eax + and eax, r11d + + not ecx + not r11d + and ecx, r11d + + or eax, ecx + and eax, 1 + jnz Lsinf_sse2_cleanup + +;; if the original region 0, 1 and arg is negative, then we negate the result. +;; if the original region 2, 3 and arg is positive, then we negate the result. + movsd xmm2, xmm0 + xorpd xmm0, xmm0 + subsd xmm0, xmm2 + + +Lsinf_sse2_cleanup: + cvtsd2ss xmm0, xmm0 + StackDeallocate stack_size + ret + +ALIGN 16 +Lsinf_sse2_naninf: + call fname_special + StackDeallocate stack_size + ret + +ALIGN 16 +Lsinf_fma3: + vmovd eax,xmm0 + mov r8d,L_inf_mask_32 + and eax,r8d + cmp eax, r8d + jz Lsinf_fma3_naninf + + vcvtss2sd xmm5,xmm0,xmm0 + vmovq r9,xmm5 + btr r9,63 ; r9 <-- |x| + cmp r9,L_piby4 + jg Lsinf_fma3_range_reduce + + cmp r9,L_mask_3f8 + jge Lsinf_fma3_compute_sinf_piby_4 + + cmp r9,L_mask_3f2 + jge Lsinf_fma3_compute_x_xxx_0_1666 + + ; Here |x| < 2^-13; just return sin x ~ x + StackDeallocate stack_size + ret + +ALIGN 16 +Lsinf_fma3_compute_x_xxx_0_1666: + ; Here |x| < 2^-7; return sin x ~ x + 1/6 x^3 + vmulsd xmm1,xmm5,xmm5 + vmulsd xmm0,xmm1,xmm5 ; xmm1 <-- x^3 + vfnmadd132sd xmm0,xmm5,L_one_sixth ; x - x*x*x*0.166666666666666666 + jmp Lsinf_fma3_return_sinf_s + +ALIGN 16 +Lsinf_fma3_compute_sinf_piby_4: + vmovapd xmm0,xmm5 + vmovsd xmm1,__Lsinfarray+010h + vmulsd xmm3,xmm0,xmm0 ; xmm3 <-- x^2 + vfmadd231sd xmm1,xmm3,__Lsinfarray+018h + vfmadd213sd xmm1,xmm3,__Lsinfarray+08h + vfmadd213sd xmm1,xmm3,__Lsinfarray + vmulsd xmm3,xmm0,xmm3 ; xmm3 <-- x^3 + vfmadd231sd xmm0,xmm1,xmm3 + jmp Lsinf_fma3_return_sinf_s + +ALIGN 16 +Lsinf_fma3_range_reduce: + vmovq xmm0,r9 ; xmm0 <-- |x| + cmp r9,L_large_x_fma3 + jge Lsinf_fma3_reduce_large + +Lsinf_fma3_sinf_reduce_moderate: + vandpd xmm1,xmm0,L_sign_mask ; xmm1 <-- |x| mov should suffice WAT + vmovapd xmm2,L_twobypi + vfmadd213sd xmm2,xmm1,L_one_half + vcvttpd2dq xmm2,xmm2 + vpmovsxdq xmm1,xmm2 + vandpd xmm4,xmm1,L_int_three ; xmm4 <-- region + vshufps xmm1 ,xmm1,xmm1,8 + vcvtdq2pd xmm1,xmm1 + vmovdqa xmm2,xmm0 + vfnmadd231sd xmm2,xmm1,L_piby2_1 ; xmm2 <-- rhead + vmulsd xmm3,xmm1,L_piby2_1tail ; xmm3 <-- rtail + vsubsd xmm0,xmm2,xmm3 ; xmm0 <-- r_1 + vsubsd xmm2,xmm2,xmm0 + vsubsd xmm1,xmm2,xmm3 ; xmm4 <-- rr_1 + jmp Lsinf_fma3_exit_s + +ALIGN 16 +Lsinf_fma3_reduce_large: + lea r9,__L_2_by_pi_bits + ;xexp = (x >> 52) 1023 + vmovq r11,xmm0 + mov rcx,r11 + shr r11,52 + sub r11,1023 ; r11 <-- xexp = exponent of input x + ;calculate the last byte from which to start multiplication + ;last = 134 (xexp >> 3) + mov r10,r11 + shr r10,3 + sub r10,134 ;r10 = last + neg r10 ;r10 = last + ;load 64 bits of 2_by_pi + mov rax,[r9+r10] + ;mantissa of x = ((x << 12) >> 12) | implied bit + shl rcx,12 + shr rcx,12 ;rcx = mantissa part of input x + bts rcx,52 ;add the implied bit as well + ;load next 128 bits of 2_by_pi + add r10,8 ;increment to next 8 bytes of 2_by_pi + vmovdqu xmm0,XMMWORD PTR[r9+r10] + ;do three 64bit multiplications with mant of x + mul rcx + mov r8,rax ; r8 <-- last 64 bits of mul = res1[2] + mov r10,rdx ; r10 <-- carry + vmovq rax,xmm0 + mul rcx + ;resexp = xexp & 7 + and r11,7 ; r11 <-- resexp = last 3 bits + psrldq xmm0,8 + add rax,r10 ; add the previous carry + adc rdx,0 + mov r9,rax ; r9 <-- next 64 bits of mul = res1[1] + mov r10,rdx ; r10 <-- carry + vmovq rax,xmm0 + mul rcx + add r10,rax ; r10 = most sig 64 bits = res1[0] + ;find the region + ;last three bits ltb = most sig bits >> (54 resexp)) + ; decimal point in last 18 bits == 8 lsb's in first 64 bits + ; and 8 msb's in next 64 bits + ;point_five = ltb & 01h; + ;region = ((ltb >> 1) + point_five) & 3; + mov rcx,54 + mov rax,r10 + sub rcx,r11 + xor rdx,rdx ;rdx = sign of x(i.e first part of x * 2bypi) + shr rax,cl + jnc Lsinf_fma3_no_point_five_f + ;;if there is carry.. then negate the result of multiplication + not r10 + not r9 + not r8 + mov rdx,08000000000000000h + +Lsinf_fma3_no_point_five_f: + adc rax,0 + and rax,3 + vmovd xmm4,eax ;store region to xmm4 + ;calculate the number of integer bits and zero them out + mov rcx,r11 + add rcx,10 ; rcx <-- no. of integer bits + shl r10,cl + shr r10,cl ; r10 contains only mant bits + sub rcx,64 ; form the exponent + mov r11,rcx + ;find the highest set bit + bsr rcx,r10 + jnz Lsinf_fma3_form_mantissa_f + mov r10,r9 + mov r9,r8 + mov r8,0 + bsr rcx,r10 ; rcx <-- hsb + sub r11,64 + +Lsinf_fma3_form_mantissa_f: + add r11,rcx ;for exp of x + sub rcx,52 ;rcx = no. of bits to shift in r10 + cmp rcx,0 + jl Lsinf_fma3_hsb_below_52_f + je Lsinf_fma3_form_numbers_f + ;hsb above 52 + mov r8,r10 ; previous contents of r8 not required + shr r10,cl ; r10 = mantissa of x with hsb at 52 + shr r9,cl ; make space for bits from r10 + sub rcx,64 + neg rcx + shl r8,cl + or r9,r8 ; r9 = mantissa bits of xx + jmp Lsinf_fma3_form_numbers_f + +ALIGN 16 +Lsinf_fma3_hsb_below_52_f: + neg rcx + mov rax,r9 + shl r10,cl + shl r9,cl + sub rcx,64 + neg rcx + shr rax,cl + or r10,rax + shr r8,cl + or r9,r8 + +ALIGN 16 +Lsinf_fma3_form_numbers_f: + add r11,1023 + btr r10,52 ; remove the implied bit + mov rcx,r11 + or r10,rdx ; put the sign + shl rcx,52 + or r10,rcx ; r10 <-- x + vmovq xmm0,r10 ; xmm0 <-- x + vmulsd xmm0,xmm0,L_piby2_lead +Lsinf_fma3_exit_s: + vmovq rax,xmm4 + and rax,01h + cmp rax,01h + jz Lsinf_fma3_cos_piby4_compute + +Lsinf_fma3_sin_piby4_compute: +;; vmovapd xmm1,__Lsinfarray+010h + vmovsd xmm1,__Lsinfarray+010h + vmulsd xmm3,xmm0,xmm0 + vfmadd231sd xmm1,xmm3,__Lsinfarray+018h + vfmadd213sd xmm1,xmm3,__Lsinfarray+008h + vfmadd213sd xmm1,xmm3,__Lsinfarray + vmulsd xmm3,xmm0,xmm3 ; xmm3 <-- x^3 + vfmadd231sd xmm0,xmm1,xmm3 + jmp Lsinf_fma3_exit_s_1 + +ALIGN 16 +Lsinf_fma3_cos_piby4_compute: + vmovapd xmm2,L_one + vmulsd xmm3,xmm0,xmm0 + vfmadd231sd xmm2,xmm3,__Lcosfarray ; xmm2 <-- 1 + c0 x^2 + ; would simple Horner's be slower? + vmovsd xmm1,__Lcosfarray+018h ; xmm1 <-- c3 + vfmadd231sd xmm1,xmm3,__Lcosfarray+020h ; xmm1 <-- c4 x^2+ c3 + vfmadd213sd xmm1,xmm3,__Lcosfarray+010h ; xmm1 <-- (c4 x^2+ c3)x^2 + c2 + vfmadd213sd xmm1,xmm3,__Lcosfarray+008h ; xmm1 <-- ((c4 x^2+ c3)x^2 + c2)x^2 + c1 + vmulsd xmm3,xmm3,xmm3 ; xmm3 <-- x^4 + vmovdqa xmm0,xmm2 + vfmadd231sd xmm0,xmm1,xmm3 +Lsinf_fma3_exit_s_1: + ; assuming FMA3 ==> AVX ==> SSE4.1 + vpcmpeqq xmm2,xmm4,XMMWORD PTR L_int_two + vpcmpeqq xmm3,xmm4,XMMWORD PTR L_int_three + vorpd xmm3,xmm2,xmm3 + vandnpd xmm3,xmm3,L_signbit + vxorpd xmm0,xmm0,xmm3 + + vandnpd xmm1,xmm5,L_signbit + vxorpd xmm0,xmm1,xmm0 +Lsinf_fma3_return_sinf_s: + vcvtsd2ss xmm0,xmm0,xmm0 + StackDeallocate stack_size + ret + +Lsinf_fma3_naninf: + call fname_special + StackDeallocate stack_size + ret + +fname endp +END diff --git a/sdk/lib/crt/math/libm_sse2/sinh.c b/sdk/lib/crt/math/libm_sse2/sinh.c new file mode 100644 index 00000000000..5b628dfbc19 --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/sinh.c @@ -0,0 +1,340 @@ + +/******************************************************************************* +MIT License +----------- + +Copyright (c) 2002-2019 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this Software and associated documentaon files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include "libm.h" +#include "libm_util.h" + +#define USE_SPLITEXP +#define USE_SCALEDOUBLE_1 +#define USE_SCALEDOUBLE_2 +#define USE_INFINITY_WITH_FLAGS +#define USE_VAL_WITH_FLAGS +#define USE_HANDLE_ERROR +#include "libm_inlines.h" +#undef USE_SPLITEXP +#undef USE_SCALEDOUBLE_1 +#undef USE_SCALEDOUBLE_2 +#undef USE_INFINITY_WITH_FLAGS +#undef USE_VAL_WITH_FLAGS +#undef USE_HANDLE_ERROR + +#include "libm_errno.h" + + +#pragma function(sinh) +double sinh(double x) +{ + /* + After dealing with special cases the computation is split into + regions as follows: + + abs(x) >= max_sinh_arg: + sinh(x) = sign(x)*Inf + + abs(x) >= small_threshold: + sinh(x) = sign(x)*exp(abs(x))/2 computed using the + splitexp and scaleDouble functions as for exp_amd(). + + abs(x) < small_threshold: + compute p = exp(y) - 1 and then z = 0.5*(p+(p/(p+1.0))) + sinh(x) is then sign(x)*z. */ + + static const double + max_sinh_arg = 7.10475860073943977113e+02, /* 0x408633ce8fb9f87e */ + thirtytwo_by_log2 = 4.61662413084468283841e+01, /* 0x40471547652b82fe */ + log2_by_32_lead = 2.16608493356034159660e-02, /* 0x3f962e42fe000000 */ + log2_by_32_tail = 5.68948749532545630390e-11, /* 0x3dcf473de6af278e */ + small_threshold = 8*BASEDIGITS_DP64*0.30102999566398119521373889; + /* (8*BASEDIGITS_DP64*log10of2) ' exp(-x) insignificant c.f. exp(x) */ + + /* Lead and tail tabulated values of sinh(i) and cosh(i) + for i = 0,...,36. The lead part has 26 leading bits. */ + + static const double sinh_lead[37] = { + 0.00000000000000000000e+00, /* 0x0000000000000000 */ + 1.17520117759704589844e+00, /* 0x3ff2cd9fc0000000 */ + 3.62686038017272949219e+00, /* 0x400d03cf60000000 */ + 1.00178747177124023438e+01, /* 0x40240926e0000000 */ + 2.72899169921875000000e+01, /* 0x403b4a3800000000 */ + 7.42032089233398437500e+01, /* 0x40528d0160000000 */ + 2.01713153839111328125e+02, /* 0x406936d228000000 */ + 5.48316116333007812500e+02, /* 0x4081228768000000 */ + 1.49047882080078125000e+03, /* 0x409749ea50000000 */ + 4.05154187011718750000e+03, /* 0x40afa71570000000 */ + 1.10132326660156250000e+04, /* 0x40c5829dc8000000 */ + 2.99370708007812500000e+04, /* 0x40dd3c4488000000 */ + 8.13773945312500000000e+04, /* 0x40f3de1650000000 */ + 2.21206695312500000000e+05, /* 0x410b00b590000000 */ + 6.01302140625000000000e+05, /* 0x412259ac48000000 */ + 1.63450865625000000000e+06, /* 0x4138f0cca8000000 */ + 4.44305525000000000000e+06, /* 0x4150f2ebd0000000 */ + 1.20774762500000000000e+07, /* 0x4167093488000000 */ + 3.28299845000000000000e+07, /* 0x417f4f2208000000 */ + 8.92411500000000000000e+07, /* 0x419546d8f8000000 */ + 2.42582596000000000000e+08, /* 0x41aceb0888000000 */ + 6.59407856000000000000e+08, /* 0x41c3a6e1f8000000 */ + 1.79245641600000000000e+09, /* 0x41dab5adb8000000 */ + 4.87240166400000000000e+09, /* 0x41f226af30000000 */ + 1.32445608960000000000e+10, /* 0x4208ab7fb0000000 */ + 3.60024494080000000000e+10, /* 0x4220c3d390000000 */ + 9.78648043520000000000e+10, /* 0x4236c93268000000 */ + 2.66024116224000000000e+11, /* 0x424ef822f0000000 */ + 7.23128516608000000000e+11, /* 0x42650bba30000000 */ + 1.96566712320000000000e+12, /* 0x427c9aae40000000 */ + 5.34323724288000000000e+12, /* 0x4293704708000000 */ + 1.45244246507520000000e+13, /* 0x42aa6b7658000000 */ + 3.94814795284480000000e+13, /* 0x42c1f43fc8000000 */ + 1.07321789251584000000e+14, /* 0x42d866f348000000 */ + 2.91730863685632000000e+14, /* 0x42f0953e28000000 */ + 7.93006722514944000000e+14, /* 0x430689e220000000 */ + 2.15561576592179200000e+15}; /* 0x431ea215a0000000 */ + + static const double sinh_tail[37] = { + 0.00000000000000000000e+00, /* 0x0000000000000000 */ + 1.60467555584448807892e-08, /* 0x3e513ae6096a0092 */ + 2.76742892754807136947e-08, /* 0x3e5db70cfb79a640 */ + 2.09697499555224576530e-07, /* 0x3e8c2526b66dc067 */ + 2.04940252448908240062e-07, /* 0x3e8b81b18647f380 */ + 1.65444891522700935932e-06, /* 0x3ebbc1cdd1e1eb08 */ + 3.53116789999998198721e-06, /* 0x3ecd9f201534fb09 */ + 6.94023870987375490695e-06, /* 0x3edd1c064a4e9954 */ + 4.98876893611587449271e-06, /* 0x3ed4eca65d06ea74 */ + 3.19656024605152215752e-05, /* 0x3f00c259bcc0ecc5 */ + 2.08687768377236501204e-04, /* 0x3f2b5a6647cf9016 */ + 4.84668088325403796299e-05, /* 0x3f09691adefb0870 */ + 1.17517985422733832468e-03, /* 0x3f53410fc29cde38 */ + 6.90830086959560562415e-04, /* 0x3f46a31a50b6fb3c */ + 1.45697262451506548420e-03, /* 0x3f57defc71805c40 */ + 2.99859023684906737806e-02, /* 0x3f9eb49fd80e0bab */ + 1.02538800507941396667e-02, /* 0x3f84fffc7bcd5920 */ + 1.26787628407699110022e-01, /* 0x3fc03a93b6c63435 */ + 6.86652479544033744752e-02, /* 0x3fb1940bb255fd1c */ + 4.81593627621056619148e-01, /* 0x3fded26e14260b50 */ + 1.70489513795397629181e+00, /* 0x3ffb47401fc9f2a2 */ + 1.12416073482258713767e+01, /* 0x40267bb3f55634f1 */ + 7.06579578070110514432e+00, /* 0x401c435ff8194ddc */ + 5.91244512999659974639e+01, /* 0x404d8fee052ba63a */ + 1.68921736147050694399e+02, /* 0x40651d7edccde3f6 */ + 2.60692936262073658327e+02, /* 0x40704b1644557d1a */ + 3.62419382134885609048e+02, /* 0x4076a6b5ca0a9dc4 */ + 4.07689930834187271103e+03, /* 0x40afd9cc72249aba */ + 1.55377375868385224749e+04, /* 0x40ce58de693edab5 */ + 2.53720210371943067003e+04, /* 0x40d8c70158ac6363 */ + 4.78822310734952334315e+04, /* 0x40e7614764f43e20 */ + 1.81871712615542812273e+05, /* 0x4106337db36fc718 */ + 5.62892347580489004031e+05, /* 0x41212d98b1f611e2 */ + 6.41374032312148716301e+05, /* 0x412392bc108b37cc */ + 7.57809544070145115256e+06, /* 0x415ce87bdc3473dc */ + 3.64177136406482197344e+06, /* 0x414bc8d5ae99ad14 */ + 7.63580561355670914054e+06}; /* 0x415d20d76744835c */ + + static const double cosh_lead[37] = { + 1.00000000000000000000e+00, /* 0x3ff0000000000000 */ + 1.54308062791824340820e+00, /* 0x3ff8b07550000000 */ + 3.76219564676284790039e+00, /* 0x400e18fa08000000 */ + 1.00676617622375488281e+01, /* 0x402422a490000000 */ + 2.73082327842712402344e+01, /* 0x403b4ee858000000 */ + 7.42099475860595703125e+01, /* 0x40528d6fc8000000 */ + 2.01715633392333984375e+02, /* 0x406936e678000000 */ + 5.48317031860351562500e+02, /* 0x4081228948000000 */ + 1.49047915649414062500e+03, /* 0x409749eaa8000000 */ + 4.05154199218750000000e+03, /* 0x40afa71580000000 */ + 1.10132329101562500000e+04, /* 0x40c5829dd0000000 */ + 2.99370708007812500000e+04, /* 0x40dd3c4488000000 */ + 8.13773945312500000000e+04, /* 0x40f3de1650000000 */ + 2.21206695312500000000e+05, /* 0x410b00b590000000 */ + 6.01302140625000000000e+05, /* 0x412259ac48000000 */ + 1.63450865625000000000e+06, /* 0x4138f0cca8000000 */ + 4.44305525000000000000e+06, /* 0x4150f2ebd0000000 */ + 1.20774762500000000000e+07, /* 0x4167093488000000 */ + 3.28299845000000000000e+07, /* 0x417f4f2208000000 */ + 8.92411500000000000000e+07, /* 0x419546d8f8000000 */ + 2.42582596000000000000e+08, /* 0x41aceb0888000000 */ + 6.59407856000000000000e+08, /* 0x41c3a6e1f8000000 */ + 1.79245641600000000000e+09, /* 0x41dab5adb8000000 */ + 4.87240166400000000000e+09, /* 0x41f226af30000000 */ + 1.32445608960000000000e+10, /* 0x4208ab7fb0000000 */ + 3.60024494080000000000e+10, /* 0x4220c3d390000000 */ + 9.78648043520000000000e+10, /* 0x4236c93268000000 */ + 2.66024116224000000000e+11, /* 0x424ef822f0000000 */ + 7.23128516608000000000e+11, /* 0x42650bba30000000 */ + 1.96566712320000000000e+12, /* 0x427c9aae40000000 */ + 5.34323724288000000000e+12, /* 0x4293704708000000 */ + 1.45244246507520000000e+13, /* 0x42aa6b7658000000 */ + 3.94814795284480000000e+13, /* 0x42c1f43fc8000000 */ + 1.07321789251584000000e+14, /* 0x42d866f348000000 */ + 2.91730863685632000000e+14, /* 0x42f0953e28000000 */ + 7.93006722514944000000e+14, /* 0x430689e220000000 */ + 2.15561576592179200000e+15}; /* 0x431ea215a0000000 */ + + static const double cosh_tail[37] = { + 0.00000000000000000000e+00, /* 0x0000000000000000 */ + 6.89700037027478056904e-09, /* 0x3e3d9f5504c2bd28 */ + 4.43207835591715833630e-08, /* 0x3e67cb66f0a4c9fd */ + 2.33540217013828929694e-07, /* 0x3e8f58617928e588 */ + 5.17452463948269748331e-08, /* 0x3e6bc7d000c38d48 */ + 9.38728274131605919153e-07, /* 0x3eaf7f9d4e329998 */ + 2.73012191010840495544e-06, /* 0x3ec6e6e464885269 */ + 3.29486051438996307950e-06, /* 0x3ecba3a8b946c154 */ + 4.75803746362771416375e-06, /* 0x3ed3f4e76110d5a4 */ + 3.33050940471947692369e-05, /* 0x3f017622515a3e2b */ + 9.94707313972136215365e-06, /* 0x3ee4dc4b528af3d0 */ + 6.51685096227860253398e-05, /* 0x3f11156278615e10 */ + 1.18132406658066663359e-03, /* 0x3f535ad50ed821f5 */ + 6.93090416366541877541e-04, /* 0x3f46b61055f2935c */ + 1.45780415323416845386e-03, /* 0x3f57e2794a601240 */ + 2.99862082708111758744e-02, /* 0x3f9eb4b45f6aadd3 */ + 1.02539925859688602072e-02, /* 0x3f85000b967b3698 */ + 1.26787669807076286421e-01, /* 0x3fc03a940fadc092 */ + 6.86652631843830962843e-02, /* 0x3fb1940bf3bf874c */ + 4.81593633223853068159e-01, /* 0x3fded26e1a2a2110 */ + 1.70489514001513020602e+00, /* 0x3ffb4740205796d6 */ + 1.12416073489841270572e+01, /* 0x40267bb3f55cb85d */ + 7.06579578098005001152e+00, /* 0x401c435ff81e18ac */ + 5.91244513000686140458e+01, /* 0x404d8fee052bdea4 */ + 1.68921736147088438429e+02, /* 0x40651d7edccde926 */ + 2.60692936262087528121e+02, /* 0x40704b1644557e0e */ + 3.62419382134890611269e+02, /* 0x4076a6b5ca0a9e1c */ + 4.07689930834187453002e+03, /* 0x40afd9cc72249abe */ + 1.55377375868385224749e+04, /* 0x40ce58de693edab5 */ + 2.53720210371943103382e+04, /* 0x40d8c70158ac6364 */ + 4.78822310734952334315e+04, /* 0x40e7614764f43e20 */ + 1.81871712615542812273e+05, /* 0x4106337db36fc718 */ + 5.62892347580489004031e+05, /* 0x41212d98b1f611e2 */ + 6.41374032312148716301e+05, /* 0x412392bc108b37cc */ + 7.57809544070145115256e+06, /* 0x415ce87bdc3473dc */ + 3.64177136406482197344e+06, /* 0x414bc8d5ae99ad14 */ + 7.63580561355670914054e+06}; /* 0x415d20d76744835c */ + + unsigned long ux, aux, xneg; + double y, z, z1, z2; + int m; + + /* Special cases */ + + GET_BITS_DP64(x, ux); + aux = ux & ~SIGNBIT_DP64; + if (aux < 0x3e30000000000000) /* |x| small enough that sinh(x) = x */ + { + if (aux == 0) + /* with no inexact */ + return x; + else + return val_with_flags(x, AMD_F_INEXACT); + } + else if (aux >= 0x7ff0000000000000) /* |x| is NaN or Inf */ + { + if (aux > 0x7ff0000000000000) + /* x is NaN */ + return _handle_error("sinh", OP_SINH, ux|0x0008000000000000, _DOMAIN, + 0, EDOM, x, 0.0, 1); + else + return x + x; + } + + + xneg = (aux != ux); + + y = x; + if (xneg) y = -x; + + if (y >= max_sinh_arg) + { + if (xneg) + return _handle_error("sinh", OP_SINH, NINFBITPATT_DP64, _OVERFLOW, + AMD_F_OVERFLOW, ERANGE, x, 0.0, 1); + else + return _handle_error("sinh", OP_SINH, PINFBITPATT_DP64, _OVERFLOW, + AMD_F_OVERFLOW, ERANGE, x, 0.0, 1); + } + else if (y >= small_threshold) + { + /* In this range y is large enough so that + the negative exponential is negligible, + so sinh(y) is approximated by sign(x)*exp(y)/2. The + code below is an inlined version of that from + exp() with two changes (it operates on + y instead of x, and the division by 2 is + done by reducing m by 1). */ + + splitexp(y, 1.0, thirtytwo_by_log2, log2_by_32_lead, + log2_by_32_tail, &m, &z1, &z2); + m -= 1; + + if (m >= EMIN_DP64 && m <= EMAX_DP64) + z = scaleDouble_1((z1+z2),m); + else + z = scaleDouble_2((z1+z2),m); + } + else + { + /* In this range we find the integer part y0 of y + and the increment dy = y - y0. We then compute + + z = sinh(y) = sinh(y0)cosh(dy) + cosh(y0)sinh(dy) + + where sinh(y0) and cosh(y0) are tabulated above. */ + + int ind; + double dy, dy2, sdy, cdy, sdy1, sdy2; + + ind = (int)y; + dy = y - ind; + + dy2 = dy*dy; + sdy = dy*dy2*(0.166666666666666667013899e0 + + (0.833333333333329931873097e-2 + + (0.198412698413242405162014e-3 + + (0.275573191913636406057211e-5 + + (0.250521176994133472333666e-7 + + (0.160576793121939886190847e-9 + + 0.7746188980094184251527126e-12*dy2)*dy2)*dy2)*dy2)*dy2)*dy2); + + cdy = dy2*(0.500000000000000005911074e0 + + (0.416666666666660876512776e-1 + + (0.138888888889814854814536e-2 + + (0.248015872460622433115785e-4 + + (0.275573350756016588011357e-6 + + (0.208744349831471353536305e-8 + + 0.1163921388172173692062032e-10*dy2)*dy2)*dy2)*dy2)*dy2)*dy2); + + /* At this point sinh(dy) is approximated by dy + sdy. + Shift some significant bits from dy to sdy. */ + + GET_BITS_DP64(dy, ux); + ux &= 0xfffffffff8000000; + PUT_BITS_DP64(ux, sdy1); + sdy2 = sdy + (dy - sdy1); + + z = ((((((cosh_tail[ind]*sdy2 + sinh_tail[ind]*cdy) + + cosh_tail[ind]*sdy1) + sinh_tail[ind]) + + cosh_lead[ind]*sdy2) + sinh_lead[ind]*cdy) + + cosh_lead[ind]*sdy1) + sinh_lead[ind]; + } + + if (xneg) z = - z; + return z; +} diff --git a/sdk/lib/crt/math/libm_sse2/sinhf.c b/sdk/lib/crt/math/libm_sse2/sinhf.c new file mode 100644 index 00000000000..ea6f6761d8f --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/sinhf.c @@ -0,0 +1,256 @@ + +/******************************************************************************* +MIT License +----------- + +Copyright (c) 2002-2019 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this Software and associated documentaon files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include "libm.h" +#include "libm_util.h" + +#define USE_SPLITEXP +#define USE_SCALEDOUBLE_1 +#define USE_INFINITY_WITH_FLAGS +#define USE_VALF_WITH_FLAGS +#define USE_HANDLE_ERRORF +#include "libm_inlines.h" +#undef USE_SPLITEXP +#undef USE_SCALEDOUBLE_1 +#undef USE_INFINITY_WITH_FLAGS +#undef USE_VALF_WITH_FLAGS +#undef USE_HANDLE_ERRORF + +#include "libm_errno.h" + +// Disable "C4163: not available as intrinsic function" warning that older +// compilers may issue here. +#pragma warning(disable:4163) +#pragma function(sinhf) + + +float sinhf(float fx) +{ + /* + After dealing with special cases the computation is split into + regions as follows: + + abs(x) >= max_sinh_arg: + sinh(x) = sign(x)*Inf + + abs(x) >= small_threshold: + sinh(x) = sign(x)*exp(abs(x))/2 computed using the + splitexp and scaleDouble functions as for exp_amd(). + + abs(x) < small_threshold: + compute p = exp(y) - 1 and then z = 0.5*(p+(p/(p+1.0))) + sinh(x) is then sign(x)*z. */ + + static const double + /* The max argument of sinhf, but stored as a double */ + max_sinh_arg = 8.94159862922329438106e+01, /* 0x40565a9f84f82e63 */ + thirtytwo_by_log2 = 4.61662413084468283841e+01, /* 0x40471547652b82fe */ + log2_by_32_lead = 2.16608493356034159660e-02, /* 0x3f962e42fe000000 */ + log2_by_32_tail = 5.68948749532545630390e-11, /* 0x3dcf473de6af278e */ + small_threshold = 8*BASEDIGITS_DP64*0.30102999566398119521373889; + /* (8*BASEDIGITS_DP64*log10of2) ' exp(-x) insignificant c.f. exp(x) */ + + /* Tabulated values of sinh(i) and cosh(i) for i = 0,...,36. */ + + static const double sinh_lead[37] = { + 0.00000000000000000000e+00, /* 0x0000000000000000 */ + 1.17520119364380137839e+00, /* 0x3ff2cd9fc44eb982 */ + 3.62686040784701857476e+00, /* 0x400d03cf63b6e19f */ + 1.00178749274099008204e+01, /* 0x40240926e70949ad */ + 2.72899171971277496596e+01, /* 0x403b4a3803703630 */ + 7.42032105777887522891e+01, /* 0x40528d0166f07374 */ + 2.01713157370279219549e+02, /* 0x406936d22f67c805 */ + 5.48316123273246489589e+02, /* 0x408122876ba380c9 */ + 1.49047882578955000099e+03, /* 0x409749ea514eca65 */ + 4.05154190208278987484e+03, /* 0x40afa7157430966f */ + 1.10132328747033916443e+04, /* 0x40c5829dced69991 */ + 2.99370708492480553105e+04, /* 0x40dd3c4488cb48d6 */ + 8.13773957064298447222e+04, /* 0x40f3de1654d043f0 */ + 2.21206696003330085659e+05, /* 0x410b00b5916a31a5 */ + 6.01302142081972560845e+05, /* 0x412259ac48bef7e3 */ + 1.63450868623590236530e+06, /* 0x4138f0ccafad27f6 */ + 4.44305526025387924165e+06, /* 0x4150f2ebd0a7ffe3 */ + 1.20774763767876271158e+07, /* 0x416709348c0ea4ed */ + 3.28299845686652474105e+07, /* 0x417f4f22091940bb */ + 8.92411504815936237574e+07, /* 0x419546d8f9ed26e1 */ + 2.42582597704895108938e+08, /* 0x41aceb088b68e803 */ + 6.59407867241607308388e+08, /* 0x41c3a6e1fd9eecfd */ + 1.79245642306579566002e+09, /* 0x41dab5adb9c435ff */ + 4.87240172312445068359e+09, /* 0x41f226af33b1fdc0 */ + 1.32445610649217357635e+10, /* 0x4208ab7fb5475fb7 */ + 3.60024496686929321289e+10, /* 0x4220c3d3920962c8 */ + 9.78648047144193725586e+10, /* 0x4236c932696a6b5c */ + 2.66024120300899291992e+11, /* 0x424ef822f7f6731c */ + 7.23128532145737548828e+11, /* 0x42650bba3796379a */ + 1.96566714857202099609e+12, /* 0x427c9aae4631c056 */ + 5.34323729076223046875e+12, /* 0x429370470aec28ec */ + 1.45244248326237109375e+13, /* 0x42aa6b765d8cdf6c */ + 3.94814800913403437500e+13, /* 0x42c1f43fcc4b662c */ + 1.07321789892958031250e+14, /* 0x42d866f34a725782 */ + 2.91730871263727437500e+14, /* 0x42f0953e2f3a1ef7 */ + 7.93006726156715250000e+14, /* 0x430689e221bc8d5a */ + 2.15561577355759750000e+15}; /* 0x431ea215a1d20d76 */ + + static const double cosh_lead[37] = { + 1.00000000000000000000e+00, /* 0x3ff0000000000000 */ + 1.54308063481524371241e+00, /* 0x3ff8b07551d9f550 */ + 3.76219569108363138810e+00, /* 0x400e18fa0df2d9bc */ + 1.00676619957777653269e+01, /* 0x402422a497d6185e */ + 2.73082328360164865444e+01, /* 0x403b4ee858de3e80 */ + 7.42099485247878334349e+01, /* 0x40528d6fcbeff3a9 */ + 2.01715636122455890700e+02, /* 0x406936e67db9b919 */ + 5.48317035155212010977e+02, /* 0x4081228949ba3a8b */ + 1.49047916125217807348e+03, /* 0x409749eaa93f4e76 */ + 4.05154202549259389343e+03, /* 0x40afa715845d8894 */ + 1.10132329201033226127e+04, /* 0x40c5829dd053712d */ + 2.99370708659497577173e+04, /* 0x40dd3c4489115627 */ + 8.13773957125740562333e+04, /* 0x40f3de1654d6b543 */ + 2.21206696005590405548e+05, /* 0x410b00b5916b6105 */ + 6.01302142082804115489e+05, /* 0x412259ac48bf13ca */ + 1.63450868623620807193e+06, /* 0x4138f0ccafad2d17 */ + 4.44305526025399193168e+06, /* 0x4150f2ebd0a8005c */ + 1.20774763767876680940e+07, /* 0x416709348c0ea503 */ + 3.28299845686652623117e+07, /* 0x417f4f22091940bf */ + 8.92411504815936237574e+07, /* 0x419546d8f9ed26e1 */ + 2.42582597704895138741e+08, /* 0x41aceb088b68e804 */ + 6.59407867241607308388e+08, /* 0x41c3a6e1fd9eecfd */ + 1.79245642306579566002e+09, /* 0x41dab5adb9c435ff */ + 4.87240172312445068359e+09, /* 0x41f226af33b1fdc0 */ + 1.32445610649217357635e+10, /* 0x4208ab7fb5475fb7 */ + 3.60024496686929321289e+10, /* 0x4220c3d3920962c8 */ + 9.78648047144193725586e+10, /* 0x4236c932696a6b5c */ + 2.66024120300899291992e+11, /* 0x424ef822f7f6731c */ + 7.23128532145737548828e+11, /* 0x42650bba3796379a */ + 1.96566714857202099609e+12, /* 0x427c9aae4631c056 */ + 5.34323729076223046875e+12, /* 0x429370470aec28ec */ + 1.45244248326237109375e+13, /* 0x42aa6b765d8cdf6c */ + 3.94814800913403437500e+13, /* 0x42c1f43fcc4b662c */ + 1.07321789892958031250e+14, /* 0x42d866f34a725782 */ + 2.91730871263727437500e+14, /* 0x42f0953e2f3a1ef7 */ + 7.93006726156715250000e+14, /* 0x430689e221bc8d5a */ + 2.15561577355759750000e+15}; /* 0x431ea215a1d20d76 */ + + unsigned long ux, aux, xneg; + double x = fx, y, z, z1, z2; + int m; + + /* Special cases */ + + GET_BITS_DP64(x, ux); + aux = ux & ~SIGNBIT_DP64; + if (aux < 0x3f10000000000000) /* |x| small enough that sinh(x) = x */ + { + if (aux == 0) + /* with no inexact */ + return fx; + else + return valf_with_flags(fx, AMD_F_INEXACT); + } + else if (aux >= 0x7ff0000000000000) /* |x| is NaN or Inf */ + { + if (aux > 0x7ff0000000000000) + { + /* x is NaN */ + unsigned int uhx; + GET_BITS_SP32(fx, uhx); + return _handle_errorf("sinhf", OP_SINH, uhx|0x00400000, _DOMAIN, + 0, EDOM, fx, 0.0F, 1); + } + else + return fx + fx; + } + + xneg = (aux != ux); + + y = x; + if (xneg) y = -x; + + if (y >= max_sinh_arg) + { + /* Return infinity with overflow flag. */ + if (xneg) + return _handle_errorf("sinhf", OP_SINH, NINFBITPATT_SP32, _OVERFLOW, + AMD_F_OVERFLOW, ERANGE, fx, 0.0F, 1); + else + return _handle_errorf("sinhf", OP_SINH, PINFBITPATT_SP32, _OVERFLOW, + AMD_F_OVERFLOW, ERANGE, fx, 0.0F, 1); + } + else if (y >= small_threshold) + { + /* In this range y is large enough so that + the negative exponential is negligible, + so sinh(y) is approximated by sign(x)*exp(y)/2. The + code below is an inlined version of that from + exp() with two changes (it operates on + y instead of x, and the division by 2 is + done by reducing m by 1). */ + + splitexp(y, 1.0, thirtytwo_by_log2, log2_by_32_lead, + log2_by_32_tail, &m, &z1, &z2); + m -= 1; + /* scaleDouble_1 is always safe because the argument x was + float, rather than double */ + z = scaleDouble_1((z1+z2),m); + } + else + { + /* In this range we find the integer part y0 of y + and the increment dy = y - y0. We then compute + + z = sinh(y) = sinh(y0)cosh(dy) + cosh(y0)sinh(dy) + + where sinh(y0) and cosh(y0) are tabulated above. */ + + int ind; + double dy, dy2, sdy, cdy; + + ind = (int)y; + dy = y - ind; + + dy2 = dy*dy; + + sdy = dy + dy*dy2*(0.166666666666666667013899e0 + + (0.833333333333329931873097e-2 + + (0.198412698413242405162014e-3 + + (0.275573191913636406057211e-5 + + (0.250521176994133472333666e-7 + + (0.160576793121939886190847e-9 + + 0.7746188980094184251527126e-12*dy2)*dy2)*dy2)*dy2)*dy2)*dy2); + + cdy = 1 + dy2*(0.500000000000000005911074e0 + + (0.416666666666660876512776e-1 + + (0.138888888889814854814536e-2 + + (0.248015872460622433115785e-4 + + (0.275573350756016588011357e-6 + + (0.208744349831471353536305e-8 + + 0.1163921388172173692062032e-10*dy2)*dy2)*dy2)*dy2)*dy2)*dy2); + + z = sinh_lead[ind]*cdy + cosh_lead[ind]*sdy; + } + + if (xneg) z = - z; + return (float)z; +} diff --git a/sdk/lib/crt/math/libm_sse2/sqrt.c b/sdk/lib/crt/math/libm_sse2/sqrt.c new file mode 100644 index 00000000000..ddadadaedc8 --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/sqrt.c @@ -0,0 +1,88 @@ + +/******************************************************************************* +MIT License +----------- + +Copyright (c) 2002-2019 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this Software and associated documentaon files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include "libm.h" +#include "libm_util.h" + +#if USE_SOFTWARE_SQRT +#define USE_SQRT_AMD_INLINE +#endif +#define USE_NAN_WITH_FLAGS +#define USE_HANDLE_ERROR +#include "libm_inlines.h" +#if USE_SOFTWARE_SQRT +#undef USE_SQRT_AMD_INLINE +#endif +#undef USE_NAN_WITH_FLAGS +#undef USE_HANDLE_ERROR + +#include "libm_errno.h" + +#pragma function(sqrt) + +double sqrt(double x) +{ +#if USE_SOFTWARE_SQRT + return sqrt_amd_inline(x); +#else + double r; + unsigned long ux; + GET_BITS_DP64(x, ux); + + /* Check for special cases for Microsoft error handling */ + if ((ux & PINFBITPATT_DP64) == PINFBITPATT_DP64) + { + /* x is infinity, or NaN */ + if (ux & MANTBITS_DP64) + { + /* NaN of some sort */ + /* If it's a signaling NaN, convert to QNaN */ + return _handle_error("sqrt", OP_SQRT, ux|0x0008000000000000, + _DOMAIN, 0,EDOM, x, 0.0, 1); + } + else + { + /* +/-infinity */ + if (ux & SIGNBIT_DP64) + { + /* - infinity */ + return _handle_error("sqrt", OP_SQRT, INDEFBITPATT_DP64, + _DOMAIN, AMD_F_INVALID, EDOM, x, 0.0, 1); + } + /* positive infinite is not a problem */ + } + } + if ((ux & SIGNBIT_DP64)&&(ux & ~SIGNBIT_DP64)) /* if x < zero */ + { + return _handle_error("sqrt", OP_SQRT, INDEFBITPATT_DP64, + _DOMAIN, AMD_F_INVALID, EDOM, x, 0.0, 1); + } + + /* VC++ intrinsic call */ + _mm_store_sd(&r, _mm_sqrt_sd(_mm_setzero_pd(), _mm_load_sd(&x))); + return r; +#endif +} diff --git a/sdk/lib/crt/math/libm_sse2/sqrtf.c b/sdk/lib/crt/math/libm_sse2/sqrtf.c new file mode 100644 index 00000000000..1cfb7fb906c --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/sqrtf.c @@ -0,0 +1,91 @@ + +/******************************************************************************* +MIT License +----------- + +Copyright (c) 2002-2019 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this Software and associated documentaon files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include "libm.h" +#include "libm_util.h" + +#if USE_SOFTWARE_SQRT +#define USE_SQRTF_AMD_INLINE +#endif +#define USE_NANF_WITH_FLAGS +#define USE_HANDLE_ERRORF +#include "libm_inlines.h" +#if USE_SOFTWARE_SQRT +#undef USE_SQRTF_AMD_INLINE +#endif +#undef USE_NANF_WITH_FLAGS +#undef USE_HANDLE_ERRORF + +#include "libm_errno.h" + +// Disable "C4163: not available as intrinsic function" warning that older +// compilers may issue here. +#pragma warning(disable:4163) +#pragma function(sqrtf) + + +float sqrtf(float x) +{ +#if USE_SOFTWARE_SQRT + return sqrtf_amd_inline(x); +#else + float r; + unsigned int ux; + GET_BITS_SP32(x, ux); + /* Check for special cases for Microsoft error handling */ + if ((ux & PINFBITPATT_SP32) == PINFBITPATT_SP32) + { + /* x is infinity, or NaN */ + if (ux & MANTBITS_SP32) + { + /* NaN of some sort */ + /* If it's a signaling NaN, convert to QNaN */ + return _handle_errorf("sqrtf", OP_SQRT, ux|0x00400000, _DOMAIN, 0, + EDOM, x, 0.0F, 1); + } + else + { + /* +/-infinity */ + if (ux & SIGNBIT_SP32) + { + /* - infinity */ + return _handle_errorf("sqrtf", OP_SQRT, INDEFBITPATT_SP32, + _DOMAIN, AMD_F_INVALID, EDOM, x, 0.0F, 1); + } + /* positive infinite is not a problem */ + } + } + if ((ux & SIGNBIT_SP32)&&(ux & ~SIGNBIT_SP32)) /* if x < zero */ + { + return _handle_errorf("sqrtf", OP_SQRT, INDEFBITPATT_SP32, + _DOMAIN, AMD_F_INVALID, EDOM, x, 0.0F, 1); + } + + /* VC++ intrinsic call */ + _mm_store_ss(&r, _mm_sqrt_ss(_mm_load_ss(&x))); + return r; +#endif +} diff --git a/sdk/lib/crt/math/libm_sse2/tan.asm b/sdk/lib/crt/math/libm_sse2/tan.asm new file mode 100644 index 00000000000..4742fbd479f --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/tan.asm @@ -0,0 +1,762 @@ +; +; +; MIT License +; ----------- +; +; Copyright (c) 2002-2019 Advanced Micro Devices, Inc. +; +; Permission is hereby granted, free of charge, to any person obtaining a copy +; of this Software and associated documentaon files (the "Software"), to deal +; in the Software without restriction, including without limitation the rights +; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +; copies of the Software, and to permit persons to whom the Software is +; furnished to do so, subject to the following conditions: +; +; The above copyright notice and this permission notice shall be included in +; all copies or substantial portions of the Software. +; +; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +; THE SOFTWARE. +; +; An implementation of the tan function. +; +; Prototype: +; +; double tan(double x); +; +; Computes tan(x). +; It will provide proper C99 return values, +; but may not raise floating point status bits properly. +; Based on the NAG C implementation. +; +; If FMA3 hardware is present, it will be used for the calculation. +; + +.const +ALIGN 16 +L_signbit DQ 08000000000000000h + DQ 08000000000000000h ; duplicate for pd + +L_sign_mask DQ 07FFFFFFFFFFFFFFFh + DQ 07FFFFFFFFFFFFFFFh ; duplicate for pd + +L_int_one DQ 00000000000000001h + DQ 00000000000000001h ; duplicate for pd + +L_twobypi DQ 03FE45F306DC9C883h + DQ 03FE45F306DC9C883h ; duplicate for pd + +L_point_333 DQ 03FD5555555555555h; 1/3 + DQ 03FD5555555555555h ; duplicate for pd + +L_tan_p0 DQ 03FD7D50F6638564Ah ; 0.372379159759792203640806338901e0 + DQ 03FD7D50F6638564Ah ; duplicate for pd + +L_tan_p2 DQ 0BF977C24C7569ABBh ; -0.229345080057565662883358588111e-1 + DQ 0BF977C24C7569ABBh ; duplicate for pd + +L_tan_p4 DQ 03F2D5DAF289C385Ah ; 0.224044448537022097264602535574e-3 + DQ 03F2D5DAF289C385Ah ; duplicate for pd + +L_tan_q0 DQ 03FF1DFCB8CAA40B8h ; 0.111713747927937668539901657944e1 + DQ 03FF1DFCB8CAA40B8h ; duplicate for pd + +L_tan_q2 DQ 0BFE08046499EB90Fh ; -0.515658515729031149329237816945e0 + DQ 0BFE08046499EB90Fh ; duplicate for pd + +L_tan_q4 DQ 03F9AB0F4F80A0ACFh ; 0.260656620398645407524064091208e-1 + DQ 03F9AB0F4F80A0ACFh ; duplicate for pd + +L_tan_q6 DQ 0BF2E7517EF6D98F8h ; -0.232371494088563558304549252913e-3 + DQ 0BF2E7517EF6D98F8h ; duplicate for pd + +L_half_mask DQ 0ffffffff00000000h + DQ 0ffffffff00000000h ; duplicate for pd + +L_piby4_lead DQ 03FE921FB54442D18h ; pi/4, high part + DQ 03FE921FB54442D18h ; duplicate for pd + +L_piby4_tail DQ 03C81A62633145C06h ; pi/4, low parft + DQ 03C81A62633145C06h ; duplicate for pd + +; Different parts of argument reduction need different versions of pi/2 + +L_piby2_1 DQ 03FF921FB54400000h ; pi/2, high 33 bits +L_piby2_1tail DQ 03DD0B4611A626331h ; pi/2, second 53 bits, overlaps... +L_piby2_2 DQ 03DD0B4611A600000h ; pi/2, second 33 bits +L_piby2_2tail DQ 03BA3198A2E037073h ; pi/2, third 53 bits, overlaps... +L_piby2_3 DQ 03BA3198A2E000000h ; pi/2, third 33 bits +L_piby2_3tail DQ 0397B839A252049C1h ; pi/2, fourth 53 bits + +; end of pi/2 versions + +L_two_to_neg_27 DQ 03e40000000000000h ; 2^-27 +L_two_to_neg_13 DQ 03f20000000000000h ; 2^-13 + +L_inf_mask_64 DQ 07FF0000000000000h +L_point_five DQ 03FE0000000000000h +L_point_68 DQ 03FE5C28F5C28F5C3h ; .68 +L_n_point_68 DQ 0BFE5C28F5C28F5C3h ; -.68 + +L_zero DQ -0000000000000000h ; 0.0 +L_one DQ 03FF0000000000000h ; 1.0 +L_n_one DQ 0BFF0000000000000h ; -1.0 +L_two DQ 04000000000000000h ; 2.0 + +L_moderate_arg_cw DQ 0411E848000000000h ; 5.e5 +L_moderate_arg_bdl DQ 0417312D000000000h ; 2e7, works for BDL + +fname TEXTEQU +fname_special TEXTEQU <_tan_special> + +; local storage offsets +save_xmm6 EQU 020h +save_xmm7 EQU 030h +store_input EQU 040h +save_r10 EQU 050h +dummy_space EQU 060h +stack_size EQU 088h + +include fm.inc + +EXTERN __use_fma3_lib:DWORD +EXTERN fname_special : PROC +EXTERN __remainder_piby2_fma3 : PROC +EXTERN __remainder_piby2_fma3_bdl : PROC +EXTERN __remainder_piby2_forAsm : PROC +EXTERN _set_statfp : PROC + +.code +ALIGN 16 +PUBLIC fname +fname PROC FRAME + StackAllocate stack_size + SaveXmm xmm6, save_xmm6 + SaveXmm xmm7, save_xmm7 + .ENDPROLOG + cmp DWORD PTR __use_fma3_lib, 0 + jne Ltan_fma3 + +Ltan_sse2: + movd rdx, xmm0 ; really movq + movaps xmm6, xmm0 + mov rcx, rdx + btr rcx, 63 ; rcx <-- |x| + + cmp rcx, L_piby4_lead + ja Ltan_abs_x_nle_pio4 ; branch if > pi/4 or NaN + + + cmp rcx, L_two_to_neg_13 + jae Ltan_abs_x_ge_two_to_neg_13 + + cmp rcx, L_two_to_neg_27 + jae Labs_x_ge_two_to_neg_27 + + ; At this point tan(x) ~= x; if it's not exact, set the inexact flag + + test rcx, rcx + je Ltan_return + + mov ecx, 20h ; ecx <-- AMD_F_INEXACT + call _set_statfp + movaps xmm0, xmm6 ; may be redundant, but xmm0 <-- x + + RestoreXmm xmm7, save_xmm7 + RestoreXmm xmm6, save_xmm6 + StackDeallocate stack_size + ret 0 + +Labs_x_ge_two_to_neg_27: + + mulsd xmm0, xmm0 + mulsd xmm0, xmm6 + mulsd xmm0, QWORD PTR L_point_333 + + addsd xmm0, xmm6 + + RestoreXmm xmm7, save_xmm7 + RestoreXmm xmm6, save_xmm6 + StackDeallocate stack_size + ret 0 + +Ltan_abs_x_ge_two_to_neg_13: + xorps xmm1, xmm1 ; xmm1 <-- xx = 0 + xor r8d, r8d ; r8 <-- recip flag = 0 + call _tan_piby4 + +Ltan_return: + RestoreXmm xmm7, save_xmm7 + RestoreXmm xmm6, save_xmm6 + StackDeallocate stack_size + ret 0 + +Ltan_abs_x_nle_pio4: + + cmp rcx, L_inf_mask_64 ; |x| uint >= +inf as uint ? + jnae Ltan_x_is_finite + + call fname_special + RestoreXmm xmm7, save_xmm7 + RestoreXmm xmm6, save_xmm6 + StackDeallocate stack_size + ret + +ALIGN 16 +Ltan_x_is_finite: + xor r8d, r8d + xor r10, r10 + cmp rcx, rdx + setne r10b ; r10 <-- x was negative flag + andpd xmm6, L_sign_mask + + movsd xmm0, QWORD PTR L_moderate_arg_cw ; currently 5e5 + comisd xmm0, xmm6 + jbe Ltan_x_is_very_large + +Ltan_x_is_moderate: ; unused label + + ; For these arguments we do a Cody-Waite reduction, subtracting the + ; appropriate multiple of pi/2, using extra precision where x is close + ; to an exact multiple of pi/2 + ; We special-case region setting for |x| <= 9pi/4 + ; It seems strange that this speeds things up, but it does + + mov rdx, rcx + + mov rax, 4616025215990052958 ; 400f6a7a2955385eH (5pi/4) + shr rdx, 52 ; rdx <-- xexp + cmp rcx, rax + ja Labs_x_gt_5pio4 + + mov rax, 4612488097114038738 ; 4002d97c7f3321d2H (3pi/4) + cmp rcx, rax + seta r8b + inc r8d ; r8d <-- region (1 or 2) + jmp Lhave_region + +Labs_x_gt_5pio4: + mov rax, 4619644535898419899 ; 401c463abeccb2bbH (9pi/4) + cmp rcx, rax + ja Lneed_region_computation + mov rax, 4617875976460412789 ; 4015fdbbe9bba775H (7pi/4) + cmp rcx, rax + seta r8b + add r8d, 3 ; r8d <-- region (3 or 4) + jmp Lhave_region + +ALIGN 16 +Lneed_region_computation: + movaps xmm0, xmm6 + mulsd xmm0, QWORD PTR L_twobypi + addsd xmm0, QWORD PTR L_point_five + cvttsd2si r8d, xmm0 ; r8d <-- region + +Lhave_region: + movd xmm3, r8d + cvtdq2pd xmm3, xmm3 + + movaps xmm2, xmm3 + movaps xmm0, xmm3 + mulsd xmm0, QWORD PTR L_piby2_1 + mulsd xmm2, QWORD PTR L_piby2_1tail ; xmm2 < rtail = npi2 * piby2_1tail + subsd xmm6, xmm0 ; xmm6 <-- rhead = x - npi2*piby2_1 + + ; If x is not too close to multiple of pi/2, + ; we're essentially done with reduction + ; If the exponent of rhead is not close to that of x, + ; then most of x has been subtracted away in computing rhead; + ; i.e., x is close to a multiple of pi/2. + + movd rax, xmm6 + shr rax, 52 + and eax, 2047 + sub rdx, rax ; rdx <-- exp diff of x vs rhead + + cmp rdx, 15 + jbe Ltan_have_rhead_rtail + + ; Oops, x is almost a multiple of pi/2. Compute more bits of reduced x + + ; t = rhead; + ; rtail = npi2 * piby2_2; + ; rhead = t - rtail; + ; rtail = npi2 * piby2_2tail - ((t - rhead) - rtail); + + movaps xmm1, xmm6 + movaps xmm0, xmm3 + + movaps xmm2, xmm3 + mulsd xmm0, QWORD PTR L_piby2_2 + mulsd xmm2, QWORD PTR L_piby2_2tail + subsd xmm6, xmm0 + subsd xmm1, xmm6 + subsd xmm1, xmm0 + subsd xmm2, xmm1 + + cmp rdx, 48 + jbe Ltan_have_rhead_rtail ; We've done enough + + ; Wow, x is REALLY close to a multiple of pi/2. Compute more bits. + + ; t = rhead; + ; rtail = npi2 * piby2_3; + ; rhead = t - rtail; + ; rtail = npi2 * piby2_3tail - ((t - rhead) - rtail); + + movaps xmm1, xmm6 + movaps xmm0, xmm3 + movaps xmm2, xmm3 + mulsd xmm0, QWORD PTR L_piby2_3 + mulsd xmm2, QWORD PTR L_piby2_3tail + subsd xmm6, xmm0 ; xmm6 <-- rhead = t - rtail + subsd xmm1, xmm6 ; xmm1 <-- t - rhead + subsd xmm1, xmm0 ; xmm1 <-- ((t - rhead) - rtail) + subsd xmm2, xmm1 ; xmm2 <-- final rtail + +Ltan_have_rhead_rtail: + + ; At this point xmm6 has a suitable rhead, xmm2 a suitable rtail + movaps xmm0, xmm6 ; xmm0 <-- copy of rhead + + ; r = rhead - rtail + ; rr = (rhead - r) - rtail; + ; region = npi2 & 3; + + and r8d, 3 ; r8d <-- region + subsd xmm0, xmm2 ; xmm0 <-- r = rhead - rtail + subsd xmm6, xmm0 ; xmm6 <-- rhead - r + subsd xmm6, xmm2 ; xmm6 <-- rr = (rhead - r) - rtail + +Ltan_do_tan_computation: + and r8d, 1 ; r8d <-- region & 1 + movaps xmm1, xmm6 + call _tan_piby4 + test r10d, r10d + je Ltan_pos_return + xorpd xmm0, QWORD PTR L_signbit +Ltan_pos_return: + RestoreXmm xmm7, save_xmm7 + RestoreXmm xmm6, save_xmm6 + StackDeallocate stack_size + ret 0 + +ALIGN 16 +Ltan_x_is_very_large: + ; Reduce x into range [-pi/4,pi/4] (general case) + movaps xmm0, xmm6 + mov QWORD PTR [rsp+save_r10], r10 + call __remainder_piby2_forAsm ; this call clobbers r10 + mov r10, QWORD PTR [rsp+save_r10] + movapd xmm6,xmm1 ; xmm6 <-- rr + mov r8d,eax ; r8d <-- region + jmp Ltan_do_tan_computation + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; +; From here on, it is assumed that the hardware supports FMA3 (and AVX). + +ALIGN 16 +Ltan_fma3: + vmovq r9,xmm0 + mov rdx,r9 ; rdx <-- x + btr r9,63 ; r9 <-- |x| + cmp r9,L_piby4_lead + jae Ltan_fma3_absx_gt_pio4 ; Note that NaN will branch + +Ltan_fma3_absx_le_pio4: + ; no argument reduction is needed, so recip is 0, xx is 0. + ; Note that this routine is not special-casing very small |x| + vmovsd xmm5,L_piby4_lead + vmovsd xmm6,L_piby4_tail + vxorpd xmm1,xmm1,xmm1 ; xx <-- 0. + vxorpd xmm7,xmm7,xmm7 ; transform <-- 0 + comisd xmm0,L_point_68 + jbe Ltan_fma3_small_x_le_point_68 +Ltan_fma3_x_small_gt_point_68: + vmovsd xmm7,L_one ; xmm7 <-- transform = 1.0 + vsubsd xmm0,xmm5,xmm0 ; x = piby4_lead - x + vaddsd xmm0,xmm0,xmm6 ; xmm0 <-- x = x + xl = x + piby4_tail + jmp Ltan_fma3_compute_Remez_for_small_x +ALIGN 16 +Ltan_fma3_small_x_le_point_68: + comisd xmm0,L_n_point_68 + jae Ltan_fma3_compute_Remez_for_small_x +Ltan_fma3_small_x_lt_neg_point_68: + vmovsd xmm7,L_n_one ; xmm7 <-- transform = -1.0 + vaddsd xmm0,xmm5,xmm0 ; x = piby4_lead + x + vaddsd xmm0,xmm0,xmm6 ; xmm0 <-- x = x + xl = x + piby4_tail +Ltan_fma3_compute_Remez_for_small_x: + ; At this point xmm0 holds x, possibly transformed + + ; now do core Remez rational approximation for x in [0,0.68] + vmovsd xmm4,L_tan_q6 + vmovsd xmm3,L_tan_p4 + vmulsd xmm2,xmm0,xmm0 ; xx is 0, so xmm2 <-- r = x*x + vfmadd213sd xmm4,xmm2,L_tan_q4 + vfmadd213sd xmm3,xmm2,L_tan_p2 + vfmadd213sd xmm4,xmm2,L_tan_q2 + vfmadd213sd xmm3,xmm2,L_tan_p0 ; xmm3 <-- p2 (polynomial) + vfmadd213sd xmm4,xmm2,L_tan_q0 ; xmm4 <-- q3 (polynomial) + vdivsd xmm3,xmm3,xmm4 ; xmm3 <-- r3 = p2/q3 + vmulsd xmm3,xmm3,xmm2 ; xmm3 <-- r * r3 + vfmadd132sd xmm0,xmm0,xmm3 ; xx = 0, so xmm0 <-- t = x + x*(r*r3) + comisd xmm7,L_zero ; did we transform x? + ; if x was transformed, we need to transform t to get answer; + ; if not, the answer is just t. + je Ltan_fma3_ext_piby4_zero + + ; x was transformed, so answer is +- (1. - 2.*t/(1.+t)) + ; (remember recip is 0 here) + vmovsd xmm3,L_one + vaddsd xmm4,xmm0,L_one ; xmm4 <-- 1. + t + vdivsd xmm6,xmm0,xmm4 ; xmm6 <-- t / (1.+t) + vfnmadd231sd xmm3,xmm6,L_two ; xmm3 <-- 1. - 2.*t/(1.+t) + vmulsd xmm0,xmm3,xmm7 ; multiply by +- 1. + +Ltan_fma3_ext_piby4_zero: + ; restore volatile registers + AVXRestoreXmm xmm7, save_xmm7 + AVXRestoreXmm xmm6, save_xmm6 + StackDeallocate stack_size + ret 0 + +ALIGN 16 +Ltan_fma3_absx_gt_pio4: ;;; come here if |x| > pi/4 + cmp r9, L_inf_mask_64 + jae Ltan_fma3_naninf + +;Ltan_fma3_range_reduce: + vmovapd [store_input + rsp],xmm0 ; save copy of x + vmovq xmm0,r9 ; xmm0l <-- |x| + cmp r9,L_moderate_arg_bdl + jge Ltan_fma3_remainder_piby2 ; go elsewhere if |x| > 500000. + + ; Note that __remainder_piby2_fma3 and __remainder_piby2_fma3_bdl + ; have calling conventions that differ from the C routine + ; on input + ; |x| is in xmm0 + ; on output + ; z is in xmm0 + ; zz is in xmm1 + ; where z + zz = arg reduced |x| and zz is small compared to z + ; region of |x| is in rax + + Ltan_fma3_remainder_piby2_small: + ; Boldo-Daumas-Li reduction for reasonably small |x| + call __remainder_piby2_fma3_bdl + + +Ltan_fma3_full_computation: + ; we have done argument reduction; recip and xx may be nonzero + ; x is in xmm0, xx is in xmm1 + ; recip is region & 1, and region is in rax. + + vmovsd xmm5,L_piby4_lead + vmovsd xmm6,L_piby4_tail + + vxorpd xmm7,xmm7,xmm7 ; transform <-- 0 + vcomisd xmm0,L_point_68 + jbe Ltan_fma3_full_x_le_point_68 +Ltan_fma3_full_x_gt_point_68: + vmovsd xmm7,L_one ; xmm7 <-- transform = 1.0 + vsubsd xmm0,xmm5,xmm0 ; xmm0 <-- x = piby4_lead - x + vsubsd xmm2,xmm6,xmm1 ; xmm2 <-- xl = pibi4_tail - xx + vaddsd xmm0,xmm0,xmm2 ; xmm0 <-- x = x + xl + vxorps xmm1,xmm1,xmm1 ; xmm1 <-- xx = 0 + jmp Ltan_fma3_compute_Remez +ALIGN 16 +Ltan_fma3_full_x_le_point_68: + vcomisd xmm0,L_n_point_68 + jae Ltan_fma3_compute_Remez +Ltan_fma3_full_x_lt_neg_point_68: + vmovsd xmm7,L_n_one ; xmm7 <-- transform = -1.0 + vaddsd xmm0,xmm5,xmm0 ; x = piby4_lead + x + vaddsd xmm2,xmm6,xmm1 ; xmm2 <-- xl = piby4_tail + xx + vaddsd xmm0,xmm0,xmm2 ; xmm0 <-- x = x + xl + vxorps xmm1,xmm1,xmm1 ; xmm1 <-- xx = 0 + +Ltan_fma3_compute_Remez: + vmulsd xmm2,xmm0,xmm0 ; xmm2 <-- x*x + vmulsd xmm5,xmm1,xmm0 ; xmm5 <-- x*xx + vfmadd132sd xmm5,xmm2,L_two ; xmm5 <-- r = x*x + 2.*x*xx + vmovsd xmm2,L_tan_p4 + vfmadd213sd xmm2,xmm5,L_tan_p2 ; xmm2 <-- p4*r+p2 + vfmadd213sd xmm2,xmm5,L_tan_p0 ; xmm2 <-- p = (p4*r+p2)*r+p0 + vmovsd xmm4,L_tan_q6 + vfmadd213sd xmm4,xmm5,L_tan_q4 ; xmm4 <-- q6*r+q4 + vfmadd213sd xmm4,xmm5,L_tan_q2 ; xmm4 <-- (q6*r+q4)*r+q2 + vfmadd213sd xmm4,xmm5,L_tan_q0 ; xmm4 <-- q = ((q6*r+q4)*r+q2)*r+q0 + vdivsd xmm2,xmm2,xmm4 ; xmm2 <-- p/q + vmulsd xmm2,xmm2,xmm5 ; xmm2 <-- r*p/q + vfmadd213sd xmm2,xmm0,xmm1 ; xmm2 <-- t2 = xx + x*r*(p/q) + vaddsd xmm1,xmm0,xmm2 ; xmm1 <-- t = (t1=x) + t2 + + ; If |x| > .68 we transformed, and t is an approximation of + ; tan(pi/4 +- (x+xx)) + ; otherwise, t is just tan(x+xx) + vxorpd xmm6,xmm6,xmm6 + vcomisd xmm7,xmm6 ; did we transform? (|x| > .68) ? + jz Ltan_fma3_if_recip_set ; if not, go check recip + +Ltan_fma3_if_transfor_set: + ; Because we transformed x+xx, we have to transform t before returning + ; let transform be 1 for x > .68, -1 for x < -.68, then we return + ; transform * (recip ? (2.*t/(t-1.) - 1.) : (1. - 2.*t/(1.+t))) + vaddsd xmm6,xmm1,xmm1 ; xmm6 <-- 2.*t + vmovsd xmm4,L_one + vaddsd xmm2,xmm1,xmm4 ; xmm2 <-- t+1 + vsubsd xmm5,xmm1,xmm4 ; xmm5 <-- t-1 + bt rax,0 + jc Ltan_fma3_transform_and_recip_set + ; here recip is not set + vaddsd xmm2,xmm1,xmm4 ; xmm2 <-- t+1 + vdivsd xmm2,xmm1,xmm2 ; xmm2 <-- t/(t+1) + vfnmadd132sd xmm2,xmm4,L_two ; xmm2 <-- 1 - 2*t/(t+1) + vmulsd xmm1,xmm2,xmm7 ; xmm1 <-- transform*(1 - 2*t/(t+1)) + jmp Ltan_fma3_exit_piby4 +ALIGN 16 +Ltan_fma3_transform_and_recip_set: + ; here recip is set + vsubsd xmm2,xmm1,xmm4 ; xmm2 <-- t-1 + vdivsd xmm2,xmm1,xmm2 ; xmm2 <-- t/(t-1) + vfmsub132sd xmm2,xmm4,L_two ; xmm2 <-- 2*t/(t-1) - 1 + vmulsd xmm1,xmm2,xmm7 ; xmm1 <-- transform*(2*t/(t-1) - 1) + jmp Ltan_fma3_exit_piby4 + +ALIGN 16 +Ltan_fma3_if_recip_set: + ; Here we did not transform x and xx, but if we are in an odd quadrant + ; we will need to return -1./(t1+t2), computed accurately + ; (t=t1 is in xmm1, t2 is in xmm2) + bt rax,0 + jnc Ltan_fma3_exit_piby4 + + vandpd xmm7,xmm1,L_half_mask ; xmm7 <-- z1 = high bits of t + vsubsd xmm4,xmm7,xmm0 ; xmm4 <-- z1 - t1 + vsubsd xmm4,xmm2,xmm4 ; xmm4 <-- z2 = t2 - (z1-t1) + vmovsd xmm2,L_n_one + vdivsd xmm2,xmm2,xmm1 ; xmm2 <-- trec = -1./t + vandpd xmm5,xmm2,L_half_mask ; xmm5 <-- trec_top=high bits of trec + vfmadd213sd xmm7,xmm5,L_one ; xmm7 <-- trec_top*z1 + 1. + vfmadd231sd xmm7 ,xmm4,xmm5 ; xmm7 <-- z2*trec_top + (trec_top*z1 + 1.) + vfmadd213sd xmm7,xmm2,xmm5 ; xmm7 <-- u = trec_top + trec*(z2*trec_top + (trec_top*z1+1.)) + vmovapd xmm1,xmm7 ; xmm1 <-- u + +Ltan_fma3_exit_piby4: + vmovapd xmm0,xmm1 ; xmm0 <-- t, u, or v, as needed + + vmovapd xmm1,[store_input + rsp] + vandpd xmm1,xmm1,L_signbit + vxorpd xmm0,xmm0,xmm1 ; tan(-x) = -tan(x) + + ; restore volatile registers + AVXRestoreXmm xmm7, save_xmm7 + AVXRestoreXmm xmm6, save_xmm6 + StackDeallocate stack_size + ret + +ALIGN 16 +Ltan_fma3_remainder_piby2: + ; argument reduction for general x + + call __remainder_piby2_fma3 + jmp Ltan_fma3_full_computation + + +Ltan_fma3_naninf: ; here argument is +-Inf or NaN. Special case. + call fname_special + AVXRestoreXmm xmm7, save_xmm7 + AVXRestoreXmm xmm6, save_xmm6 + StackDeallocate stack_size + ret + +fname endp + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +.const +tan_piby4_save_xmm6 EQU 030h +tan_piby4_stack_size EQU 048h +.code +ALIGN 16 +_tan_piby4 PROC PRIVATE FRAME + StackAllocate tan_piby4_stack_size + SaveXmm xmm6, tan_piby4_save_xmm6 + .ENDPROLOG + + ; Compute tangent for x+xx in [-pi/4,pi/4]. + ; xmm0 has x + ; xmm1 has xx + ; r8d has recip. If recip is true, return -1/tan(x+xx) else tan(x+xx) + + xor eax, eax + + comisd xmm0, QWORD PTR L_point_68 + movaps xmm3, xmm1 + movaps xmm6, xmm0 + jbe Ltan_piby4_x_le_point_68 + + ; Here x > .68, so we transform x using the identity + ; tan(pi/4-x) = (1-tan(x))/(1+tan(x)) + + movsd xmm2, QWORD PTR L_piby4_lead + mov eax, 1 ; eax <-- transform = 1 + subsd xmm2, xmm0 ; xmm2 <-- x = piby4_lead - x + movsd xmm0, QWORD PTR L_piby4_tail + subsd xmm0, xmm1 ; xmm0 <-- xl = piby4_tail - xx + movaps xmm6, xmm2 + addsd xmm6, xmm0 ; xmm6 <-- x = x + xl + xorps xmm3,xmm3 ; xmm3 <-- xx = 0. + jmp Ltan_piby4_do_remez + +Ltan_piby4_x_le_point_68: +; 43 : else if (x < -0.68) + + movsd xmm0, QWORD PTR L_n_point_68 + comisd xmm0, xmm6 + jbe Ltan_piby4_do_remez ; jump if x >= -.68 + + ; Here x < -.68, so we transform x using the identity + ; tan(x-pi/4) = (tan(x)-1)/(tan(x)+1) + + addsd xmm6, QWORD PTR L_piby4_lead ; xmm6 <-- x = piby4_lead + x + addsd xmm3, QWORD PTR L_piby4_tail ; xmm3 <-- xl = piby4_tail + xx + or eax, -1 ; eax <-- transform = -1 + addsd xmm6, xmm3 ; xmm6 <-- x = x + xl + xorps xmm3, xmm3 ; xmm3 <-- xx = 0 + +Ltan_piby4_do_remez: + + ; Core Remez [2,3] approximation to tan(x+xx) on the interval [0,0.68]. + movaps xmm0, xmm6 + movaps xmm2, xmm6; +; An implementation of the tan function. +; +; Prototype: +; +; double tan(double x); +; +; Computes tan(x). +; It will provide proper C99 return values, +; but may not raise floating point status bits properly. +; Based on the NAG C implementation. +; +; + + mulsd xmm0, xmm6 ; xmm0 <-- x*x + addsd xmm2, xmm2 ; xmm2 <-- 2*x + mulsd xmm2, xmm3 ; xmm2 <-- 2*x*xx + addsd xmm2, xmm0 ; xmm2 <-- r = x*x + 2*x*xx + + ; Magic Remez approximation + movaps xmm0, xmm2 + movaps xmm5, xmm2 + movaps xmm1, xmm2 + mulsd xmm5, QWORD PTR L_tan_p4 + mulsd xmm1, QWORD PTR L_tan_q6 + mulsd xmm0, xmm6 + addsd xmm5, QWORD PTR L_tan_p2 + mulsd xmm5, xmm2 + addsd xmm5, QWORD PTR L_tan_p0 + mulsd xmm5, xmm0 + movsd xmm0, QWORD PTR L_tan_q4 + addsd xmm0, xmm1 + mulsd xmm0, xmm2 + addsd xmm0, QWORD PTR L_tan_q2 + mulsd xmm0, xmm2 + addsd xmm0, QWORD PTR L_tan_q0 + divsd xmm5, xmm0 + addsd xmm5, xmm3 ; xmm5 <-- t2 + + test eax, eax + je Ltan_piby4_transform_false + + addsd xmm5, xmm6 ; xmm5 <-- t = t1 + t2 = x + t2 + + test r8d, r8d + je Ltan_piby4_transform_true_recip_false + + ; Here transform and recip are both true. + ; return transform*(2*t/(t-1) - 1.0); + + movaps xmm0, xmm5 + subsd xmm5, QWORD PTR L_one + movd xmm1, eax + addsd xmm0, xmm0 + divsd xmm0, xmm5 + cvtdq2pd xmm1, xmm1 + subsd xmm0, QWORD PTR L_one + mulsd xmm0, xmm1 + RestoreXmm xmm6, tan_piby4_save_xmm6 + StackDeallocate tan_piby4_stack_size + ret 0 + +Ltan_piby4_transform_true_recip_false: + ; Here return transform*(1.0 - 2*t/(1+t)); + movsd xmm0, QWORD PTR L_one + movaps xmm1, xmm5 + addsd xmm5, xmm0 + addsd xmm1, xmm1 + divsd xmm1, xmm5 + subsd xmm0, xmm1 + movd xmm1, eax + cvtdq2pd xmm1, xmm1 + mulsd xmm0, xmm1 + RestoreXmm xmm6, tan_piby4_save_xmm6 + StackDeallocate tan_piby4_stack_size + ret 0 + +Ltan_piby4_transform_false: + test r8d, r8d + je Ltan_piby4_atransform_false_recip_false + + ; Here transform is false but recip is true + ; We return an accurate computation of -1.0/(t1 + t2). + + movsd xmm4, QWORD PTR L_n_one + movaps xmm0, xmm5 + mov rcx, -4294967296 ; ffffffff00000000H + addsd xmm0, xmm6 + movd rax, xmm0 ; really movq + divsd xmm4, xmm0 + and rax, rcx + movd xmm3, rax ; really movq + movaps xmm1, xmm3 + subsd xmm1, xmm6 + + movd rax, xmm4 ; really movq + subsd xmm5, xmm1 + + and rax, rcx + movd xmm2, rax ; really movq + + ; return trec_top + trec * ((1.0 + trec_top * z1) + trec_top * z2); + + movaps xmm0, xmm2 + mulsd xmm5, xmm2 + mulsd xmm0, xmm3 + addsd xmm0, QWORD PTR L_one + addsd xmm0, xmm5 + mulsd xmm0, xmm4 + addsd xmm0, xmm2 + + RestoreXmm xmm6, tan_piby4_save_xmm6 + StackDeallocate tan_piby4_stack_size + ret 0 + +Ltan_piby4_atransform_false_recip_false: + ; Here both transform and recip are false; we just return t1 + t2 + addsd xmm5, xmm6 + movaps xmm0, xmm5 + RestoreXmm xmm6, tan_piby4_save_xmm6 + StackDeallocate tan_piby4_stack_size + ret 0 + +_tan_piby4 endp +END diff --git a/sdk/lib/crt/math/libm_sse2/tan.c b/sdk/lib/crt/math/libm_sse2/tan.c new file mode 100644 index 00000000000..0a86a2ef606 --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/tan.c @@ -0,0 +1,242 @@ + +/******************************************************************************* +MIT License +----------- + +Copyright (c) 2002-2019 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this Software and associated documentaon files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include "libm.h" +#include "libm_util.h" + +#define USE_NAN_WITH_FLAGS +#define USE_VAL_WITH_FLAGS +#define USE_HANDLE_ERROR +#include "libm_inlines.h" +#undef USE_NAN_WITH_FLAGS +#undef USE_VAL_WITH_FLAGS +#undef USE_HANDLE_ERROR + +#include "libm_errno.h" + +/* tan(x + xx) approximation valid on the interval [-pi/4,pi/4]. + If recip is true return -1/tan(x + xx) instead. */ +static inline double tan_piby4(double x, double xx, int recip) +{ + double r, t1, t2, xl; + int transform = 0; + static const double + piby4_lead = 7.85398163397448278999e-01, /* 0x3fe921fb54442d18 */ + piby4_tail = 3.06161699786838240164e-17; /* 0x3c81a62633145c06 */ + + /* In order to maintain relative precision transform using the identity: + tan(pi/4-x) = (1-tan(x))/(1+tan(x)) for arguments close to pi/4. + Similarly use tan(x-pi/4) = (tan(x)-1)/(tan(x)+1) close to -pi/4. */ + + if (x > 0.68) + { + transform = 1; + x = piby4_lead - x; + xl = piby4_tail - xx; + x += xl; + xx = 0.0; + } + else if (x < -0.68) + { + transform = -1; + x = piby4_lead + x; + xl = piby4_tail + xx; + x += xl; + xx = 0.0; + } + + /* Core Remez [2,3] approximation to tan(x+xx) on the + interval [0,0.68]. */ + + r = x*x + 2.0 * x * xx; + t1 = x; + t2 = xx + x*r* + (0.372379159759792203640806338901e0 + + (-0.229345080057565662883358588111e-1 + + 0.224044448537022097264602535574e-3*r)*r)/ + (0.111713747927937668539901657944e1 + + (-0.515658515729031149329237816945e0 + + (0.260656620398645407524064091208e-1 - + 0.232371494088563558304549252913e-3*r)*r)*r); + + /* Reconstruct tan(x) in the transformed case. */ + + if (transform) + { + double t; + t = t1 + t2; + if (recip) + return transform*(2*t/(t-1) - 1.0); + else + return transform*(1.0 - 2*t/(1+t)); + } + + if (recip) + { + /* Compute -1.0/(t1 + t2) accurately */ + double trec, trec_top, z1, z2, t; + unsigned long u; + t = t1 + t2; + GET_BITS_DP64(t, u); + u &= 0xffffffff00000000; + PUT_BITS_DP64(u, z1); + z2 = t2 - (z1 - t1); + trec = -1.0 / t; + GET_BITS_DP64(trec, u); + u &= 0xffffffff00000000; + PUT_BITS_DP64(u, trec_top); + return trec_top + trec * ((1.0 + trec_top * z1) + trec_top * z2); + + } + else + return t1 + t2; +} + +#pragma function(tan) + +double tan(double x) +{ + double r, rr; + int region, xneg; + + unsigned long ux, ax; + GET_BITS_DP64(x, ux); + ax = (ux & ~SIGNBIT_DP64); + if (ax <= 0x3fe921fb54442d18) /* abs(x) <= pi/4 */ + { + if (ax < 0x3f20000000000000) /* abs(x) < 2.0^(-13) */ + { + if (ax < 0x3e40000000000000) /* abs(x) < 2.0^(-27) */ + { + if (ax == 0x0000000000000000) return x; + else return val_with_flags(x, AMD_F_INEXACT); + } + else + { + /* Using a temporary variable prevents 64-bit VC++ from + rearranging + x + x*x*x*0.333333333333333333; + into + x * (1 + x*x*0.333333333333333333); + The latter results in an incorrectly rounded answer. */ + double tmp; + tmp = x*x*x*0.333333333333333333; + return x + tmp; + } + } + else + return tan_piby4(x, 0.0, 0); + } + else if ((ux & EXPBITS_DP64) == EXPBITS_DP64) + { + /* x is either NaN or infinity */ + if (ux & MANTBITS_DP64) + /* x is NaN */ + return _handle_error("tan", OP_TAN, ux|0x0008000000000000, _DOMAIN, 0, + EDOM, x, 0.0, 1); + else + /* x is infinity. Return a NaN */ + return _handle_error("tan", OP_TAN, INDEFBITPATT_DP64, _DOMAIN, AMD_F_INVALID, + EDOM, x, 0.0, 1); + } + xneg = (ax != ux); + + + if (xneg) + x = -x; + + if (x < 5.0e5) + { + /* For these size arguments we can just carefully subtract the + appropriate multiple of pi/2, using extra precision where + x is close to an exact multiple of pi/2 */ + static const double + twobypi = 6.36619772367581382433e-01, /* 0x3fe45f306dc9c883 */ + piby2_1 = 1.57079632673412561417e+00, /* 0x3ff921fb54400000 */ + piby2_1tail = 6.07710050650619224932e-11, /* 0x3dd0b4611a626331 */ + piby2_2 = 6.07710050630396597660e-11, /* 0x3dd0b4611a600000 */ + piby2_2tail = 2.02226624879595063154e-21, /* 0x3ba3198a2e037073 */ + piby2_3 = 2.02226624871116645580e-21, /* 0x3ba3198a2e000000 */ + piby2_3tail = 8.47842766036889956997e-32; /* 0x397b839a252049c1 */ + double t, rhead, rtail; + int npi2; + unsigned long uy, xexp, expdiff; + xexp = ax >> EXPSHIFTBITS_DP64; + /* How many pi/2 is x a multiple of? */ + if (ax <= 0x400f6a7a2955385e) /* 5pi/4 */ + { + if (ax <= 0x4002d97c7f3321d2) /* 3pi/4 */ + npi2 = 1; + else + npi2 = 2; + } + else if (ax <= 0x401c463abeccb2bb) /* 9pi/4 */ + { + if (ax <= 0x4015fdbbe9bba775) /* 7pi/4 */ + npi2 = 3; + else + npi2 = 4; + } + else + npi2 = (int)(x * twobypi + 0.5); + /* Subtract the multiple from x to get an extra-precision remainder */ + rhead = x - npi2 * piby2_1; + rtail = npi2 * piby2_1tail; + GET_BITS_DP64(rhead, uy); + expdiff = xexp - ((uy & EXPBITS_DP64) >> EXPSHIFTBITS_DP64); + if (expdiff > 15) + { + /* The remainder is pretty small compared with x, which + implies that x is a near multiple of pi/2 + (x matches the multiple to at least 15 bits) */ + t = rhead; + rtail = npi2 * piby2_2; + rhead = t - rtail; + rtail = npi2 * piby2_2tail - ((t - rhead) - rtail); + if (expdiff > 48) + { + /* x matches a pi/2 multiple to at least 48 bits */ + t = rhead; + rtail = npi2 * piby2_3; + rhead = t - rtail; + rtail = npi2 * piby2_3tail - ((t - rhead) - rtail); + } + } + r = rhead - rtail; + rr = (rhead - r) - rtail; + region = npi2 & 3; + } + else + { + /* Reduce x into range [-pi/4,pi/4] */ + __remainder_piby2(x, &r, &rr, ®ion); + } + + if (xneg) + return -tan_piby4(r, rr, region & 1); + else + return tan_piby4(r, rr, region & 1); +} diff --git a/sdk/lib/crt/math/libm_sse2/tanf.asm b/sdk/lib/crt/math/libm_sse2/tanf.asm new file mode 100644 index 00000000000..9dd421356d7 --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/tanf.asm @@ -0,0 +1,551 @@ +; +; +; MIT License +; ----------- +; +; Copyright (c) 2002-2019 Advanced Micro Devices, Inc. +; +; Permission is hereby granted, free of charge, to any person obtaining a copy +; of this Software and associated documentaon files (the "Software"), to deal +; in the Software without restriction, including without limitation the rights +; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +; copies of the Software, and to permit persons to whom the Software is +; furnished to do so, subject to the following conditions: +; +; The above copyright notice and this permission notice shall be included in +; all copies or substantial portions of the Software. +; +; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +; THE SOFTWARE. +; +; An implementation of the tanf function using the fma3 instruction. +; +; Prototype: +; +; float tanf(float x); +; +; Computes tanf(x). +; It will provide proper C99 return values, +; but may not raise floating point status bits properly. +; Based on the NAG C implementation. +; +.const +ALIGN 16 +L_sign_mask DQ 07FFFFFFFFFFFFFFFh + DQ 07FFFFFFFFFFFFFFFh +L_twobypi DQ 03FE45F306DC9C883h + DQ 03FE45F306DC9C883h +L_int_three DQ 00000000000000003h + DQ 00000000000000003h +L_int_one DQ 00000000000000001h + DQ 00000000000000001h +L_signbit DQ 08000000000000000h + DQ 08000000000000000h + +L_tanf DQ 03FD8A8B0DA56CB17h ; c0 + DQ 0BF919DBA6EFD6AADh ; c1 + DQ 03FF27E84A3E73A2Eh ; d0 + DQ 0BFE07266D7B3511Bh ; d1 + DQ 03F92E29003C692D9h ; d2 + +L_large_x_sse2 DQ 04160000000000000h ; 8388608. +L_large_x_fma3 DQ 041E921FB40000000h ; 3.373259264e9 +L_point_333 DQ 03FD5555555555555h +L_mask_3e4 DQ 03e40000000000000h +L_mask_3f2 DQ 03f20000000000000h +L_point_five DQ 03FE0000000000000h +L_piby2_1 DQ 03FF921FB54400000h +L_piby2_1tail DQ 03DD0B4611A626331h +L_piby2_lead DQ 03ff921fb54442d18h +L_n_one DQ 0BFF0000000000000h +L_piby4 DQ 03fe921fb54442d18h +L_min_norm DQ 00010000000000000h + + +L_inf_mask_32 DD 07F800000h + DD 07F800000h + +EXTRN __use_fma3_lib:DWORD +EXTRN __L_2_by_pi_bits:BYTE + +fname TEXTEQU +fname_special TEXTEQU <_tanf_special> + +; define local variable storage offsets +; actually there aren't any, but we need to leave room for _tanf_special. +dummy_space EQU 20h +stack_size EQU 38h + +include fm.inc + +;Define name and any external functions being called +EXTERN fname_special : PROC + +.code +PUBLIC fname +fname PROC FRAME + StackAllocate stack_size + .ENDPROLOG + cmp DWORD PTR __use_fma3_lib, 0 + jne Ltanf_fma3 + +Ltanf_sse2: + movd eax,xmm0 + mov r8d,L_inf_mask_32 + and eax,r8d + cmp eax, r8d + jz Ltanf_sse2_naninf + + cvtss2sd xmm5,xmm0 + movd r9,xmm5 + btr r9,63 ; r9 <-- |x| + + cmp r9,L_piby4 + jg Ltanf_sse2_range_reduce + cmp r9,L_mask_3f2 ; compare to 2^-13 = 0.0001220703125 + jge Ltanf_sse2_compute_tanf_piby_4 + cmp r9,L_mask_3e4 ; compare to 2^-27 = 7.4505805969238281e-009 + jge Ltanf_sse2_compute_x_xxx_0_333 + ; At this point tan(x) ~= x; if it's not exact, set the inexact flag. + + test r9, r9 + je Ltanf_sse2_exact_return + movsd xmm1, L_n_one + addsd xmm1, L_min_norm ; set inexact + +Ltanf_sse2_exact_return: + StackDeallocate stack_size + ret + +ALIGN 16 +Ltanf_sse2_compute_x_xxx_0_333: + movapd xmm2,xmm5 + mulsd xmm2,xmm2 ; xmm2 <-- x^2 + movapd xmm0,xmm2 + mulsd xmm0,xmm5 ; xmm0 <-- x^3 + mulsd xmm0,L_point_333 + addsd xmm0,xmm5 ; x + x*x*x*0.3333333333333333; + jmp Ltanf_sse2_return_s + +ALIGN 16 +Ltanf_sse2_compute_tanf_piby_4: + movapd xmm0,xmm5 ; xmm0 <-- x (as double) + + movapd xmm1,xmm0 + mulsd xmm1,xmm0 ; xmm1 <-- x*x + + movsd xmm3,L_tanf+008h ; xmm3 <-- c1 + mulsd xmm3,xmm1 ; xmm3 <-- c1*x^2 + addsd xmm3,L_tanf ; xmm3 <-- c = c1*x^2 + c0 + + movsd xmm2,L_tanf+020h ; xmm2 <-- d2 + mulsd xmm2,xmm1 ; xmm2 <-- d2*x^2 + addsd xmm2,L_tanf+018h ; xmm2 <-- d2*x^2 + d1 + mulsd xmm2,xmm1 ; xmm2 <-- (d2*x^2 + d1)*x^2 + addsd xmm2,L_tanf+010h ; xmm2 <-- d = (d2*x^2 + d1)*x^2 + d0 + divsd xmm3,xmm2 ; xmm3 <-- c/d + mulsd xmm1,xmm0 ; xmm1 <-- x^3 + mulsd xmm1,xmm3 ; xmm1 <-- x^3 * c/d + addsd xmm0,xmm1 ; xmm0 <-- x + x^3 * c/d + jmp Ltanf_sse2_return_s + +Ltanf_sse2_range_reduce: + movd xmm0,r9 + cmp r9,L_large_x_sse2 + jge Ltanf_sse2_tanf_reduce_large + +Ltanf_sse2_tanf_reduce_moderate: + movapd xmm1,xmm0 + andpd xmm1,L_sign_mask + movapd xmm2,L_twobypi + mulsd xmm2,xmm1 + addsd xmm2,L_point_five + cvttpd2dq xmm4,xmm2 + cvtdq2pd xmm1,xmm4 + andpd xmm4,L_int_three ; xmm4 <-- region + movapd xmm2,xmm0 + + movapd xmm3,xmm1 + mulsd xmm1,L_piby2_1 + subsd xmm2,xmm1 + mulsd xmm3,L_piby2_1tail ; xmm3 rtail + movapd xmm0,xmm2 + subsd xmm0,xmm3 + subsd xmm2,xmm0 + movapd xmm1,xmm2 + subsd xmm1,xmm3 + jmp Ltanf_sse2_exit_s + +Ltanf_sse2_tanf_reduce_large: + lea r9,__L_2_by_pi_bits + ;xexp = (x >> 52) 1023 + movd r11,xmm0 + mov rcx,r11 + shr r11,52 + sub r11,1023 ; r11 <-- xexp = exponent of input x + ;calculate the last byte from which to start multiplication + ;last = 134 (xexp >> 3) + mov r10,r11 + shr r10,3 + sub r10,134 ; r10 <-- -last + neg r10 ; r10 <-- last + ;load 64 bits of 2_by_pi + mov rax,[r9+r10] + ;mantissa of x = ((x << 12) >> 12) | implied bit + shl rcx,12 + shr rcx,12 ; rcx <-- mantissa part of input x + bts rcx,52 ; add the implied bit as well + ;load next 128 bits of 2_by_pi + add r10,8 ; increment to next 8 bytes of 2_by_pi + movdqu xmm0,[r9+r10] + ;do three 64bit multiplications with mant of x + mul rcx + mov r8,rax ; r8 = last 64 bits of mul = res1[2] + mov r10,rdx ; r10 = carry + vmovq rax,xmm0 + mul rcx + ;resexp = xexp & 7 + and r11,7 ; r11 <-- resexp = last 3 bits of xexp + psrldq xmm0,8 + add rax,r10 ; add the previous carry + adc rdx,0 + mov r9,rax ; r9 <-- next 64 bits of mul = res1[1] + mov r10,rdx ; r10 <-- carry + movd rax,xmm0 + mul rcx + add r10,rax ;r10 = most sig 64 bits = res1[0] + ;find the region + ;last three bits ltb = most sig bits >> (54 resexp)) + ; decimal point in last 18 bits == 8 lsb's in first 64 bits + ; and 8 msb's in next 64 bits + ;point_five = ltb & 01h; + ;region = ((ltb >> 1) + point_five) & 3; + mov rcx,54 + mov rax,r10 + sub rcx,r11 + xor rdx,rdx ;rdx = sign of x + shr rax,cl + jnc Ltanf_sse2_no_point_five_f + ;;if there is carry.. then negate the result of multiplication + not r10 + not r9 + not r8 + mov rdx,08000000000000000h +ALIGN 16 +Ltanf_sse2_no_point_five_f: + adc rax,0 + and rax,3 + movd xmm4,eax ; xmm4 <-- region + ;calculate the number of integer bits and zero them out + mov rcx,r11 + add rcx,10 ; rcx = no. of integer bits + shl r10,cl + shr r10,cl ; r10 contains only mant bits + sub rcx,64 ; form the exponent + mov r11,rcx + ;find the highest set bit + bsr rcx,r10 + jnz Ltanf_sse2_form_mantissa_f + mov r10,r9 + mov r9,r8 + mov r8,0 + bsr rcx,r10 ;rcx = hsb + sub r11,64 +ALIGN 16 +Ltanf_sse2_form_mantissa_f: + add r11,rcx ; for exp of x + sub rcx,52 ; rcx = no. of bits to shift in r10 + cmp rcx,0 + jl Ltanf_sse2_hsb_below_52_f + je Ltanf_sse2_form_numbers_f + ;hsb above 52 + mov r8,r10 + shr r10,cl ; r10 = mantissa of x with hsb at 52 + shr r9,cl ; make space for bits from r10 + sub rcx,64 + neg rcx ; rcx = no of bits to shift r10 + shl r8,cl + or r9,r8 ; r9 = mantissa bits of xx + jmp Ltanf_sse2_form_numbers_f + +ALIGN 16 +Ltanf_sse2_hsb_below_52_f: + neg rcx + mov rax,r9 + shl r10,cl + shl r9,cl + sub rcx,64 + neg rcx + shr rax,cl + or r10,rax + shr r8,cl + or r9,r8 +ALIGN 16 +Ltanf_sse2_form_numbers_f: + add r11,1023 + btr r10,52 ; remove the implied bit + mov rcx,r11 + or r10,rdx ; put the sign + shl rcx,52 + or r10,rcx ; x is in r10 + movd xmm0,r10 ; xmm0 <-- x + mulsd xmm0,L_piby2_lead + +Ltanf_sse2_exit_s: + movd eax,xmm4 + and eax,1 ; eax <-- region & 1 + movapd xmm1,xmm0 + mulsd xmm1,xmm0 ; xmm1 <-- x*x + + movsd xmm3,L_tanf+008h ; xmm3 <-- c1 + mulsd xmm3,xmm1 ; xmm3 <-- c1*x^2 + addsd xmm3,L_tanf ; xmm3 <-- c = c1*x^2 + c0 + + movsd xmm2,L_tanf+020h ; xmm2 <-- d2 + mulsd xmm2,xmm1 ; xmm2 <-- d2*x^2 + addsd xmm2,L_tanf+018h ; xmm2 <-- d2*x^2 + d1 + mulsd xmm2,xmm1 ; xmm2 <-- (d2*x^2 + d1)*x^2 + addsd xmm2,L_tanf+010h ; xmm2 <-- d = (d2*x^2 + d1)*x^2 + d0 + divsd xmm3,xmm2 ; xmm3 <-- c/d + mulsd xmm1,xmm0 ; xmm1 <-- x^3 + mulsd xmm1,xmm3 ; xmm1 <-- x^3 * c/d + addsd xmm0,xmm1 ; xmm0 <-- x + x^3 * c/d + cmp eax,01h + jne Ltanf_sse2_exit_tanpiby4 +Ltanf_sse2_recip : + movd xmm3,L_n_one + divsd xmm3,xmm0 + movsd xmm0,xmm3 +Ltanf_sse2_exit_tanpiby4 : + andpd xmm5,L_signbit + xorpd xmm0,xmm5 + +Ltanf_sse2_return_s: + cvtsd2ss xmm0,xmm0 +Ltanf_sse2_return_c: + StackDeallocate stack_size + ret + +Ltanf_sse2_naninf: + call fname_special + StackDeallocate stack_size + ret + +ALIGN 16 +Ltanf_fma3: + vmovd eax,xmm0 + mov r8d,L_inf_mask_32 + and eax,r8d + cmp eax, r8d + jz Ltanf_fma3_naninf + + vcvtss2sd xmm5,xmm0,xmm0 + vmovq r9,xmm5 + btr r9,63 ; r9 <-- |x| + + cmp r9,L_piby4 + jg Ltanf_fma3_range_reduce + cmp r9,L_mask_3f2 + jge Ltanf_fma3_compute_tanf_piby_4 + cmp r9,L_mask_3e4 + jge Ltanf_fma3_compute_x_xxx_0_333 + jmp Ltanf_fma3_return_c + +Ltanf_fma3_compute_x_xxx_0_333: + vmulsd xmm2,xmm5,xmm5 + vmulsd xmm0,xmm2,xmm5 + vfmadd132sd xmm0,xmm5,L_point_333 ; x + x*x*x*0.3333333333333333; + jmp Ltanf_fma3_return_s + +Ltanf_fma3_compute_tanf_piby_4: + vmovsd xmm0,xmm5,xmm5 + vmulsd xmm1,xmm0,xmm0 + vmovsd xmm3,L_tanf+008h + vfmadd213sd xmm3,xmm1,L_tanf + vmovsd xmm2,L_tanf+020h + vfmadd213sd xmm2,xmm1,L_tanf+018h + vfmadd213sd xmm2,xmm1,L_tanf+010h + vdivsd xmm3,xmm3,xmm2 + vmulsd xmm1,xmm1,xmm0 + vfmadd231sd xmm0,xmm1,xmm3 + jmp Ltanf_fma3_return_s + +Ltanf_fma3_range_reduce: + vmovq xmm0,r9 + cmp r9,L_large_x_fma3 + jge Ltanf_fma3_tanf_reduce_large + +Ltanf_fma3_tanf_reduce_moderate: + vandpd xmm1,xmm0,L_sign_mask + vmovapd xmm2,L_twobypi + vfmadd213sd xmm2,xmm1,L_point_five + vcvttpd2dq xmm2,xmm2 + vpmovsxdq xmm1,xmm2 + vandpd xmm4,xmm1,L_int_three ; xmm4 <-- region + vshufps xmm1 ,xmm1,xmm1,8 + vcvtdq2pd xmm1,xmm1 + vmovdqa xmm2,xmm0 + vfnmadd231sd xmm2,xmm1,L_piby2_1 ; xmm2 rhead + vmulsd xmm3,xmm1,L_piby2_1tail ; xmm3 rtail + vsubsd xmm0,xmm2,xmm3 + vsubsd xmm2,xmm2,xmm0 + vsubsd xmm1,xmm2,xmm3 + jmp Ltanf_fma3_exit_s + +Ltanf_fma3_tanf_reduce_large: + lea r9,__L_2_by_pi_bits + ;xexp = (x >> 52) 1023 + vmovq r11,xmm0 + mov rcx,r11 + shr r11,52 + sub r11,1023 ; r11 <-- xexp = exponent of input x + ;calculate the last byte from which to start multiplication + ;last = 134 (xexp >> 3) + mov r10,r11 + shr r10,3 + sub r10,134 ; r10 <-- -last + neg r10 ; r10 <-- last + ;load 64 bits of 2_by_pi + mov rax,[r9+r10] + ;mantissa of x = ((x << 12) >> 12) | implied bit + shl rcx,12 + shr rcx,12 ; rcx <-- mantissa part of input x + bts rcx,52 ; add the implied bit as well + ;load next 128 bits of 2_by_pi + add r10,8 ; increment to next 8 bytes of 2_by_pi + vmovdqu xmm0,XMMWORD PTR[r9+r10] + ;do three 64bit multiplications with mant of x + mul rcx + mov r8,rax ; r8 = last 64 bits of mul = res1[2] + mov r10,rdx ; r10 = carry + vmovq rax,xmm0 + mul rcx + ;resexp = xexp & 7 + and r11,7 ; r11 <-- resexp = last 3 bits of xexp + vpsrldq xmm0,xmm0,8 + add rax,r10 ; add the previous carry + adc rdx,0 + mov r9,rax ; r9 <-- next 64 bits of mul = res1[1] + mov r10,rdx ; r10 <-- carry + vmovq rax,xmm0 + mul rcx + add r10,rax ;r10 = most sig 64 bits = res1[0] + ;find the region + ;last three bits ltb = most sig bits >> (54 resexp)) + ; decimal point in last 18 bits == 8 lsb's in first 64 bits + ; and 8 msb's in next 64 bits + ;point_five = ltb & 01h; + ;region = ((ltb >> 1) + point_five) & 3; + mov rcx,54 + mov rax,r10 + sub rcx,r11 + xor rdx,rdx ;rdx = sign of x + shr rax,cl + jnc Ltanf_fma3_no_point_five_f + ;;if there is carry.. then negate the result of multiplication + not r10 + not r9 + not r8 + mov rdx,08000000000000000h +ALIGN 16 +Ltanf_fma3_no_point_five_f: + adc rax,0 + and rax,3 + vmovd xmm4,eax ; xmm4 <-- region + ;calculate the number of integer bits and zero them out + mov rcx,r11 + add rcx,10 ; rcx = no. of integer bits + shl r10,cl + shr r10,cl ; r10 contains only mant bits + sub rcx,64 ; form the exponent + mov r11,rcx + ;find the highest set bit + bsr rcx,r10 + jnz Ltanf_fma3_form_mantissa_f + mov r10,r9 + mov r9,r8 + mov r8,0 + bsr rcx,r10 ;rcx = hsb + sub r11,64 +ALIGN 16 +Ltanf_fma3_form_mantissa_f: + add r11,rcx ; for exp of x + sub rcx,52 ; rcx = no. of bits to shift in r10 + cmp rcx,0 + jl Ltanf_fma3_hsb_below_52_f + je Ltanf_fma3_form_numbers_f + ;hsb above 52 + mov r8,r10 + shr r10,cl ; r10 = mantissa of x with hsb at 52 + shr r9,cl ; make space for bits from r10 + sub rcx,64 + neg rcx ; rcx = no of bits to shift r10 + shl r8,cl + or r9,r8 ; r9 = mantissa bits of xx + jmp Ltanf_fma3_form_numbers_f + +ALIGN 16 +Ltanf_fma3_hsb_below_52_f: + neg rcx + mov rax,r9 + shl r10,cl + shl r9,cl + sub rcx,64 + neg rcx + shr rax,cl + or r10,rax + shr r8,cl + or r9,r8 +ALIGN 16 +Ltanf_fma3_form_numbers_f: + add r11,1023 + btr r10,52 ; remove the implied bit + mov rcx,r11 + or r10,rdx ; put the sign + shl rcx,52 + or r10,rcx ; x is in r10 + vmovq xmm0,r10 ; xmm0 <-- x + vmulsd xmm0,xmm0,L_piby2_lead + +Ltanf_fma3_exit_s: + vandpd xmm2,xmm4,XMMWORD PTR L_int_one + vmovd eax,xmm2 + vmulsd xmm1,xmm0,xmm0 + vmovsd xmm3,L_tanf+008h + vfmadd213sd xmm3,xmm1,L_tanf + vmovsd xmm2,L_tanf+020h + vfmadd213sd xmm2,xmm1,L_tanf+018h + vfmadd213sd xmm2,xmm1,L_tanf+010h + vdivsd xmm3,xmm3,xmm2 + vmulsd xmm1,xmm1,xmm0 + vfmadd231sd xmm0,xmm1,xmm3 + cmp eax,01h + je Ltanf_fma3_recip + jmp Ltanf_fma3_exit_tanpiby4 + +Ltanf_fma3_recip : + vmovq xmm3,L_n_one + vdivsd xmm0,xmm3,xmm0 + +Ltanf_fma3_exit_tanpiby4 : + vandpd xmm5,xmm5,L_signbit + vxorpd xmm0,xmm0,xmm5 + +Ltanf_fma3_return_s: + vcvtsd2ss xmm0,xmm0,xmm0 +Ltanf_fma3_return_c: + StackDeallocate stack_size + ret + +Ltanf_fma3_naninf: + call fname_special + StackDeallocate stack_size + ret + +fname endp +END diff --git a/sdk/lib/crt/math/libm_sse2/tanf.c b/sdk/lib/crt/math/libm_sse2/tanf.c new file mode 100644 index 00000000000..8a86a2d2b73 --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/tanf.c @@ -0,0 +1,193 @@ + +/******************************************************************************* +MIT License +----------- + +Copyright (c) 2002-2019 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this Software and associated documentaon files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include "libm.h" +#include "libm_util.h" + +#define USE_REMAINDER_PIBY2F_INLINE +#define USE_VALF_WITH_FLAGS +#define USE_NANF_WITH_FLAGS +#define USE_HANDLE_ERRORF +#include "libm_inlines.h" +#undef USE_VALF_WITH_FLAGS +#undef USE_NANF_WITH_FLAGS +#undef USE_REMAINDER_PIBY2F_INLINE +#undef USE_HANDLE_ERRORF + +#include "libm_errno.h" + +// Disable "C4163: not available as intrinsic function" warning that older +// compilers may issue here. +#pragma warning(disable:4163) +#pragma function(tanf) + +/* tan(x) approximation valid on the interval [-pi/4,pi/4]. + If recip is true return -1/tan(x) instead. */ +static inline double tanf_piby4(double x, int recip) +{ + double r, t; + + /* Core Remez [1,2] approximation to tan(x) on the + interval [0,pi/4]. */ + r = x*x; + t = x + x*r* + (0.385296071263995406715129e0 - + 0.172032480471481694693109e-1 * r) / + (0.115588821434688393452299e+1 + + (-0.51396505478854532132342e0 + + 0.1844239256901656082986661e-1 * r) * r); + + if (recip) + return -1.0 / t; + else + return t; +} + + +float tanf(float x) +{ + double r, dx; + int region, xneg; + + unsigned long ux, ax; + + dx = x; + + GET_BITS_DP64(dx, ux); + ax = (ux & ~SIGNBIT_DP64); + + if (ax <= 0x3fe921fb54442d18) /* abs(x) <= pi/4 */ + { + if (ax < 0x3f80000000000000) /* abs(x) < 2.0^(-7) */ + { + if (ax < 0x3f20000000000000) /* abs(x) < 2.0^(-13) */ + { + if (ax == 0x0000000000000000) + return x; + else + return valf_with_flags(x, AMD_F_INEXACT); + } + else + return (float)(dx + dx*dx*dx*0.333333333333333333); + } + else + return (float)tanf_piby4(x, 0); + } + else if ((ux & EXPBITS_DP64) == EXPBITS_DP64) + { + /* x is either NaN or infinity */ + if (ux & MANTBITS_DP64) + { + /* x is NaN */ + unsigned int ufx; + GET_BITS_SP32(x, ufx); + return _handle_errorf("tanf", OP_TAN, ufx|0x00400000, _DOMAIN, 0, + EDOM, x, 0.0F, 1); + } + else + { + /* x is infinity. Return a NaN */ + return _handle_errorf("tanf", OP_TAN, INDEFBITPATT_SP32, _DOMAIN, AMD_F_INVALID, + EDOM, x, 0.0F, 1); + } + } + + xneg = (int)(ux >> 63); + + if (xneg) + dx = -dx; + + if (dx < 5.0e5) + { + /* For these size arguments we can just carefully subtract the + appropriate multiple of pi/2, using extra precision where + dx is close to an exact multiple of pi/2 */ + static const double + twobypi = 6.36619772367581382433e-01, /* 0x3fe45f306dc9c883 */ + piby2_1 = 1.57079632673412561417e+00, /* 0x3ff921fb54400000 */ + piby2_1tail = 6.07710050650619224932e-11, /* 0x3dd0b4611a626331 */ + piby2_2 = 6.07710050630396597660e-11, /* 0x3dd0b4611a600000 */ + piby2_2tail = 2.02226624879595063154e-21, /* 0x3ba3198a2e037073 */ + piby2_3 = 2.02226624871116645580e-21, /* 0x3ba3198a2e000000 */ + piby2_3tail = 8.47842766036889956997e-32; /* 0x397b839a252049c1 */ + double t, rhead, rtail; + int npi2; + unsigned long uy, xexp, expdiff; + xexp = ax >> EXPSHIFTBITS_DP64; + /* How many pi/2 is dx a multiple of? */ + if (ax <= 0x400f6a7a2955385e) /* 5pi/4 */ + { + if (ax <= 0x4002d97c7f3321d2) /* 3pi/4 */ + npi2 = 1; + else + npi2 = 2; + } + else if (ax <= 0x401c463abeccb2bb) /* 9pi/4 */ + { + if (ax <= 0x4015fdbbe9bba775) /* 7pi/4 */ + npi2 = 3; + else + npi2 = 4; + } + else + npi2 = (int)(dx * twobypi + 0.5); + /* Subtract the multiple from dx to get an extra-precision remainder */ + rhead = dx - npi2 * piby2_1; + rtail = npi2 * piby2_1tail; + GET_BITS_DP64(rhead, uy); + expdiff = xexp - ((uy & EXPBITS_DP64) >> EXPSHIFTBITS_DP64); + if (expdiff > 15) + { + /* The remainder is pretty small compared with dx, which + implies that dx is a near multiple of pi/2 + (dx matches the multiple to at least 15 bits) */ + t = rhead; + rtail = npi2 * piby2_2; + rhead = t - rtail; + rtail = npi2 * piby2_2tail - ((t - rhead) - rtail); + if (expdiff > 48) + { + /* dx matches a pi/2 multiple to at least 48 bits */ + t = rhead; + rtail = npi2 * piby2_3; + rhead = t - rtail; + rtail = npi2 * piby2_3tail - ((t - rhead) - rtail); + } + } + r = rhead - rtail; + region = npi2 & 3; + } + else + { + /* Reduce x into range [-pi/4,pi/4] */ + __remainder_piby2f_inline(ax, &r, ®ion); + } + + if (xneg) + return (float)-tanf_piby4(r, region & 1); + else + return (float)tanf_piby4(r, region & 1); +} diff --git a/sdk/lib/crt/math/libm_sse2/tanh.c b/sdk/lib/crt/math/libm_sse2/tanh.c new file mode 100644 index 00000000000..49385275c26 --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/tanh.c @@ -0,0 +1,137 @@ + +/******************************************************************************* +MIT License +----------- + +Copyright (c) 2002-2019 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this Software and associated documentaon files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include "libm.h" +#include "libm_util.h" + +#define USE_HANDLE_ERROR +#define USE_SPLITEXP +#define USE_SCALEDOUBLE_2 +#define USE_VAL_WITH_FLAGS +#include "libm_inlines.h" +#undef USE_SPLITEXP +#undef USE_SCALEDOUBLE_2 +#undef USE_VAL_WITH_FLAGS +#undef USE_HANDLE_ERROR + +#include "libm_errno.h" + + +#pragma function(tanh) +double tanh(double x) +{ + /* + The definition of tanh(x) is sinh(x)/cosh(x), which is also equivalent + to the following three formulae: + 1. (exp(x) - exp(-x))/(exp(x) + exp(-x)) + 2. (1 - (2/(exp(2*x) + 1 ))) + 3. (exp(2*x) - 1)/(exp(2*x) + 1) + but computationally, some formulae are better on some ranges. + */ + static const double + thirtytwo_by_log2 = 4.61662413084468283841e+01, /* 0x40471547652b82fe */ + log2_by_32_lead = 2.16608493356034159660e-02, /* 0x3f962e42fe000000 */ + log2_by_32_tail = 5.68948749532545630390e-11, /* 0x3dcf473de6af278e */ + large_threshold = 20.0; /* 0x4034000000000000 */ + + unsigned long ux, aux, xneg; + double y, z, p, z1, z2; + int m; + + /* Special cases */ + + GET_BITS_DP64(x, ux); + aux = ux & ~SIGNBIT_DP64; + if (aux < 0x3e30000000000000) /* |x| small enough that tanh(x) = x */ + { + if (aux == 0) + return x; /* with no inexact */ + else + return val_with_flags(x, AMD_F_INEXACT); + } + else if (aux > 0x7ff0000000000000) /* |x| is NaN */ + return _handle_error("tanh", OP_TANH, ux|0x0008000000000000, _DOMAIN, + 0, EDOM, x, 0.0, 1); +// return x + x; + + xneg = (aux != ux); + + y = x; + if (xneg) y = -x; + + if (y > large_threshold) + { + /* If x is large then exp(-x) is negligible and + formula 1 reduces to plus or minus 1.0 */ + z = 1.0; + } + else if (y <= 1.0) + { + double y2; + y2 = y*y; + if (y < 0.9) + { + /* Use a [3,3] Remez approximation on [0,0.9]. */ + z = y + y*y2* + (-0.274030424656179760118928e0 + + (-0.176016349003044679402273e-1 + + (-0.200047621071909498730453e-3 - + 0.142077926378834722618091e-7*y2)*y2)*y2)/ + (0.822091273968539282568011e0 + + (0.381641414288328849317962e0 + + (0.201562166026937652780575e-1 + + 0.2091140262529164482568557e-3*y2)*y2)*y2); + } + else + { + /* Use a [3,3] Remez approximation on [0.9,1]. */ + z = y + y*y2* + (-0.227793870659088295252442e0 + + (-0.146173047288731678404066e-1 + + (-0.165597043903549960486816e-3 - + 0.115475878996143396378318e-7*y2)*y2)*y2)/ + (0.683381611977295894959554e0 + + (0.317204558977294374244770e0 + + (0.167358775461896562588695e-1 + + 0.173076050126225961768710e-3*y2)*y2)*y2); + } + } + else + { + /* Compute p = exp(2*y) + 1. The code is basically inlined + from exp_amd. */ + + splitexp(2*y, 1.0, thirtytwo_by_log2, log2_by_32_lead, + log2_by_32_tail, &m, &z1, &z2); + p = scaleDouble_2(z1 + z2, m) + 1.0; + + /* Now reconstruct tanh from p. */ + z = (1.0 - 2.0/p); + } + + if (xneg) z = - z; + return z; +} diff --git a/sdk/lib/crt/math/libm_sse2/tanhf.c b/sdk/lib/crt/math/libm_sse2/tanhf.c new file mode 100644 index 00000000000..ab0ddc70a4a --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/tanhf.c @@ -0,0 +1,136 @@ + +/******************************************************************************* +MIT License +----------- + +Copyright (c) 2002-2019 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this Software and associated documentaon files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include "libm.h" +#include "libm_util.h" + +#define USE_HANDLE_ERRORF +#define USE_SPLITEXPF +#define USE_SCALEFLOAT_2 +#define USE_VALF_WITH_FLAGS +#include "libm_inlines.h" +#undef USE_SPLITEXPF +#undef USE_SCALEFLOAT_2 +#undef USE_VALF_WITH_FLAGS +#undef USE_HANDLE_ERRORF + +#include "libm_errno.h" + +// Disable "C4163: not available as intrinsic function" warning that older +// compilers may issue here. +#pragma warning(disable:4163) +#pragma function(tanhf) + +float tanhf(float x) +{ + /* + The definition of tanh(x) is sinh(x)/cosh(x), which is also equivalent + to the following three formulae: + 1. (exp(x) - exp(-x))/(exp(x) + exp(-x)) + 2. (1 - (2/(exp(2*x) + 1 ))) + 3. (exp(2*x) - 1)/(exp(2*x) + 1) + but computationally, some formulae are better on some ranges. + */ + static const float + thirtytwo_by_log2 = 4.6166240692e+01F, /* 0x4238aa3b */ + log2_by_32_lead = 2.1659851074e-02F, /* 0x3cb17000 */ + log2_by_32_tail = 9.9831822808e-07F, /* 0x3585fdf4 */ + large_threshold = 10.0F; /* 0x41200000 */ + + unsigned int ux, aux; + float y, z, p, z1, z2, xneg; + int m; + + /* Special cases */ + + GET_BITS_SP32(x, ux); + aux = ux & ~SIGNBIT_SP32; + if (aux < 0x39000000) /* |x| small enough that tanh(x) = x */ + { + if (aux == 0) + return x; /* with no inexact */ + else + return valf_with_flags(x, AMD_F_INEXACT); + } + else if (aux > 0x7f800000) /* |x| is NaN */ + { + unsigned int ufx; + GET_BITS_SP32(x, ufx); + return _handle_errorf("tanhf", OP_TANH, ufx|0x00400000, _DOMAIN, 0, + EDOM, x, 0.0F, 1); + } +// return x + x; + + xneg = 1.0F - 2.0F * (aux != ux); + + y = xneg * x; + + if (y > large_threshold) + { + /* If x is large then exp(-x) is negligible and + formula 1 reduces to plus or minus 1.0 */ + z = 1.0F; + } + else if (y <= 1.0F) + { + float y2; + y2 = y*y; + + if (y < 0.9F) + { + /* Use a [2,1] Remez approximation on [0,0.9]. */ + z = y + y*y2* + (-0.28192806108402678e0F + + (-0.14628356048797849e-2F + + 0.4891631088530669873e-4F*y2)*y2)/ + (0.845784192581041099e0F + + 0.3427017942262751343e0F*y2); + } + else + { + /* Use a [2,1] Remez approximation on [0.9,1]. */ + z = y + y*y2* + (-0.24069858695196524e0F + + (-0.12325644183611929e-2F + + 0.3827534993599483396e-4F*y2)*y2)/ + (0.72209738473684982e0F + + 0.292529068698052819e0F*y2); + } + } + else + { + /* Compute p = exp(2*y) + 1. The code is basically inlined + from exp_amd. */ + + splitexpf(2*y, 1.0F, thirtytwo_by_log2, log2_by_32_lead, + log2_by_32_tail, &m, &z1, &z2); + p = scaleFloat_2(z1 + z2, m) + 1.0F; + /* Now reconstruct tanh from p. */ + z = (1.0F - 2.0F/p); + } + + return xneg * z; +} diff --git a/sdk/lib/crt/math/libm_sse2/two_to_jby64_head_tail_table.asm b/sdk/lib/crt/math/libm_sse2/two_to_jby64_head_tail_table.asm new file mode 100644 index 00000000000..7b95c6e91b8 --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/two_to_jby64_head_tail_table.asm @@ -0,0 +1,165 @@ +;; +; +; MIT License +; ----------- +; +; Copyright (c) 2002-2019 Advanced Micro Devices, Inc. +; +; Permission is hereby granted, free of charge, to any person obtaining a copy +; of this Software and associated documentaon files (the "Software"), to deal +; in the Software without restriction, including without limitation the rights +; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +; copies of the Software, and to permit persons to whom the Software is +; furnished to do so, subject to the following conditions: +; +; The above copyright notice and this permission notice shall be included in +; all copies or substantial portions of the Software. +; +; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +; THE SOFTWARE. +; +;; Defines __two_to_jby64_head_table and __two_to_jby64_tail_table tables +;; Used in exp and pow +;; + +.const + +ALIGN 16 +PUBLIC __two_to_jby64_head_table +__two_to_jby64_head_table DQ 3ff0000000000000h + DQ 3ff02c9a30000000h + DQ 3ff059b0d0000000h + DQ 3ff0874510000000h + DQ 3ff0b55860000000h + DQ 3ff0e3ec30000000h + DQ 3ff11301d0000000h + DQ 3ff1429aa0000000h + DQ 3ff172b830000000h + DQ 3ff1a35be0000000h + DQ 3ff1d48730000000h + DQ 3ff2063b80000000h + DQ 3ff2387a60000000h + DQ 3ff26b4560000000h + DQ 3ff29e9df0000000h + DQ 3ff2d285a0000000h + DQ 3ff306fe00000000h + DQ 3ff33c08b0000000h + DQ 3ff371a730000000h + DQ 3ff3a7db30000000h + DQ 3ff3dea640000000h + DQ 3ff4160a20000000h + DQ 3ff44e0860000000h + DQ 3ff486a2b0000000h + DQ 3ff4bfdad0000000h + DQ 3ff4f9b270000000h + DQ 3ff5342b50000000h + DQ 3ff56f4730000000h + DQ 3ff5ab07d0000000h + DQ 3ff5e76f10000000h + DQ 3ff6247eb0000000h + DQ 3ff6623880000000h + DQ 3ff6a09e60000000h + DQ 3ff6dfb230000000h + DQ 3ff71f75e0000000h + DQ 3ff75feb50000000h + DQ 3ff7a11470000000h + DQ 3ff7e2f330000000h + DQ 3ff8258990000000h + DQ 3ff868d990000000h + DQ 3ff8ace540000000h + DQ 3ff8f1ae90000000h + DQ 3ff93737b0000000h + DQ 3ff97d8290000000h + DQ 3ff9c49180000000h + DQ 3ffa0c6670000000h + DQ 3ffa5503b0000000h + DQ 3ffa9e6b50000000h + DQ 3ffae89f90000000h + DQ 3ffb33a2b0000000h + DQ 3ffb7f76f0000000h + DQ 3ffbcc1e90000000h + DQ 3ffc199bd0000000h + DQ 3ffc67f120000000h + DQ 3ffcb720d0000000h + DQ 3ffd072d40000000h + DQ 3ffd5818d0000000h + DQ 3ffda9e600000000h + DQ 3ffdfc9730000000h + DQ 3ffe502ee0000000h + DQ 3ffea4afa0000000h + DQ 3ffefa1be0000000h + DQ 3fff507650000000h + DQ 3fffa7c180000000h + +ALIGN 16 +PUBLIC __two_to_jby64_tail_table +__two_to_jby64_tail_table DQ 0000000000000000h + DQ 3e6cef00c1dcdef9h + DQ 3e48ac2ba1d73e2ah + DQ 3e60eb37901186beh + DQ 3e69f3121ec53172h + DQ 3e469e8d10103a17h + DQ 3df25b50a4ebbf1ah + DQ 3e6d525bbf668203h + DQ 3e68faa2f5b9bef9h + DQ 3e66df96ea796d31h + DQ 3e368b9aa7805b80h + DQ 3e60c519ac771dd6h + DQ 3e6ceac470cd83f5h + DQ 3e5789f37495e99ch + DQ 3e547f7b84b09745h + DQ 3e5b900c2d002475h + DQ 3e64636e2a5bd1abh + DQ 3e4320b7fa64e430h + DQ 3e5ceaa72a9c5154h + DQ 3e53967fdba86f24h + DQ 3e682468446b6824h + DQ 3e3f72e29f84325bh + DQ 3e18624b40c4dbd0h + DQ 3e5704f3404f068eh + DQ 3e54d8a89c750e5eh + DQ 3e5a74b29ab4cf62h + DQ 3e5a753e077c2a0fh + DQ 3e5ad49f699bb2c0h + DQ 3e6a90a852b19260h + DQ 3e56b48521ba6f93h + DQ 3e0d2ac258f87d03h + DQ 3e42a91124893ecfh + DQ 3e59fcef32422cbeh + DQ 3e68ca345de441c5h + DQ 3e61d8bee7ba46e1h + DQ 3e59099f22fdba6ah + DQ 3e4f580c36bea881h + DQ 3e5b3d398841740ah + DQ 3e62999c25159f11h + DQ 3e668925d901c83bh + DQ 3e415506dadd3e2ah + DQ 3e622aee6c57304eh + DQ 3e29b8bc9e8a0387h + DQ 3e6fbc9c9f173d24h + DQ 3e451f8480e3e235h + DQ 3e66bbcac96535b5h + DQ 3e41f12ae45a1224h + DQ 3e55e7f6fd0fac90h + DQ 3e62b5a75abd0e69h + DQ 3e609e2bf5ed7fa1h + DQ 3e47daf237553d84h + DQ 3e12f074891ee83dh + DQ 3e6b0aa538444196h + DQ 3e6cafa29694426fh + DQ 3e69df20d22a0797h + DQ 3e640f12f71a1e45h + DQ 3e69f7490e4bb40bh + DQ 3e4ed9942b84600dh + DQ 3e4bdcdaf5cb4656h + DQ 3e5e2cffd89cf44ch + DQ 3e452486cc2c7b9dh + DQ 3e6cc2b44eee3fa4h + DQ 3e66dc8a80ce9f09h + DQ 3e39e90d82e90a7eh +END diff --git a/sdk/lib/crt/math/libm_sse2/two_to_jby64_table.asm b/sdk/lib/crt/math/libm_sse2/two_to_jby64_table.asm new file mode 100644 index 00000000000..fa9519bc7e4 --- /dev/null +++ b/sdk/lib/crt/math/libm_sse2/two_to_jby64_table.asm @@ -0,0 +1,99 @@ +;; +; +; MIT License +; ----------- +; +; Copyright (c) 2002-2019 Advanced Micro Devices, Inc. +; +; Permission is hereby granted, free of charge, to any person obtaining a copy +; of this Software and associated documentaon files (the "Software"), to deal +; in the Software without restriction, including without limitation the rights +; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +; copies of the Software, and to permit persons to whom the Software is +; furnished to do so, subject to the following conditions: +; +; The above copyright notice and this permission notice shall be included in +; all copies or substantial portions of the Software. +; +; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +; THE SOFTWARE. +; +;; Defines __two_to_jby64_table table +;; Used by exp and expf +;; + +.const + +ALIGN 16 +PUBLIC __two_to_jby64_table +__two_to_jby64_table DQ 3ff0000000000000h + DQ 3ff02c9a3e778061h + DQ 3ff059b0d3158574h + DQ 3ff0874518759bc8h + DQ 3ff0b5586cf9890fh + DQ 3ff0e3ec32d3d1a2h + DQ 3ff11301d0125b51h + DQ 3ff1429aaea92de0h + DQ 3ff172b83c7d517bh + DQ 3ff1a35beb6fcb75h + DQ 3ff1d4873168b9aah + DQ 3ff2063b88628cd6h + DQ 3ff2387a6e756238h + DQ 3ff26b4565e27cddh + DQ 3ff29e9df51fdee1h + DQ 3ff2d285a6e4030bh + DQ 3ff306fe0a31b715h + DQ 3ff33c08b26416ffh + DQ 3ff371a7373aa9cbh + DQ 3ff3a7db34e59ff7h + DQ 3ff3dea64c123422h + DQ 3ff4160a21f72e2ah + DQ 3ff44e086061892dh + DQ 3ff486a2b5c13cd0h + DQ 3ff4bfdad5362a27h + DQ 3ff4f9b2769d2ca7h + DQ 3ff5342b569d4f82h + DQ 3ff56f4736b527dah + DQ 3ff5ab07dd485429h + DQ 3ff5e76f15ad2148h + DQ 3ff6247eb03a5585h + DQ 3ff6623882552225h + DQ 3ff6a09e667f3bcdh + DQ 3ff6dfb23c651a2fh + DQ 3ff71f75e8ec5f74h + DQ 3ff75feb564267c9h + DQ 3ff7a11473eb0187h + DQ 3ff7e2f336cf4e62h + DQ 3ff82589994cce13h + DQ 3ff868d99b4492edh + DQ 3ff8ace5422aa0dbh + DQ 3ff8f1ae99157736h + DQ 3ff93737b0cdc5e5h + DQ 3ff97d829fde4e50h + DQ 3ff9c49182a3f090h + DQ 3ffa0c667b5de565h + DQ 3ffa5503b23e255dh + DQ 3ffa9e6b5579fdbfh + DQ 3ffae89f995ad3adh + DQ 3ffb33a2b84f15fbh + DQ 3ffb7f76f2fb5e47h + DQ 3ffbcc1e904bc1d2h + DQ 3ffc199bdd85529ch + DQ 3ffc67f12e57d14bh + DQ 3ffcb720dcef9069h + DQ 3ffd072d4a07897ch + DQ 3ffd5818dcfba487h + DQ 3ffda9e603db3285h + DQ 3ffdfc97337b9b5fh + DQ 3ffe502ee78b3ff6h + DQ 3ffea4afa2a490dah + DQ 3ffefa1bee615a27h + DQ 3fff50765b6e4540h + DQ 3fffa7c1819e90d8h + +END