/***********************************************************************************/ /** MIT License **/ /** ----------- **/ /** **/ /** Copyright (c) 2002-2019 Advanced Micro Devices, Inc. **/ /** **/ /** Permission is hereby granted, free of charge, to any person obtaining a copy **/ /** of this Software and associated documentaon files (the "Software"), to deal **/ /** in the Software without restriction, including without limitation the rights **/ /** to use, copy, modify, merge, publish, distribute, sublicense, and/or sell **/ /** copies of the Software, and to permit persons to whom the Software is **/ /** furnished to do so, subject to the following conditions: **/ /** **/ /** The above copyright notice and this permission notice shall be included in **/ /** all copies or substantial portions of the Software. **/ /** **/ /** THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR **/ /** IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, **/ /** FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE **/ /** AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER **/ /** LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, **/ /** OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN **/ /** THE SOFTWARE. **/ /***********************************************************************************/ #ifndef LIBM_INLINES_AMD_H_INCLUDED #define LIBM_INLINES_AMD_H_INCLUDED 1 #include "libm_util.h" /* Set defines for inline functions calling other inlines */ #if defined(USE_VAL_WITH_FLAGS) || defined(USE_VALF_WITH_FLAGS) || \ defined(USE_ZERO_WITH_FLAGS) || defined(USE_ZEROF_WITH_FLAGS) || \ defined(USE_NAN_WITH_FLAGS) || defined(USE_NANF_WITH_FLAGS) || \ defined(USE_INDEFINITE_WITH_FLAGS) || defined(USE_INDEFINITEF_WITH_FLAGS) || \ defined(USE_INFINITY_WITH_FLAGS) || defined(USE_INFINITYF_WITH_FLAGS) || \ defined(USE_SQRT_AMD_INLINE) || defined(USE_SQRTF_AMD_INLINE) || \ (defined(USE_HANDLE_ERROR) || defined(USE_HANDLE_ERRORF)) #undef USE_RAISE_FPSW_FLAGS #define USE_RAISE_FPSW_FLAGS 1 #endif #if defined(USE_SPLITDOUBLE) /* Splits double x into exponent e and mantissa m, where 0.5 <= abs(m) < 1.0. Assumes that x is not zero, denormal, infinity or NaN, but these conditions are not checked */ static inline void splitDouble(double x, int *e, double *m) { unsigned long long ux, uy; GET_BITS_DP64(x, ux); uy = ux; ux &= EXPBITS_DP64; ux >>= EXPSHIFTBITS_DP64; *e = (int)ux - EXPBIAS_DP64 + 1; uy = (uy & (SIGNBIT_DP64 | MANTBITS_DP64)) | HALFEXPBITS_DP64; PUT_BITS_DP64(uy, x); *m = x; } #endif /* USE_SPLITDOUBLE */ #if defined(USE_SPLITDOUBLE_2) /* Splits double x into exponent e and mantissa m, where 1.0 <= abs(m) < 4.0. Assumes that x is not zero, denormal, infinity or NaN, but these conditions are not checked. Also assumes EXPBIAS_DP is odd. With this assumption, e will be even on exit. */ static inline void splitDouble_2(double x, int *e, double *m) { unsigned long long ux, vx; GET_BITS_DP64(x, ux); vx = ux; ux &= EXPBITS_DP64; ux >>= EXPSHIFTBITS_DP64; if (ux & 1) { /* The exponent is odd */ vx = (vx & (SIGNBIT_DP64 | MANTBITS_DP64)) | ONEEXPBITS_DP64; PUT_BITS_DP64(vx, x); *m = x; *e = ux - EXPBIAS_DP64; } else { /* The exponent is even */ vx = (vx & (SIGNBIT_DP64 | MANTBITS_DP64)) | TWOEXPBITS_DP64; PUT_BITS_DP64(vx, x); *m = x; *e = ux - EXPBIAS_DP64 - 1; } } #endif /* USE_SPLITDOUBLE_2 */ #if defined(USE_SPLITFLOAT) /* Splits float x into exponent e and mantissa m, where 0.5 <= abs(m) < 1.0. Assumes that x is not zero, denormal, infinity or NaN, but these conditions are not checked */ static inline void splitFloat(float x, int *e, float *m) { unsigned int ux, uy; GET_BITS_SP32(x, ux); uy = ux; ux &= EXPBITS_SP32; ux >>= EXPSHIFTBITS_SP32; *e = (int)ux - EXPBIAS_SP32 + 1; uy = (uy & (SIGNBIT_SP32 | MANTBITS_SP32)) | HALFEXPBITS_SP32; PUT_BITS_SP32(uy, x); *m = x; } #endif /* USE_SPLITFLOAT */ #if defined(USE_SCALEDOUBLE_1) /* Scales the double x by 2.0**n. Assumes EMIN <= n <= EMAX, though this condition is not checked. */ static inline double scaleDouble_1(double x, int n) { double t; /* Construct the number t = 2.0**n */ PUT_BITS_DP64(((long long)n + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t); return x*t; } #endif /* USE_SCALEDOUBLE_1 */ #if defined(USE_SCALEDOUBLE_2) /* Scales the double x by 2.0**n. Assumes 2*EMIN <= n <= 2*EMAX, though this condition is not checked. */ static inline double scaleDouble_2(double x, int n) { double t1, t2; int n1, n2; n1 = n / 2; n2 = n - n1; /* Construct the numbers t1 = 2.0**n1 and t2 = 2.0**n2 */ PUT_BITS_DP64(((long long)n1 + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t1); PUT_BITS_DP64(((long long)n2 + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t2); return (x*t1)*t2; } #endif /* USE_SCALEDOUBLE_2 */ #if defined(USE_SCALEDOUBLE_3) /* Scales the double x by 2.0**n. Assumes 3*EMIN <= n <= 3*EMAX, though this condition is not checked. */ static inline double scaleDouble_3(double x, int n) { double t1, t2, t3; int n1, n2, n3; n1 = n / 3; n2 = (n - n1) / 2; n3 = n - n1 - n2; /* Construct the numbers t1 = 2.0**n1, t2 = 2.0**n2 and t3 = 2.0**n3 */ PUT_BITS_DP64(((long long)n1 + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t1); PUT_BITS_DP64(((long long)n2 + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t2); PUT_BITS_DP64(((long long)n3 + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t3); return ((x*t1)*t2)*t3; } #endif /* USE_SCALEDOUBLE_3 */ #if defined(USE_SCALEFLOAT_1) /* Scales the float x by 2.0**n. Assumes EMIN <= n <= EMAX, though this condition is not checked. */ static inline float scaleFloat_1(float x, int n) { float t; /* Construct the number t = 2.0**n */ PUT_BITS_SP32((n + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, t); return x*t; } #endif /* USE_SCALEFLOAT_1 */ #if defined(USE_SCALEFLOAT_2) /* Scales the float x by 2.0**n. Assumes 2*EMIN <= n <= 2*EMAX, though this condition is not checked. */ static inline float scaleFloat_2(float x, int n) { float t1, t2; int n1, n2; n1 = n / 2; n2 = n - n1; /* Construct the numbers t1 = 2.0**n1 and t2 = 2.0**n2 */ PUT_BITS_SP32((n1 + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, t1); PUT_BITS_SP32((n2 + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, t2); return (x*t1)*t2; } #endif /* USE_SCALEFLOAT_2 */ #if defined(USE_SCALEFLOAT_3) /* Scales the float x by 2.0**n. Assumes 3*EMIN <= n <= 3*EMAX, though this condition is not checked. */ static inline float scaleFloat_3(float x, int n) { float t1, t2, t3; int n1, n2, n3; n1 = n / 3; n2 = (n - n1) / 2; n3 = n - n1 - n2; /* Construct the numbers t1 = 2.0**n1, t2 = 2.0**n2 and t3 = 2.0**n3 */ PUT_BITS_SP32((n1 + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, t1); PUT_BITS_SP32((n2 + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, t2); PUT_BITS_SP32((n3 + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, t3); return ((x*t1)*t2)*t3; } #endif /* USE_SCALEFLOAT_3 */ #if defined(USE_SETPRECISIONDOUBLE) unsigned int setPrecisionDouble(void) { unsigned int cw, cwold = 0; /* There is no precision control on Hammer */ return cwold; } #endif /* USE_SETPRECISIONDOUBLE */ #if defined(USE_RESTOREPRECISION) void restorePrecision(unsigned int cwold) { /* There is no precision control on Hammer */ return; } #endif /* USE_RESTOREPRECISION */ #if defined(USE_RAISE_FPSW_FLAGS) /* Raises floating-point status flags. The argument should be the bitwise or of the flags to be raised, from the list above, e.g. raise_fpsw_flags(AMD_F_INEXACT | AMD_F_INVALID); */ /* ISSUE - wat - 08182010 * These AMD_ISW_* flags are duplicated from trans.h * this is not clean; Mark S. did it for targeted fix of 855457 * Eliminate all redundant flags in the next overhaul */ #define AMD_ISW_INVALID 0x0001 #define AMD_ISW_DENORMAL 0x0002 #define AMD_ISW_ZERODIVIDE 0x0004 #define AMD_ISW_OVERFLOW 0x0008 #define AMD_ISW_UNDERFLOW 0x0010 #define AMD_ISW_INEXACT 0x0020 /* use this function from fpctrl.c */ void _set_statfp(uintptr_t); static inline void raise_fpsw_flags(int flags) { unsigned int f = 0; if (flags & AMD_F_OVERFLOW) { f |= AMD_ISW_OVERFLOW; } if (flags & AMD_F_UNDERFLOW) { f |= AMD_ISW_UNDERFLOW; } if (flags & AMD_F_DIVBYZERO) { f |= AMD_ISW_ZERODIVIDE; } if (flags & AMD_F_INVALID) { f |= AMD_ISW_INVALID; } if (flags & AMD_F_INEXACT) { f |= AMD_ISW_INEXACT; } _set_statfp(f); } #endif /* USE_RAISE_FPSW_FLAGS */ #if defined(USE_GET_FPSW_INLINE) /* Return the current floating-point status word */ static inline unsigned int get_fpsw_inline(void) { return _mm_getcsr(); } #endif /* USE_GET_FPSW_INLINE */ #if defined(USE_SET_FPSW_INLINE) /* Set the floating-point status word */ static inline void set_fpsw_inline(unsigned int sw) { _mm_setcsr(sw); } #endif /* USE_SET_FPSW_INLINE */ #if defined(USE_VAL_WITH_FLAGS) /* Returns a double value after raising the given flags, e.g. val_with_flags(AMD_F_INEXACT); */ static inline double val_with_flags(double val, int flags) { raise_fpsw_flags(flags); return val; } #endif /* USE_VAL_WITH_FLAGS */ #if defined(USE_VALF_WITH_FLAGS) /* Returns a float value after raising the given flags, e.g. valf_with_flags(AMD_F_INEXACT); */ static inline float valf_with_flags(float val, int flags) { raise_fpsw_flags(flags); return val; } #endif /* USE_VALF_WITH_FLAGS */ #if defined(USE_ZERO_WITH_FLAGS) /* Returns a double +zero after raising the given flags, e.g. zero_with_flags(AMD_F_INEXACT | AMD_F_INVALID); */ static inline double zero_with_flags(int flags) { raise_fpsw_flags(flags); return 0.0; } #endif /* USE_ZERO_WITH_FLAGS */ #if defined(USE_ZEROF_WITH_FLAGS) /* Returns a float +zero after raising the given flags, e.g. zerof_with_flags(AMD_F_INEXACT | AMD_F_INVALID); */ static inline float zerof_with_flags(int flags) { raise_fpsw_flags(flags); return 0.0F; } #endif /* USE_ZEROF_WITH_FLAGS */ #if defined(USE_NAN_WITH_FLAGS) /* Returns a double quiet +nan after raising the given flags, e.g. nan_with_flags(AMD_F_INVALID); */ static inline double nan_with_flags(int flags) { double z; raise_fpsw_flags(flags); PUT_BITS_DP64(0x7ff8000000000000, z); return z; } #endif /* USE_NAN_WITH_FLAGS */ #if defined(USE_NANF_WITH_FLAGS) /* Returns a float quiet +nan after raising the given flags, e.g. nanf_with_flags(AMD_F_INVALID); */ static inline float nanf_with_flags(int flags) { float z; raise_fpsw_flags(flags); PUT_BITS_SP32(0x7fc00000, z); return z; } #endif /* USE_NANF_WITH_FLAGS */ #if defined(USE_INDEFINITE_WITH_FLAGS) /* Returns a double indefinite after raising the given flags, e.g. indefinite_with_flags(AMD_F_INVALID); */ static inline double indefinite_with_flags(int flags) { double z; raise_fpsw_flags(flags); PUT_BITS_DP64(0xfff8000000000000, z); return z; } #endif /* USE_INDEFINITE_WITH_FLAGS */ #if defined(USE_INDEFINITEF_WITH_FLAGS) /* Returns a float quiet +indefinite after raising the given flags, e.g. indefinitef_with_flags(AMD_F_INVALID); */ static inline float indefinitef_with_flags(int flags) { float z; raise_fpsw_flags(flags); PUT_BITS_SP32(0xffc00000, z); return z; } #endif /* USE_INDEFINITEF_WITH_FLAGS */ #ifdef USE_INFINITY_WITH_FLAGS /* Returns a positive double infinity after raising the given flags, e.g. infinity_with_flags(AMD_F_OVERFLOW); */ static inline double infinity_with_flags(int flags) { double z; raise_fpsw_flags(flags); PUT_BITS_DP64((unsigned long long)(BIASEDEMAX_DP64 + 1) << EXPSHIFTBITS_DP64, z); return z; } #endif /* USE_INFINITY_WITH_FLAGS */ #ifdef USE_INFINITYF_WITH_FLAGS /* Returns a positive float infinity after raising the given flags, e.g. infinityf_with_flags(AMD_F_OVERFLOW); */ static inline float infinityf_with_flags(int flags) { float z; raise_fpsw_flags(flags); PUT_BITS_SP32((BIASEDEMAX_SP32 + 1) << EXPSHIFTBITS_SP32, z); return z; } #endif /* USE_INFINITYF_WITH_FLAGS */ #if defined(USE_HANDLE_ERROR) || defined(USE_HANDLE_ERRORF) #include #endif /* define the Microsoft specific error handling routine */ double _handle_error( char *fname, int opcode, unsigned long long value, int type, int flags, int error, double arg1, double arg2, int nargs ); float _handle_errorf( char *fname, int opcode, unsigned long long value, int type, int flags, int error, float arg1, float arg2, int nargs ); #if defined(USE_SPLITEXP) /* Compute the values m, z1, and z2 such that base**x = 2**m * (z1 + z2). Small arguments abs(x) < 1/(16*ln(base)) and extreme arguments abs(x) > large/(ln(base)) (where large is the largest representable floating point number) should be handled separately instead of calling this function. This function is called by exp, exp2, exp10, cosh and sinh. */ static inline void splitexp(double x, double logbase, double thirtytwo_by_logbaseof2, double logbaseof2_by_32_lead, double logbaseof2_by_32_trail, int *m, double *z1, double *z2) { double q, r, r1, r2, f1, f2; int n, j; /* Arrays two_to_jby32_lead_table and two_to_jby32_trail_table contain leading and trailing parts respectively of precomputed values of pow(2.0,j/32.0), for j = 0, 1, ..., 31. two_to_jby32_lead_table contains the first 25 bits of precision, and two_to_jby32_trail_table contains a further 53 bits precision. */ static const double two_to_jby32_lead_table[32] = { 1.00000000000000000000e+00, /* 0x3ff0000000000000 */ 1.02189713716506958008e+00, /* 0x3ff059b0d0000000 */ 1.04427373409271240234e+00, /* 0x3ff0b55860000000 */ 1.06714040040969848633e+00, /* 0x3ff11301d0000000 */ 1.09050768613815307617e+00, /* 0x3ff172b830000000 */ 1.11438673734664916992e+00, /* 0x3ff1d48730000000 */ 1.13878858089447021484e+00, /* 0x3ff2387a60000000 */ 1.16372483968734741211e+00, /* 0x3ff29e9df0000000 */ 1.18920707702636718750e+00, /* 0x3ff306fe00000000 */ 1.21524733304977416992e+00, /* 0x3ff371a730000000 */ 1.24185776710510253906e+00, /* 0x3ff3dea640000000 */ 1.26905095577239990234e+00, /* 0x3ff44e0860000000 */ 1.29683953523635864258e+00, /* 0x3ff4bfdad0000000 */ 1.32523661851882934570e+00, /* 0x3ff5342b50000000 */ 1.35425549745559692383e+00, /* 0x3ff5ab07d0000000 */ 1.38390988111495971680e+00, /* 0x3ff6247eb0000000 */ 1.41421353816986083984e+00, /* 0x3ff6a09e60000000 */ 1.44518077373504638672e+00, /* 0x3ff71f75e0000000 */ 1.47682613134384155273e+00, /* 0x3ff7a11470000000 */ 1.50916439294815063477e+00, /* 0x3ff8258990000000 */ 1.54221081733703613281e+00, /* 0x3ff8ace540000000 */ 1.57598084211349487305e+00, /* 0x3ff93737b0000000 */ 1.61049032211303710938e+00, /* 0x3ff9c49180000000 */ 1.64575546979904174805e+00, /* 0x3ffa5503b0000000 */ 1.68179279565811157227e+00, /* 0x3ffae89f90000000 */ 1.71861928701400756836e+00, /* 0x3ffb7f76f0000000 */ 1.75625211000442504883e+00, /* 0x3ffc199bd0000000 */ 1.79470902681350708008e+00, /* 0x3ffcb720d0000000 */ 1.83400803804397583008e+00, /* 0x3ffd5818d0000000 */ 1.87416762113571166992e+00, /* 0x3ffdfc9730000000 */ 1.91520655155181884766e+00, /* 0x3ffea4afa0000000 */ 1.95714408159255981445e+00}; /* 0x3fff507650000000 */ static const double two_to_jby32_trail_table[32] = { 0.00000000000000000000e+00, /* 0x0000000000000000 */ 1.14890470981563546737e-08, /* 0x3e48ac2ba1d73e2a */ 4.83347014379782142328e-08, /* 0x3e69f3121ec53172 */ 2.67125131841396124714e-10, /* 0x3df25b50a4ebbf1b */ 4.65271045830351350190e-08, /* 0x3e68faa2f5b9bef9 */ 5.24924336638693782574e-09, /* 0x3e368b9aa7805b80 */ 5.38622214388600821910e-08, /* 0x3e6ceac470cd83f6 */ 1.90902301017041969782e-08, /* 0x3e547f7b84b09745 */ 3.79763538792174980894e-08, /* 0x3e64636e2a5bd1ab */ 2.69306947081946450986e-08, /* 0x3e5ceaa72a9c5154 */ 4.49683815095311756138e-08, /* 0x3e682468446b6824 */ 1.41933332021066904914e-09, /* 0x3e18624b40c4dbd0 */ 1.94146510233556266402e-08, /* 0x3e54d8a89c750e5e */ 2.46409119489264118569e-08, /* 0x3e5a753e077c2a0f */ 4.94812958044698886494e-08, /* 0x3e6a90a852b19260 */ 8.48872238075784476136e-10, /* 0x3e0d2ac258f87d03 */ 2.42032342089579394887e-08, /* 0x3e59fcef32422cbf */ 3.32420002333182569170e-08, /* 0x3e61d8bee7ba46e2 */ 1.45956577586525322754e-08, /* 0x3e4f580c36bea881 */ 3.46452721050003920866e-08, /* 0x3e62999c25159f11 */ 8.07090469079979051284e-09, /* 0x3e415506dadd3e2a */ 2.99439161340839520436e-09, /* 0x3e29b8bc9e8a0388 */ 9.83621719880452147153e-09, /* 0x3e451f8480e3e236 */ 8.35492309647188080486e-09, /* 0x3e41f12ae45a1224 */ 3.48493175137966283582e-08, /* 0x3e62b5a75abd0e6a */ 1.11084703472699692902e-08, /* 0x3e47daf237553d84 */ 5.03688744342840346564e-08, /* 0x3e6b0aa538444196 */ 4.81896001063495806249e-08, /* 0x3e69df20d22a0798 */ 4.83653666334089557746e-08, /* 0x3e69f7490e4bb40b */ 1.29745882314081237628e-08, /* 0x3e4bdcdaf5cb4656 */ 9.84532844621636118964e-09, /* 0x3e452486cc2c7b9d */ 4.25828404545651943883e-08}; /* 0x3e66dc8a80ce9f09 */ /* Step 1. Reduce the argument. To perform argument reduction, we find the integer n such that x = n * logbaseof2/32 + remainder, |remainder| <= logbaseof2/64. n is defined by round-to-nearest-integer( x*32/logbaseof2 ) and remainder by x - n*logbaseof2/32. The calculation of n is straightforward whereas the computation of x - n*logbaseof2/32 must be carried out carefully. logbaseof2/32 is so represented in two pieces that (1) logbaseof2/32 is known to extra precision, (2) the product of n and the leading piece is a model number and is hence calculated without error, and (3) the subtraction of the value obtained in (2) from x is a model number and is hence again obtained without error. */ r = x * thirtytwo_by_logbaseof2; /* Set n = nearest integer to r */ /* This is faster on Hammer */ if (r > 0) n = (int)(r + 0.5); else n = (int)(r - 0.5); r1 = x - n * logbaseof2_by_32_lead; r2 = - n * logbaseof2_by_32_trail; /* Set j = n mod 32: 5 mod 32 = 5, -5 mod 32 = 27, etc. */ /* j = n % 32; if (j < 0) j += 32; */ j = n & 0x0000001f; f1 = two_to_jby32_lead_table[j]; f2 = two_to_jby32_trail_table[j]; *m = (n - j) / 32; /* Step 2. The following is the core approximation. We approximate exp(r1+r2)-1 by a polynomial. */ r1 *= logbase; r2 *= logbase; r = r1 + r2; q = r1 + (r2 + r*r*( 5.00000000000000008883e-01 + r*( 1.66666666665260878863e-01 + r*( 4.16666666662260795726e-02 + r*( 8.33336798434219616221e-03 + r*( 1.38889490863777199667e-03 )))))); /* Step 3. Function value reconstruction. We now reconstruct the exponential of the input argument so that exp(x) = 2**m * (z1 + z2). The order of the computation below must be strictly observed. */ *z1 = f1; *z2 = f2 + ((f1 + f2) * q); } #endif /* USE_SPLITEXP */ #if defined(USE_SPLITEXPF) /* Compute the values m, z1, and z2 such that base**x = 2**m * (z1 + z2). Small arguments abs(x) < 1/(16*ln(base)) and extreme arguments abs(x) > large/(ln(base)) (where large is the largest representable floating point number) should be handled separately instead of calling this function. This function is called by exp, exp2, exp10, cosh and sinh. */ static inline void splitexpf(float x, float logbase, float thirtytwo_by_logbaseof2, float logbaseof2_by_32_lead, float logbaseof2_by_32_trail, int *m, float *z1, float *z2) { float q, r, r1, r2, f1, f2; int n, j; /* Arrays two_to_jby32_lead_table and two_to_jby32_trail_table contain leading and trailing parts respectively of precomputed values of pow(2.0,j/32.0), for j = 0, 1, ..., 31. two_to_jby32_lead_table contains the first 10 bits of precision, and two_to_jby32_trail_table contains a further 24 bits precision. */ static const float two_to_jby32_lead_table[32] = { 1.0000000000E+00F, /* 0x3F800000 */ 1.0214843750E+00F, /* 0x3F82C000 */ 1.0429687500E+00F, /* 0x3F858000 */ 1.0664062500E+00F, /* 0x3F888000 */ 1.0898437500E+00F, /* 0x3F8B8000 */ 1.1132812500E+00F, /* 0x3F8E8000 */ 1.1386718750E+00F, /* 0x3F91C000 */ 1.1621093750E+00F, /* 0x3F94C000 */ 1.1875000000E+00F, /* 0x3F980000 */ 1.2148437500E+00F, /* 0x3F9B8000 */ 1.2402343750E+00F, /* 0x3F9EC000 */ 1.2675781250E+00F, /* 0x3FA24000 */ 1.2949218750E+00F, /* 0x3FA5C000 */ 1.3242187500E+00F, /* 0x3FA98000 */ 1.3535156250E+00F, /* 0x3FAD4000 */ 1.3828125000E+00F, /* 0x3FB10000 */ 1.4140625000E+00F, /* 0x3FB50000 */ 1.4433593750E+00F, /* 0x3FB8C000 */ 1.4765625000E+00F, /* 0x3FBD0000 */ 1.5078125000E+00F, /* 0x3FC10000 */ 1.5410156250E+00F, /* 0x3FC54000 */ 1.5742187500E+00F, /* 0x3FC98000 */ 1.6093750000E+00F, /* 0x3FCE0000 */ 1.6445312500E+00F, /* 0x3FD28000 */ 1.6816406250E+00F, /* 0x3FD74000 */ 1.7167968750E+00F, /* 0x3FDBC000 */ 1.7558593750E+00F, /* 0x3FE0C000 */ 1.7929687500E+00F, /* 0x3FE58000 */ 1.8339843750E+00F, /* 0x3FEAC000 */ 1.8730468750E+00F, /* 0x3FEFC000 */ 1.9140625000E+00F, /* 0x3FF50000 */ 1.9570312500E+00F}; /* 0x3FFA8000 */ static const float two_to_jby32_trail_table[32] = { 0.0000000000E+00F, /* 0x00000000 */ 4.1277357377E-04F, /* 0x39D86988 */ 1.3050324051E-03F, /* 0x3AAB0D9F */ 7.3415064253E-04F, /* 0x3A407404 */ 6.6398258787E-04F, /* 0x3A2E0F1E */ 1.1054925853E-03F, /* 0x3A90E62D */ 1.1675967835E-04F, /* 0x38F4DCE0 */ 1.6154836630E-03F, /* 0x3AD3BEA3 */ 1.7071149778E-03F, /* 0x3ADFC146 */ 4.0360994171E-04F, /* 0x39D39B9C */ 1.6234370414E-03F, /* 0x3AD4C982 */ 1.4728321694E-03F, /* 0x3AC10C0C */ 1.9176795613E-03F, /* 0x3AFB5AA6 */ 1.0178930825E-03F, /* 0x3A856AD3 */ 7.3992193211E-04F, /* 0x3A41F752 */ 1.0973819299E-03F, /* 0x3A8FD607 */ 1.5106226783E-04F, /* 0x391E6678 */ 1.8214319134E-03F, /* 0x3AEEBD1D */ 2.6364589576E-04F, /* 0x398A39F4 */ 1.3519275235E-03F, /* 0x3AB13329 */ 1.1952003697E-03F, /* 0x3A9CA845 */ 1.7620950239E-03F, /* 0x3AE6F619 */ 1.1153318919E-03F, /* 0x3A923054 */ 1.2242280645E-03F, /* 0x3AA07647 */ 1.5220546629E-04F, /* 0x391F9958 */ 1.8224230735E-03F, /* 0x3AEEDE5F */ 3.9278529584E-04F, /* 0x39CDEEC0 */ 1.7403248930E-03F, /* 0x3AE41B9D */ 2.3711356334E-05F, /* 0x37C6E7C0 */ 1.1207590578E-03F, /* 0x3A92E66F */ 1.1440613307E-03F, /* 0x3A95F454 */ 1.1287408415E-04F}; /* 0x38ECB6D0 */ /* Step 1. Reduce the argument. To perform argument reduction, we find the integer n such that x = n * logbaseof2/32 + remainder, |remainder| <= logbaseof2/64. n is defined by round-to-nearest-integer( x*32/logbaseof2 ) and remainder by x - n*logbaseof2/32. The calculation of n is straightforward whereas the computation of x - n*logbaseof2/32 must be carried out carefully. logbaseof2/32 is so represented in two pieces that (1) logbaseof2/32 is known to extra precision, (2) the product of n and the leading piece is a model number and is hence calculated without error, and (3) the subtraction of the value obtained in (2) from x is a model number and is hence again obtained without error. */ r = x * thirtytwo_by_logbaseof2; /* Set n = nearest integer to r */ /* This is faster on Hammer */ if (r > 0) n = (int)(r + 0.5F); else n = (int)(r - 0.5F); r1 = x - n * logbaseof2_by_32_lead; r2 = - n * logbaseof2_by_32_trail; /* Set j = n mod 32: 5 mod 32 = 5, -5 mod 32 = 27, etc. */ /* j = n % 32; if (j < 0) j += 32; */ j = n & 0x0000001f; f1 = two_to_jby32_lead_table[j]; f2 = two_to_jby32_trail_table[j]; *m = (n - j) / 32; /* Step 2. The following is the core approximation. We approximate exp(r1+r2)-1 by a polynomial. */ r1 *= logbase; r2 *= logbase; r = r1 + r2; q = r1 + (r2 + r*r*( 5.00000000000000008883e-01F + r*( 1.66666666665260878863e-01F ))); /* Step 3. Function value reconstruction. We now reconstruct the exponential of the input argument so that exp(x) = 2**m * (z1 + z2). The order of the computation below must be strictly observed. */ *z1 = f1; *z2 = f2 + ((f1 + f2) * q); } #endif /* SPLITEXPF */ #if defined(USE_SCALEUPDOUBLE1024) /* Scales up a double (normal or denormal) whose bit pattern is given as ux by 2**1024. There are no checks that the input number is scalable by that amount. */ static inline void scaleUpDouble1024(unsigned long long ux, unsigned long long *ur) { unsigned long long uy; double y; if ((ux & EXPBITS_DP64) == 0) { /* ux is denormalised */ PUT_BITS_DP64(ux | 0x4010000000000000, y); if (ux & SIGNBIT_DP64) y += 4.0; else y -= 4.0; GET_BITS_DP64(y, uy); } else /* ux is normal */ uy = ux + 0x4000000000000000; *ur = uy; return; } #endif /* SCALEUPDOUBLE1024 */ #if defined(USE_SCALEDOWNDOUBLE) /* Scales down a double whose bit pattern is given as ux by 2**k. There are no checks that the input number is scalable by that amount. */ static inline void scaleDownDouble(unsigned long long ux, int k, unsigned long long *ur) { unsigned long long uy, uk, ax, xsign; int n, shift; xsign = ux & SIGNBIT_DP64; ax = ux & ~SIGNBIT_DP64; n = (int)((ax & EXPBITS_DP64) >> EXPSHIFTBITS_DP64) - k; if (n > 0) { uk = (unsigned long long)n << EXPSHIFTBITS_DP64; uy = (ax & ~EXPBITS_DP64) | uk; } else { uy = (ax & ~EXPBITS_DP64) | 0x0010000000000000; shift = (1 - n); if (shift > MANTLENGTH_DP64 + 1) /* Sigh. Shifting works mod 64 so be careful not to shift too much */ uy = 0; else { /* Make sure we round the result */ uy >>= shift - 1; uy = (uy >> 1) + (uy & 1); } } *ur = uy | xsign; } #endif /* SCALEDOWNDOUBLE */ #if defined(USE_SCALEUPFLOAT128) /* Scales up a float (normal or denormal) whose bit pattern is given as ux by 2**128. There are no checks that the input number is scalable by that amount. */ static inline void scaleUpFloat128(unsigned int ux, unsigned int *ur) { unsigned int uy; float y; if ((ux & EXPBITS_SP32) == 0) { /* ux is denormalised */ PUT_BITS_SP32(ux | 0x40800000, y); /* Compensate for the implicit bit just added */ if (ux & SIGNBIT_SP32) y += 4.0F; else y -= 4.0F; GET_BITS_SP32(y, uy); } else /* ux is normal */ uy = ux + 0x40000000; *ur = uy; } #endif /* SCALEUPFLOAT128 */ #if defined(USE_SCALEDOWNFLOAT) /* Scales down a float whose bit pattern is given as ux by 2**k. There are no checks that the input number is scalable by that amount. */ static inline void scaleDownFloat(unsigned int ux, int k, unsigned int *ur) { unsigned int uy, uk, ax, xsign; int n, shift; xsign = ux & SIGNBIT_SP32; ax = ux & ~SIGNBIT_SP32; n = ((ax & EXPBITS_SP32) >> EXPSHIFTBITS_SP32) - k; if (n > 0) { uk = (unsigned int)n << EXPSHIFTBITS_SP32; uy = (ax & ~EXPBITS_SP32) | uk; } else { uy = (ax & ~EXPBITS_SP32) | 0x00800000; shift = (1 - n); if (shift > MANTLENGTH_SP32 + 1) /* Sigh. Shifting works mod 32 so be careful not to shift too much */ uy = 0; else { /* Make sure we round the result */ uy >>= shift - 1; uy = (uy >> 1) + (uy & 1); } } *ur = uy | xsign; } #endif /* SCALEDOWNFLOAT */ #if defined(USE_SQRT_AMD_INLINE) static inline double sqrt_amd_inline(double x) { /* Computes the square root of x. The calculation is carried out in three steps. Step 1. Reduction. The input argument is scaled to the interval [1, 4) by computing x = 2^e * y, where y in [1,4). Furthermore y is decomposed as y = c + t where c = 1 + j/32, j = 0,1,..,96; and |t| <= 1/64. Step 2. Approximation. An approximation q = sqrt(1 + (t/c)) - 1 is obtained from a basic series expansion using precomputed values stored in rt_jby32_lead_table_dbl and rt_jby32_trail_table_dbl. Step 3. Reconstruction. The value of sqrt(x) is reconstructed via sqrt(x) = 2^(e/2) * sqrt(y) = 2^(e/2) * sqrt(c) * sqrt(y/c) = 2^(e/2) * sqrt(c) * sqrt(1 + t/c) = 2^(e/2) * [ sqrt(c) + sqrt(c)*q ] */ unsigned long long ux, ax, u; double r1, r2, c, y, p, q, r, twop, z, rtc, rtc_lead, rtc_trail; int e, denorm = 0, index; /* Arrays rt_jby32_lead_table_dbl and rt_jby32_trail_table_dbl contain leading and trailing parts respectively of precomputed values of sqrt(j/32), for j = 32, 33, ..., 128. rt_jby32_lead_table_dbl contains the first 21 bits of precision, and rt_jby32_trail_table_dbl contains a further 53 bits precision. */ static const double rt_jby32_lead_table_dbl[97] = { 1.00000000000000000000e+00, /* 0x3ff0000000000000 */ 1.01550388336181640625e+00, /* 0x3ff03f8100000000 */ 1.03077602386474609375e+00, /* 0x3ff07e0f00000000 */ 1.04582500457763671875e+00, /* 0x3ff0bbb300000000 */ 1.06065940856933593750e+00, /* 0x3ff0f87600000000 */ 1.07528972625732421875e+00, /* 0x3ff1346300000000 */ 1.08972454071044921875e+00, /* 0x3ff16f8300000000 */ 1.10396957397460937500e+00, /* 0x3ff1a9dc00000000 */ 1.11803340911865234375e+00, /* 0x3ff1e37700000000 */ 1.13192272186279296875e+00, /* 0x3ff21c5b00000000 */ 1.14564323425292968750e+00, /* 0x3ff2548e00000000 */ 1.15920162200927734375e+00, /* 0x3ff28c1700000000 */ 1.17260360717773437500e+00, /* 0x3ff2c2fc00000000 */ 1.18585395812988281250e+00, /* 0x3ff2f94200000000 */ 1.19895744323730468750e+00, /* 0x3ff32eee00000000 */ 1.21191978454589843750e+00, /* 0x3ff3640600000000 */ 1.22474479675292968750e+00, /* 0x3ff3988e00000000 */ 1.23743629455566406250e+00, /* 0x3ff3cc8a00000000 */ 1.25000000000000000000e+00, /* 0x3ff4000000000000 */ 1.26243782043457031250e+00, /* 0x3ff432f200000000 */ 1.27475452423095703125e+00, /* 0x3ff4656500000000 */ 1.28695297241210937500e+00, /* 0x3ff4975c00000000 */ 1.29903793334960937500e+00, /* 0x3ff4c8dc00000000 */ 1.31101036071777343750e+00, /* 0x3ff4f9e600000000 */ 1.32287502288818359375e+00, /* 0x3ff52a7f00000000 */ 1.33463478088378906250e+00, /* 0x3ff55aaa00000000 */ 1.34629058837890625000e+00, /* 0x3ff58a6800000000 */ 1.35784721374511718750e+00, /* 0x3ff5b9be00000000 */ 1.36930561065673828125e+00, /* 0x3ff5e8ad00000000 */ 1.38066959381103515625e+00, /* 0x3ff6173900000000 */ 1.39194107055664062500e+00, /* 0x3ff6456400000000 */ 1.40312099456787109375e+00, /* 0x3ff6732f00000000 */ 1.41421318054199218750e+00, /* 0x3ff6a09e00000000 */ 1.42521858215332031250e+00, /* 0x3ff6cdb200000000 */ 1.43614006042480468750e+00, /* 0x3ff6fa6e00000000 */ 1.44697952270507812500e+00, /* 0x3ff726d400000000 */ 1.45773792266845703125e+00, /* 0x3ff752e500000000 */ 1.46841716766357421875e+00, /* 0x3ff77ea300000000 */ 1.47901916503906250000e+00, /* 0x3ff7aa1000000000 */ 1.48954677581787109375e+00, /* 0x3ff7d52f00000000 */ 1.50000000000000000000e+00, /* 0x3ff8000000000000 */ 1.51038074493408203125e+00, /* 0x3ff82a8500000000 */ 1.52068996429443359375e+00, /* 0x3ff854bf00000000 */ 1.53093051910400390625e+00, /* 0x3ff87eb100000000 */ 1.54110336303710937500e+00, /* 0x3ff8a85c00000000 */ 1.55120849609375000000e+00, /* 0x3ff8d1c000000000 */ 1.56124877929687500000e+00, /* 0x3ff8fae000000000 */ 1.57122516632080078125e+00, /* 0x3ff923bd00000000 */ 1.58113861083984375000e+00, /* 0x3ff94c5800000000 */ 1.59099006652832031250e+00, /* 0x3ff974b200000000 */ 1.60078048706054687500e+00, /* 0x3ff99ccc00000000 */ 1.61051177978515625000e+00, /* 0x3ff9c4a800000000 */ 1.62018489837646484375e+00, /* 0x3ff9ec4700000000 */ 1.62979984283447265625e+00, /* 0x3ffa13a900000000 */ 1.63935947418212890625e+00, /* 0x3ffa3ad100000000 */ 1.64886283874511718750e+00, /* 0x3ffa61be00000000 */ 1.65831184387207031250e+00, /* 0x3ffa887200000000 */ 1.66770744323730468750e+00, /* 0x3ffaaeee00000000 */ 1.67705059051513671875e+00, /* 0x3ffad53300000000 */ 1.68634128570556640625e+00, /* 0x3ffafb4100000000 */ 1.69558238983154296875e+00, /* 0x3ffb211b00000000 */ 1.70477199554443359375e+00, /* 0x3ffb46bf00000000 */ 1.71391296386718750000e+00, /* 0x3ffb6c3000000000 */ 1.72300529479980468750e+00, /* 0x3ffb916e00000000 */ 1.73204994201660156250e+00, /* 0x3ffbb67a00000000 */ 1.74104785919189453125e+00, /* 0x3ffbdb5500000000 */ 1.75000000000000000000e+00, /* 0x3ffc000000000000 */ 1.75890541076660156250e+00, /* 0x3ffc247a00000000 */ 1.76776695251464843750e+00, /* 0x3ffc48c600000000 */ 1.77658367156982421875e+00, /* 0x3ffc6ce300000000 */ 1.78535652160644531250e+00, /* 0x3ffc90d200000000 */ 1.79408740997314453125e+00, /* 0x3ffcb49500000000 */ 1.80277538299560546875e+00, /* 0x3ffcd82b00000000 */ 1.81142139434814453125e+00, /* 0x3ffcfb9500000000 */ 1.82002735137939453125e+00, /* 0x3ffd1ed500000000 */ 1.82859230041503906250e+00, /* 0x3ffd41ea00000000 */ 1.83711719512939453125e+00, /* 0x3ffd64d500000000 */ 1.84560203552246093750e+00, /* 0x3ffd879600000000 */ 1.85404872894287109375e+00, /* 0x3ffdaa2f00000000 */ 1.86245727539062500000e+00, /* 0x3ffdcca000000000 */ 1.87082862854003906250e+00, /* 0x3ffdeeea00000000 */ 1.87916183471679687500e+00, /* 0x3ffe110c00000000 */ 1.88745784759521484375e+00, /* 0x3ffe330700000000 */ 1.89571857452392578125e+00, /* 0x3ffe54dd00000000 */ 1.90394306182861328125e+00, /* 0x3ffe768d00000000 */ 1.91213226318359375000e+00, /* 0x3ffe981800000000 */ 1.92028617858886718750e+00, /* 0x3ffeb97e00000000 */ 1.92840576171875000000e+00, /* 0x3ffedac000000000 */ 1.93649101257324218750e+00, /* 0x3ffefbde00000000 */ 1.94454288482666015625e+00, /* 0x3fff1cd900000000 */ 1.95256233215332031250e+00, /* 0x3fff3db200000000 */ 1.96054744720458984375e+00, /* 0x3fff5e6700000000 */ 1.96850109100341796875e+00, /* 0x3fff7efb00000000 */ 1.97642326354980468750e+00, /* 0x3fff9f6e00000000 */ 1.98431301116943359375e+00, /* 0x3fffbfbf00000000 */ 1.99217128753662109375e+00, /* 0x3fffdfef00000000 */ 2.00000000000000000000e+00}; /* 0x4000000000000000 */ static const double rt_jby32_trail_table_dbl[97] = { 0.00000000000000000000e+00, /* 0x0000000000000000 */ 9.17217678638807524014e-07, /* 0x3eaec6d70177881c */ 3.82539669043705364790e-07, /* 0x3e99abfb41bd6b24 */ 2.85899577162227138140e-08, /* 0x3e5eb2bf6bab55a2 */ 7.63210485349101216659e-07, /* 0x3ea99bed9b2d8d0c */ 9.32123004127716212874e-07, /* 0x3eaf46e029c1b296 */ 1.95174719169309219157e-07, /* 0x3e8a3226fc42f30c */ 5.34316371481845492427e-07, /* 0x3ea1edbe20701d73 */ 5.79631242504454563052e-07, /* 0x3ea372fe94f82be7 */ 4.20404384109571705948e-07, /* 0x3e9c367e08e7bb06 */ 6.89486030314147010716e-07, /* 0x3ea722a3d0a66608 */ 6.89927685625314560328e-07, /* 0x3ea7266f067ca1d6 */ 3.32778123013641425828e-07, /* 0x3e965515a9b34850 */ 1.64433259436999584387e-07, /* 0x3e8611e23ef6c1bd */ 4.37590875197899335723e-07, /* 0x3e9d5dc1059ed8e7 */ 1.79808183816018617413e-07, /* 0x3e88222982d0e4f4 */ 7.46386593615986477624e-08, /* 0x3e7409212e7d0322 */ 5.72520794105201454728e-07, /* 0x3ea335ea8a5fcf39 */ 0.00000000000000000000e+00, /* 0x0000000000000000 */ 2.96860689431670420344e-07, /* 0x3e93ec071e938bfe */ 3.54167239176257065345e-07, /* 0x3e97c48bfd9862c6 */ 7.95211265664474710063e-07, /* 0x3eaaaed010f74671 */ 1.72327048595145565621e-07, /* 0x3e87211cbfeb62e0 */ 6.99494915996239297020e-07, /* 0x3ea7789d9660e72d */ 6.32644111701500844315e-07, /* 0x3ea53a5f1d36f1cf */ 6.20124838851440463844e-10, /* 0x3e054eacff2057dc */ 6.13404719757812629969e-07, /* 0x3ea4951b3e6a83cc */ 3.47654909777986407387e-07, /* 0x3e9754aa76884c66 */ 7.83106177002392475763e-07, /* 0x3eaa46d4b1de1074 */ 5.33337372440526357008e-07, /* 0x3ea1e55548f92635 */ 2.01508648555298681765e-08, /* 0x3e55a3070dd17788 */ 5.25472356925843939587e-07, /* 0x3ea1a1c5eedb0801 */ 3.81831102861301692797e-07, /* 0x3e999fcef32422cc */ 6.99220602161420018738e-07, /* 0x3ea776425d6b0199 */ 6.01209702477462624811e-07, /* 0x3ea42c5a1e0191a2 */ 9.01437000591944740554e-08, /* 0x3e7832a0bdff1327 */ 5.10428680864685379950e-08, /* 0x3e6b674743636676 */ 3.47895267104621031421e-07, /* 0x3e9758cb90d2f714 */ 7.80735841510641848628e-07, /* 0x3eaa3278459cde25 */ 1.35158752025506517690e-07, /* 0x3e822404f4a103ee */ 0.00000000000000000000e+00, /* 0x0000000000000000 */ 1.76523947728535489812e-09, /* 0x3e1e539af6892ac5 */ 6.68280121328499932183e-07, /* 0x3ea66c7b872c9cd0 */ 5.70135482405123276616e-07, /* 0x3ea3216d2f43887d */ 1.37705134737562525897e-07, /* 0x3e827b832cbedc0e */ 7.09655107074516613672e-07, /* 0x3ea7cfe41579091d */ 7.20302724551461693011e-07, /* 0x3ea82b5a713c490a */ 4.69926266058212796694e-07, /* 0x3e9f8945932d872e */ 2.19244345915999437026e-07, /* 0x3e8d6d2da9490251 */ 1.91141411617401877927e-07, /* 0x3e89a791a3114e4a */ 5.72297665296622053774e-07, /* 0x3ea333ffe005988d */ 5.61055484436830560103e-07, /* 0x3ea2d36e0ed49ab1 */ 2.76225500213991506100e-07, /* 0x3e92898498f55f9e */ 7.58466189522395692908e-07, /* 0x3ea9732cca1032a3 */ 1.56893371256836029827e-07, /* 0x3e850ed0b02a22d2 */ 4.06038997708867066507e-07, /* 0x3e9b3fb265b1e40a */ 5.51305629612057435809e-07, /* 0x3ea27fade682d1de */ 5.64778487026561123207e-07, /* 0x3ea2f36906f707ba */ 3.92609705553556897517e-07, /* 0x3e9a58fbbee883b6 */ 9.09698438776943827802e-07, /* 0x3eae864005bca6d7 */ 1.05949774066016139743e-07, /* 0x3e7c70d02300f263 */ 7.16578798392844784244e-07, /* 0x3ea80b5d712d8e3e */ 6.86233073531233972561e-07, /* 0x3ea706b27cc7d390 */ 7.99211473033494452908e-07, /* 0x3eaad12c9d849a97 */ 8.65552275731027456121e-07, /* 0x3ead0b09954e764b */ 6.75456120386058448618e-07, /* 0x3ea6aa1fb7826cbd */ 0.00000000000000000000e+00, /* 0x0000000000000000 */ 4.99167184520462138743e-07, /* 0x3ea0bfd03f46763c */ 4.51720373502110930296e-10, /* 0x3dff0abfb4adfb9e */ 1.28874162718371367439e-07, /* 0x3e814c151f991b2e */ 5.85529267186999798656e-07, /* 0x3ea3a5a879b09292 */ 1.01827770937125531924e-07, /* 0x3e7b558d173f9796 */ 2.54736389177809626508e-07, /* 0x3e9118567cd83fb8 */ 6.98925535290464831294e-07, /* 0x3ea773b981896751 */ 1.20940735036524314513e-07, /* 0x3e803b7df49f48a8 */ 5.43759351196479689657e-08, /* 0x3e6d315f22491900 */ 1.11957989042397958409e-07, /* 0x3e7e0db1c5bb84b2 */ 8.47006714134442661218e-07, /* 0x3eac6bbb7644ff76 */ 8.92831044643427836228e-07, /* 0x3eadf55c3afec01f */ 7.77828292464916501663e-07, /* 0x3eaa197e81034da3 */ 6.48469316302918797451e-08, /* 0x3e71683f4920555d */ 2.12579816658859849140e-07, /* 0x3e8c882fd78bb0b0 */ 7.61222472580559138435e-07, /* 0x3ea98ad9eb7b83ec */ 2.86488961857314189607e-07, /* 0x3e9339d7c7777273 */ 2.14637363790165363515e-07, /* 0x3e8ccee237cae6fe */ 5.44137005612605847831e-08, /* 0x3e6d368fe324a146 */ 2.58378284856442408413e-07, /* 0x3e9156e7b6d99b45 */ 3.15848939061134843091e-07, /* 0x3e95323e5310b5c1 */ 6.60530466255089632309e-07, /* 0x3ea629e9db362f5d */ 7.63436345535852301127e-07, /* 0x3ea99dde4728d7ec */ 8.68233432860324345268e-08, /* 0x3e774e746878544d */ 9.45465175398023087082e-07, /* 0x3eafb97be873a87d */ 8.77499534786171267246e-07, /* 0x3ead71a9e23c2f63 */ 2.74055432394999316135e-07, /* 0x3e92643c89cda173 */ 4.72129009349126213532e-07, /* 0x3e9faf1d57a4d56c */ 8.93777032327078947306e-07, /* 0x3eadfd7c7ab7b282 */ 0.00000000000000000000e+00}; /* 0x0000000000000000 */ /* Handle special arguments first */ GET_BITS_DP64(x, ux); ax = ux & (~SIGNBIT_DP64); if(ax >= 0x7ff0000000000000) { /* x is either NaN or infinity */ if (ux & MANTBITS_DP64) /* x is NaN */ return x + x; /* Raise invalid if it is a signalling NaN */ else if (ux & SIGNBIT_DP64) /* x is negative infinity */ return nan_with_flags(AMD_F_INVALID); else /* x is positive infinity */ return x; } else if (ux & SIGNBIT_DP64) { /* x is negative. */ if (ux == SIGNBIT_DP64) /* Handle negative zero first */ return x; else return nan_with_flags(AMD_F_INVALID); } else if (ux <= 0x000fffffffffffff) { /* x is denormalised or zero */ if (ux == 0) /* x is zero */ return x; else { /* x is denormalised; scale it up */ /* Normalize x by increasing the exponent by 60 and subtracting a correction to account for the implicit bit. This replaces a slow denormalized multiplication by a fast normal subtraction. */ static const double corr = 2.5653355008114851558350183e-290; /* 0x03d0000000000000 */ denorm = 1; GET_BITS_DP64(x, ux); PUT_BITS_DP64(ux | 0x03d0000000000000, x); x -= corr; GET_BITS_DP64(x, ux); } } /* Main algorithm */ /* Find y and e such that x = 2^e * y, where y in [1,4). This is done using an in-lined variant of splitDouble, which also ensures that e is even. */ y = x; ux &= EXPBITS_DP64; ux >>= EXPSHIFTBITS_DP64; if (ux & 1) { GET_BITS_DP64(y, u); u &= (SIGNBIT_DP64 | MANTBITS_DP64); u |= ONEEXPBITS_DP64; PUT_BITS_DP64(u, y); e = ux - EXPBIAS_DP64; } else { GET_BITS_DP64(y, u); u &= (SIGNBIT_DP64 | MANTBITS_DP64); u |= TWOEXPBITS_DP64; PUT_BITS_DP64(u, y); e = ux - EXPBIAS_DP64 - 1; } /* Find the index of the sub-interval of [1,4) in which y lies. */ index = (int)(32.0*y+0.5); /* Look up the table values and compute c and r = c/t */ rtc_lead = rt_jby32_lead_table_dbl[index-32]; rtc_trail = rt_jby32_trail_table_dbl[index-32]; c = 0.03125*index; r = (y - c)/c; /* Find q = sqrt(1+r) - 1. From one step of Newton on (q+1)^2 = 1+r */ p = r*0.5 - r*r*(0.1250079870 - r*(0.6250522999E-01)); twop = p + p; q = p - (p*p + (twop - r))/(twop + 2.0); /* Reconstruction */ rtc = rtc_lead + rtc_trail; e >>= 1; /* e = e/2 */ z = rtc_lead + (rtc*q+rtc_trail); if (denorm) { /* Scale by 2**(e-30) */ PUT_BITS_DP64(((long long)(e - 30) + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, r); z *= r; } else { /* Scale by 2**e */ PUT_BITS_DP64(((long long)e + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, r); z *= r; } return z; } #endif /* SQRT_AMD_INLINE */ #if defined(USE_SQRTF_AMD_INLINE) static inline float sqrtf_amd_inline(float x) { /* Computes the square root of x. The calculation is carried out in three steps. Step 1. Reduction. The input argument is scaled to the interval [1, 4) by computing x = 2^e * y, where y in [1,4). Furthermore y is decomposed as y = c + t where c = 1 + j/32, j = 0,1,..,96; and |t| <= 1/64. Step 2. Approximation. An approximation q = sqrt(1 + (t/c)) - 1 is obtained from a basic series expansion using precomputed values stored in rt_jby32_lead_table_float and rt_jby32_trail_table_float. Step 3. Reconstruction. The value of sqrt(x) is reconstructed via sqrt(x) = 2^(e/2) * sqrt(y) = 2^(e/2) * sqrt(c) * sqrt(y/c) = 2^(e/2) * sqrt(c) * sqrt(1 + t/c) = 2^(e/2) * [ sqrt(c) + sqrt(c)*q ] */ unsigned int ux, ax, u; float r1, r2, c, y, p, q, r, twop, z, rtc, rtc_lead, rtc_trail; int e, denorm = 0, index; /* Arrays rt_jby32_lead_table_float and rt_jby32_trail_table_float contain leading and trailing parts respectively of precomputed values of sqrt(j/32), for j = 32, 33, ..., 128. rt_jby32_lead_table_float contains the first 13 bits of precision, and rt_jby32_trail_table_float contains a further 24 bits precision. */ static const float rt_jby32_lead_table_float[97] = { 1.00000000000000000000e+00F, /* 0x3f800000 */ 1.01538085937500000000e+00F, /* 0x3f81f800 */ 1.03076171875000000000e+00F, /* 0x3f83f000 */ 1.04565429687500000000e+00F, /* 0x3f85d800 */ 1.06054687500000000000e+00F, /* 0x3f87c000 */ 1.07519531250000000000e+00F, /* 0x3f89a000 */ 1.08959960937500000000e+00F, /* 0x3f8b7800 */ 1.10375976562500000000e+00F, /* 0x3f8d4800 */ 1.11791992187500000000e+00F, /* 0x3f8f1800 */ 1.13183593750000000000e+00F, /* 0x3f90e000 */ 1.14550781250000000000e+00F, /* 0x3f92a000 */ 1.15917968750000000000e+00F, /* 0x3f946000 */ 1.17236328125000000000e+00F, /* 0x3f961000 */ 1.18579101562500000000e+00F, /* 0x3f97c800 */ 1.19873046875000000000e+00F, /* 0x3f997000 */ 1.21191406250000000000e+00F, /* 0x3f9b2000 */ 1.22460937500000000000e+00F, /* 0x3f9cc000 */ 1.23730468750000000000e+00F, /* 0x3f9e6000 */ 1.25000000000000000000e+00F, /* 0x3fa00000 */ 1.26220703125000000000e+00F, /* 0x3fa19000 */ 1.27465820312500000000e+00F, /* 0x3fa32800 */ 1.28686523437500000000e+00F, /* 0x3fa4b800 */ 1.29882812500000000000e+00F, /* 0x3fa64000 */ 1.31079101562500000000e+00F, /* 0x3fa7c800 */ 1.32275390625000000000e+00F, /* 0x3fa95000 */ 1.33447265625000000000e+00F, /* 0x3faad000 */ 1.34619140625000000000e+00F, /* 0x3fac5000 */ 1.35766601562500000000e+00F, /* 0x3fadc800 */ 1.36914062500000000000e+00F, /* 0x3faf4000 */ 1.38061523437500000000e+00F, /* 0x3fb0b800 */ 1.39184570312500000000e+00F, /* 0x3fb22800 */ 1.40307617187500000000e+00F, /* 0x3fb39800 */ 1.41406250000000000000e+00F, /* 0x3fb50000 */ 1.42504882812500000000e+00F, /* 0x3fb66800 */ 1.43603515625000000000e+00F, /* 0x3fb7d000 */ 1.44677734375000000000e+00F, /* 0x3fb93000 */ 1.45751953125000000000e+00F, /* 0x3fba9000 */ 1.46826171875000000000e+00F, /* 0x3fbbf000 */ 1.47900390625000000000e+00F, /* 0x3fbd5000 */ 1.48950195312500000000e+00F, /* 0x3fbea800 */ 1.50000000000000000000e+00F, /* 0x3fc00000 */ 1.51025390625000000000e+00F, /* 0x3fc15000 */ 1.52050781250000000000e+00F, /* 0x3fc2a000 */ 1.53076171875000000000e+00F, /* 0x3fc3f000 */ 1.54101562500000000000e+00F, /* 0x3fc54000 */ 1.55102539062500000000e+00F, /* 0x3fc68800 */ 1.56103515625000000000e+00F, /* 0x3fc7d000 */ 1.57104492187500000000e+00F, /* 0x3fc91800 */ 1.58105468750000000000e+00F, /* 0x3fca6000 */ 1.59082031250000000000e+00F, /* 0x3fcba000 */ 1.60058593750000000000e+00F, /* 0x3fcce000 */ 1.61035156250000000000e+00F, /* 0x3fce2000 */ 1.62011718750000000000e+00F, /* 0x3fcf6000 */ 1.62963867187500000000e+00F, /* 0x3fd09800 */ 1.63916015625000000000e+00F, /* 0x3fd1d000 */ 1.64868164062500000000e+00F, /* 0x3fd30800 */ 1.65820312500000000000e+00F, /* 0x3fd44000 */ 1.66748046875000000000e+00F, /* 0x3fd57000 */ 1.67700195312500000000e+00F, /* 0x3fd6a800 */ 1.68627929687500000000e+00F, /* 0x3fd7d800 */ 1.69555664062500000000e+00F, /* 0x3fd90800 */ 1.70458984375000000000e+00F, /* 0x3fda3000 */ 1.71386718750000000000e+00F, /* 0x3fdb6000 */ 1.72290039062500000000e+00F, /* 0x3fdc8800 */ 1.73193359375000000000e+00F, /* 0x3fddb000 */ 1.74096679687500000000e+00F, /* 0x3fded800 */ 1.75000000000000000000e+00F, /* 0x3fe00000 */ 1.75878906250000000000e+00F, /* 0x3fe12000 */ 1.76757812500000000000e+00F, /* 0x3fe24000 */ 1.77636718750000000000e+00F, /* 0x3fe36000 */ 1.78515625000000000000e+00F, /* 0x3fe48000 */ 1.79394531250000000000e+00F, /* 0x3fe5a000 */ 1.80273437500000000000e+00F, /* 0x3fe6c000 */ 1.81127929687500000000e+00F, /* 0x3fe7d800 */ 1.81982421875000000000e+00F, /* 0x3fe8f000 */ 1.82836914062500000000e+00F, /* 0x3fea0800 */ 1.83691406250000000000e+00F, /* 0x3feb2000 */ 1.84545898437500000000e+00F, /* 0x3fec3800 */ 1.85400390625000000000e+00F, /* 0x3fed5000 */ 1.86230468750000000000e+00F, /* 0x3fee6000 */ 1.87060546875000000000e+00F, /* 0x3fef7000 */ 1.87915039062500000000e+00F, /* 0x3ff08800 */ 1.88745117187500000000e+00F, /* 0x3ff19800 */ 1.89550781250000000000e+00F, /* 0x3ff2a000 */ 1.90380859375000000000e+00F, /* 0x3ff3b000 */ 1.91210937500000000000e+00F, /* 0x3ff4c000 */ 1.92016601562500000000e+00F, /* 0x3ff5c800 */ 1.92822265625000000000e+00F, /* 0x3ff6d000 */ 1.93627929687500000000e+00F, /* 0x3ff7d800 */ 1.94433593750000000000e+00F, /* 0x3ff8e000 */ 1.95239257812500000000e+00F, /* 0x3ff9e800 */ 1.96044921875000000000e+00F, /* 0x3ffaf000 */ 1.96826171875000000000e+00F, /* 0x3ffbf000 */ 1.97631835937500000000e+00F, /* 0x3ffcf800 */ 1.98413085937500000000e+00F, /* 0x3ffdf800 */ 1.99194335937500000000e+00F, /* 0x3ffef800 */ 2.00000000000000000000e+00F}; /* 0x40000000 */ static const float rt_jby32_trail_table_float[97] = { 0.00000000000000000000e+00F, /* 0x00000000 */ 1.23941208585165441036e-04F, /* 0x3901f637 */ 1.46876545841223560274e-05F, /* 0x37766aff */ 1.70736297150142490864e-04F, /* 0x393307ad */ 1.13296780909877270460e-04F, /* 0x38ed99bf */ 9.53458802541717886925e-05F, /* 0x38c7f46e */ 1.25126505736261606216e-04F, /* 0x39033464 */ 2.10342666832730174065e-04F, /* 0x395c8f6e */ 1.14066875539720058441e-04F, /* 0x38ef3730 */ 8.72047676239162683487e-05F, /* 0x38b6e1b4 */ 1.36111237225122749805e-04F, /* 0x390eb915 */ 2.26244374061934649944e-05F, /* 0x37bdc99c */ 2.40658700931817293167e-04F, /* 0x397c5954 */ 6.31069415248930454254e-05F, /* 0x38845848 */ 2.27412077947519719601e-04F, /* 0x396e7577 */ 5.90185391047270968556e-06F, /* 0x36c6088a */ 1.35496389702893793583e-04F, /* 0x390e1409 */ 1.32179571664892137051e-04F, /* 0x390a99af */ 0.00000000000000000000e+00F, /* 0x00000000 */ 2.31086043640971183777e-04F, /* 0x39724fb0 */ 9.66752704698592424393e-05F, /* 0x38cabe24 */ 8.85332483449019491673e-05F, /* 0x38b9aaed */ 2.09980673389509320259e-04F, /* 0x395c2e42 */ 2.20044588786549866199e-04F, /* 0x3966bbc5 */ 1.21749282698146998882e-04F, /* 0x38ff53a6 */ 1.62125259521417319775e-04F, /* 0x392a002b */ 9.97955357888713479042e-05F, /* 0x38d14952 */ 1.81545779923908412457e-04F, /* 0x393e5d53 */ 1.65768768056295812130e-04F, /* 0x392dd237 */ 5.48927710042335093021e-05F, /* 0x38663caa */ 9.53875860432162880898e-05F, /* 0x38c80ad2 */ 4.53481625299900770187e-05F, /* 0x383e3438 */ 1.51062369695864617825e-04F, /* 0x391e667f */ 1.70453247847035527229e-04F, /* 0x3932bbb2 */ 1.05505387182347476482e-04F, /* 0x38dd42c6 */ 2.02269104192964732647e-04F, /* 0x39541833 */ 2.18442466575652360916e-04F, /* 0x39650db4 */ 1.55796806211583316326e-04F, /* 0x39235d63 */ 1.60395247803535312414e-05F, /* 0x37868c9e */ 4.49578510597348213196e-05F, /* 0x383c9120 */ 0.00000000000000000000e+00F, /* 0x00000000 */ 1.26840444863773882389e-04F, /* 0x39050079 */ 1.82820076588541269302e-04F, /* 0x393fb364 */ 1.69370483490638434887e-04F, /* 0x3931990b */ 8.78757418831810355186e-05F, /* 0x38b849ee */ 1.83815121999941766262e-04F, /* 0x3940be7f */ 2.14343352126888930798e-04F, /* 0x3960c15b */ 1.80714370799250900745e-04F, /* 0x393d7e25 */ 8.41425862745381891727e-05F, /* 0x38b075b5 */ 1.69945167726837098598e-04F, /* 0x3932334f */ 1.95121858268976211548e-04F, /* 0x394c99a0 */ 1.60778334247879683971e-04F, /* 0x3928969b */ 6.79871009197086095810e-05F, /* 0x388e944c */ 1.61929419846273958683e-04F, /* 0x3929cb99 */ 1.99474830878898501396e-04F, /* 0x39512a1e */ 1.81604162207804620266e-04F, /* 0x393e6cff */ 1.09270178654696792364e-04F, /* 0x38e527fb */ 2.27539261686615645885e-04F, /* 0x396e979b */ 4.90300008095800876617e-05F, /* 0x384da590 */ 6.28985289949923753738e-05F, /* 0x3883e864 */ 2.58551553997676819563e-05F, /* 0x37d8e386 */ 1.82868374395184218884e-04F, /* 0x393fc05b */ 4.64625991298817098141e-05F, /* 0x3842e0d6 */ 1.05703387816902250051e-04F, /* 0x38ddad13 */ 1.17213814519345760345e-04F, /* 0x38f5d0b0 */ 8.17377731436863541603e-05F, /* 0x38ab6aa2 */ 0.00000000000000000000e+00F, /* 0x00000000 */ 1.16847433673683553934e-04F, /* 0x38f50bfd */ 1.88827965757809579372e-04F, /* 0x3946001f */ 2.16612941585481166840e-04F, /* 0x39632298 */ 2.00857131858356297016e-04F, /* 0x39529d2d */ 1.42199307447299361229e-04F, /* 0x39151b56 */ 4.12627305195201188326e-05F, /* 0x382d1185 */ 1.42796401632949709892e-04F, /* 0x3915bb9e */ 2.03253570361994206905e-04F, /* 0x39552077 */ 2.23214170546270906925e-04F, /* 0x396a0e99 */ 2.03244591830298304558e-04F, /* 0x39551e0e */ 1.43898156238719820976e-04F, /* 0x3916e35e */ 4.57155256299301981926e-05F, /* 0x383fbeac */ 1.53365719597786664963e-04F, /* 0x3920d0cc */ 2.23224633373320102692e-04F, /* 0x396a1168 */ 1.16566716314991936088e-05F, /* 0x37439106 */ 7.43694272387074306607e-06F, /* 0x36f98ada */ 2.11048507480882108212e-04F, /* 0x395d4ce7 */ 1.34682719362899661064e-04F, /* 0x390d399e */ 2.29425968427676707506e-05F, /* 0x37c074da */ 1.20421340398024767637e-04F, /* 0x38fc8ab7 */ 1.83421318070031702518e-04F, /* 0x394054c9 */ 2.12376224226318299770e-04F, /* 0x395eb14f */ 2.07710763788782060146e-04F, /* 0x3959ccef */ 1.69840845046564936638e-04F, /* 0x3932174e */ 9.91739216260612010956e-05F, /* 0x38cffb98 */ 2.40249748458154499531e-04F, /* 0x397beb8d */ 1.05178231024183332920e-04F, /* 0x38dc9322 */ 1.82623916771262884140e-04F, /* 0x393f7ebc */ 2.28821940254420042038e-04F, /* 0x396fefec */ 0.00000000000000000000e+00F}; /* 0x00000000 */ /* Handle special arguments first */ GET_BITS_SP32(x, ux); ax = ux & (~SIGNBIT_SP32); if(ax >= 0x7f800000) { /* x is either NaN or infinity */ if (ux & MANTBITS_SP32) /* x is NaN */ return x + x; /* Raise invalid if it is a signalling NaN */ else if (ux & SIGNBIT_SP32) return nanf_with_flags(AMD_F_INVALID); else /* x is positive infinity */ return x; } else if (ux & SIGNBIT_SP32) { /* x is negative. */ if (x == 0.0F) /* Handle negative zero first */ return x; else return nanf_with_flags(AMD_F_INVALID); } else if (ux <= 0x007fffff) { /* x is denormalised or zero */ if (ux == 0) /* x is zero */ return x; else { /* x is denormalised; scale it up */ /* Normalize x by increasing the exponent by 26 and subtracting a correction to account for the implicit bit. This replaces a slow denormalized multiplication by a fast normal subtraction. */ static const float corr = 7.888609052210118054e-31F; /* 0x0d800000 */ denorm = 1; GET_BITS_SP32(x, ux); PUT_BITS_SP32(ux | 0x0d800000, x); x -= corr; GET_BITS_SP32(x, ux); } } /* Main algorithm */ /* Find y and e such that x = 2^e * y, where y in [1,4). This is done using an in-lined variant of splitFloat, which also ensures that e is even. */ y = x; ux &= EXPBITS_SP32; ux >>= EXPSHIFTBITS_SP32; if (ux & 1) { GET_BITS_SP32(y, u); u &= (SIGNBIT_SP32 | MANTBITS_SP32); u |= ONEEXPBITS_SP32; PUT_BITS_SP32(u, y); e = ux - EXPBIAS_SP32; } else { GET_BITS_SP32(y, u); u &= (SIGNBIT_SP32 | MANTBITS_SP32); u |= TWOEXPBITS_SP32; PUT_BITS_SP32(u, y); e = ux - EXPBIAS_SP32 - 1; } /* Find the index of the sub-interval of [1,4) in which y lies. */ index = (int)(32.0F*y+0.5); /* Look up the table values and compute c and r = c/t */ rtc_lead = rt_jby32_lead_table_float[index-32]; rtc_trail = rt_jby32_trail_table_float[index-32]; c = 0.03125F*index; r = (y - c)/c; /* Find q = sqrt(1+r) - 1. From one step of Newton on (q+1)^2 = 1+r */ p = r*0.5F - r*r*(0.1250079870F - r*(0.6250522999e-01F)); twop = p + p; q = p - (p*p + (twop - r))/(twop + 2.0); /* Reconstruction */ rtc = rtc_lead + rtc_trail; e >>= 1; /* e = e/2 */ z = rtc_lead + (rtc*q+rtc_trail); if (denorm) { /* Scale by 2**(e-13) */ PUT_BITS_SP32(((e - 13) + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, r); z *= r; } else { /* Scale by 2**e */ PUT_BITS_SP32((e + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, r); z *= r; } return z; } #endif /* SQRTF_AMD_INLINE */ #ifdef USE_LOG_KERNEL_AMD static inline void log_kernel_amd64(double x, unsigned long long ux, int *xexp, double *r1, double *r2) { int expadjust; double r, z1, z2, correction, f, f1, f2, q, u, v, poly; int index; /* Computes natural log(x). Algorithm based on: Ping-Tak Peter Tang "Table-driven implementation of the logarithm function in IEEE floating-point arithmetic" ACM Transactions on Mathematical Software (TOMS) Volume 16, Issue 4 (December 1990) */ /* Arrays ln_lead_table and ln_tail_table contain leading and trailing parts respectively of precomputed values of natural log(1+i/64), for i = 0, 1, ..., 64. ln_lead_table contains the first 24 bits of precision, and ln_tail_table contains a further 53 bits precision. */ static const double ln_lead_table[65] = { 0.00000000000000000000e+00, /* 0x0000000000000000 */ 1.55041813850402832031e-02, /* 0x3f8fc0a800000000 */ 3.07716131210327148438e-02, /* 0x3f9f829800000000 */ 4.58095073699951171875e-02, /* 0x3fa7745800000000 */ 6.06245994567871093750e-02, /* 0x3faf0a3000000000 */ 7.52233862876892089844e-02, /* 0x3fb341d700000000 */ 8.96121263504028320312e-02, /* 0x3fb6f0d200000000 */ 1.03796780109405517578e-01, /* 0x3fba926d00000000 */ 1.17783010005950927734e-01, /* 0x3fbe270700000000 */ 1.31576299667358398438e-01, /* 0x3fc0d77e00000000 */ 1.45181953907012939453e-01, /* 0x3fc2955280000000 */ 1.58604979515075683594e-01, /* 0x3fc44d2b00000000 */ 1.71850204467773437500e-01, /* 0x3fc5ff3000000000 */ 1.84922337532043457031e-01, /* 0x3fc7ab8900000000 */ 1.97825729846954345703e-01, /* 0x3fc9525a80000000 */ 2.10564732551574707031e-01, /* 0x3fcaf3c900000000 */ 2.23143517971038818359e-01, /* 0x3fcc8ff780000000 */ 2.35566020011901855469e-01, /* 0x3fce270700000000 */ 2.47836112976074218750e-01, /* 0x3fcfb91800000000 */ 2.59957492351531982422e-01, /* 0x3fd0a324c0000000 */ 2.71933674812316894531e-01, /* 0x3fd1675c80000000 */ 2.83768117427825927734e-01, /* 0x3fd22941c0000000 */ 2.95464158058166503906e-01, /* 0x3fd2e8e280000000 */ 3.07025015354156494141e-01, /* 0x3fd3a64c40000000 */ 3.18453729152679443359e-01, /* 0x3fd4618bc0000000 */ 3.29753279685974121094e-01, /* 0x3fd51aad80000000 */ 3.40926527976989746094e-01, /* 0x3fd5d1bd80000000 */ 3.51976394653320312500e-01, /* 0x3fd686c800000000 */ 3.62905442714691162109e-01, /* 0x3fd739d7c0000000 */ 3.73716354370117187500e-01, /* 0x3fd7eaf800000000 */ 3.84411692619323730469e-01, /* 0x3fd89a3380000000 */ 3.94993782043457031250e-01, /* 0x3fd9479400000000 */ 4.05465066432952880859e-01, /* 0x3fd9f323c0000000 */ 4.15827870368957519531e-01, /* 0x3fda9cec80000000 */ 4.26084339618682861328e-01, /* 0x3fdb44f740000000 */ 4.36236739158630371094e-01, /* 0x3fdbeb4d80000000 */ 4.46287095546722412109e-01, /* 0x3fdc8ff7c0000000 */ 4.56237375736236572266e-01, /* 0x3fdd32fe40000000 */ 4.66089725494384765625e-01, /* 0x3fddd46a00000000 */ 4.75845873355865478516e-01, /* 0x3fde744240000000 */ 4.85507786273956298828e-01, /* 0x3fdf128f40000000 */ 4.95077252388000488281e-01, /* 0x3fdfaf5880000000 */ 5.04556000232696533203e-01, /* 0x3fe02552a0000000 */ 5.13945698738098144531e-01, /* 0x3fe0723e40000000 */ 5.23248136043548583984e-01, /* 0x3fe0be72e0000000 */ 5.32464742660522460938e-01, /* 0x3fe109f380000000 */ 5.41597247123718261719e-01, /* 0x3fe154c3c0000000 */ 5.50647079944610595703e-01, /* 0x3fe19ee6a0000000 */ 5.59615731239318847656e-01, /* 0x3fe1e85f40000000 */ 5.68504691123962402344e-01, /* 0x3fe23130c0000000 */ 5.77315330505371093750e-01, /* 0x3fe2795e00000000 */ 5.86049020290374755859e-01, /* 0x3fe2c0e9e0000000 */ 5.94707071781158447266e-01, /* 0x3fe307d720000000 */ 6.03290796279907226562e-01, /* 0x3fe34e2880000000 */ 6.11801505088806152344e-01, /* 0x3fe393e0c0000000 */ 6.20240390300750732422e-01, /* 0x3fe3d90260000000 */ 6.28608644008636474609e-01, /* 0x3fe41d8fe0000000 */ 6.36907458305358886719e-01, /* 0x3fe4618bc0000000 */ 6.45137906074523925781e-01, /* 0x3fe4a4f840000000 */ 6.53301239013671875000e-01, /* 0x3fe4e7d800000000 */ 6.61398470401763916016e-01, /* 0x3fe52a2d20000000 */ 6.69430613517761230469e-01, /* 0x3fe56bf9c0000000 */ 6.77398800849914550781e-01, /* 0x3fe5ad4040000000 */ 6.85303986072540283203e-01, /* 0x3fe5ee02a0000000 */ 6.93147122859954833984e-01}; /* 0x3fe62e42e0000000 */ static const double ln_tail_table[65] = { 0.00000000000000000000e+00, /* 0x0000000000000000 */ 5.15092497094772879206e-09, /* 0x3e361f807c79f3db */ 4.55457209735272790188e-08, /* 0x3e6873c1980267c8 */ 2.86612990859791781788e-08, /* 0x3e5ec65b9f88c69e */ 2.23596477332056055352e-08, /* 0x3e58022c54cc2f99 */ 3.49498983167142274770e-08, /* 0x3e62c37a3a125330 */ 3.23392843005887000414e-08, /* 0x3e615cad69737c93 */ 1.35722380472479366661e-08, /* 0x3e4d256ab1b285e9 */ 2.56504325268044191098e-08, /* 0x3e5b8abcb97a7aa2 */ 5.81213608741512136843e-08, /* 0x3e6f34239659a5dc */ 5.59374849578288093334e-08, /* 0x3e6e07fd48d30177 */ 5.06615629004996189970e-08, /* 0x3e6b32df4799f4f6 */ 5.24588857848400955725e-08, /* 0x3e6c29e4f4f21cf8 */ 9.61968535632653505972e-10, /* 0x3e1086c848df1b59 */ 1.34829655346594463137e-08, /* 0x3e4cf456b4764130 */ 3.65557749306383026498e-08, /* 0x3e63a02ffcb63398 */ 3.33431709374069198903e-08, /* 0x3e61e6a6886b0976 */ 5.13008650536088382197e-08, /* 0x3e6b8abcb97a7aa2 */ 5.09285070380306053751e-08, /* 0x3e6b578f8aa35552 */ 3.20853940845502057341e-08, /* 0x3e6139c871afb9fc */ 4.06713248643004200446e-08, /* 0x3e65d5d30701ce64 */ 5.57028186706125221168e-08, /* 0x3e6de7bcb2d12142 */ 5.48356693724804282546e-08, /* 0x3e6d708e984e1664 */ 1.99407553679345001938e-08, /* 0x3e556945e9c72f36 */ 1.96585517245087232086e-09, /* 0x3e20e2f613e85bda */ 6.68649386072067321503e-09, /* 0x3e3cb7e0b42724f6 */ 5.89936034642113390002e-08, /* 0x3e6fac04e52846c7 */ 2.85038578721554472484e-08, /* 0x3e5e9b14aec442be */ 5.09746772910284482606e-08, /* 0x3e6b5de8034e7126 */ 5.54234668933210171467e-08, /* 0x3e6dc157e1b259d3 */ 6.29100830926604004874e-09, /* 0x3e3b05096ad69c62 */ 2.61974119468563937716e-08, /* 0x3e5c2116faba4cdd */ 4.16752115011186398935e-08, /* 0x3e665fcc25f95b47 */ 2.47747534460820790327e-08, /* 0x3e5a9a08498d4850 */ 5.56922172017964209793e-08, /* 0x3e6de647b1465f77 */ 2.76162876992552906035e-08, /* 0x3e5da71b7bf7861d */ 7.08169709942321478061e-09, /* 0x3e3e6a6886b09760 */ 5.77453510221151779025e-08, /* 0x3e6f0075eab0ef64 */ 4.43021445893361960146e-09, /* 0x3e33071282fb989b */ 3.15140984357495864573e-08, /* 0x3e60eb43c3f1bed2 */ 2.95077445089736670973e-08, /* 0x3e5faf06ecb35c84 */ 1.44098510263167149349e-08, /* 0x3e4ef1e63db35f68 */ 1.05196987538551827693e-08, /* 0x3e469743fb1a71a5 */ 5.23641361722697546261e-08, /* 0x3e6c1cdf404e5796 */ 7.72099925253243069458e-09, /* 0x3e4094aa0ada625e */ 5.62089493829364197156e-08, /* 0x3e6e2d4c96fde3ec */ 3.53090261098577946927e-08, /* 0x3e62f4d5e9a98f34 */ 3.80080516835568242269e-08, /* 0x3e6467c96ecc5cbe */ 5.66961038386146408282e-08, /* 0x3e6e7040d03dec5a */ 4.42287063097349852717e-08, /* 0x3e67bebf4282de36 */ 3.45294525105681104660e-08, /* 0x3e6289b11aeb783f */ 2.47132034530447431509e-08, /* 0x3e5a891d1772f538 */ 3.59655343422487209774e-08, /* 0x3e634f10be1fb591 */ 5.51581770357780862071e-08, /* 0x3e6d9ce1d316eb93 */ 3.60171867511861372793e-08, /* 0x3e63562a19a9c442 */ 1.94511067964296180547e-08, /* 0x3e54e2adf548084c */ 1.54137376631349347838e-08, /* 0x3e508ce55cc8c97a */ 3.93171034490174464173e-09, /* 0x3e30e2f613e85bda */ 5.52990607758839766440e-08, /* 0x3e6db03ebb0227bf */ 3.29990737637586136511e-08, /* 0x3e61b75bb09cb098 */ 1.18436010922446096216e-08, /* 0x3e496f16abb9df22 */ 4.04248680368301346709e-08, /* 0x3e65b3f399411c62 */ 2.27418915900284316293e-08, /* 0x3e586b3e59f65355 */ 1.70263791333409206020e-08, /* 0x3e52482ceae1ac12 */ 5.76999904754328540596e-08}; /* 0x3e6efa39ef35793c */ /* Approximating polynomial coefficients for x near 1.0 */ static const double ca_1 = 8.33333333333317923934e-02, /* 0x3fb55555555554e6 */ ca_2 = 1.25000000037717509602e-02, /* 0x3f89999999bac6d4 */ ca_3 = 2.23213998791944806202e-03, /* 0x3f62492307f1519f */ ca_4 = 4.34887777707614552256e-04; /* 0x3f3c8034c85dfff0 */ /* Approximating polynomial coefficients for other x */ static const double cb_1 = 8.33333333333333593622e-02, /* 0x3fb5555555555557 */ cb_2 = 1.24999999978138668903e-02, /* 0x3f89999999865ede */ cb_3 = 2.23219810758559851206e-03; /* 0x3f6249423bd94741 */ static const unsigned long long log_thresh1 = 0x3fee0faa00000000, log_thresh2 = 0x3ff1082c00000000; /* log_thresh1 = 9.39412117004394531250e-1 = 0x3fee0faa00000000 log_thresh2 = 1.06449508666992187500 = 0x3ff1082c00000000 */ if (ux >= log_thresh1 && ux <= log_thresh2) { /* Arguments close to 1.0 are handled separately to maintain accuracy. The approximation in this region exploits the identity log( 1 + r ) = log( 1 + u/2 ) / log( 1 - u/2 ), where u = 2r / (2+r). Note that the right hand side has an odd Taylor series expansion which converges much faster than the Taylor series expansion of log( 1 + r ) in r. Thus, we approximate log( 1 + r ) by u + A1 * u^3 + A2 * u^5 + ... + An * u^(2n+1). One subtlety is that since u cannot be calculated from r exactly, the rounding error in the first u should be avoided if possible. To accomplish this, we observe that u = r - r*r/(2+r). Since x (=1+r) is the input argument, and thus presumed exact, the formula above approximates u accurately because u = r - correction, and the magnitude of "correction" (of the order of r*r) is small. With these observations, we will approximate log( 1 + r ) by r + ( (A1*u^3 + ... + An*u^(2n+1)) - correction ). We approximate log(1+r) by an odd polynomial in u, where u = 2r/(2+r) = r - r*r/(2+r). */ r = x - 1.0; u = r / (2.0 + r); correction = r * u; u = u + u; v = u * u; z1 = r; z2 = (u * v * (ca_1 + v * (ca_2 + v * (ca_3 + v * ca_4))) - correction); *r1 = z1; *r2 = z2; *xexp = 0; } else { /* First, we decompose the argument x to the form x = 2**M * (F1 + F2), where 1 <= F1+F2 < 2, M has the value of an integer, F1 = 1 + j/64, j ranges from 0 to 64, and |F2| <= 1/128. Second, we approximate log( 1 + F2/F1 ) by an odd polynomial in U, where U = 2 F2 / (2 F2 + F1). Note that log( 1 + F2/F1 ) = log( 1 + U/2 ) - log( 1 - U/2 ). The core approximation calculates Poly = [log( 1 + U/2 ) - log( 1 - U/2 )]/U - 1. Note that log(1 + U/2) - log(1 - U/2) = 2 arctanh ( U/2 ), thus, Poly = 2 arctanh( U/2 ) / U - 1. It is not hard to see that log(x) = M*log(2) + log(F1) + log( 1 + F2/F1 ). Hence, we return Z1 = log(F1), and Z2 = log( 1 + F2/F1). The values of log(F1) are calculated beforehand and stored in the program. */ f = x; if (ux < IMPBIT_DP64) { /* The input argument x is denormalized */ /* Normalize f by increasing the exponent by 60 and subtracting a correction to account for the implicit bit. This replaces a slow denormalized multiplication by a fast normal subtraction. */ static const double corr = 2.5653355008114851558350183e-290; /* 0x03d0000000000000 */ GET_BITS_DP64(f, ux); ux |= 0x03d0000000000000; PUT_BITS_DP64(ux, f); f -= corr; GET_BITS_DP64(f, ux); expadjust = 60; } else expadjust = 0; /* Store the exponent of x in xexp and put f into the range [0.5,1) */ *xexp = (int)((ux & EXPBITS_DP64) >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64 - expadjust; PUT_BITS_DP64((ux & MANTBITS_DP64) | HALFEXPBITS_DP64, f); /* Now x = 2**xexp * f, 1/2 <= f < 1. */ /* Set index to be the nearest integer to 128*f */ r = 128.0 * f; index = (int)(r + 0.5); z1 = ln_lead_table[index-64]; q = ln_tail_table[index-64]; f1 = index * 0.0078125; /* 0.0078125 = 1/128 */ f2 = f - f1; /* At this point, x = 2**xexp * ( f1 + f2 ) where f1 = j/128, j = 64, 65, ..., 128 and |f2| <= 1/256. */ /* Calculate u = 2 f2 / ( 2 f1 + f2 ) = f2 / ( f1 + 0.5*f2 ) */ /* u = f2 / (f1 + 0.5 * f2); */ u = f2 / (f1 + 0.5 * f2); /* Here, |u| <= 2(exp(1/16)-1) / (exp(1/16)+1). The core approximation calculates poly = [log(1 + u/2) - log(1 - u/2)]/u - 1 */ v = u * u; poly = (v * (cb_1 + v * (cb_2 + v * cb_3))); z2 = q + (u + u * poly); *r1 = z1; *r2 = z2; } return; } #endif /* USE_LOG_KERNEL_AMD */ #if defined(USE_REMAINDER_PIBY2F_INLINE) /* Define this to get debugging print statements activated */ #define DEBUGGING_PRINT #undef DEBUGGING_PRINT #ifdef DEBUGGING_PRINT #include char *d2b(long long d, int bitsper, int point) { static char buff[200]; int i, j; j = bitsper; if (point >= 0 && point <= bitsper) j++; buff[j] = '\0'; for (i = bitsper - 1; i >= 0; i--) { j--; if (d % 2 == 1) buff[j] = '1'; else buff[j] = '0'; if (i == point) { j--; buff[j] = '.'; } d /= 2; } return buff; } #endif /* Given positive argument x, reduce it to the range [-pi/4,pi/4] using extra precision, and return the result in r. Return value "region" tells how many lots of pi/2 were subtracted from x to put it in the range [-pi/4,pi/4], mod 4. */ static inline void __remainder_piby2f_inline(unsigned long long ux, double *r, int *region) { /* This method simulates multi-precision floating-point arithmetic and is accurate for all 1 <= x < infinity */ #if 0 const int bitsper = 36; #else #define bitsper 36 #endif unsigned long long res[10]; unsigned long long u, carry, mask, mant, nextbits; int first, last, i, rexp, xexp, resexp, ltb, determ, bc; double dx; static const double piby2 = 1.57079632679489655800e+00; /* 0x3ff921fb54442d18 */ static unsigned long long pibits[] = { 0LL, 5215LL, 13000023176LL, 11362338026LL, 67174558139LL, 34819822259LL, 10612056195LL, 67816420731LL, 57840157550LL, 19558516809LL, 50025467026LL, 25186875954LL, 18152700886LL }; xexp = (int)(((ux & EXPBITS_DP64) >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64); ux = ((ux & MANTBITS_DP64) | IMPBIT_DP64) >> 29; /* Now ux is the mantissa bit pattern of x as a long long integer */ mask = 1; mask = (mask << bitsper) - 1; /* Set first and last to the positions of the first and last chunks of 2/pi that we need */ first = xexp / bitsper; resexp = xexp - first * bitsper; /* 120 is the theoretical maximum number of bits (actually 115 for IEEE single precision) that we need to extract from the middle of 2/pi to compute the reduced argument accurately enough for our purposes */ last = first + 120 / bitsper; #ifdef DEBUGGING_PRINT printf("first = %d, last = %d\n", first, last); #endif /* Do a long multiplication of the bits of 2/pi by the integer mantissa */ #if 0 for (i = last; i >= first; i--) { u = pibits[i] * ux + carry; res[i - first] = u & mask; carry = u >> bitsper; } res[last - first + 1] = 0; #else /* Unroll the loop. This is only correct because we know that bitsper is fixed as 36. */ res[4] = 0; u = pibits[last] * ux; res[3] = u & mask; carry = u >> bitsper; u = pibits[last - 1] * ux + carry; res[2] = u & mask; carry = u >> bitsper; u = pibits[last - 2] * ux + carry; res[1] = u & mask; carry = u >> bitsper; u = pibits[first] * ux + carry; res[0] = u & mask; #endif #ifdef DEBUGGING_PRINT printf("resexp = %d\n", resexp); printf("Significant part of x * 2/pi with binary" " point in correct place:\n"); for (i = 0; i <= last - first; i++) { if (i > 0 && i % 5 == 0) printf("\n "); if (i == 1) printf("%s ", d2b(res[i], bitsper, resexp)); else printf("%s ", d2b(res[i], bitsper, -1)); } printf("\n"); #endif /* Reconstruct the result */ ltb = (int)((((res[0] << bitsper) | res[1]) >> (bitsper - 1 - resexp)) & 7); /* determ says whether the fractional part is >= 0.5 */ determ = ltb & 1; #ifdef DEBUGGING_PRINT printf("ltb = %d (last two bits before binary point" " and first bit after)\n", ltb); printf("determ = %d (1 means need to negate because the fractional\n" " part of x * 2/pi is greater than 0.5)\n", determ); #endif i = 1; if (determ) { /* The mantissa is >= 0.5. We want to subtract it from 1.0 by negating all the bits */ *region = ((ltb >> 1) + 1) & 3; mant = 1; mant = ~(res[1]) & ((mant << (bitsper - resexp)) - 1); while (mant < 0x0000000000010000) { i++; mant = (mant << bitsper) | (~(res[i]) & mask); } nextbits = (~(res[i+1]) & mask); } else { *region = (ltb >> 1); mant = 1; mant = res[1] & ((mant << (bitsper - resexp)) - 1); while (mant < 0x0000000000010000) { i++; mant = (mant << bitsper) | res[i]; } nextbits = res[i+1]; } #ifdef DEBUGGING_PRINT printf("First bits of mant = %s\n", d2b(mant, bitsper, -1)); #endif /* Normalize the mantissa. The shift value 6 here, determined by trial and error, seems to give optimal speed. */ bc = 0; while (mant < 0x0000400000000000) { bc += 6; mant <<= 6; } while (mant < 0x0010000000000000) { bc++; mant <<= 1; } mant |= nextbits >> (bitsper - bc); rexp = 52 + resexp - bc - i * bitsper; #ifdef DEBUGGING_PRINT printf("Normalised mantissa = 0x%016lx\n", mant); printf("Exponent to be inserted on mantissa = rexp = %d\n", rexp); #endif /* Put the result exponent rexp onto the mantissa pattern */ u = ((unsigned long long)rexp + EXPBIAS_DP64) << EXPSHIFTBITS_DP64; ux = (mant & MANTBITS_DP64) | u; if (determ) /* If we negated the mantissa we negate x too */ ux |= SIGNBIT_DP64; PUT_BITS_DP64(ux, dx); #ifdef DEBUGGING_PRINT printf("(x*2/pi) = %25.20e = %s\n", dx, double2hex(&dx)); #endif /* x is a double precision version of the fractional part of x * 2 / pi. Multiply x by pi/2 in double precision to get the reduced argument r. */ *r = dx * piby2; #ifdef DEBUGGING_PRINT printf(" r = frac(x*2/pi) * pi/2:\n"); printf(" r = %25.20e = %s\n", *r, double2hex(r)); printf("region = (number of pi/2 subtracted from x) mod 4 = %d\n", *region); #endif } #endif /* USE_REMAINDER_PIBY2F_INLINE */ #endif /* LIBM_INLINES_AMD_H_INCLUDED */