reactos/sdk/lib/crt/math/libm_sse2/libm_inlines.h

/***********************************************************************************/
/** MIT License **/
/** ----------- **/
/** **/
/** Copyright (c) 2002-2019 Advanced Micro Devices, Inc. **/
/** **/
/** Permission is hereby granted, free of charge, to any person obtaining a copy **/
/** of this Software and associated documentaon files (the "Software"), to deal **/
/** in the Software without restriction, including without limitation the rights **/
/** to use, copy, modify, merge, publish, distribute, sublicense, and/or sell **/
/** copies of the Software, and to permit persons to whom the Software is **/
/** furnished to do so, subject to the following conditions: **/
/** **/
/** The above copyright notice and this permission notice shall be included in **/
/** all copies or substantial portions of the Software. **/
/** **/
/** THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR **/
/** IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, **/
/** FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE **/
/** AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER **/
/** LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, **/
/** OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN **/
/** THE SOFTWARE. **/
/***********************************************************************************/

#ifndef LIBM_INLINES_AMD_H_INCLUDED
#define LIBM_INLINES_AMD_H_INCLUDED 1

#include "libm_util.h"

/* Set defines for inline functions calling other inlines */
#if defined(USE_VAL_WITH_FLAGS) || defined(USE_VALF_WITH_FLAGS) || \
    defined(USE_ZERO_WITH_FLAGS) || defined(USE_ZEROF_WITH_FLAGS) || \
    defined(USE_NAN_WITH_FLAGS) || defined(USE_NANF_WITH_FLAGS) || \
    defined(USE_INDEFINITE_WITH_FLAGS) || defined(USE_INDEFINITEF_WITH_FLAGS) || \
    defined(USE_INFINITY_WITH_FLAGS) || defined(USE_INFINITYF_WITH_FLAGS) || \
    defined(USE_SQRT_AMD_INLINE) || defined(USE_SQRTF_AMD_INLINE) || \
    (defined(USE_HANDLE_ERROR) || defined(USE_HANDLE_ERRORF))
#undef USE_RAISE_FPSW_FLAGS
#define USE_RAISE_FPSW_FLAGS 1
#endif

#if defined(USE_SPLITDOUBLE)
/* Splits double x into exponent e and mantissa m, where 0.5 <= abs(m) < 1.0.
   Assumes that x is not zero, denormal, infinity or NaN, but these conditions
   are not checked */
static inline void splitDouble(double x, int *e, double *m)
{
  unsigned long long ux, uy;
  GET_BITS_DP64(x, ux);
  uy = ux;
  ux &= EXPBITS_DP64;
  ux >>= EXPSHIFTBITS_DP64;
  *e = (int)ux - EXPBIAS_DP64 + 1;
  uy = (uy & (SIGNBIT_DP64 | MANTBITS_DP64)) | HALFEXPBITS_DP64;
  PUT_BITS_DP64(uy, x);
  *m = x;
}
#endif /* USE_SPLITDOUBLE */


#if defined(USE_SPLITDOUBLE_2)
/* Splits double x into exponent e and mantissa m, where 1.0 <= abs(m) < 4.0.
   Assumes that x is not zero, denormal, infinity or NaN, but these conditions
   are not checked. Also assumes EXPBIAS_DP is odd. With this
   assumption, e will be even on exit. */
static inline void splitDouble_2(double x, int *e, double *m)
{
  unsigned long long ux, vx;
  GET_BITS_DP64(x, ux);
  vx = ux;
  ux &= EXPBITS_DP64;
  ux >>= EXPSHIFTBITS_DP64;
  if (ux & 1)
    {
      /* The exponent is odd */
      vx = (vx & (SIGNBIT_DP64 | MANTBITS_DP64)) | ONEEXPBITS_DP64;
      PUT_BITS_DP64(vx, x);
      *m = x;
      *e = ux - EXPBIAS_DP64;
    }
  else
    {
      /* The exponent is even */
      vx = (vx & (SIGNBIT_DP64 | MANTBITS_DP64)) | TWOEXPBITS_DP64;
      PUT_BITS_DP64(vx, x);
      *m = x;
      *e = ux - EXPBIAS_DP64 - 1;
    }
}
#endif /* USE_SPLITDOUBLE_2 */


#if defined(USE_SPLITFLOAT)
/* Splits float x into exponent e and mantissa m, where 0.5 <= abs(m) < 1.0.
   Assumes that x is not zero, denormal, infinity or NaN, but these conditions
   are not checked */
static inline void splitFloat(float x, int *e, float *m)
{
  unsigned int ux, uy;
  GET_BITS_SP32(x, ux);
  uy = ux;
  ux &= EXPBITS_SP32;
  ux >>= EXPSHIFTBITS_SP32;
  *e = (int)ux - EXPBIAS_SP32 + 1;
  uy = (uy & (SIGNBIT_SP32 | MANTBITS_SP32)) | HALFEXPBITS_SP32;
  PUT_BITS_SP32(uy, x);
  *m = x;
}
#endif /* USE_SPLITFLOAT */


#if defined(USE_SCALEDOUBLE_1)
/* Scales the double x by 2.0**n.
   Assumes EMIN <= n <= EMAX, though this condition is not checked. */
static inline double scaleDouble_1(double x, int n)
{
  double t;
  /* Construct the number t = 2.0**n */
  PUT_BITS_DP64(((long long)n + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t);
  return x*t;
}
#endif /* USE_SCALEDOUBLE_1 */


#if defined(USE_SCALEDOUBLE_2)
/* Scales the double x by 2.0**n.
   Assumes 2*EMIN <= n <= 2*EMAX, though this condition is not checked. */
static inline double scaleDouble_2(double x, int n)
{
  double t1, t2;
  int n1, n2;
  n1 = n / 2;
  n2 = n - n1;
  /* Construct the numbers t1 = 2.0**n1 and t2 = 2.0**n2 */
  PUT_BITS_DP64(((long long)n1 + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t1);
  PUT_BITS_DP64(((long long)n2 + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t2);
  return (x*t1)*t2;
}
#endif /* USE_SCALEDOUBLE_2 */


#if defined(USE_SCALEDOUBLE_3)
/* Scales the double x by 2.0**n.
   Assumes 3*EMIN <= n <= 3*EMAX, though this condition is not checked. */
static inline double scaleDouble_3(double x, int n)
{
  double t1, t2, t3;
  int n1, n2, n3;
  n1 = n / 3;
  n2 = (n - n1) / 2;
  n3 = n - n1 - n2;
  /* Construct the numbers t1 = 2.0**n1, t2 = 2.0**n2 and t3 = 2.0**n3 */
  PUT_BITS_DP64(((long long)n1 + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t1);
  PUT_BITS_DP64(((long long)n2 + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t2);
  PUT_BITS_DP64(((long long)n3 + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t3);
  return ((x*t1)*t2)*t3;
}
#endif /* USE_SCALEDOUBLE_3 */


#if defined(USE_SCALEFLOAT_1)
/* Scales the float x by 2.0**n.
   Assumes EMIN <= n <= EMAX, though this condition is not checked. */
static inline float scaleFloat_1(float x, int n)
{
  float t;
  /* Construct the number t = 2.0**n */
  PUT_BITS_SP32((n + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, t);
  return x*t;
}
#endif /* USE_SCALEFLOAT_1 */


#if defined(USE_SCALEFLOAT_2)
/* Scales the float x by 2.0**n.
   Assumes 2*EMIN <= n <= 2*EMAX, though this condition is not checked. */
static inline float scaleFloat_2(float x, int n)
{
  float t1, t2;
  int n1, n2;
  n1 = n / 2;
  n2 = n - n1;
  /* Construct the numbers t1 = 2.0**n1 and t2 = 2.0**n2 */
  PUT_BITS_SP32((n1 + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, t1);
  PUT_BITS_SP32((n2 + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, t2);
  return (x*t1)*t2;
}
#endif /* USE_SCALEFLOAT_2 */


#if defined(USE_SCALEFLOAT_3)
/* Scales the float x by 2.0**n.
   Assumes 3*EMIN <= n <= 3*EMAX, though this condition is not checked. */
static inline float scaleFloat_3(float x, int n)
{
  float t1, t2, t3;
  int n1, n2, n3;
  n1 = n / 3;
  n2 = (n - n1) / 2;
  n3 = n - n1 - n2;
  /* Construct the numbers t1 = 2.0**n1, t2 = 2.0**n2 and t3 = 2.0**n3 */
  PUT_BITS_SP32((n1 + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, t1);
  PUT_BITS_SP32((n2 + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, t2);
  PUT_BITS_SP32((n3 + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, t3);
  return ((x*t1)*t2)*t3;
}
#endif /* USE_SCALEFLOAT_3 */

#if defined(USE_SETPRECISIONDOUBLE)
unsigned int setPrecisionDouble(void)
{
  unsigned int cw, cwold = 0;
  /* There is no precision control on Hammer */
  return cwold;
}
#endif /* USE_SETPRECISIONDOUBLE */

#if defined(USE_RESTOREPRECISION)
void restorePrecision(unsigned int cwold)
{
  /* There is no precision control on Hammer */
  return;
}
#endif /* USE_RESTOREPRECISION */


#if defined(USE_RAISE_FPSW_FLAGS)
/* Raises floating-point status flags. The argument should be
   the bitwise or of the flags to be raised, from the
   list above, e.g.
     raise_fpsw_flags(AMD_F_INEXACT | AMD_F_INVALID);
 */

/* ISSUE - wat - 08182010
 * These AMD_ISW_* flags are duplicated from trans.h
 * this is not clean; Mark S. did it for targeted fix of 855457
 * Eliminate all redundant flags in the next overhaul
 */

#define AMD_ISW_INVALID     0x0001
#define AMD_ISW_DENORMAL    0x0002
#define AMD_ISW_ZERODIVIDE  0x0004
#define AMD_ISW_OVERFLOW    0x0008
#define AMD_ISW_UNDERFLOW   0x0010
#define AMD_ISW_INEXACT     0x0020

/* use this function from fpctrl.c */
void _set_statfp(uintptr_t);

static inline void raise_fpsw_flags(int flags)
{
    unsigned int f = 0;

    if (flags & AMD_F_OVERFLOW)     { f |= AMD_ISW_OVERFLOW; }
    if (flags & AMD_F_UNDERFLOW)    { f |= AMD_ISW_UNDERFLOW; }
    if (flags & AMD_F_DIVBYZERO)    { f |= AMD_ISW_ZERODIVIDE; }
    if (flags & AMD_F_INVALID)      { f |= AMD_ISW_INVALID; }
    if (flags & AMD_F_INEXACT)      { f |= AMD_ISW_INEXACT; }

    _set_statfp(f);
}

#endif /* USE_RAISE_FPSW_FLAGS */


#if defined(USE_GET_FPSW_INLINE)
/* Return the current floating-point status word */
static inline unsigned int get_fpsw_inline(void)
{
  return _mm_getcsr();
}
#endif /* USE_GET_FPSW_INLINE */

#if defined(USE_SET_FPSW_INLINE)
/* Set the floating-point status word */
static inline void set_fpsw_inline(unsigned int sw)
{
  _mm_setcsr(sw);
}
#endif /* USE_SET_FPSW_INLINE */


#if defined(USE_VAL_WITH_FLAGS)
/* Returns a double value after raising the given flags,
  e.g.  val_with_flags(AMD_F_INEXACT);
 */
static inline double val_with_flags(double val, int flags)
{
  raise_fpsw_flags(flags);
  return val;
}
#endif /* USE_VAL_WITH_FLAGS */

#if defined(USE_VALF_WITH_FLAGS)
/* Returns a float value after raising the given flags,
  e.g.  valf_with_flags(AMD_F_INEXACT);
 */
static inline float valf_with_flags(float val, int flags)
{
  raise_fpsw_flags(flags);
  return val;
}
#endif /* USE_VALF_WITH_FLAGS */


#if defined(USE_ZERO_WITH_FLAGS)
/* Returns a double +zero after raising the given flags,
  e.g.  zero_with_flags(AMD_F_INEXACT | AMD_F_INVALID);
 */
static inline double zero_with_flags(int flags)
{
  raise_fpsw_flags(flags);
  return 0.0;
}
#endif /* USE_ZERO_WITH_FLAGS */


#if defined(USE_ZEROF_WITH_FLAGS)
/* Returns a float +zero after raising the given flags,
  e.g.  zerof_with_flags(AMD_F_INEXACT | AMD_F_INVALID);
 */
static inline float zerof_with_flags(int flags)
{
  raise_fpsw_flags(flags);
  return 0.0F;
}
#endif /* USE_ZEROF_WITH_FLAGS */


#if defined(USE_NAN_WITH_FLAGS)
/* Returns a double quiet +nan after raising the given flags,
   e.g.  nan_with_flags(AMD_F_INVALID);
*/
static inline double nan_with_flags(int flags)
{
  double z;
  raise_fpsw_flags(flags);
  PUT_BITS_DP64(0x7ff8000000000000, z);
  return z;
}
#endif /* USE_NAN_WITH_FLAGS */

#if defined(USE_NANF_WITH_FLAGS)
/* Returns a float quiet +nan after raising the given flags,
   e.g.  nanf_with_flags(AMD_F_INVALID);
*/
static inline float nanf_with_flags(int flags)
{
  float z;
  raise_fpsw_flags(flags);
  PUT_BITS_SP32(0x7fc00000, z);
  return z;
}
#endif /* USE_NANF_WITH_FLAGS */


#if defined(USE_INDEFINITE_WITH_FLAGS)
/* Returns a double indefinite after raising the given flags,
   e.g.  indefinite_with_flags(AMD_F_INVALID);
*/
static inline double indefinite_with_flags(int flags)
{
  double z;
  raise_fpsw_flags(flags);
  PUT_BITS_DP64(0xfff8000000000000, z);
  return z;
}
#endif /* USE_INDEFINITE_WITH_FLAGS */

#if defined(USE_INDEFINITEF_WITH_FLAGS)
/* Returns a float quiet +indefinite after raising the given flags,
   e.g.  indefinitef_with_flags(AMD_F_INVALID);
*/
static inline float indefinitef_with_flags(int flags)
{
  float z;
  raise_fpsw_flags(flags);
  PUT_BITS_SP32(0xffc00000, z);
  return z;
}
#endif /* USE_INDEFINITEF_WITH_FLAGS */


#ifdef USE_INFINITY_WITH_FLAGS
/* Returns a positive double infinity after raising the given flags,
   e.g.  infinity_with_flags(AMD_F_OVERFLOW);
*/
static inline double infinity_with_flags(int flags)
{
  double z;
  raise_fpsw_flags(flags);
  PUT_BITS_DP64((unsigned long long)(BIASEDEMAX_DP64 + 1) << EXPSHIFTBITS_DP64, z);
  return z;
}
#endif /* USE_INFINITY_WITH_FLAGS */

#ifdef USE_INFINITYF_WITH_FLAGS
/* Returns a positive float infinity after raising the given flags,
   e.g.  infinityf_with_flags(AMD_F_OVERFLOW);
*/
static inline float infinityf_with_flags(int flags)
{
  float z;
  raise_fpsw_flags(flags);
  PUT_BITS_SP32((BIASEDEMAX_SP32 + 1) << EXPSHIFTBITS_SP32, z);
  return z;
}
#endif /* USE_INFINITYF_WITH_FLAGS */

#if defined(USE_HANDLE_ERROR) || defined(USE_HANDLE_ERRORF)
#include <errno.h>
#endif

/* define the Microsoft specific error handling routine */
double _handle_error(
        char *fname,
        int opcode,
        unsigned long long value,
        int type,
        int flags,
        int error,
        double arg1,
        double arg2,
        int nargs
        );
float _handle_errorf(
        char *fname,
        int opcode,
        unsigned long long value,
        int type,
        int flags,
        int error,
        float arg1,
        float arg2,
        int nargs
        );

#if defined(USE_SPLITEXP)
/* Compute the values m, z1, and z2 such that base**x = 2**m * (z1 + z2).
   Small arguments abs(x) < 1/(16*ln(base)) and extreme arguments
   abs(x) > large/(ln(base)) (where large is the largest representable
   floating point number) should be handled separately instead of calling
   this function. This function is called by exp, exp2, exp10,
   cosh and sinh. */
static inline void splitexp(double x, double logbase,
                            double thirtytwo_by_logbaseof2,
                            double logbaseof2_by_32_lead,
                            double logbaseof2_by_32_trail,
                            int *m, double *z1, double *z2)
{
  double q, r, r1, r2, f1, f2;
  int n, j;

/* Arrays two_to_jby32_lead_table and two_to_jby32_trail_table contain
   leading and trailing parts respectively of precomputed
   values of pow(2.0,j/32.0), for j = 0, 1, ..., 31.
   two_to_jby32_lead_table contains the first 25 bits of precision,
   and two_to_jby32_trail_table contains a further 53 bits precision. */

  static const double two_to_jby32_lead_table[32] = {
    1.00000000000000000000e+00,   /* 0x3ff0000000000000 */
    1.02189713716506958008e+00,   /* 0x3ff059b0d0000000 */
    1.04427373409271240234e+00,   /* 0x3ff0b55860000000 */
    1.06714040040969848633e+00,   /* 0x3ff11301d0000000 */
    1.09050768613815307617e+00,   /* 0x3ff172b830000000 */
    1.11438673734664916992e+00,   /* 0x3ff1d48730000000 */
    1.13878858089447021484e+00,   /* 0x3ff2387a60000000 */
    1.16372483968734741211e+00,   /* 0x3ff29e9df0000000 */
    1.18920707702636718750e+00,   /* 0x3ff306fe00000000 */
    1.21524733304977416992e+00,   /* 0x3ff371a730000000 */
    1.24185776710510253906e+00,   /* 0x3ff3dea640000000 */
    1.26905095577239990234e+00,   /* 0x3ff44e0860000000 */
    1.29683953523635864258e+00,   /* 0x3ff4bfdad0000000 */
    1.32523661851882934570e+00,   /* 0x3ff5342b50000000 */
    1.35425549745559692383e+00,   /* 0x3ff5ab07d0000000 */
    1.38390988111495971680e+00,   /* 0x3ff6247eb0000000 */
    1.41421353816986083984e+00,   /* 0x3ff6a09e60000000 */
    1.44518077373504638672e+00,   /* 0x3ff71f75e0000000 */
    1.47682613134384155273e+00,   /* 0x3ff7a11470000000 */
    1.50916439294815063477e+00,   /* 0x3ff8258990000000 */
    1.54221081733703613281e+00,   /* 0x3ff8ace540000000 */
    1.57598084211349487305e+00,   /* 0x3ff93737b0000000 */
    1.61049032211303710938e+00,   /* 0x3ff9c49180000000 */
    1.64575546979904174805e+00,   /* 0x3ffa5503b0000000 */
    1.68179279565811157227e+00,   /* 0x3ffae89f90000000 */
    1.71861928701400756836e+00,   /* 0x3ffb7f76f0000000 */
    1.75625211000442504883e+00,   /* 0x3ffc199bd0000000 */
    1.79470902681350708008e+00,   /* 0x3ffcb720d0000000 */
    1.83400803804397583008e+00,   /* 0x3ffd5818d0000000 */
    1.87416762113571166992e+00,   /* 0x3ffdfc9730000000 */
    1.91520655155181884766e+00,   /* 0x3ffea4afa0000000 */
    1.95714408159255981445e+00};  /* 0x3fff507650000000 */

  static const double two_to_jby32_trail_table[32] = {
    0.00000000000000000000e+00,   /* 0x0000000000000000 */
    1.14890470981563546737e-08,   /* 0x3e48ac2ba1d73e2a */
    4.83347014379782142328e-08,   /* 0x3e69f3121ec53172 */
    2.67125131841396124714e-10,   /* 0x3df25b50a4ebbf1b */
    4.65271045830351350190e-08,   /* 0x3e68faa2f5b9bef9 */
    5.24924336638693782574e-09,   /* 0x3e368b9aa7805b80 */
    5.38622214388600821910e-08,   /* 0x3e6ceac470cd83f6 */
    1.90902301017041969782e-08,   /* 0x3e547f7b84b09745 */
    3.79763538792174980894e-08,   /* 0x3e64636e2a5bd1ab */
    2.69306947081946450986e-08,   /* 0x3e5ceaa72a9c5154 */
    4.49683815095311756138e-08,   /* 0x3e682468446b6824 */
    1.41933332021066904914e-09,   /* 0x3e18624b40c4dbd0 */
    1.94146510233556266402e-08,   /* 0x3e54d8a89c750e5e */
    2.46409119489264118569e-08,   /* 0x3e5a753e077c2a0f */
    4.94812958044698886494e-08,   /* 0x3e6a90a852b19260 */
    8.48872238075784476136e-10,   /* 0x3e0d2ac258f87d03 */
    2.42032342089579394887e-08,   /* 0x3e59fcef32422cbf */
    3.32420002333182569170e-08,   /* 0x3e61d8bee7ba46e2 */
    1.45956577586525322754e-08,   /* 0x3e4f580c36bea881 */
    3.46452721050003920866e-08,   /* 0x3e62999c25159f11 */
    8.07090469079979051284e-09,   /* 0x3e415506dadd3e2a */
    2.99439161340839520436e-09,   /* 0x3e29b8bc9e8a0388 */
    9.83621719880452147153e-09,   /* 0x3e451f8480e3e236 */
    8.35492309647188080486e-09,   /* 0x3e41f12ae45a1224 */
    3.48493175137966283582e-08,   /* 0x3e62b5a75abd0e6a */
    1.11084703472699692902e-08,   /* 0x3e47daf237553d84 */
    5.03688744342840346564e-08,   /* 0x3e6b0aa538444196 */
    4.81896001063495806249e-08,   /* 0x3e69df20d22a0798 */
    4.83653666334089557746e-08,   /* 0x3e69f7490e4bb40b */
    1.29745882314081237628e-08,   /* 0x3e4bdcdaf5cb4656 */
    9.84532844621636118964e-09,   /* 0x3e452486cc2c7b9d */
    4.25828404545651943883e-08};  /* 0x3e66dc8a80ce9f09 */

    /*
      Step 1. Reduce the argument.

      To perform argument reduction, we find the integer n such that
      x = n * logbaseof2/32 + remainder, |remainder| <= logbaseof2/64.
      n is defined by round-to-nearest-integer( x*32/logbaseof2 ) and
      remainder by x - n*logbaseof2/32. The calculation of n is
      straightforward whereas the computation of x - n*logbaseof2/32
      must be carried out carefully.
      logbaseof2/32 is so represented in two pieces that
      (1) logbaseof2/32 is known to extra precision, (2) the product
      of n and the leading piece is a model number and is hence
      calculated without error, and (3) the subtraction of the value
      obtained in (2) from x is a model number and is hence again
      obtained without error.
    */

    r = x * thirtytwo_by_logbaseof2;
    /* Set n = nearest integer to r */
    /* This is faster on Hammer */
    if (r > 0)
      n = (int)(r + 0.5);
    else
      n = (int)(r - 0.5);

    r1 = x - n * logbaseof2_by_32_lead;
    r2 =   - n * logbaseof2_by_32_trail;

    /* Set j = n mod 32:   5 mod 32 = 5,   -5 mod 32 = 27,  etc. */
    /* j = n % 32;
       if (j < 0) j += 32; */
    j = n & 0x0000001f;

    f1 = two_to_jby32_lead_table[j];
    f2 = two_to_jby32_trail_table[j];

    *m = (n - j) / 32;

    /* Step 2. The following is the core approximation. We approximate
       exp(r1+r2)-1 by a polynomial. */

    r1 *= logbase; r2 *= logbase;

    r = r1 + r2;
    q = r1 + (r2 +
              r*r*( 5.00000000000000008883e-01 +
                      r*( 1.66666666665260878863e-01 +
                      r*( 4.16666666662260795726e-02 +
                      r*( 8.33336798434219616221e-03 +
                      r*( 1.38889490863777199667e-03 ))))));

    /* Step 3. Function value reconstruction.
       We now reconstruct the exponential of the input argument
       so that exp(x) = 2**m * (z1 + z2).
       The order of the computation below must be strictly observed. */

    *z1 = f1;
    *z2 = f2 + ((f1 + f2) * q);
}
#endif /* USE_SPLITEXP */


#if defined(USE_SPLITEXPF)
/* Compute the values m, z1, and z2 such that base**x = 2**m * (z1 + z2).
   Small arguments abs(x) < 1/(16*ln(base)) and extreme arguments
   abs(x) > large/(ln(base)) (where large is the largest representable
   floating point number) should be handled separately instead of calling
   this function. This function is called by exp, exp2, exp10,
   cosh and sinh. */
static inline void splitexpf(float x, float logbase,
                             float thirtytwo_by_logbaseof2,
                             float logbaseof2_by_32_lead,
                             float logbaseof2_by_32_trail,
                             int *m, float *z1, float *z2)
{
  float q, r, r1, r2, f1, f2;
  int n, j;

/* Arrays two_to_jby32_lead_table and two_to_jby32_trail_table contain
   leading and trailing parts respectively of precomputed
   values of pow(2.0,j/32.0), for j = 0, 1, ..., 31.
   two_to_jby32_lead_table contains the first 10 bits of precision,
   and two_to_jby32_trail_table contains a further 24 bits precision. */

  static const float two_to_jby32_lead_table[32] = {
    1.0000000000E+00F,  /* 0x3F800000 */
    1.0214843750E+00F,  /* 0x3F82C000 */
    1.0429687500E+00F,  /* 0x3F858000 */
    1.0664062500E+00F,  /* 0x3F888000 */
    1.0898437500E+00F,  /* 0x3F8B8000 */
    1.1132812500E+00F,  /* 0x3F8E8000 */
    1.1386718750E+00F,  /* 0x3F91C000 */
    1.1621093750E+00F,  /* 0x3F94C000 */
    1.1875000000E+00F,  /* 0x3F980000 */
    1.2148437500E+00F,  /* 0x3F9B8000 */
    1.2402343750E+00F,  /* 0x3F9EC000 */
    1.2675781250E+00F,  /* 0x3FA24000 */
    1.2949218750E+00F,  /* 0x3FA5C000 */
    1.3242187500E+00F,  /* 0x3FA98000 */
    1.3535156250E+00F,  /* 0x3FAD4000 */
    1.3828125000E+00F,  /* 0x3FB10000 */
    1.4140625000E+00F,  /* 0x3FB50000 */
    1.4433593750E+00F,  /* 0x3FB8C000 */
    1.4765625000E+00F,  /* 0x3FBD0000 */
    1.5078125000E+00F,  /* 0x3FC10000 */
    1.5410156250E+00F,  /* 0x3FC54000 */
    1.5742187500E+00F,  /* 0x3FC98000 */
    1.6093750000E+00F,  /* 0x3FCE0000 */
    1.6445312500E+00F,  /* 0x3FD28000 */
    1.6816406250E+00F,  /* 0x3FD74000 */
    1.7167968750E+00F,  /* 0x3FDBC000 */
    1.7558593750E+00F,  /* 0x3FE0C000 */
    1.7929687500E+00F,  /* 0x3FE58000 */
    1.8339843750E+00F,  /* 0x3FEAC000 */
    1.8730468750E+00F,  /* 0x3FEFC000 */
    1.9140625000E+00F,  /* 0x3FF50000 */
    1.9570312500E+00F}; /* 0x3FFA8000 */

  static const float two_to_jby32_trail_table[32] = {
    0.0000000000E+00F,  /* 0x00000000 */
    4.1277357377E-04F,  /* 0x39D86988 */
    1.3050324051E-03F,  /* 0x3AAB0D9F */
    7.3415064253E-04F,  /* 0x3A407404 */
    6.6398258787E-04F,  /* 0x3A2E0F1E */
    1.1054925853E-03F,  /* 0x3A90E62D */
    1.1675967835E-04F,  /* 0x38F4DCE0 */
    1.6154836630E-03F,  /* 0x3AD3BEA3 */
    1.7071149778E-03F,  /* 0x3ADFC146 */
    4.0360994171E-04F,  /* 0x39D39B9C */
    1.6234370414E-03F,  /* 0x3AD4C982 */
    1.4728321694E-03F,  /* 0x3AC10C0C */
    1.9176795613E-03F,  /* 0x3AFB5AA6 */
    1.0178930825E-03F,  /* 0x3A856AD3 */
    7.3992193211E-04F,  /* 0x3A41F752 */
    1.0973819299E-03F,  /* 0x3A8FD607 */
    1.5106226783E-04F,  /* 0x391E6678 */
    1.8214319134E-03F,  /* 0x3AEEBD1D */
    2.6364589576E-04F,  /* 0x398A39F4 */
    1.3519275235E-03F,  /* 0x3AB13329 */
    1.1952003697E-03F,  /* 0x3A9CA845 */
    1.7620950239E-03F,  /* 0x3AE6F619 */
    1.1153318919E-03F,  /* 0x3A923054 */
    1.2242280645E-03F,  /* 0x3AA07647 */
    1.5220546629E-04F,  /* 0x391F9958 */
    1.8224230735E-03F,  /* 0x3AEEDE5F */
    3.9278529584E-04F,  /* 0x39CDEEC0 */
    1.7403248930E-03F,  /* 0x3AE41B9D */
    2.3711356334E-05F,  /* 0x37C6E7C0 */
    1.1207590578E-03F,  /* 0x3A92E66F */
    1.1440613307E-03F,  /* 0x3A95F454 */
    1.1287408415E-04F}; /* 0x38ECB6D0 */

    /*
      Step 1. Reduce the argument.

      To perform argument reduction, we find the integer n such that
      x = n * logbaseof2/32 + remainder, |remainder| <= logbaseof2/64.
      n is defined by round-to-nearest-integer( x*32/logbaseof2 ) and
      remainder by x - n*logbaseof2/32. The calculation of n is
      straightforward whereas the computation of x - n*logbaseof2/32
      must be carried out carefully.
      logbaseof2/32 is so represented in two pieces that
      (1) logbaseof2/32 is known to extra precision, (2) the product
      of n and the leading piece is a model number and is hence
      calculated without error, and (3) the subtraction of the value
      obtained in (2) from x is a model number and is hence again
      obtained without error.
    */

    r = x * thirtytwo_by_logbaseof2;
    /* Set n = nearest integer to r */
    /* This is faster on Hammer */
    if (r > 0)
      n = (int)(r + 0.5F);
    else
      n = (int)(r - 0.5F);

    r1 = x - n * logbaseof2_by_32_lead;
    r2 =   - n * logbaseof2_by_32_trail;

    /* Set j = n mod 32:   5 mod 32 = 5,   -5 mod 32 = 27,  etc. */
    /* j = n % 32;
       if (j < 0) j += 32; */
    j = n & 0x0000001f;

    f1 = two_to_jby32_lead_table[j];
    f2 = two_to_jby32_trail_table[j];

    *m = (n - j) / 32;

    /* Step 2. The following is the core approximation. We approximate
       exp(r1+r2)-1 by a polynomial. */

    r1 *= logbase; r2 *= logbase;

    r = r1 + r2;
    q = r1 + (r2 +
              r*r*( 5.00000000000000008883e-01F +
                      r*( 1.66666666665260878863e-01F )));

    /* Step 3. Function value reconstruction.
       We now reconstruct the exponential of the input argument
       so that exp(x) = 2**m * (z1 + z2).
       The order of the computation below must be strictly observed. */

    *z1 = f1;
    *z2 = f2 + ((f1 + f2) * q);
}
#endif /* SPLITEXPF */


#if defined(USE_SCALEUPDOUBLE1024)
/* Scales up a double (normal or denormal) whose bit pattern is given
   as ux by 2**1024. There are no checks that the input number is
   scalable by that amount. */
static inline void scaleUpDouble1024(unsigned long long ux, unsigned long long *ur)
{
  unsigned long long uy;
  double y;

  if ((ux & EXPBITS_DP64) == 0)
    {
      /* ux is denormalised */
      PUT_BITS_DP64(ux | 0x4010000000000000, y);
      if (ux & SIGNBIT_DP64)
        y += 4.0;
      else
        y -= 4.0;
      GET_BITS_DP64(y, uy);
    }
  else
    /* ux is normal */
    uy = ux + 0x4000000000000000;

  *ur = uy;
  return;
}

#endif /* SCALEUPDOUBLE1024 */


#if defined(USE_SCALEDOWNDOUBLE)
/* Scales down a double whose bit pattern is given as ux by 2**k.
   There are no checks that the input number is scalable by that amount. */
static inline void scaleDownDouble(unsigned long long ux, int k,
                                   unsigned long long *ur)
{
  unsigned long long uy, uk, ax, xsign;
  int n, shift;
  xsign = ux & SIGNBIT_DP64;
  ax = ux & ~SIGNBIT_DP64;
  n = (int)((ax & EXPBITS_DP64) >> EXPSHIFTBITS_DP64) - k;
  if (n > 0)
    {
      uk = (unsigned long long)n << EXPSHIFTBITS_DP64;
      uy = (ax & ~EXPBITS_DP64) | uk;
    }
  else
    {
      uy = (ax & ~EXPBITS_DP64) | 0x0010000000000000;
      shift = (1 - n);
      if (shift > MANTLENGTH_DP64 + 1)
        /* Sigh. Shifting works mod 64 so be careful not to shift too much */
        uy = 0;
      else
        {
          /* Make sure we round the result */
          uy >>= shift - 1;
          uy = (uy >> 1) + (uy & 1);
        }
    }
  *ur = uy | xsign;
}

#endif /* SCALEDOWNDOUBLE */


#if defined(USE_SCALEUPFLOAT128)
/* Scales up a float (normal or denormal) whose bit pattern is given
   as ux by 2**128. There are no checks that the input number is
   scalable by that amount. */
static inline void scaleUpFloat128(unsigned int ux, unsigned int *ur)
{
  unsigned int uy;
  float y;

  if ((ux & EXPBITS_SP32) == 0)
    {
      /* ux is denormalised */
      PUT_BITS_SP32(ux | 0x40800000, y);
      /* Compensate for the implicit bit just added */
      if (ux & SIGNBIT_SP32)
        y += 4.0F;
      else
        y -= 4.0F;
      GET_BITS_SP32(y, uy);
    }
  else
    /* ux is normal */
    uy = ux + 0x40000000;
  *ur = uy;
}
#endif /* SCALEUPFLOAT128 */


#if defined(USE_SCALEDOWNFLOAT)
/* Scales down a float whose bit pattern is given as ux by 2**k.
   There are no checks that the input number is scalable by that amount. */
static inline void scaleDownFloat(unsigned int ux, int k,
                                  unsigned int *ur)
{
  unsigned int uy, uk, ax, xsign;
  int n, shift;

  xsign = ux & SIGNBIT_SP32;
  ax = ux & ~SIGNBIT_SP32;
  n = ((ax & EXPBITS_SP32) >> EXPSHIFTBITS_SP32) - k;
  if (n > 0)
    {
      uk = (unsigned int)n << EXPSHIFTBITS_SP32;
      uy = (ax & ~EXPBITS_SP32) | uk;
    }
  else
    {
      uy = (ax & ~EXPBITS_SP32) | 0x00800000;
      shift = (1 - n);
      if (shift > MANTLENGTH_SP32 + 1)
        /* Sigh. Shifting works mod 32 so be careful not to shift too much */
        uy = 0;
      else
        {
          /* Make sure we round the result */
          uy >>= shift - 1;
          uy = (uy >> 1) + (uy & 1);
        }
    }
  *ur = uy | xsign;
}
#endif /* SCALEDOWNFLOAT */


#if defined(USE_SQRT_AMD_INLINE)
static inline double sqrt_amd_inline(double x)
{
  /*
     Computes the square root of x.

     The calculation is carried out in three steps.

     Step 1. Reduction.
     The input argument is scaled to the interval [1, 4) by
     computing
               x = 2^e * y, where y in [1,4).
     Furthermore y is decomposed as y = c + t where
               c = 1 + j/32, j = 0,1,..,96; and |t| <= 1/64.

     Step 2. Approximation.
     An approximation q = sqrt(1 + (t/c)) - 1  is obtained
     from a basic series expansion using precomputed values
     stored in rt_jby32_lead_table_dbl and rt_jby32_trail_table_dbl.

     Step 3. Reconstruction.
     The value of sqrt(x) is reconstructed via
       sqrt(x) = 2^(e/2) * sqrt(y)
               = 2^(e/2) * sqrt(c) * sqrt(y/c)
               = 2^(e/2) * sqrt(c) * sqrt(1 + t/c)
               = 2^(e/2) * [ sqrt(c) + sqrt(c)*q ]
    */

  unsigned long long ux, ax, u;
  double r1, r2, c, y, p, q, r, twop, z, rtc, rtc_lead, rtc_trail;
  int e, denorm = 0, index;

/* Arrays rt_jby32_lead_table_dbl and rt_jby32_trail_table_dbl contain
   leading and trailing parts respectively of precomputed
   values of sqrt(j/32), for j = 32, 33, ..., 128.
   rt_jby32_lead_table_dbl contains the first 21 bits of precision,
   and rt_jby32_trail_table_dbl contains a further 53 bits precision. */

  static const double rt_jby32_lead_table_dbl[97] = {
    1.00000000000000000000e+00,   /* 0x3ff0000000000000 */
    1.01550388336181640625e+00,   /* 0x3ff03f8100000000 */
    1.03077602386474609375e+00,   /* 0x3ff07e0f00000000 */
    1.04582500457763671875e+00,   /* 0x3ff0bbb300000000 */
    1.06065940856933593750e+00,   /* 0x3ff0f87600000000 */
    1.07528972625732421875e+00,   /* 0x3ff1346300000000 */
    1.08972454071044921875e+00,   /* 0x3ff16f8300000000 */
    1.10396957397460937500e+00,   /* 0x3ff1a9dc00000000 */
    1.11803340911865234375e+00,   /* 0x3ff1e37700000000 */
    1.13192272186279296875e+00,   /* 0x3ff21c5b00000000 */
    1.14564323425292968750e+00,   /* 0x3ff2548e00000000 */
    1.15920162200927734375e+00,   /* 0x3ff28c1700000000 */
    1.17260360717773437500e+00,   /* 0x3ff2c2fc00000000 */
    1.18585395812988281250e+00,   /* 0x3ff2f94200000000 */
    1.19895744323730468750e+00,   /* 0x3ff32eee00000000 */
    1.21191978454589843750e+00,   /* 0x3ff3640600000000 */
    1.22474479675292968750e+00,   /* 0x3ff3988e00000000 */
    1.23743629455566406250e+00,   /* 0x3ff3cc8a00000000 */
    1.25000000000000000000e+00,   /* 0x3ff4000000000000 */
    1.26243782043457031250e+00,   /* 0x3ff432f200000000 */
    1.27475452423095703125e+00,   /* 0x3ff4656500000000 */
    1.28695297241210937500e+00,   /* 0x3ff4975c00000000 */
    1.29903793334960937500e+00,   /* 0x3ff4c8dc00000000 */
    1.31101036071777343750e+00,   /* 0x3ff4f9e600000000 */
    1.32287502288818359375e+00,   /* 0x3ff52a7f00000000 */
    1.33463478088378906250e+00,   /* 0x3ff55aaa00000000 */
    1.34629058837890625000e+00,   /* 0x3ff58a6800000000 */
    1.35784721374511718750e+00,   /* 0x3ff5b9be00000000 */
    1.36930561065673828125e+00,   /* 0x3ff5e8ad00000000 */
    1.38066959381103515625e+00,   /* 0x3ff6173900000000 */
    1.39194107055664062500e+00,   /* 0x3ff6456400000000 */
    1.40312099456787109375e+00,   /* 0x3ff6732f00000000 */
    1.41421318054199218750e+00,   /* 0x3ff6a09e00000000 */
    1.42521858215332031250e+00,   /* 0x3ff6cdb200000000 */
    1.43614006042480468750e+00,   /* 0x3ff6fa6e00000000 */
    1.44697952270507812500e+00,   /* 0x3ff726d400000000 */
    1.45773792266845703125e+00,   /* 0x3ff752e500000000 */
    1.46841716766357421875e+00,   /* 0x3ff77ea300000000 */
    1.47901916503906250000e+00,   /* 0x3ff7aa1000000000 */
    1.48954677581787109375e+00,   /* 0x3ff7d52f00000000 */
    1.50000000000000000000e+00,   /* 0x3ff8000000000000 */
    1.51038074493408203125e+00,   /* 0x3ff82a8500000000 */
    1.52068996429443359375e+00,   /* 0x3ff854bf00000000 */
    1.53093051910400390625e+00,   /* 0x3ff87eb100000000 */
    1.54110336303710937500e+00,   /* 0x3ff8a85c00000000 */
    1.55120849609375000000e+00,   /* 0x3ff8d1c000000000 */
    1.56124877929687500000e+00,   /* 0x3ff8fae000000000 */
    1.57122516632080078125e+00,   /* 0x3ff923bd00000000 */
    1.58113861083984375000e+00,   /* 0x3ff94c5800000000 */
    1.59099006652832031250e+00,   /* 0x3ff974b200000000 */
    1.60078048706054687500e+00,   /* 0x3ff99ccc00000000 */
    1.61051177978515625000e+00,   /* 0x3ff9c4a800000000 */
    1.62018489837646484375e+00,   /* 0x3ff9ec4700000000 */
    1.62979984283447265625e+00,   /* 0x3ffa13a900000000 */
    1.63935947418212890625e+00,   /* 0x3ffa3ad100000000 */
    1.64886283874511718750e+00,   /* 0x3ffa61be00000000 */
    1.65831184387207031250e+00,   /* 0x3ffa887200000000 */
    1.66770744323730468750e+00,   /* 0x3ffaaeee00000000 */
    1.67705059051513671875e+00,   /* 0x3ffad53300000000 */
    1.68634128570556640625e+00,   /* 0x3ffafb4100000000 */
    1.69558238983154296875e+00,   /* 0x3ffb211b00000000 */
    1.70477199554443359375e+00,   /* 0x3ffb46bf00000000 */
    1.71391296386718750000e+00,   /* 0x3ffb6c3000000000 */
    1.72300529479980468750e+00,   /* 0x3ffb916e00000000 */
    1.73204994201660156250e+00,   /* 0x3ffbb67a00000000 */
    1.74104785919189453125e+00,   /* 0x3ffbdb5500000000 */
    1.75000000000000000000e+00,   /* 0x3ffc000000000000 */
    1.75890541076660156250e+00,   /* 0x3ffc247a00000000 */
    1.76776695251464843750e+00,   /* 0x3ffc48c600000000 */
    1.77658367156982421875e+00,   /* 0x3ffc6ce300000000 */
    1.78535652160644531250e+00,   /* 0x3ffc90d200000000 */
    1.79408740997314453125e+00,   /* 0x3ffcb49500000000 */
    1.80277538299560546875e+00,   /* 0x3ffcd82b00000000 */
    1.81142139434814453125e+00,   /* 0x3ffcfb9500000000 */
    1.82002735137939453125e+00,   /* 0x3ffd1ed500000000 */
    1.82859230041503906250e+00,   /* 0x3ffd41ea00000000 */
    1.83711719512939453125e+00,   /* 0x3ffd64d500000000 */
    1.84560203552246093750e+00,   /* 0x3ffd879600000000 */
    1.85404872894287109375e+00,   /* 0x3ffdaa2f00000000 */
    1.86245727539062500000e+00,   /* 0x3ffdcca000000000 */
    1.87082862854003906250e+00,   /* 0x3ffdeeea00000000 */
    1.87916183471679687500e+00,   /* 0x3ffe110c00000000 */
    1.88745784759521484375e+00,   /* 0x3ffe330700000000 */
    1.89571857452392578125e+00,   /* 0x3ffe54dd00000000 */
    1.90394306182861328125e+00,   /* 0x3ffe768d00000000 */
    1.91213226318359375000e+00,   /* 0x3ffe981800000000 */
    1.92028617858886718750e+00,   /* 0x3ffeb97e00000000 */
    1.92840576171875000000e+00,   /* 0x3ffedac000000000 */
    1.93649101257324218750e+00,   /* 0x3ffefbde00000000 */
    1.94454288482666015625e+00,   /* 0x3fff1cd900000000 */
    1.95256233215332031250e+00,   /* 0x3fff3db200000000 */
    1.96054744720458984375e+00,   /* 0x3fff5e6700000000 */
    1.96850109100341796875e+00,   /* 0x3fff7efb00000000 */
    1.97642326354980468750e+00,   /* 0x3fff9f6e00000000 */
    1.98431301116943359375e+00,   /* 0x3fffbfbf00000000 */
    1.99217128753662109375e+00,   /* 0x3fffdfef00000000 */
    2.00000000000000000000e+00};  /* 0x4000000000000000 */

  static const double rt_jby32_trail_table_dbl[97] = {
    0.00000000000000000000e+00,   /* 0x0000000000000000 */
    9.17217678638807524014e-07,   /* 0x3eaec6d70177881c */
    3.82539669043705364790e-07,   /* 0x3e99abfb41bd6b24 */
    2.85899577162227138140e-08,   /* 0x3e5eb2bf6bab55a2 */
    7.63210485349101216659e-07,   /* 0x3ea99bed9b2d8d0c */
    9.32123004127716212874e-07,   /* 0x3eaf46e029c1b296 */
    1.95174719169309219157e-07,   /* 0x3e8a3226fc42f30c */
    5.34316371481845492427e-07,   /* 0x3ea1edbe20701d73 */
    5.79631242504454563052e-07,   /* 0x3ea372fe94f82be7 */
    4.20404384109571705948e-07,   /* 0x3e9c367e08e7bb06 */
    6.89486030314147010716e-07,   /* 0x3ea722a3d0a66608 */
    6.89927685625314560328e-07,   /* 0x3ea7266f067ca1d6 */
    3.32778123013641425828e-07,   /* 0x3e965515a9b34850 */
    1.64433259436999584387e-07,   /* 0x3e8611e23ef6c1bd */
    4.37590875197899335723e-07,   /* 0x3e9d5dc1059ed8e7 */
    1.79808183816018617413e-07,   /* 0x3e88222982d0e4f4 */
    7.46386593615986477624e-08,   /* 0x3e7409212e7d0322 */
    5.72520794105201454728e-07,   /* 0x3ea335ea8a5fcf39 */
    0.00000000000000000000e+00,   /* 0x0000000000000000 */
    2.96860689431670420344e-07,   /* 0x3e93ec071e938bfe */
    3.54167239176257065345e-07,   /* 0x3e97c48bfd9862c6 */
    7.95211265664474710063e-07,   /* 0x3eaaaed010f74671 */
    1.72327048595145565621e-07,   /* 0x3e87211cbfeb62e0 */
    6.99494915996239297020e-07,   /* 0x3ea7789d9660e72d */
    6.32644111701500844315e-07,   /* 0x3ea53a5f1d36f1cf */
    6.20124838851440463844e-10,   /* 0x3e054eacff2057dc */
    6.13404719757812629969e-07,   /* 0x3ea4951b3e6a83cc */
    3.47654909777986407387e-07,   /* 0x3e9754aa76884c66 */
    7.83106177002392475763e-07,   /* 0x3eaa46d4b1de1074 */
    5.33337372440526357008e-07,   /* 0x3ea1e55548f92635 */
    2.01508648555298681765e-08,   /* 0x3e55a3070dd17788 */
    5.25472356925843939587e-07,   /* 0x3ea1a1c5eedb0801 */
    3.81831102861301692797e-07,   /* 0x3e999fcef32422cc */
    6.99220602161420018738e-07,   /* 0x3ea776425d6b0199 */
    6.01209702477462624811e-07,   /* 0x3ea42c5a1e0191a2 */
    9.01437000591944740554e-08,   /* 0x3e7832a0bdff1327 */
    5.10428680864685379950e-08,   /* 0x3e6b674743636676 */
    3.47895267104621031421e-07,   /* 0x3e9758cb90d2f714 */
    7.80735841510641848628e-07,   /* 0x3eaa3278459cde25 */
    1.35158752025506517690e-07,   /* 0x3e822404f4a103ee */
    0.00000000000000000000e+00,   /* 0x0000000000000000 */
    1.76523947728535489812e-09,   /* 0x3e1e539af6892ac5 */
    6.68280121328499932183e-07,   /* 0x3ea66c7b872c9cd0 */
    5.70135482405123276616e-07,   /* 0x3ea3216d2f43887d */
    1.37705134737562525897e-07,   /* 0x3e827b832cbedc0e */
    7.09655107074516613672e-07,   /* 0x3ea7cfe41579091d */
    7.20302724551461693011e-07,   /* 0x3ea82b5a713c490a */
    4.69926266058212796694e-07,   /* 0x3e9f8945932d872e */
    2.19244345915999437026e-07,   /* 0x3e8d6d2da9490251 */
    1.91141411617401877927e-07,   /* 0x3e89a791a3114e4a */
    5.72297665296622053774e-07,   /* 0x3ea333ffe005988d */
    5.61055484436830560103e-07,   /* 0x3ea2d36e0ed49ab1 */
    2.76225500213991506100e-07,   /* 0x3e92898498f55f9e */
    7.58466189522395692908e-07,   /* 0x3ea9732cca1032a3 */
    1.56893371256836029827e-07,   /* 0x3e850ed0b02a22d2 */
    4.06038997708867066507e-07,   /* 0x3e9b3fb265b1e40a */
    5.51305629612057435809e-07,   /* 0x3ea27fade682d1de */
    5.64778487026561123207e-07,   /* 0x3ea2f36906f707ba */
    3.92609705553556897517e-07,   /* 0x3e9a58fbbee883b6 */
    9.09698438776943827802e-07,   /* 0x3eae864005bca6d7 */
    1.05949774066016139743e-07,   /* 0x3e7c70d02300f263 */
    7.16578798392844784244e-07,   /* 0x3ea80b5d712d8e3e */
    6.86233073531233972561e-07,   /* 0x3ea706b27cc7d390 */
    7.99211473033494452908e-07,   /* 0x3eaad12c9d849a97 */
    8.65552275731027456121e-07,   /* 0x3ead0b09954e764b */
    6.75456120386058448618e-07,   /* 0x3ea6aa1fb7826cbd */
    0.00000000000000000000e+00,   /* 0x0000000000000000 */
    4.99167184520462138743e-07,   /* 0x3ea0bfd03f46763c */
    4.51720373502110930296e-10,   /* 0x3dff0abfb4adfb9e */
    1.28874162718371367439e-07,   /* 0x3e814c151f991b2e */
    5.85529267186999798656e-07,   /* 0x3ea3a5a879b09292 */
    1.01827770937125531924e-07,   /* 0x3e7b558d173f9796 */
    2.54736389177809626508e-07,   /* 0x3e9118567cd83fb8 */
    6.98925535290464831294e-07,   /* 0x3ea773b981896751 */
    1.20940735036524314513e-07,   /* 0x3e803b7df49f48a8 */
    5.43759351196479689657e-08,   /* 0x3e6d315f22491900 */
    1.11957989042397958409e-07,   /* 0x3e7e0db1c5bb84b2 */
    8.47006714134442661218e-07,   /* 0x3eac6bbb7644ff76 */
    8.92831044643427836228e-07,   /* 0x3eadf55c3afec01f */
    7.77828292464916501663e-07,   /* 0x3eaa197e81034da3 */
    6.48469316302918797451e-08,   /* 0x3e71683f4920555d */
    2.12579816658859849140e-07,   /* 0x3e8c882fd78bb0b0 */
    7.61222472580559138435e-07,   /* 0x3ea98ad9eb7b83ec */
    2.86488961857314189607e-07,   /* 0x3e9339d7c7777273 */
    2.14637363790165363515e-07,   /* 0x3e8ccee237cae6fe */
    5.44137005612605847831e-08,   /* 0x3e6d368fe324a146 */
    2.58378284856442408413e-07,   /* 0x3e9156e7b6d99b45 */
    3.15848939061134843091e-07,   /* 0x3e95323e5310b5c1 */
    6.60530466255089632309e-07,   /* 0x3ea629e9db362f5d */
    7.63436345535852301127e-07,   /* 0x3ea99dde4728d7ec */
    8.68233432860324345268e-08,   /* 0x3e774e746878544d */
    9.45465175398023087082e-07,   /* 0x3eafb97be873a87d */
    8.77499534786171267246e-07,   /* 0x3ead71a9e23c2f63 */
    2.74055432394999316135e-07,   /* 0x3e92643c89cda173 */
    4.72129009349126213532e-07,   /* 0x3e9faf1d57a4d56c */
    8.93777032327078947306e-07,   /* 0x3eadfd7c7ab7b282 */
    0.00000000000000000000e+00};  /* 0x0000000000000000 */


  /* Handle special arguments first */

  GET_BITS_DP64(x, ux);
  ax = ux & (~SIGNBIT_DP64);

  if(ax >= 0x7ff0000000000000)
    {
      /* x is either NaN or infinity */
      if (ux & MANTBITS_DP64)
        /* x is NaN */
        return x + x; /* Raise invalid if it is a signalling NaN */
      else if (ux & SIGNBIT_DP64)
        /* x is negative infinity */
        return nan_with_flags(AMD_F_INVALID);
      else
        /* x is positive infinity */
        return x;
    }
  else if (ux & SIGNBIT_DP64)
    {
      /* x is negative. */
      if (ux == SIGNBIT_DP64)
        /* Handle negative zero first */
        return x;
      else
        return nan_with_flags(AMD_F_INVALID);
    }
  else if (ux <= 0x000fffffffffffff)
    {
      /* x is denormalised or zero */
      if (ux == 0)
        /* x is zero */
        return x;
      else
        {
          /* x is denormalised; scale it up */
          /* Normalize x by increasing the exponent by 60
             and subtracting a correction to account for the implicit
             bit. This replaces a slow denormalized
             multiplication by a fast normal subtraction. */
          static const double corr = 2.5653355008114851558350183e-290; /* 0x03d0000000000000 */
          denorm = 1;
          GET_BITS_DP64(x, ux);
          PUT_BITS_DP64(ux | 0x03d0000000000000, x);
          x -= corr;
          GET_BITS_DP64(x, ux);
        }
    }

  /* Main algorithm */

  /*
     Find y and e such that x = 2^e * y, where y in [1,4).
     This is done using an in-lined variant of splitDouble,
     which also ensures that e is even.
   */
  y = x;
  ux &= EXPBITS_DP64;
  ux >>= EXPSHIFTBITS_DP64;
  if (ux & 1)
    {
      GET_BITS_DP64(y, u);
      u &= (SIGNBIT_DP64 | MANTBITS_DP64);
      u |= ONEEXPBITS_DP64;
      PUT_BITS_DP64(u, y);
      e = ux - EXPBIAS_DP64;
    }
  else
    {
      GET_BITS_DP64(y, u);
      u &= (SIGNBIT_DP64 | MANTBITS_DP64);
      u |= TWOEXPBITS_DP64;
      PUT_BITS_DP64(u, y);
      e = ux - EXPBIAS_DP64 - 1;
    }


  /* Find the index of the sub-interval of [1,4) in which y lies. */

  index = (int)(32.0*y+0.5);

  /* Look up the table values and compute c and r = c/t */

  rtc_lead = rt_jby32_lead_table_dbl[index-32];
  rtc_trail = rt_jby32_trail_table_dbl[index-32];
  c = 0.03125*index;
  r = (y - c)/c;

  /*
    Find q = sqrt(1+r) - 1.
    From one step of Newton on (q+1)^2 = 1+r
  */

  p = r*0.5 - r*r*(0.1250079870 - r*(0.6250522999E-01));
  twop = p + p;
  q = p - (p*p + (twop - r))/(twop + 2.0);

  /* Reconstruction */

  rtc = rtc_lead + rtc_trail;
  e >>= 1; /* e = e/2 */
  z = rtc_lead + (rtc*q+rtc_trail);

  if (denorm)
    {
      /* Scale by 2**(e-30) */
      PUT_BITS_DP64(((long long)(e - 30) + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, r);
      z *= r;
    }
  else
    {
      /* Scale by 2**e */
      PUT_BITS_DP64(((long long)e + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, r);
      z *= r;
    }

  return z;

}
#endif /* SQRT_AMD_INLINE */

#if defined(USE_SQRTF_AMD_INLINE)

static inline float sqrtf_amd_inline(float x)
{
  /*
     Computes the square root of x.

     The calculation is carried out in three steps.

     Step 1. Reduction.
     The input argument is scaled to the interval [1, 4) by
     computing
               x = 2^e * y, where y in [1,4).
     Furthermore y is decomposed as y = c + t where
               c = 1 + j/32, j = 0,1,..,96; and |t| <= 1/64.

     Step 2. Approximation.
     An approximation q = sqrt(1 + (t/c)) - 1  is obtained
     from a basic series expansion using precomputed values
     stored in rt_jby32_lead_table_float and rt_jby32_trail_table_float.

     Step 3. Reconstruction.
     The value of sqrt(x) is reconstructed via
       sqrt(x) = 2^(e/2) * sqrt(y)
               = 2^(e/2) * sqrt(c) * sqrt(y/c)
               = 2^(e/2) * sqrt(c) * sqrt(1 + t/c)
               = 2^(e/2) * [ sqrt(c) + sqrt(c)*q ]
    */

  unsigned int ux, ax, u;
  float r1, r2, c, y, p, q, r, twop, z, rtc, rtc_lead, rtc_trail;
  int e, denorm = 0, index;

/* Arrays rt_jby32_lead_table_float and rt_jby32_trail_table_float contain
   leading and trailing parts respectively of precomputed
   values of sqrt(j/32), for j = 32, 33, ..., 128.
   rt_jby32_lead_table_float contains the first 13 bits of precision,
   and rt_jby32_trail_table_float contains a further 24 bits precision. */

static const float rt_jby32_lead_table_float[97] = {
    1.00000000000000000000e+00F,   /* 0x3f800000 */
    1.01538085937500000000e+00F,   /* 0x3f81f800 */
    1.03076171875000000000e+00F,   /* 0x3f83f000 */
    1.04565429687500000000e+00F,   /* 0x3f85d800 */
    1.06054687500000000000e+00F,   /* 0x3f87c000 */
    1.07519531250000000000e+00F,   /* 0x3f89a000 */
    1.08959960937500000000e+00F,   /* 0x3f8b7800 */
    1.10375976562500000000e+00F,   /* 0x3f8d4800 */
    1.11791992187500000000e+00F,   /* 0x3f8f1800 */
    1.13183593750000000000e+00F,   /* 0x3f90e000 */
    1.14550781250000000000e+00F,   /* 0x3f92a000 */
    1.15917968750000000000e+00F,   /* 0x3f946000 */
    1.17236328125000000000e+00F,   /* 0x3f961000 */
    1.18579101562500000000e+00F,   /* 0x3f97c800 */
    1.19873046875000000000e+00F,   /* 0x3f997000 */
    1.21191406250000000000e+00F,   /* 0x3f9b2000 */
    1.22460937500000000000e+00F,   /* 0x3f9cc000 */
    1.23730468750000000000e+00F,   /* 0x3f9e6000 */
    1.25000000000000000000e+00F,   /* 0x3fa00000 */
    1.26220703125000000000e+00F,   /* 0x3fa19000 */
    1.27465820312500000000e+00F,   /* 0x3fa32800 */
    1.28686523437500000000e+00F,   /* 0x3fa4b800 */
    1.29882812500000000000e+00F,   /* 0x3fa64000 */
    1.31079101562500000000e+00F,   /* 0x3fa7c800 */
    1.32275390625000000000e+00F,   /* 0x3fa95000 */
    1.33447265625000000000e+00F,   /* 0x3faad000 */
    1.34619140625000000000e+00F,   /* 0x3fac5000 */
    1.35766601562500000000e+00F,   /* 0x3fadc800 */
    1.36914062500000000000e+00F,   /* 0x3faf4000 */
    1.38061523437500000000e+00F,   /* 0x3fb0b800 */
    1.39184570312500000000e+00F,   /* 0x3fb22800 */
    1.40307617187500000000e+00F,   /* 0x3fb39800 */
    1.41406250000000000000e+00F,   /* 0x3fb50000 */
    1.42504882812500000000e+00F,   /* 0x3fb66800 */
    1.43603515625000000000e+00F,   /* 0x3fb7d000 */
    1.44677734375000000000e+00F,   /* 0x3fb93000 */
    1.45751953125000000000e+00F,   /* 0x3fba9000 */
    1.46826171875000000000e+00F,   /* 0x3fbbf000 */
    1.47900390625000000000e+00F,   /* 0x3fbd5000 */
    1.48950195312500000000e+00F,   /* 0x3fbea800 */
    1.50000000000000000000e+00F,   /* 0x3fc00000 */
    1.51025390625000000000e+00F,   /* 0x3fc15000 */
    1.52050781250000000000e+00F,   /* 0x3fc2a000 */
    1.53076171875000000000e+00F,   /* 0x3fc3f000 */
    1.54101562500000000000e+00F,   /* 0x3fc54000 */
    1.55102539062500000000e+00F,   /* 0x3fc68800 */
    1.56103515625000000000e+00F,   /* 0x3fc7d000 */
    1.57104492187500000000e+00F,   /* 0x3fc91800 */
    1.58105468750000000000e+00F,   /* 0x3fca6000 */
    1.59082031250000000000e+00F,   /* 0x3fcba000 */
    1.60058593750000000000e+00F,   /* 0x3fcce000 */
    1.61035156250000000000e+00F,   /* 0x3fce2000 */
    1.62011718750000000000e+00F,   /* 0x3fcf6000 */
    1.62963867187500000000e+00F,   /* 0x3fd09800 */
    1.63916015625000000000e+00F,   /* 0x3fd1d000 */
    1.64868164062500000000e+00F,   /* 0x3fd30800 */
    1.65820312500000000000e+00F,   /* 0x3fd44000 */
    1.66748046875000000000e+00F,   /* 0x3fd57000 */
    1.67700195312500000000e+00F,   /* 0x3fd6a800 */
    1.68627929687500000000e+00F,   /* 0x3fd7d800 */
    1.69555664062500000000e+00F,   /* 0x3fd90800 */
    1.70458984375000000000e+00F,   /* 0x3fda3000 */
    1.71386718750000000000e+00F,   /* 0x3fdb6000 */
    1.72290039062500000000e+00F,   /* 0x3fdc8800 */
    1.73193359375000000000e+00F,   /* 0x3fddb000 */
    1.74096679687500000000e+00F,   /* 0x3fded800 */
    1.75000000000000000000e+00F,   /* 0x3fe00000 */
    1.75878906250000000000e+00F,   /* 0x3fe12000 */
    1.76757812500000000000e+00F,   /* 0x3fe24000 */
    1.77636718750000000000e+00F,   /* 0x3fe36000 */
    1.78515625000000000000e+00F,   /* 0x3fe48000 */
    1.79394531250000000000e+00F,   /* 0x3fe5a000 */
    1.80273437500000000000e+00F,   /* 0x3fe6c000 */
    1.81127929687500000000e+00F,   /* 0x3fe7d800 */
    1.81982421875000000000e+00F,   /* 0x3fe8f000 */
    1.82836914062500000000e+00F,   /* 0x3fea0800 */
    1.83691406250000000000e+00F,   /* 0x3feb2000 */
    1.84545898437500000000e+00F,   /* 0x3fec3800 */
    1.85400390625000000000e+00F,   /* 0x3fed5000 */
    1.86230468750000000000e+00F,   /* 0x3fee6000 */
    1.87060546875000000000e+00F,   /* 0x3fef7000 */
    1.87915039062500000000e+00F,   /* 0x3ff08800 */
    1.88745117187500000000e+00F,   /* 0x3ff19800 */
    1.89550781250000000000e+00F,   /* 0x3ff2a000 */
    1.90380859375000000000e+00F,   /* 0x3ff3b000 */
    1.91210937500000000000e+00F,   /* 0x3ff4c000 */
    1.92016601562500000000e+00F,   /* 0x3ff5c800 */
    1.92822265625000000000e+00F,   /* 0x3ff6d000 */
    1.93627929687500000000e+00F,   /* 0x3ff7d800 */
    1.94433593750000000000e+00F,   /* 0x3ff8e000 */
    1.95239257812500000000e+00F,   /* 0x3ff9e800 */
    1.96044921875000000000e+00F,   /* 0x3ffaf000 */
    1.96826171875000000000e+00F,   /* 0x3ffbf000 */
    1.97631835937500000000e+00F,   /* 0x3ffcf800 */
    1.98413085937500000000e+00F,   /* 0x3ffdf800 */
    1.99194335937500000000e+00F,   /* 0x3ffef800 */
    2.00000000000000000000e+00F};  /* 0x40000000 */

static const float rt_jby32_trail_table_float[97] = {
    0.00000000000000000000e+00F,   /* 0x00000000 */
    1.23941208585165441036e-04F,   /* 0x3901f637 */
    1.46876545841223560274e-05F,   /* 0x37766aff */
    1.70736297150142490864e-04F,   /* 0x393307ad */
    1.13296780909877270460e-04F,   /* 0x38ed99bf */
    9.53458802541717886925e-05F,   /* 0x38c7f46e */
    1.25126505736261606216e-04F,   /* 0x39033464 */
    2.10342666832730174065e-04F,   /* 0x395c8f6e */
    1.14066875539720058441e-04F,   /* 0x38ef3730 */
    8.72047676239162683487e-05F,   /* 0x38b6e1b4 */
    1.36111237225122749805e-04F,   /* 0x390eb915 */
    2.26244374061934649944e-05F,   /* 0x37bdc99c */
    2.40658700931817293167e-04F,   /* 0x397c5954 */
    6.31069415248930454254e-05F,   /* 0x38845848 */
    2.27412077947519719601e-04F,   /* 0x396e7577 */
    5.90185391047270968556e-06F,   /* 0x36c6088a */
    1.35496389702893793583e-04F,   /* 0x390e1409 */
    1.32179571664892137051e-04F,   /* 0x390a99af */
    0.00000000000000000000e+00F,   /* 0x00000000 */
    2.31086043640971183777e-04F,   /* 0x39724fb0 */
    9.66752704698592424393e-05F,   /* 0x38cabe24 */
    8.85332483449019491673e-05F,   /* 0x38b9aaed */
    2.09980673389509320259e-04F,   /* 0x395c2e42 */
    2.20044588786549866199e-04F,   /* 0x3966bbc5 */
    1.21749282698146998882e-04F,   /* 0x38ff53a6 */
    1.62125259521417319775e-04F,   /* 0x392a002b */
    9.97955357888713479042e-05F,   /* 0x38d14952 */
    1.81545779923908412457e-04F,   /* 0x393e5d53 */
    1.65768768056295812130e-04F,   /* 0x392dd237 */
    5.48927710042335093021e-05F,   /* 0x38663caa */
    9.53875860432162880898e-05F,   /* 0x38c80ad2 */
    4.53481625299900770187e-05F,   /* 0x383e3438 */
    1.51062369695864617825e-04F,   /* 0x391e667f */
    1.70453247847035527229e-04F,   /* 0x3932bbb2 */
    1.05505387182347476482e-04F,   /* 0x38dd42c6 */
    2.02269104192964732647e-04F,   /* 0x39541833 */
    2.18442466575652360916e-04F,   /* 0x39650db4 */
    1.55796806211583316326e-04F,   /* 0x39235d63 */
    1.60395247803535312414e-05F,   /* 0x37868c9e */
    4.49578510597348213196e-05F,   /* 0x383c9120 */
    0.00000000000000000000e+00F,   /* 0x00000000 */
    1.26840444863773882389e-04F,   /* 0x39050079 */
    1.82820076588541269302e-04F,   /* 0x393fb364 */
    1.69370483490638434887e-04F,   /* 0x3931990b */
    8.78757418831810355186e-05F,   /* 0x38b849ee */
    1.83815121999941766262e-04F,   /* 0x3940be7f */
    2.14343352126888930798e-04F,   /* 0x3960c15b */
    1.80714370799250900745e-04F,   /* 0x393d7e25 */
    8.41425862745381891727e-05F,   /* 0x38b075b5 */
    1.69945167726837098598e-04F,   /* 0x3932334f */
    1.95121858268976211548e-04F,   /* 0x394c99a0 */
    1.60778334247879683971e-04F,   /* 0x3928969b */
    6.79871009197086095810e-05F,   /* 0x388e944c */
    1.61929419846273958683e-04F,   /* 0x3929cb99 */
    1.99474830878898501396e-04F,   /* 0x39512a1e */
    1.81604162207804620266e-04F,   /* 0x393e6cff */
    1.09270178654696792364e-04F,   /* 0x38e527fb */
    2.27539261686615645885e-04F,   /* 0x396e979b */
    4.90300008095800876617e-05F,   /* 0x384da590 */
    6.28985289949923753738e-05F,   /* 0x3883e864 */
    2.58551553997676819563e-05F,   /* 0x37d8e386 */
    1.82868374395184218884e-04F,   /* 0x393fc05b */
    4.64625991298817098141e-05F,   /* 0x3842e0d6 */
    1.05703387816902250051e-04F,   /* 0x38ddad13 */
    1.17213814519345760345e-04F,   /* 0x38f5d0b0 */
    8.17377731436863541603e-05F,   /* 0x38ab6aa2 */
    0.00000000000000000000e+00F,   /* 0x00000000 */
    1.16847433673683553934e-04F,   /* 0x38f50bfd */
    1.88827965757809579372e-04F,   /* 0x3946001f */
    2.16612941585481166840e-04F,   /* 0x39632298 */
    2.00857131858356297016e-04F,   /* 0x39529d2d */
    1.42199307447299361229e-04F,   /* 0x39151b56 */
    4.12627305195201188326e-05F,   /* 0x382d1185 */
    1.42796401632949709892e-04F,   /* 0x3915bb9e */
    2.03253570361994206905e-04F,   /* 0x39552077 */
    2.23214170546270906925e-04F,   /* 0x396a0e99 */
    2.03244591830298304558e-04F,   /* 0x39551e0e */
    1.43898156238719820976e-04F,   /* 0x3916e35e */
    4.57155256299301981926e-05F,   /* 0x383fbeac */
    1.53365719597786664963e-04F,   /* 0x3920d0cc */
    2.23224633373320102692e-04F,   /* 0x396a1168 */
    1.16566716314991936088e-05F,   /* 0x37439106 */
    7.43694272387074306607e-06F,   /* 0x36f98ada */
    2.11048507480882108212e-04F,   /* 0x395d4ce7 */
    1.34682719362899661064e-04F,   /* 0x390d399e */
    2.29425968427676707506e-05F,   /* 0x37c074da */
    1.20421340398024767637e-04F,   /* 0x38fc8ab7 */
    1.83421318070031702518e-04F,   /* 0x394054c9 */
    2.12376224226318299770e-04F,   /* 0x395eb14f */
    2.07710763788782060146e-04F,   /* 0x3959ccef */
    1.69840845046564936638e-04F,   /* 0x3932174e */
    9.91739216260612010956e-05F,   /* 0x38cffb98 */
    2.40249748458154499531e-04F,   /* 0x397beb8d */
    1.05178231024183332920e-04F,   /* 0x38dc9322 */
    1.82623916771262884140e-04F,   /* 0x393f7ebc */
    2.28821940254420042038e-04F,   /* 0x396fefec */
    0.00000000000000000000e+00F};  /* 0x00000000 */


/* Handle special arguments first */

  GET_BITS_SP32(x, ux);
  ax = ux & (~SIGNBIT_SP32);

  if(ax >= 0x7f800000)
    {
      /* x is either NaN or infinity */
      if (ux & MANTBITS_SP32)
        /* x is NaN */
        return x + x; /* Raise invalid if it is a signalling NaN */
      else if (ux & SIGNBIT_SP32)
        return nanf_with_flags(AMD_F_INVALID);
      else
        /* x is positive infinity */
        return x;
    }
  else if (ux & SIGNBIT_SP32)
    {
      /* x is negative. */
      if (x == 0.0F)
        /* Handle negative zero first */
        return x;
      else
        return nanf_with_flags(AMD_F_INVALID);
    }
  else if (ux <= 0x007fffff)
    {
      /* x is denormalised or zero */
      if (ux == 0)
        /* x is zero */
        return x;
      else
        {
          /* x is denormalised; scale it up */
          /* Normalize x by increasing the exponent by 26
             and subtracting a correction to account for the implicit
             bit. This replaces a slow denormalized
             multiplication by a fast normal subtraction. */
          static const float corr = 7.888609052210118054e-31F; /* 0x0d800000 */
          denorm = 1;
          GET_BITS_SP32(x, ux);
          PUT_BITS_SP32(ux | 0x0d800000, x);
          x -= corr;
          GET_BITS_SP32(x, ux);
        }
    }

  /* Main algorithm */

  /*
     Find y and e such that x = 2^e * y, where y in [1,4).
     This is done using an in-lined variant of splitFloat,
     which also ensures that e is even.
   */
  y = x;
  ux &= EXPBITS_SP32;
  ux >>= EXPSHIFTBITS_SP32;
  if (ux & 1)
    {
      GET_BITS_SP32(y, u);
      u &= (SIGNBIT_SP32 | MANTBITS_SP32);
      u |= ONEEXPBITS_SP32;
      PUT_BITS_SP32(u, y);
      e = ux - EXPBIAS_SP32;
    }
  else
    {
      GET_BITS_SP32(y, u);
      u &= (SIGNBIT_SP32 | MANTBITS_SP32);
      u |= TWOEXPBITS_SP32;
      PUT_BITS_SP32(u, y);
      e = ux - EXPBIAS_SP32 - 1;
    }

  /* Find the index of the sub-interval of [1,4) in which y lies. */

  index = (int)(32.0F*y+0.5);

  /* Look up the table values and compute c and r = c/t */

  rtc_lead = rt_jby32_lead_table_float[index-32];
  rtc_trail = rt_jby32_trail_table_float[index-32];
  c = 0.03125F*index;
  r = (y - c)/c;

  /*
  Find q = sqrt(1+r) - 1.
  From one step of Newton on (q+1)^2 = 1+r
  */

  p = r*0.5F - r*r*(0.1250079870F - r*(0.6250522999e-01F));
  twop = p + p;
  q = p - (p*p + (twop - r))/(twop + 2.0);

  /* Reconstruction */

  rtc = rtc_lead + rtc_trail;
  e >>= 1; /* e = e/2 */
  z = rtc_lead + (rtc*q+rtc_trail);

  if (denorm)
    {
      /* Scale by 2**(e-13) */
      PUT_BITS_SP32(((e - 13) + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, r);
      z *= r;
    }
  else
    {
      /* Scale by 2**e */
      PUT_BITS_SP32((e + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, r);
      z *= r;
    }

  return z;

}
#endif /* SQRTF_AMD_INLINE */

#ifdef USE_LOG_KERNEL_AMD
static inline void log_kernel_amd64(double x, unsigned long long ux, int *xexp, double *r1, double *r2)
{

  int expadjust;
  double r, z1, z2, correction, f, f1, f2, q, u, v, poly;
  int index;

  /*
    Computes natural log(x). Algorithm based on:
    Ping-Tak Peter Tang
    "Table-driven implementation of the logarithm function in IEEE
    floating-point arithmetic"
    ACM Transactions on Mathematical Software (TOMS)
    Volume 16, Issue 4 (December 1990)
  */

/* Arrays ln_lead_table and ln_tail_table contain
   leading and trailing parts respectively of precomputed
   values of natural log(1+i/64), for i = 0, 1, ..., 64.
   ln_lead_table contains the first 24 bits of precision,
   and ln_tail_table contains a further 53 bits precision. */

  static const double ln_lead_table[65] = {
    0.00000000000000000000e+00,   /* 0x0000000000000000 */
    1.55041813850402832031e-02,   /* 0x3f8fc0a800000000 */
    3.07716131210327148438e-02,   /* 0x3f9f829800000000 */
    4.58095073699951171875e-02,   /* 0x3fa7745800000000 */
    6.06245994567871093750e-02,   /* 0x3faf0a3000000000 */
    7.52233862876892089844e-02,   /* 0x3fb341d700000000 */
    8.96121263504028320312e-02,   /* 0x3fb6f0d200000000 */
    1.03796780109405517578e-01,   /* 0x3fba926d00000000 */
    1.17783010005950927734e-01,   /* 0x3fbe270700000000 */
    1.31576299667358398438e-01,   /* 0x3fc0d77e00000000 */
    1.45181953907012939453e-01,   /* 0x3fc2955280000000 */
    1.58604979515075683594e-01,   /* 0x3fc44d2b00000000 */
    1.71850204467773437500e-01,   /* 0x3fc5ff3000000000 */
    1.84922337532043457031e-01,   /* 0x3fc7ab8900000000 */
    1.97825729846954345703e-01,   /* 0x3fc9525a80000000 */
    2.10564732551574707031e-01,   /* 0x3fcaf3c900000000 */
    2.23143517971038818359e-01,   /* 0x3fcc8ff780000000 */
    2.35566020011901855469e-01,   /* 0x3fce270700000000 */
    2.47836112976074218750e-01,   /* 0x3fcfb91800000000 */
    2.59957492351531982422e-01,   /* 0x3fd0a324c0000000 */
    2.71933674812316894531e-01,   /* 0x3fd1675c80000000 */
    2.83768117427825927734e-01,   /* 0x3fd22941c0000000 */
    2.95464158058166503906e-01,   /* 0x3fd2e8e280000000 */
    3.07025015354156494141e-01,   /* 0x3fd3a64c40000000 */
    3.18453729152679443359e-01,   /* 0x3fd4618bc0000000 */
    3.29753279685974121094e-01,   /* 0x3fd51aad80000000 */
    3.40926527976989746094e-01,   /* 0x3fd5d1bd80000000 */
    3.51976394653320312500e-01,   /* 0x3fd686c800000000 */
    3.62905442714691162109e-01,   /* 0x3fd739d7c0000000 */
    3.73716354370117187500e-01,   /* 0x3fd7eaf800000000 */
    3.84411692619323730469e-01,   /* 0x3fd89a3380000000 */
    3.94993782043457031250e-01,   /* 0x3fd9479400000000 */
    4.05465066432952880859e-01,   /* 0x3fd9f323c0000000 */
    4.15827870368957519531e-01,   /* 0x3fda9cec80000000 */
    4.26084339618682861328e-01,   /* 0x3fdb44f740000000 */
    4.36236739158630371094e-01,   /* 0x3fdbeb4d80000000 */
    4.46287095546722412109e-01,   /* 0x3fdc8ff7c0000000 */
    4.56237375736236572266e-01,   /* 0x3fdd32fe40000000 */
    4.66089725494384765625e-01,   /* 0x3fddd46a00000000 */
    4.75845873355865478516e-01,   /* 0x3fde744240000000 */
    4.85507786273956298828e-01,   /* 0x3fdf128f40000000 */
    4.95077252388000488281e-01,   /* 0x3fdfaf5880000000 */
    5.04556000232696533203e-01,   /* 0x3fe02552a0000000 */
    5.13945698738098144531e-01,   /* 0x3fe0723e40000000 */
    5.23248136043548583984e-01,   /* 0x3fe0be72e0000000 */
    5.32464742660522460938e-01,   /* 0x3fe109f380000000 */
    5.41597247123718261719e-01,   /* 0x3fe154c3c0000000 */
    5.50647079944610595703e-01,   /* 0x3fe19ee6a0000000 */
    5.59615731239318847656e-01,   /* 0x3fe1e85f40000000 */
    5.68504691123962402344e-01,   /* 0x3fe23130c0000000 */
    5.77315330505371093750e-01,   /* 0x3fe2795e00000000 */
    5.86049020290374755859e-01,   /* 0x3fe2c0e9e0000000 */
    5.94707071781158447266e-01,   /* 0x3fe307d720000000 */
    6.03290796279907226562e-01,   /* 0x3fe34e2880000000 */
    6.11801505088806152344e-01,   /* 0x3fe393e0c0000000 */
    6.20240390300750732422e-01,   /* 0x3fe3d90260000000 */
    6.28608644008636474609e-01,   /* 0x3fe41d8fe0000000 */
    6.36907458305358886719e-01,   /* 0x3fe4618bc0000000 */
    6.45137906074523925781e-01,   /* 0x3fe4a4f840000000 */
    6.53301239013671875000e-01,   /* 0x3fe4e7d800000000 */
    6.61398470401763916016e-01,   /* 0x3fe52a2d20000000 */
    6.69430613517761230469e-01,   /* 0x3fe56bf9c0000000 */
    6.77398800849914550781e-01,   /* 0x3fe5ad4040000000 */
    6.85303986072540283203e-01,   /* 0x3fe5ee02a0000000 */
    6.93147122859954833984e-01};  /* 0x3fe62e42e0000000 */

  static const double ln_tail_table[65] = {
    0.00000000000000000000e+00,   /* 0x0000000000000000 */
    5.15092497094772879206e-09,   /* 0x3e361f807c79f3db */
    4.55457209735272790188e-08,   /* 0x3e6873c1980267c8 */
    2.86612990859791781788e-08,   /* 0x3e5ec65b9f88c69e */
    2.23596477332056055352e-08,   /* 0x3e58022c54cc2f99 */
    3.49498983167142274770e-08,   /* 0x3e62c37a3a125330 */
    3.23392843005887000414e-08,   /* 0x3e615cad69737c93 */
    1.35722380472479366661e-08,   /* 0x3e4d256ab1b285e9 */
    2.56504325268044191098e-08,   /* 0x3e5b8abcb97a7aa2 */
    5.81213608741512136843e-08,   /* 0x3e6f34239659a5dc */
    5.59374849578288093334e-08,   /* 0x3e6e07fd48d30177 */
    5.06615629004996189970e-08,   /* 0x3e6b32df4799f4f6 */
    5.24588857848400955725e-08,   /* 0x3e6c29e4f4f21cf8 */
    9.61968535632653505972e-10,   /* 0x3e1086c848df1b59 */
    1.34829655346594463137e-08,   /* 0x3e4cf456b4764130 */
    3.65557749306383026498e-08,   /* 0x3e63a02ffcb63398 */
    3.33431709374069198903e-08,   /* 0x3e61e6a6886b0976 */
    5.13008650536088382197e-08,   /* 0x3e6b8abcb97a7aa2 */
    5.09285070380306053751e-08,   /* 0x3e6b578f8aa35552 */
    3.20853940845502057341e-08,   /* 0x3e6139c871afb9fc */
    4.06713248643004200446e-08,   /* 0x3e65d5d30701ce64 */
    5.57028186706125221168e-08,   /* 0x3e6de7bcb2d12142 */
    5.48356693724804282546e-08,   /* 0x3e6d708e984e1664 */
    1.99407553679345001938e-08,   /* 0x3e556945e9c72f36 */
    1.96585517245087232086e-09,   /* 0x3e20e2f613e85bda */
    6.68649386072067321503e-09,   /* 0x3e3cb7e0b42724f6 */
    5.89936034642113390002e-08,   /* 0x3e6fac04e52846c7 */
    2.85038578721554472484e-08,   /* 0x3e5e9b14aec442be */
    5.09746772910284482606e-08,   /* 0x3e6b5de8034e7126 */
    5.54234668933210171467e-08,   /* 0x3e6dc157e1b259d3 */
    6.29100830926604004874e-09,   /* 0x3e3b05096ad69c62 */
    2.61974119468563937716e-08,   /* 0x3e5c2116faba4cdd */
    4.16752115011186398935e-08,   /* 0x3e665fcc25f95b47 */
    2.47747534460820790327e-08,   /* 0x3e5a9a08498d4850 */
    5.56922172017964209793e-08,   /* 0x3e6de647b1465f77 */
    2.76162876992552906035e-08,   /* 0x3e5da71b7bf7861d */
    7.08169709942321478061e-09,   /* 0x3e3e6a6886b09760 */
    5.77453510221151779025e-08,   /* 0x3e6f0075eab0ef64 */
    4.43021445893361960146e-09,   /* 0x3e33071282fb989b */
    3.15140984357495864573e-08,   /* 0x3e60eb43c3f1bed2 */
    2.95077445089736670973e-08,   /* 0x3e5faf06ecb35c84 */
    1.44098510263167149349e-08,   /* 0x3e4ef1e63db35f68 */
    1.05196987538551827693e-08,   /* 0x3e469743fb1a71a5 */
    5.23641361722697546261e-08,   /* 0x3e6c1cdf404e5796 */
    7.72099925253243069458e-09,   /* 0x3e4094aa0ada625e */
    5.62089493829364197156e-08,   /* 0x3e6e2d4c96fde3ec */
    3.53090261098577946927e-08,   /* 0x3e62f4d5e9a98f34 */
    3.80080516835568242269e-08,   /* 0x3e6467c96ecc5cbe */
    5.66961038386146408282e-08,   /* 0x3e6e7040d03dec5a */
    4.42287063097349852717e-08,   /* 0x3e67bebf4282de36 */
    3.45294525105681104660e-08,   /* 0x3e6289b11aeb783f */
    2.47132034530447431509e-08,   /* 0x3e5a891d1772f538 */
    3.59655343422487209774e-08,   /* 0x3e634f10be1fb591 */
    5.51581770357780862071e-08,   /* 0x3e6d9ce1d316eb93 */
    3.60171867511861372793e-08,   /* 0x3e63562a19a9c442 */
    1.94511067964296180547e-08,   /* 0x3e54e2adf548084c */
    1.54137376631349347838e-08,   /* 0x3e508ce55cc8c97a */
    3.93171034490174464173e-09,   /* 0x3e30e2f613e85bda */
    5.52990607758839766440e-08,   /* 0x3e6db03ebb0227bf */
    3.29990737637586136511e-08,   /* 0x3e61b75bb09cb098 */
    1.18436010922446096216e-08,   /* 0x3e496f16abb9df22 */
    4.04248680368301346709e-08,   /* 0x3e65b3f399411c62 */
    2.27418915900284316293e-08,   /* 0x3e586b3e59f65355 */
    1.70263791333409206020e-08,   /* 0x3e52482ceae1ac12 */
    5.76999904754328540596e-08};  /* 0x3e6efa39ef35793c */

  /* Approximating polynomial coefficients for x near 1.0 */
  static const double
    ca_1 = 8.33333333333317923934e-02,  /* 0x3fb55555555554e6 */
    ca_2 = 1.25000000037717509602e-02,  /* 0x3f89999999bac6d4 */
    ca_3 = 2.23213998791944806202e-03,  /* 0x3f62492307f1519f */
    ca_4 = 4.34887777707614552256e-04;  /* 0x3f3c8034c85dfff0 */

  /* Approximating polynomial coefficients for other x */
  static const double
    cb_1 = 8.33333333333333593622e-02,  /* 0x3fb5555555555557 */
    cb_2 = 1.24999999978138668903e-02,  /* 0x3f89999999865ede */
    cb_3 = 2.23219810758559851206e-03;  /* 0x3f6249423bd94741 */

  static const unsigned long long
    log_thresh1 = 0x3fee0faa00000000,
    log_thresh2 = 0x3ff1082c00000000;

  /* log_thresh1 = 9.39412117004394531250e-1 = 0x3fee0faa00000000
     log_thresh2 = 1.06449508666992187500 = 0x3ff1082c00000000 */
  if (ux >= log_thresh1 && ux <= log_thresh2)
    {
      /* Arguments close to 1.0 are handled separately to maintain
         accuracy.

         The approximation in this region exploits the identity
             log( 1 + r ) = log( 1 + u/2 )  /  log( 1 - u/2 ), where
             u  = 2r / (2+r).
         Note that the right hand side has an odd Taylor series expansion
         which converges much faster than the Taylor series expansion of
         log( 1 + r ) in r. Thus, we approximate log( 1 + r ) by
             u + A1 * u^3 + A2 * u^5 + ... + An * u^(2n+1).

         One subtlety is that since u cannot be calculated from
         r exactly, the rounding error in the first u should be
         avoided if possible. To accomplish this, we observe that
                       u  =  r  -  r*r/(2+r).
         Since x (=1+r) is the input argument, and thus presumed exact,
         the formula above approximates u accurately because
                       u  =  r  -  correction,
         and the magnitude of "correction" (of the order of r*r)
         is small.
         With these observations, we will approximate log( 1 + r ) by
            r + (  (A1*u^3 + ... + An*u^(2n+1)) - correction ).

         We approximate log(1+r) by an odd polynomial in u, where
                  u = 2r/(2+r) = r - r*r/(2+r).
      */
      r = x - 1.0;
      u = r / (2.0 + r);
      correction = r * u;
      u = u + u;
      v = u * u;
      z1 = r;
      z2 = (u * v * (ca_1 + v * (ca_2 + v * (ca_3 + v * ca_4))) - correction);
      *r1 = z1;
      *r2 = z2;
      *xexp = 0;
    }
  else
    {
      /*
        First, we decompose the argument x to the form
        x  =  2**M  *  (F1  +  F2),
        where  1 <= F1+F2 < 2, M has the value of an integer,
        F1 = 1 + j/64, j ranges from 0 to 64, and |F2| <= 1/128.

        Second, we approximate log( 1 + F2/F1 ) by an odd polynomial
        in U, where U  =  2 F2 / (2 F2 + F1).
        Note that log( 1 + F2/F1 ) = log( 1 + U/2 ) - log( 1 - U/2 ).
        The core approximation calculates
        Poly = [log( 1 + U/2 ) - log( 1 - U/2 )]/U   -   1.
        Note that  log(1 + U/2) - log(1 - U/2) = 2 arctanh ( U/2 ),
        thus, Poly =  2 arctanh( U/2 ) / U  -  1.

        It is not hard to see that
          log(x) = M*log(2) + log(F1) + log( 1 + F2/F1 ).
        Hence, we return Z1 = log(F1), and  Z2 = log( 1 + F2/F1).
        The values of log(F1) are calculated beforehand and stored
        in the program.
      */

      f = x;
      if (ux < IMPBIT_DP64)
        {
          /* The input argument x is denormalized */
          /* Normalize f by increasing the exponent by 60
             and subtracting a correction to account for the implicit
             bit. This replaces a slow denormalized
             multiplication by a fast normal subtraction. */
          static const double corr = 2.5653355008114851558350183e-290; /* 0x03d0000000000000 */
          GET_BITS_DP64(f, ux);
          ux |= 0x03d0000000000000;
          PUT_BITS_DP64(ux, f);
          f -= corr;
          GET_BITS_DP64(f, ux);
          expadjust = 60;
        }
      else
        expadjust = 0;

      /* Store the exponent of x in xexp and put
         f into the range [0.5,1) */
      *xexp = (int)((ux & EXPBITS_DP64) >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64 - expadjust;
      PUT_BITS_DP64((ux & MANTBITS_DP64) | HALFEXPBITS_DP64, f);

      /* Now  x = 2**xexp  * f,  1/2 <= f < 1. */

      /* Set index to be the nearest integer to 128*f */
      r = 128.0 * f;
      index = (int)(r + 0.5);

      z1 = ln_lead_table[index-64];
      q = ln_tail_table[index-64];
      f1 = index * 0.0078125; /* 0.0078125 = 1/128 */
      f2 = f - f1;
      /* At this point, x = 2**xexp * ( f1  +  f2 ) where
         f1 = j/128, j = 64, 65, ..., 128 and |f2| <= 1/256. */

      /* Calculate u = 2 f2 / ( 2 f1 + f2 ) = f2 / ( f1 + 0.5*f2 ) */
      /* u = f2 / (f1 + 0.5 * f2); */
      u = f2 / (f1 + 0.5 * f2);

      /* Here, |u| <= 2(exp(1/16)-1) / (exp(1/16)+1).
         The core approximation calculates
         poly = [log(1 + u/2) - log(1 - u/2)]/u  -  1  */
      v = u * u;
      poly = (v * (cb_1 + v * (cb_2 + v * cb_3)));
      z2 = q + (u + u * poly);
      *r1 = z1;
      *r2 = z2;
    }
  return;
}
#endif /* USE_LOG_KERNEL_AMD */

#if defined(USE_REMAINDER_PIBY2F_INLINE)
/* Define this to get debugging print statements activated */
#define DEBUGGING_PRINT
#undef DEBUGGING_PRINT


#ifdef DEBUGGING_PRINT
#include <stdio.h>
char *d2b(long long d, int bitsper, int point)
{
  static char buff[200];
  int i, j;
  j = bitsper;
  if (point >= 0 && point <= bitsper)
    j++;
  buff[j] = '\0';
  for (i = bitsper - 1; i >= 0; i--)
    {
      j--;
      if (d % 2 == 1)
        buff[j] = '1';
      else
        buff[j] = '0';
      if (i == point)
        {
          j--;
          buff[j] = '.';
        }
      d /= 2;
    }
  return buff;
}
#endif

/* Given positive argument x, reduce it to the range [-pi/4,pi/4] using
   extra precision, and return the result in r.
   Return value "region" tells how many lots of pi/2 were subtracted
   from x to put it in the range [-pi/4,pi/4], mod 4. */
static inline void __remainder_piby2f_inline(unsigned long long ux, double *r, int *region)
{

      /* This method simulates multi-precision floating-point
         arithmetic and is accurate for all 1 <= x < infinity */
#if 0
      const int bitsper = 36;
#else
#define bitsper 36
#endif
      unsigned long long res[10];
      unsigned long long u, carry, mask, mant, nextbits;
      int first, last, i, rexp, xexp, resexp, ltb, determ, bc;
      double dx;
      static const double
        piby2 = 1.57079632679489655800e+00; /* 0x3ff921fb54442d18 */
      static unsigned long long pibits[] =
      {
        0LL,
        5215LL, 13000023176LL, 11362338026LL, 67174558139LL,
        34819822259LL, 10612056195LL, 67816420731LL, 57840157550LL,
        19558516809LL, 50025467026LL, 25186875954LL, 18152700886LL
      };

      xexp = (int)(((ux & EXPBITS_DP64) >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64);
      ux = ((ux & MANTBITS_DP64) | IMPBIT_DP64) >> 29;


      /* Now ux is the mantissa bit pattern of x as a long long integer */
      mask = 1;
      mask = (mask << bitsper) - 1;

      /* Set first and last to the positions of the first
         and last chunks of 2/pi that we need */
      first = xexp / bitsper;
      resexp = xexp - first * bitsper;
      /* 120 is the theoretical maximum number of bits (actually
         115 for IEEE single precision) that we need to extract
         from the middle of 2/pi to compute the reduced argument
         accurately enough for our purposes */
      last = first + 120 / bitsper;

#ifdef DEBUGGING_PRINT
      printf("first = %d, last = %d\n", first, last);
#endif

      /* Do a long multiplication of the bits of 2/pi by the
         integer mantissa */
#if 0
      for (i = last; i >= first; i--)
        {
          u = pibits[i] * ux + carry;
          res[i - first] = u & mask;
          carry = u >> bitsper;
        }
      res[last - first + 1] = 0;
#else
      /* Unroll the loop. This is only correct because we know
         that bitsper is fixed as 36. */
      res[4] = 0;
      u = pibits[last] * ux;
      res[3] = u & mask;
      carry = u >> bitsper;
      u = pibits[last - 1] * ux + carry;
      res[2] = u & mask;
      carry = u >> bitsper;
      u = pibits[last - 2] * ux + carry;
      res[1] = u & mask;
      carry = u >> bitsper;
      u = pibits[first] * ux + carry;
      res[0] = u & mask;
#endif

#ifdef DEBUGGING_PRINT
      printf("resexp = %d\n", resexp);
      printf("Significant part of x * 2/pi with binary"
             " point in correct place:\n");
      for (i = 0; i <= last - first; i++)
        {
          if (i > 0 && i % 5 == 0)
            printf("\n ");
          if (i == 1)
            printf("%s ", d2b(res[i], bitsper, resexp));
          else
            printf("%s ", d2b(res[i], bitsper, -1));
        }
      printf("\n");
#endif

      /* Reconstruct the result */
      ltb = (int)((((res[0] << bitsper) | res[1])
                   >> (bitsper - 1 - resexp)) & 7);

      /* determ says whether the fractional part is >= 0.5 */
      determ = ltb & 1;

#ifdef DEBUGGING_PRINT
      printf("ltb = %d (last two bits before binary point"
             " and first bit after)\n", ltb);
      printf("determ = %d (1 means need to negate because the fractional\n"
             "            part of x * 2/pi is greater than 0.5)\n", determ);
#endif

      i = 1;
      if (determ)
        {
          /* The mantissa is >= 0.5. We want to subtract it
             from 1.0 by negating all the bits */
          *region = ((ltb >> 1) + 1) & 3;
          mant = 1;
          mant = ~(res[1]) & ((mant << (bitsper - resexp)) - 1);
          while (mant < 0x0000000000010000)
            {
              i++;
              mant = (mant << bitsper) | (~(res[i]) & mask);
            }
          nextbits = (~(res[i+1]) & mask);
        }
      else
        {
          *region = (ltb >> 1);
          mant = 1;
          mant = res[1] & ((mant << (bitsper - resexp)) - 1);
          while (mant < 0x0000000000010000)
            {
              i++;
              mant = (mant << bitsper) | res[i];
            }
          nextbits = res[i+1];
        }

#ifdef DEBUGGING_PRINT
      printf("First bits of mant = %s\n", d2b(mant, bitsper, -1));
#endif

      /* Normalize the mantissa. The shift value 6 here, determined by
         trial and error, seems to give optimal speed. */
      bc = 0;
      while (mant < 0x0000400000000000)
        {
          bc += 6;
          mant <<= 6;
        }
      while (mant < 0x0010000000000000)
        {
          bc++;
          mant <<= 1;
        }
      mant |= nextbits >> (bitsper - bc);

      rexp = 52 + resexp - bc - i * bitsper;

#ifdef DEBUGGING_PRINT
      printf("Normalised mantissa = 0x%016lx\n", mant);
      printf("Exponent to be inserted on mantissa = rexp = %d\n", rexp);
#endif

      /* Put the result exponent rexp onto the mantissa pattern */
      u = ((unsigned long long)rexp + EXPBIAS_DP64) << EXPSHIFTBITS_DP64;
      ux = (mant & MANTBITS_DP64) | u;
      if (determ)
        /* If we negated the mantissa we negate x too */
        ux |= SIGNBIT_DP64;
      PUT_BITS_DP64(ux, dx);

#ifdef DEBUGGING_PRINT
      printf("(x*2/pi) = %25.20e = %s\n", dx, double2hex(&dx));
#endif

      /* x is a double precision version of the fractional part of
         x * 2 / pi. Multiply x by pi/2 in double precision
         to get the reduced argument r. */
      *r = dx * piby2;

#ifdef DEBUGGING_PRINT
      printf(" r = frac(x*2/pi) * pi/2:\n");
      printf(" r = %25.20e = %s\n", *r, double2hex(r));
      printf("region = (number of pi/2 subtracted from x) mod 4 = %d\n",
             *region);
#endif
}
#endif /* USE_REMAINDER_PIBY2F_INLINE */

#endif /* LIBM_INLINES_AMD_H_INCLUDED */