// Float inverse square root approximation
// http://bits.stephan-brumme.com

// references:
// Quake3 source code
// Chris Lomont, "Fast Inverse Square Root"

// code:
float invSquareRootFastest(float x)
{
  // access float with integer logic
  unsigned int *i = (unsigned int*) &x;
  // approximation with empirically found "magic number"
  *i = 0x5F375A86 - (*i>>1);
  return x;
}

float invSquareRoot(float x)
{
  // for Newton iteration
  float xHalf = 0.5f*x;

  // same as above
  unsigned int *i = (unsigned int*) &x;
  *i = 0x5F375A86 - (*i>>1);

  // one Newton iteration, repeating further improves precision
  return x * (1.5f - xHalf*x*x);
}

// restrictions:
// - designed for positive 32 bit floats
// - approximation, average deviation of approx. 5%
//   - for example: sqrt(144) = 12, but squareRoot(144) = 12.5, approx. 4% off

// explanation:
// The inverse square root can be rewritten with an exponent: 1/sqrt(x) => x^-1/2
// When x already is expressed with an exponent, say x => a^b, then 1/sqrt(x) => 1/sqrt a^b  => a^-b/2
//
// TODO

// validation:
#include <stdio.h>
#include <time.h>
#include <math.h>

int main(int argc, char* argv[])
{
  // analyze all normalized numbers from 0 to 10^38
  printf("verifying ");
  unsigned int scan   = 1 << 23;
  unsigned int totalX = 0x7FFFFFFF - 2*scan - 2; // exclude denormalized numbers
  float        error  = 0;
  float    maxPercent = 0;

  do
  {
    // "scan" will contain all possible bit combinations of floats
    float x = *(float*) &scan;
    float root = invSquareRoot(x);

    // get relative error
    float fpuSInvquareRoot = 1/sqrtf(x);
    float relative = fpuSInvquareRoot / root;
    // express error as percentage
    float percent  = 100*fabs(relative - 1);
    // more than 10% ?
    if (percent > 10)
      printf("Error of %.4f %% for x=%f.\n", percent, x);
    error += percent;
    if (percent > maxPercent)
      maxPercent = percent;

    // progress meter, the whole validation takes about a minute on my computer
    if (scan % 0x07FFFFFF == 0)
      printf(".");

    scan++;
  } while (scan != 0x7FFFFFFF-1);

  printf(" %f %% error, max. error = %f %%.\n", error / totalX, maxPercent);

  printf("performance test ... ");
  clock_t start = clock();

  scan = 1 << 23;
  do
  {
    // "volatile" is required to prevent optimizer from completely eliminating code
    volatile float result;
    // unroll loop to reduce overhead
    result = invSquareRoot(*(float*)&scan); scan++;
    result = invSquareRoot(*(float*)&scan); scan++;
    result = invSquareRoot(*(float*)&scan); scan++;
    result = invSquareRoot(*(float*)&scan); scan++;
    result = invSquareRoot(*(float*)&scan); scan++;
    result = invSquareRoot(*(float*)&scan); scan++;
    result = invSquareRoot(*(float*)&scan); scan++;
    result = invSquareRoot(*(float*)&scan); scan++;
    result = invSquareRoot(*(float*)&scan); scan++;
    result = invSquareRoot(*(float*)&scan); scan++;
    result = invSquareRoot(*(float*)&scan); scan++;
    result = invSquareRoot(*(float*)&scan); scan++;
    result = invSquareRoot(*(float*)&scan); scan++;
    result = invSquareRoot(*(float*)&scan); scan++;
    result = invSquareRoot(*(float*)&scan); scan++;
    result = invSquareRoot(*(float*)&scan); scan++;
  }
  while (scan != 0x7FFFFFFF - 2*(1 << 23) - 1);

  float seconds = (clock() - start) / float(CLOCKS_PER_SEC);
  printf("%.3f seconds, approx. %.3f million numbers per second.\n", seconds, (totalX/seconds)/1000000);

  return 0;
}

// performance:
// - constant time, data independent
//
// - 2.67 GHz Intel Pentium D805, Microsoft Visual C++ 2005, highest level of optimization:
//   approx. 950 million numbers per second
//   approx. 3 cycles per number
//
// - 2.67 GHz Intel Pentium D805, GCC 3.4.5 (MinGW), highest level of optimization:
//   approx. ??? million numbers per second
//   approx. ? cycles per number
