// Float square root approximation
// http://bits.stephan-brumme.com

// references:
// - http://en.wikipedia.org/wiki/Methods_of_computing_square_roots
// - Intel, "Intel Software Optimization Cookbook"

// code:
float squareRoot(float x)
{
  unsigned int i = *(unsigned int*) &x;

  // adjust bias
  i  += 127 << 23;
  // approximation of square root
  i >>= 1;

  return *(float*) &i;
}

// assembler:
#ifdef _MSC_VER
__forceinline void squareRootAsm() // stripped down to core algorithm to avoid overhead of parameter pushes/pops
{
  // compiler: Visual C++ 2008
  // in: eax - x
  // out: eax - result
  __asm
  {
    ; i += 127 << 23
    add eax, 3F800000h
    ; i >>= 1
    shr eax, 1
  }
}
#endif

// restrictions:
// - designed for positive 32 bit floats
// - approximation, average deviation of approx. 5%
//   - for example: sqrt(144) = 12, but squareRoot(144) = 12.5, approx. 4% off

// explanation:
// The standard IEEE-754 defines the binary layout of floating point numbers.
// All major CPUs (better to say: FPUs) follow this standard.
//
// The data type "float" is a 32 bit wide floating point number consisting of a 23 bit mantissa,
// an 8 bit exponent and a sign bit. Because the exponent can be negative, too, it is implemented using
// a bias of 2^8-1 = 127. That means, 127 is added to the actual exponent.
// Binary exponents below -127 or above +127 cannot be represented by "float" (approx. +-10^38 decimal).
// To save one bit, the most significant bit of the mantissa is omitted because it is always set to one.
//
// The square root can be rewritten with an exponent: sqrt(x) => x^1/2
// When x already is expressed with an exponent, say x => a^b, then sqrt(x) => sqrt a^b => a^b/2
// It seems all we have to do is performing a right shift of the exponent of one bit while taking care of the bias, too.
//
// This works pretty well when the exponent is even but fails when it is odd.
// A surprisingly good approximation is to shift both mantissa and exponent.
// For odd exponents, the second most significant bit of the mantissa will be set because is it shifted out
// of the exponent and moves into the mantissa.
//
// In summary, we have to perform three steps:
// 1. remove bias from exponent
// 2. shift everything right by one bit
// 3. add bias to exponent
//
// Since the mantissa covers 23 bits, step 1 is equivalent to subtracting 127*2^23 .
// Vice versa, step 3 comes down to adding 127*2^23 .
// This approximation can be found in the "Intel Software Optimization Cookbook" for example.
//
// Step 1 and 3 can be combined into one operation by observing that the sign bit is always zero
// (square root of negative numbers is usually not defined) and just replacing step 1 by step 3
// and skipping the former step 3. We get a temporary overflow, thus setting the sign bit, but the logical shift
// afterwards (step 2) corrects for it by moving the bit into the exponent and clearing the sign bit.
//
// The improved, admittedly non-intuitive, algorithm looks this:
// 1. add bias to exponent (line 6)
// 2. shift everything right by one bit (line 8)
//
// We have to switch from IEEE-754 to integer arithmetic (line 3), of course, and back (line 10).

// validation:
#include <stdio.h>
#include <math.h>
#ifdef _MSC_VER
#include <intrin.h>
#endif

int main(int, char**)
{
  // Microsoft Visual C++ performance measurement
#ifdef _MSC_VER
  printf("performance test ...\n");

  const size_t maxLoop = 100000;
  const size_t unroll  = 10;

  unsigned long long start = __rdtsc();
  for (size_t i = maxLoop; i != 0; i--)
  {
    // unroll 10 times to keep loop overhead to a minimum
    squareRootAsm(); squareRootAsm();
    squareRootAsm(); squareRootAsm();
    squareRootAsm(); squareRootAsm();
    squareRootAsm(); squareRootAsm();
    squareRootAsm(); squareRootAsm();
  }
  unsigned long long elapsed = __rdtsc() - start;
  printf("fast: %I64d ticks => about %.3f ticks per call\n", elapsed, elapsed/(maxLoop*float(unroll)));
#endif

  // analyze all normalized numbers from 0 to 10^38
  printf("verifying ");
  unsigned int scan   = 1 << 23;
  unsigned int totalX = 0x7FFFFFFF - 2*scan - 2; // exclude denormalized numbers
  float        error  = 0;
  float    maxPercent = 0;

  do
  {
    // "scan" will contain all possible bit combinations of floats
    float x = *(float*) &scan;
    float root = squareRoot(x);

    // get relative error
    float fpuSquareRoot = sqrtf(x);
    float relative = fpuSquareRoot / root;
    // express error as percentage
    float percent  = 100*fabs(relative - 1);
    // more than 10% ?
    if (percent > 10)
      printf("Error of %.4f %% for x=%f.\n", percent, x);
    error += percent;
    if (percent > maxPercent)
      maxPercent = percent;

    // progress meter, the whole validation takes about a minute on my computer
    if ((scan & 0x07FFFFF) == 0)
      printf(".");

    scan++;
  } while (scan != 0x7FFFFFFF-1);

  printf(" %f %% error, max. error = %f %%.\n", error / totalX, maxPercent);
  return 0;
}

// performance:*
// - constant time, data independent
//
// + Intel Pentium D:
// - squareRoot: approx. 2 cycles per number
//
// + Intel Core 2:
// - squareRoot: approx. 2 cycles per number
//
// + Intel Core i7:
// - squareRoot: approx. 2 cycles per number
