// Approximative inverse of a float
// http://bits.stephan-brumme.com

// references:
// unknown

// code:
float inverse(float x)
{
  // re-interpret as a 32 bit integer
  unsigned int *i = (unsigned int*)&x;

  // adjust exponent
  *i = 0x7F000000 - *i;
  //*i = 0x7EEEEEEE - *i;

  return x;
}

// assembler:
__forceinline void inverseAsm() // stripped down to core algorithm to avoid overhead of parameter pushes/pops
{
  // compiler: VS 2008
  // in: edx - x
  // out: eax - result
  __asm
  {
    mov eax, 7F000000H
    sub eax, edx
  }
}

__forceinline void inverseFpuAsm() // stripped down to core algorithm to avoid overhead of parameter pushes/pops
{
  // compiler: VS 2008
  // in: ST(0) - x
  // out: ST(0) - result
  __asm
  {
    fld1
    fdivrp ST(1), ST(0)
  }
}

// restrictions:
// - designed for 32 bit IEEE floats
// - approximation, average deviation of approx. 8%

// explanation:
// The standard IEEE-754 defines the binary layout of floating point numbers.
// All major CPUs (or better to say: FPUs) follow this standard.
//
// The data type "float" is a 32 bit wide floating point number consisting of a 23 bit mantissa,
// an 8 bit exponent and a sign bit. Because the exponent can be negative, too, it is implemented using
// a bias of 2^8-1 = 127. That means, 127 is added to the actual exponent to keep it always positive.
// Written in a formula: x = sign * mantissa * 2^exponent .
//
// Note: To save one bit, the most significant bit of the mantissa is omitted because it is by definition always set to one.
//
// Note, too: Binary exponents below -127 or above +127 cannot be represented by "float" (approx. +-10^38 decimal).
//
// The inverse can be written as: inverse(a) => a^-1 or more specific: inverse(a^b) => a^-b .
// If speed is more important than precision, we can approximate x^-1 by computing sign * mantissa * 2^-exponent .
// The only difference to the original formula is the minus in front of exponent.
// Because the mantissa remains unadjusted, the result will slightly deviate from the true inverse value.
//
// In the end, a simple subtraction of the exponent bits, often performed in merely one CPU cycle,
// does the trick (line 7): exponent => 127 - exponent
// The constant 0x7F000000 represents 127 shifted left by 23 bits.
//
// A brute force search revealed that a few constants expose a better overall accuracy and cut down the
// error to about 4%. So far, 0x7EEEEEEE (line 8) seems to be the best candidate.
// My brute force search didn't analyze all possible numbers, that means, there might be even better constants.
// But you should keep in mind that only for 0x7F000000 we get inverse(1) = 1, which might be important for
// many algorithms. No other constant can produce the same result for inverse(1).
//
// We have to switch from IEEE-754 to integer arithmetic (line 4), of course, and back (line 10).

// validation:
#include <stdio.h>
#include <math.h>
#include <intrin.h>

int main(int, char**)
{
  // Microsoft Visual C++ performance measurement
#ifdef _MSC_VER
  printf("performance test ...\n");

  const size_t maxLoop = 100000;
  const size_t unroll  = 10;

  unsigned long long start = __rdtsc();
  for (size_t i = maxLoop; i != 0; i--)
  {
    // unroll 10 times to keep loop overhead to a minimum
    inverseAsm(); inverseAsm();
    inverseAsm(); inverseAsm();
    inverseAsm(); inverseAsm();
    inverseAsm(); inverseAsm();
    inverseAsm(); inverseAsm();
  }
  unsigned long long elapsed = __rdtsc() - start;
  printf("CPU: %I64d ticks => about %.3f ticks per call\n", elapsed, elapsed/(maxLoop*float(unroll)));

  float dummy = 5;
  start = __rdtsc();
  for (size_t i = maxLoop; i != 0; i--)
  {
    // unroll 10 times to keep loop overhead to a minimum
    __asm { fld dummy } // load dummy number
    inverseFpuAsm(); inverseFpuAsm();
    inverseFpuAsm(); inverseFpuAsm();
    inverseFpuAsm(); inverseFpuAsm();
    inverseFpuAsm(); inverseFpuAsm();
    inverseFpuAsm(); inverseFpuAsm();
    __asm { fstp dummy } // clean up FPU stack
  }
  elapsed = __rdtsc() - start;
  printf("FPU %I64d ticks => about %.3f ticks per call\n", elapsed, elapsed/(maxLoop*float(unroll)));
#endif

  // analyze all normalized numbers from 0 to 10^38
  printf("verifying ");
  unsigned int scan   = 1 << 23;
  unsigned int totalX = 0x7FFFFFFF - 2*scan - 2; // exclude denormalized numbers
  float        error  = 0;
  float    maxPercent = 0;

  do
  {
    // "scan" will contain all possible bit combinations of floats
    float x = *(float*) &scan;
    float inv = inverse(x);

    // get relative error
    float fpuInverse = 1.f/x;
    float relative = fpuInverse / inv;
    // express error as percentage
    float percent  = 100*fabs(relative - 1);
    // more than 10% ?
    if (percent > 15)
      printf("Error of %.4f %% for x=%f.\n", percent, x);
    error += percent;
    if (percent > maxPercent)
      maxPercent = percent;

    // progress meter, the whole validation takes about a minute on my computer
    if ((scan & 0x07FFFFFF) == 0)
      printf(".");

    scan++;
  }
  while (scan != 0xFFFFFF - 2*(1 << 23) - 1);

  printf(" %f %% error, max. error = %f %%.\n", error / totalX, maxPercent);
  return 0;
}

// performance:*
// - constant time, data independent
//
// + Intel Pentium D:
// - inverse/CPU: approx. 1 cycle per number
// - inverse/FPU: approx. 42 cycles per number
//
// + Intel Core 2:
// - inverse/CPU: approx. 0.75 cycle per number
// - inverse/FPU: approx. 34 cycles per number
//
// + Intel Core i7:
// - inverse/CPU: approx. 0.75 cycle per number
// - inverse/FPU: approx. 22 cycles per number
