// Absolute value of a float
// http://bits.stephan-brumme.com

// references:
// none

// code:
float myAbs(float x)
{
  // copy and re-interpret as 32 bit integer
  int casted = *(int*) &x;
  // clear highest bit
  casted &= 0x7FFFFFFF;

  // re-interpret as float
  return *(float*)&casted;
}

// assembler:
#ifdef _MSC_VER
__forceinline void myAbsAsm() // stripped down to core algorithm to avoid overhead of parameter pushes/pops
{
  // compiler: Visual C++ 2008
  // in: eax - x
  // out: eax - result
  __asm
  {
    and eax, 7FFFFFFFh
  }
}

__forceinline void fabsAsm() // stripped down to core algorithm to avoid overhead of parameter pushes/pops
{
  // compiler: Visual C++ 2008
  // in: ST(0) - x
  // out: ST(0) - result
  __asm
  {
    fabs
  }
}
#endif

// restrictions:
// - designed for 32 bit IEEE floats

// explanation:
// IEEE 754 floats' highest bit is the so-called sign bit: set to 1 for negative and 0 for positive numbers (incl. zero).
// We just always set it to 0 - and we are done !
//
// The data type "float" requires 32 bits. Unfortunately, C does not allow any bit operations on floats.
// To work around this issue, these 32 bits are re-interpreted as a 32 bit integer (line 4).
// Then clearing the sign bit becomes simple and easy (line 6), however, the required code looks a bit nasty.
//
// The built-in C function fabs() is translated into its FABS intrinsic when the data value is already on the FPU stack.
// When data is stored in main memory and will remain there after fabs, the above trick outperforms the FPU by far
// because one FPU load and one store operation can be saved.
// These FPU load/store can vastly skew the performance chart, so please be careful with interpreting the results.

// validation:
#include <stdio.h>
#include <math.h>
#include <float.h>
#ifdef _MSC_VER
#include <intrin.h>
#endif

int main(int, char**)
{
  // Microsoft Visual C++ performance measurement
#ifdef _MSC_VER
  printf("performance test ...\n");

  const size_t maxLoop = 100000;
  const size_t unroll  = 10;

  unsigned long long start = __rdtsc();
  for (size_t i = maxLoop; i != 0; i--)
  {
    // unroll 10 times to keep loop overhead to a minimum
    myAbsAsm(); myAbsAsm();
    myAbsAsm(); myAbsAsm();
    myAbsAsm(); myAbsAsm();
    myAbsAsm(); myAbsAsm();
    myAbsAsm(); myAbsAsm();
  }
  unsigned long long elapsed = __rdtsc() - start;
  printf("CPU: %I64d ticks => about %.3f ticks per call\n", elapsed, elapsed/(maxLoop*float(unroll)));

  start = __rdtsc();
  __asm { fld1 } // dummy
  for (size_t i = maxLoop; i != 0; i--)
  {
    // unroll 10 times to keep loop overhead to a minimum
    fabsAsm(); fabsAsm();
    fabsAsm(); fabsAsm();
    fabsAsm(); fabsAsm();
    fabsAsm(); fabsAsm();
    fabsAsm(); fabsAsm();
  }
  elapsed = __rdtsc() - start;
  printf("FPU: %I64d ticks => about %.3f ticks per call\n", elapsed, elapsed/(maxLoop*float(unroll)));
#endif

  return 0;
}

// performance:*
// - constant time, data independent
//
// + Intel Pentium D:
// - myAbs: approx. 1 cycle per number
// - fabs: approx. 3 cycles per number (if on FPU stack)
//
// + Intel Core 2:
// - myAbs: approx. 1 cycle per number
// - fabs: approx. 1 cycle per number (if on FPU stack)
//
// + Intel Core i7:
// - myAbs: approx. 1 cycle per number
// - fabs: approx. 1 cycle per number (if on FPU stack)

