// Absolute value of an integer
// http://bits.stephan-brumme.com

// references:
// multiple sources, author unknown

// code:
int myAbs(int x)
{
  const int bit31 = x >> 31;
  return (x ^ bit31) - bit31;
}

// assembler:
#ifdef _MSC_VER
__forceinline void myAbsAsm() // stripped down to core algorithm to avoid overhead of parameter pushes/pops
{
  // compiler: Visual C++ 2008
  // in: eax - x
  // out: edx - result
  // temp: ecx
  __asm
  {
    ; bit31 = x >> 31
    mov ecx, eax
    sar ecx, 31
    ; return (x ^ bit31) - bit31
    mov edx, ecx
    xor edx, eax
    sub edx, ecx
  }
}

__forceinline void absAsm() // stripped down to core algorithm to avoid overhead of parameter pushes/pops
{
  // compiler: Visual C++ 2008
  // in: ecx - x
  // out: eax - result
  // temp: edx
  __asm
  {
    mov eax, ecx
    cdq
    xor eax, edx
    sub eax, edx
  }
}
#endif

// restrictions:
// - designed for 32 bits (simple modification for 64 bits possible)

// explanation:
// All major processors represent negative numbers using the two's-complement which is defined as:
// for x >= 0 => x
// for x < 0 => NOT(x) + 1
//
// On the lowest level, computers provide logical bit shifts and arithmetic bit shifts.
// Both shifts differ in handling how to fill the empty bits on the left side.
// Logical shifts insert zeros while arithmetic shifts replicate the formerly highest bit.
//
// Whereas signed integers are arithmetically shifted in C, unsigned integers are logically shifted.
//
// In our case x is shifted arithmetically 31 times to the right which basically erases its value
// and spreads the highest bit. That means, line 3 evaluates either to 0x00000000 (=> 0) or
// 0xFFFFFFFF (=> -1).
// Note: 32 bit systems require a shift by 31, 64 bit systems require a shift by 63 accordingly.
//
// Consequently, line 4 turns out to be (x XOR 0) - 0 for positive values of x (including x=0).
// x XOR 0 is still x and x - 0 remains x, too. So for positive x we get x >= 0 => x.
//
// We saw that for negative values of x, bit31 is set to 0xFFFFFFFF.
// Line 4 is then (x XOR 0xFFFFFFFF) - 0xFFFFFFFF. The bracketed XOR is equivalent to NOT(x) and
// the constant -0xFFFFFFFF turns out to be -(-1) = +1.
// In the end, the whole term is NOT(x) + 1, exactly what we wanted: for x < 0 => NOT(x) + 1
//
// Note: Current C++ compilers (Microsoft, GCC and Intel) implemented a special assembler code sequence
//         for abs which runs faster than the shown algorithm on x86 CPUs (see below for its source code).

// validation:
#include <stdio.h>
#ifdef _MSC_VER
#include <intrin.h>
#endif

int main(int, char**)
{
  // Microsoft Visual C++ performance measurement
#ifdef _MSC_VER
  printf("performance test ...\n");

  const size_t maxLoop = 100000;
  const size_t unroll  = 10;

  unsigned long long start = __rdtsc();
  for (size_t i = maxLoop; i != 0; i--)
  {
    // unroll 10 times to keep loop overhead to a minimum
    myAbsAsm(); myAbsAsm();
    myAbsAsm(); myAbsAsm();
    myAbsAsm(); myAbsAsm();
    myAbsAsm(); myAbsAsm();
    myAbsAsm(); myAbsAsm();
  }
  unsigned long long elapsed = __rdtsc() - start;
  printf("myAbs: %I64d ticks => about %.3f ticks per call\n", elapsed, elapsed/(maxLoop*float(unroll)));
  
  start = __rdtsc();
  for (size_t i = maxLoop; i != 0; i--)
  {
    // unroll 10 times to keep loop overhead to a minimum
    absAsm(); absAsm();
    absAsm(); absAsm();
    absAsm(); absAsm();
    absAsm(); absAsm();
    absAsm(); absAsm();
  }
  elapsed = __rdtsc() - start;
  volatile int a = abs(int(elapsed));
  printf("abs: %I64d ticks => about %.3f ticks per call\n", elapsed, elapsed/(maxLoop*float(unroll)));
#endif

  // analyze all numbers from 0 to 1^32 - 1
  printf("verifying ");
  unsigned int totalX = 0xFFFFFFFF;
           int x      = 0;
  unsigned int errors = 0;
  do 
  {
    if (((x <  0) && (abs(x) != -x)) ||
        ((x >= 0) && (abs(x) != +x)))
    {
      printf("failed ! x=%u has wrong absolute value, expected %u but got %u.\n", x, x, abs(x));
      errors++;
    }

    // progress meter, the whole validation takes about a minute on my computer
    if ((x & 0x07FFFFFF) == 0)
      printf(".");

    x++;
  } while (x != 0);

  printf(" %u errors.\n", errors);
  return errors;
}

// performance:*
// - constant execution time because branch free
//
// + Intel Pentium D:
// - myAbs: approx. 2.25 cycles per number
// - abs: approx. 1.75 cycles per number
//
// + Intel Core 2:
// - myAbs: approx. 2 cycles per number
// - abs: approx. 1.5 cycles per number
//
// + Intel Core i7:
// - myAbs: approx. 1.75 cycles per number
// - abs: approx. 2 cycles per number
