// Is power of two
// http://bits.stephan-brumme.com

// references:
// http://graphics.stanford.edu/~seander/bithacks.html

// code:
bool isPowerOfTwo(unsigned int x)
{
  return ((x & (x - 1)) == 0);
}

// assembler:
#ifdef _MSC_VER
__forceinline void isPowerOfTwoVcAsm() // stripped down to core algorithm to avoid overhead of parameter pushes/pops
{
  // compiler: Visual C++ 2008
  // in: ecx - x
  // out: eax - result
  __asm
  {
    lea eax, [ecx-1]
    and eax, ecx
    neg eax
    sbb eax, eax
    inc eax
  }
}

__forceinline void isPowerOfTwoGccAsm() // stripped down to core algorithm to avoid overhead of parameter pushes/pops
{
  // compiler: GCC 4.4
  // in: eax - x
  // out: al - result
  // temp: edx
  __asm
  {
    lea edx, [eax-1]
    test eax, edx
    sete al
  }
}
#endif

// restrictions:
// - the algorithm returns true for x = 0 and x = 1, i.e. they are considered to be a power of two

// explanation:
// The binary representation of a power-of-two 2^y is a 1 followed only by 0s.
// In such a case, x - 1 generates a binary number where the 1s turn into 0s and the former 0s turn into 1s.
// For example, 8 = 1000b and 8 - 1 = 7 = 0111b.
//
// If x and x - 1 are binary ANDed then the result is only 0 if x is a power of two (line 3).

// validation:
#include <stdio.h>
#ifdef _MSC_VER
#include <intrin.h>
#endif

int main(int, char**)
{
  // Microsoft Visual C++ performance measurement
#ifdef _MSC_VER
  printf("performance test ...\n");

  const size_t maxLoop = 100000;
  const size_t unroll  = 10;

  unsigned long long start = __rdtsc();
  for (size_t i = maxLoop; i != 0; i--)
  {
    // unroll 10 times to keep loop overhead to a minimum
    isPowerOfTwoVcAsm(); isPowerOfTwoVcAsm();
    isPowerOfTwoVcAsm(); isPowerOfTwoVcAsm();
    isPowerOfTwoVcAsm(); isPowerOfTwoVcAsm();
    isPowerOfTwoVcAsm(); isPowerOfTwoVcAsm();
    isPowerOfTwoVcAsm(); isPowerOfTwoVcAsm();
  }
  unsigned long long elapsed = __rdtsc() - start;
  printf("VC: %I64d ticks => about %.3f ticks per call\n", elapsed, elapsed/(maxLoop*float(unroll)));

  start = __rdtsc();
  for (size_t i = maxLoop; i != 0; i--)
  {
    // unroll 10 times to keep loop overhead to a minimum
    isPowerOfTwoGccAsm(); isPowerOfTwoGccAsm();
    isPowerOfTwoGccAsm(); isPowerOfTwoGccAsm();
    isPowerOfTwoGccAsm(); isPowerOfTwoGccAsm();
    isPowerOfTwoGccAsm(); isPowerOfTwoGccAsm();
    isPowerOfTwoGccAsm(); isPowerOfTwoGccAsm();
  }
  elapsed = __rdtsc() - start;
  printf("GCC: %I64d ticks => about %.3f ticks per call\n", elapsed, elapsed/(maxLoop*float(unroll)));
#endif

  // analyze all numbers from 0 to 1^32 - 1
  printf("verifying ");
  unsigned int maxX   = 0xFFFFFFFF;
  unsigned int x      = 0;
  unsigned int errors = 0;
  unsigned int expectedPowerOfTwo = 0;
  do
  {
    // round
    bool isPower = isPowerOfTwo(x);

    // verify result
    if (isPower == true  && x != expectedPowerOfTwo)
    {
      printf("failed ! x=%u is not a power of two.\n", x);
      errors++;
    }
    if (isPower == false && x == expectedPowerOfTwo)
    {
      printf("failed ! x=%u is a power of two.\n", x);
      errors++;
    }

    if (x == expectedPowerOfTwo)
    {
      expectedPowerOfTwo <<= 1;
      // shift doesn't work for changing from zero to one
      if (expectedPowerOfTwo == 0)
        expectedPowerOfTwo = 1;
    }

    // progress meter, the whole validation takes a few seconds on my computer
    if ((x & 0x07FFFFFF) == 0)
      printf(".");

    x++;
  } while (x < maxX);

  printf(" %u errors.\n", errors);
  return errors;
}

// performance:*4
// - constant time, data independent
//
// + Intel Pentium D:
// - isPowerOfTwo/VC: approx. 5.5 cycles per number
// - isPowerOfTwo/GCC: approx. 11 cycles per number
//
// + Intel Core 2:
// - isPowerOfTwo/VC: approx. 2.25 cycles per number
// - isPowerOfTwo/GCC: approx. 4 cycles per number
//
// + Intel Core i7:
// - isPowerOfTwo/VC: approx. 2.25 cycles per number
// - isPowerOfTwo/GCC: approx. 4 cycles per number
