// Round up to the next power of two
// http://bits.stephan-brumme.com

// references:
// http://graphics.stanford.edu/~seander/bithacks.html

// code:
unsigned int roundUpToNextPowerOfTwo(unsigned int x)
{
  x--;
  x |= x >> 1;  // handle  2 bit numbers
  x |= x >> 2;  // handle  4 bit numbers
  x |= x >> 4;  // handle  8 bit numbers
  x |= x >> 8;  // handle 16 bit numbers
  x |= x >> 16; // handle 32 bit numbers
  x++;

  return x;
}

// assembler:
__forceinline void roundUpToNextPowerOfTwoAsm()
{
  // compiler: VC 2008
  // in: ecx - x
  // out: eax - result
  // temp: edx
  __asm
  {
    ; x--
    dec ecx
    ; x |= x >> 1
    mov eax, ecx
    shr eax, 1
    or  ecx, eax
    ; x |= x >> 2
    mov edx, ecx
    shr edx, 2
    or  ecx, edx
    ; x |= x >> 4
    mov eax, ecx
    shr eax, 4
    or  ecx, eax
    ; x |= x >> 8
    mov edx, ecx
    shr edx, 8
    or  ecx, edx
    ; x |= x >> 16
    mov eax, ecx
    shr eax, 16
    or  eax, ecx
    ; x++
    inc eax
  }
}

// restrictions:
// - designed for 32 bits
// - x must be positive and <= 2^31, else results are undefined

// explanation:
// Each number - except for zero - has at least one bit set to 1. Let's consider only the highest bit which is set to 1:
// If it is spread to all bit position on its right side, then we get a number which is just one below the next power of two.
//
// 00...001... => 00...001111 => increment by 1: 00...010000
//
// A simple way to distribute this set bit is to create a sequence of right-shifted ORs.
// Even better, a divide-and-conquer strategy can be implemented branchless.
//
// Care must be taken when processing a power of 2 as it must map to itself.
// By default, the algorithm would give the next power of 2, however, a simple decrement by 1 (line 3) neatly solves this problem.

// validation:
#include <stdio.h>
#include <intrin.h>

int main(int, char**)
{
  // Microsoft Visual C++ performance measurement
#ifdef _MSC_VER
  printf("performance test ...\n");

  const size_t maxLoop = 100000;
  const size_t unroll  = 10;

  unsigned long long start = __rdtsc();
  for (size_t i = maxLoop; i != 0; i--)
  {
    // unroll 10 times to keep loop overhead to a minimum
    roundUpToNextPowerOfTwoAsm(); roundUpToNextPowerOfTwoAsm();
    roundUpToNextPowerOfTwoAsm(); roundUpToNextPowerOfTwoAsm();
    roundUpToNextPowerOfTwoAsm(); roundUpToNextPowerOfTwoAsm();
    roundUpToNextPowerOfTwoAsm(); roundUpToNextPowerOfTwoAsm();
    roundUpToNextPowerOfTwoAsm(); roundUpToNextPowerOfTwoAsm();
  }
  unsigned long long elapsed = __rdtsc() - start;
  printf("%I64d ticks => about %.3f ticks per call\n", elapsed, elapsed/(maxLoop*float(unroll)));
#endif

  // analyze all numbers from 0 to 1^31
  printf("verifying ... ");
  unsigned int maxX   = 0x80000000;
  unsigned int x      = 0;
  unsigned int errors = 0;
  unsigned int expected = 0;
  do
  {
    if (roundUpToNextPowerOfTwo(x) != expected)
    {
      printf("failed ! x=%u, expected %u but got %u.\n", x, expected, roundUpToNextPowerOfTwo(x));
      errors++;
    }

    // next power of two reached, step forward
    if (x == expected)
    {
      expected <<= 1;
      if (expected == 0)
        expected = 1;
    }

    // progress meter, the whole validation takes a few seconds on my computer
    if (x & 0x07FFFFFF == 0)
      printf(".");

    x++;
  } while (x < maxX);

  printf("%u errors.\n", errors);
  return errors;
}

// performance:*
// - constant time, data independent
//
// + Intel Pentium D:
// - roundUpToNextPowerOfTwo: approx. 13.5 cycles per number
//
// + Intel Core 2:
// - roundUpToNextPowerOfTwo: approx. 15 cycles per number
//
// + Intel Core i7:
// - roundUpToNextPowerOfTwo: approx. 14 cycles per number
