// Bit mask of lowest bit not set
// http://bits.stephan-brumme.com

// references:
// http://www.catonmat.net/blog/low-level-bit-hacks-you-absolutely-must-know/

// code:
unsigned int lowestBitNotSet(unsigned int x)
{
  return ~x & (x + 1);
}

unsigned int lowestBitNotSetSimple(unsigned int x)
{
  // to be consistent with lowestBitNotSet
  if (x == ~0)
    return 0;

  // shift right until finding a zero
  unsigned int result = 1;
  while ((x & result) != 0)
    result <<= 1;

  return result;
}


// assembler:
#ifdef _MSC_VER
__forceinline void lowestBitNotSetAsm() // stripped down to core algorithm to avoid overhead of parameter pushes/pops
{
  // compiler: Visual C++ 2008
  // in: ecx - x
  // out: eax - result
  __asm
  {
    ; x + 1
    lea eax, DWORD PTR [ecx+1]
    ; ~x
    not ecx
    ; ~x & (x + 1)
    and eax, ecx
  }
}

__forceinline void lowestBitNotSetSimpleAsm() // stripped down to core algorithm to avoid overhead of parameter pushes/pops
{
  // compiler: Visual C++ 2008
  // in: ecx - x
  // out: eax - result
  __asm
  {
    ; x == ~0
    cmp ecx, -1
    jne $prepare
    xor eax, eax
    jmp $finish
  $prepare:
    ; int result = 1;
    mov eax, 1
    ; while ((x & result) != 0)
    test cl, al
    je $finish
  $loop:
    ; result <<= 1;
    add eax, eax
    test eax, ecx
    jne $loop
  $finish:
  }
}
#endif

// restrictions:
// - if all bits are set, the algorithm returns 0

// explanation:
// On this website I mention an algorithm to detect the position of the lowest bit set (see http://bits.stephan-brumme.com/lowestBitSet.html).
// Here I present an idea I saw on the internet (see references) to return a mask of the lowest bit NOT set.
// Of course, both can be easily concated to get the position of the lowest bit not set.
//
// The code works as follows:
// ~x turns all 0's into 1's (and vice versa). This gives us a nice AND-mask of all potential candidates.
// Now the second part of the formula is responsible for turning only the rightmost 0 into a 1.
//
// If the rightmost bit is 0, then x+1 turns this bit into 1 and leaves all other bits untouched.
// If there are a few 1's right of the rightmost 0, then x+1 flips all these 1's into 0's and overflows a 1 to the rightmost 0.

// validation:
#include <stdio.h>
#ifdef _MSC_VER
#include <intrin.h>
#endif

int main(int, char**)
{
  // Microsoft Visual C++ performance measurement
#ifdef _MSC_VER
  printf("performance test ...\n");

  const size_t maxLoop = 100000;
  const size_t unroll  = 10;

  unsigned long long start = __rdtsc();
  for (size_t i = maxLoop; i != 0; i--)
  {
    // unroll 10 times to keep loop overhead to a minimum
    lowestBitNotSetAsm(); lowestBitNotSetAsm();
    lowestBitNotSetAsm(); lowestBitNotSetAsm();
    lowestBitNotSetAsm(); lowestBitNotSetAsm();
    lowestBitNotSetAsm(); lowestBitNotSetAsm();
    lowestBitNotSetAsm(); lowestBitNotSetAsm();
  }
  unsigned long long elapsed = __rdtsc() - start;
  printf("%I64d ticks => about %.3f ticks per call\n", elapsed, elapsed/(maxLoop*float(unroll)));

  start = __rdtsc();
  for (size_t i = maxLoop; i != 0; i--)
  {
    // unroll 10 times to keep loop overhead to a minimum
    __asm { mov ecx, 0 }; lowestBitNotSetSimpleAsm(); __asm { mov ecx, 0 }; lowestBitNotSetSimpleAsm();
    __asm { mov ecx, 0 }; lowestBitNotSetSimpleAsm(); __asm { mov ecx, 0 }; lowestBitNotSetSimpleAsm();
    __asm { mov ecx, 0 }; lowestBitNotSetSimpleAsm(); __asm { mov ecx, 1 }; lowestBitNotSetSimpleAsm();
    __asm { mov ecx, 1 }; lowestBitNotSetSimpleAsm(); __asm { mov ecx, 1 }; lowestBitNotSetSimpleAsm();
    __asm { mov ecx, 1 }; lowestBitNotSetSimpleAsm(); __asm { mov ecx, 1 }; lowestBitNotSetSimpleAsm();
  }
  elapsed = __rdtsc() - start;
  printf("simple(1,2): %I64d ticks => about %.3f ticks per call\n", elapsed, elapsed/(maxLoop*float(unroll)));

  start = __rdtsc();
  for (size_t i = maxLoop; i != 0; i--)
  {
    // unroll 10 times to keep loop overhead to a minimum
    __asm { mov ecx, 7FFFFFFFEh }; lowestBitNotSetSimpleAsm(); __asm { mov ecx, 7FFFFFFFEh }; lowestBitNotSetSimpleAsm();
    __asm { mov ecx, 7FFFFFFFEh }; lowestBitNotSetSimpleAsm(); __asm { mov ecx, 7FFFFFFFEh }; lowestBitNotSetSimpleAsm();
    __asm { mov ecx, 7FFFFFFFDh }; lowestBitNotSetSimpleAsm(); __asm { mov ecx, 7FFFFFFFDh }; lowestBitNotSetSimpleAsm();
    __asm { mov ecx, 7FFFFFF1Fh }; lowestBitNotSetSimpleAsm(); __asm { mov ecx, 7FFFFF1FFh }; lowestBitNotSetSimpleAsm();
    __asm { mov ecx, 7FFFEFFFFh }; lowestBitNotSetSimpleAsm(); __asm { mov ecx, 7FFFFFFFFh }; lowestBitNotSetSimpleAsm();
  }
  elapsed = __rdtsc() - start;
  printf("simple(mix): %I64d ticks => about %.3f ticks per call\n", elapsed, elapsed/(maxLoop*float(unroll)));
#endif

  // analyze all numbers from 0 to 1^32 - 1
  printf("verifying ");
  unsigned int maxX   = 0xFFFFFFFF;
  unsigned int x      = 0;
  unsigned int errors = 0;
  do 
  {
    if (lowestBitNotSet(x) != lowestBitNotSetSimple(x))
    {
      printf("failed ! x=%u, expected %u but got %u.\n", x, lowestBitNotSetSimple(x), lowestBitNotSet(x));
      errors++;
    }

    // progress meter, the whole validation takes about a minute on my computer
    if ((x & 0x07FFFFFF) == 0)
      printf(".");

    x++;
  } while (x < maxX);

  printf(" %u errors.\n", errors);
  return errors;
}

// performance:*5
// + Intel Pentium D:
// - lowestBitNotSet: approx. 1.5 cycles per number
// - simple(1,2): approx. 6 cycles per number
// - simple(mix): approx. 13 cycles per number
//
// + Intel Core 2:
// - lowestBitNotSet: approx. 1.5 cycles per number
// - simple(1,2): approx. 4 cycles per number
// - simple(mix): approx. 10.5 cycles per number
//
// + Intel Core i7:
// - lowestBitNotSet: approx. 1.5 cycles per number
// - simple(1,2): approx. 4 cycles per number
// - simple(mix): approx. 10.5 cycles per number
