// Position of lowest bit set
// http://bits.stephan-brumme.com

// references:
// http://graphics.stanford.edu/~seander/bithacks.html

// code:
const unsigned int MultiplyDeBruijnBitPosition[32] =
{
  // precomputed lookup table
   0,  1, 28,  2, 29, 14, 24,  3, 30, 22, 20, 15, 25, 17,  4,  8,
  31, 27, 13, 23, 21, 19, 16,  7, 26, 12, 18,  6, 11,  5, 10,  9
};

unsigned int lowestBitSet(unsigned int x)
{
  // only lowest bit will remain 1, all others become 0
  x  &= -int(x);
  // DeBruijn constant
  x  *= 0x077CB531;
  // the upper 5 bits are unique, skip the rest (32 - 5 = 27)
  x >>= 27;
  // convert to actual position
  return MultiplyDeBruijnBitPosition[x];
}

unsigned int lowestBitSetSimple(unsigned int x)
{
  // x=0 is not properly handled by while-loop
  if (x == 0)
    return 0;

  unsigned int result = 0;
  while ((x & 1) == 0)
  {
    x >>= 1;
    result++;
  }

  return result;
}

// assembler:
#ifdef _MSC_VER
__forceinline void lowestBitSetAsm() // stripped down to core algorithm to avoid overhead of parameter pushes/pops
{
  // compiler: Visual C++ 2008
  // in: ecx - x
  // out: eax - result
  __asm
  {
    ; x &= -x
    mov ecx, eax
    neg ecx
    and ecx, eax
    ; x *= 0x077CB531
    imul ecx, 077CB531h
    ; x >>= 27
    shr ecx, 27
    ; return MultiplyDeBruijnBitPosition[x];
    mov eax, MultiplyDeBruijnBitPosition[ecx*4]
  }
}

__forceinline void lowestBitSetSimpleAsm() // stripped down to core algorithm to avoid overhead of parameter pushes/pops
{
  // compiler: Visual C++ 2008
  // in: ecx - x
  // out: eax - result
  __asm
  {
    ; result = 0
    xor eax, eax
    ; if (x == 0) return 0
    test ecx, ecx
    je $finish
    ; while ((x & 1) == 0)
    test cl, 1
    jne $finish
  $scanLoop:
    ; x >>= 1
    shr ecx, 1
    ; result++
    inc eax
    test cl, 1
    je $scanLoop
  $finish:
  }
}
#endif

// restrictions:
// - designed for 32 bits

// explanation:
// A negative number is usually represented by the two-complement: -x = not(x)+1
//
// The formula x & -x = x & (not(x) + 1) deletes all but the lowest set bit (line 11).
//
// That means after line 11 we have a power of 2 and there are obviously only 32 possibilities left.
// When multiplied by the DeBruijn constant 0x077CB531 (line 13), the upper 5 bits are unique for each power of 2.
// A small lookup table resolves the actual position of the lowest bit set (line 1 to 6, line 17).
// The overall performance of this algorithm is mainly determined by the lookup table and
// the numbers given below are based on full cache hits.
//
// Note: DeBruijn sequences are extensively explained by Wikipedia, too.
//
// The simple version performs surprisingly well for randomly distributed input:
// 50% of all integers are odd and their result will be 0. Another 25% result will be 1, therefore the loop terminates pretty early.
// When numbers with long spans of zeros are processed, timings become significantly worse.
// The observed break-even was lowestBitSet(32) on a Pentium D, your mileage may vary.
//
// The fastest way on almost all modern x86/x64 CPU bases on the built-in BSF instruction.

// validation:
#include <stdio.h>
#ifdef _MSC_VER
#include <intrin.h>
#endif

int main(int, char**)
{
  // Microsoft Visual C++ performance measurement
#ifdef _MSC_VER
  printf("performance test ...\n");

  const size_t maxLoop = 100000;
  const size_t unroll  = 10;

  unsigned long long start = __rdtsc();
  for (size_t i = maxLoop; i != 0; i--)
  {
    // unroll 10 times to keep loop overhead to a minimum
    lowestBitSetAsm(); lowestBitSetAsm();
    lowestBitSetAsm(); lowestBitSetAsm();
    lowestBitSetAsm(); lowestBitSetAsm();
    lowestBitSetAsm(); lowestBitSetAsm();
    lowestBitSetAsm(); lowestBitSetAsm();
  }
  unsigned long long elapsed = __rdtsc() - start;
  printf("%I64d ticks => about %.3f ticks per call\n", elapsed, elapsed/(maxLoop*float(unroll)));

  start = __rdtsc();
  for (size_t i = maxLoop; i != 0; i--)
  {
    // unroll 10 times to keep loop overhead to a minimum
    __asm { mov ecx, 1 }; lowestBitSetSimpleAsm(); __asm { mov ecx, 1 }; lowestBitSetSimpleAsm();
    __asm { mov ecx, 1 }; lowestBitSetSimpleAsm(); __asm { mov ecx, 1 }; lowestBitSetSimpleAsm();
    __asm { mov ecx, 1 }; lowestBitSetSimpleAsm(); __asm { mov ecx, 2 }; lowestBitSetSimpleAsm();
    __asm { mov ecx, 2 }; lowestBitSetSimpleAsm(); __asm { mov ecx, 2 }; lowestBitSetSimpleAsm();
    __asm { mov ecx, 2 }; lowestBitSetSimpleAsm(); __asm { mov ecx, 2 }; lowestBitSetSimpleAsm();
  }
  elapsed = __rdtsc() - start;
  printf("simple(1 & 2): %I64d ticks => about %.3f ticks per call\n", elapsed, elapsed/(maxLoop*float(unroll)));

  start = __rdtsc();
  for (size_t i = maxLoop; i != 0; i--)
  {
    // unroll 10 times to keep loop overhead to a minimum
    __asm { mov ecx, 000000004h }; lowestBitSetSimpleAsm(); __asm { mov ecx, 000000004h }; lowestBitSetSimpleAsm();
    __asm { mov ecx, 000000010h }; lowestBitSetSimpleAsm(); __asm { mov ecx, 000000010h }; lowestBitSetSimpleAsm();
    __asm { mov ecx, 000000100h }; lowestBitSetSimpleAsm(); __asm { mov ecx, 000000100h }; lowestBitSetSimpleAsm();
    __asm { mov ecx, 000010000h }; lowestBitSetSimpleAsm(); __asm { mov ecx, 000010000h }; lowestBitSetSimpleAsm();
    __asm { mov ecx, 001000000h }; lowestBitSetSimpleAsm(); __asm { mov ecx, 001000000h }; lowestBitSetSimpleAsm();
  }
  elapsed = __rdtsc() - start;
  printf("simple(2^2,2^4,2^8,2^16,2^24): %I64d ticks => about %.3f ticks per call\n", elapsed, elapsed/(maxLoop*float(unroll)));

  start = __rdtsc();
  for (size_t i = maxLoop; i != 0; i--)
  {
    // unroll 10 times to keep loop overhead to a minimum
    unsigned long result;
    _BitScanForward(&result, 0x000000004); _BitScanForward(&result, 0x000000010);
    _BitScanForward(&result, 0x000000010); _BitScanForward(&result, 0x000000010);
    _BitScanForward(&result, 0x000000100); _BitScanForward(&result, 0x000000100);
    _BitScanForward(&result, 0x000010000); _BitScanForward(&result, 0x000010000);
    _BitScanForward(&result, 0x001000000); _BitScanForward(&result, 0x001000000);
  }
  elapsed = __rdtsc() - start;
  printf("intrinsic: %I64d ticks => about %.3f ticks per call\n", elapsed, elapsed/(maxLoop*float(unroll)));
#endif

  // analyze all numbers from 0 to 1^32 - 1
  printf("verifying ");
  unsigned int maxX   = 0xFFFFFFFF;
  unsigned int x      = 0;
  unsigned int errors = 0;
  do
  {
    if (lowestBitSet(x) != lowestBitSetSimple(x))
    {
      printf("failed ! x=%u, expected %u but got %u.\n", x, lowestBitSetSimple(x), lowestBitSet(x));
      errors++;
    }

    // progress meter, the whole validation takes about a minute on my computer
    if ((x & 0x07FFFFFF) == 0)
      printf(".");

    x++;
  } while (x < maxX);

  printf(" %u errors.\n", errors);
  return errors;
}

// performance:*5
// + Intel Pentium D:
// - lowestBitSet: approx. 18 cycles per number
// - simple(1,2): approx. 4 cycles per number
// - simple(mix): approx. 39 cycles per number
// - BSF: approx. 3 cycles per number
//
// + Intel Core 2:
// - lowestBitSet: approx. 10.5 cycles per number
// - simple(1,2): approx. 2.5 cycles per number
// - simple(mix): approx. 21 cycles per number
// - BSF: approx. 1 cycle per number
//
// + Intel Core i7:
// - lowestBitSet: approx. 10.5 cycles per number
// - simple(1,2): approx. 2.5 cycles per number
// - simple(mix): approx. 21 cycles per number
// - BSF: approx. 1 cycle per number