// Count bits set in parallel a.k.a. Population Count
// http://bits.stephan-brumme.com

// references:
// AMD, "AMD Software Optimization Guide for AMD64 Processors"
// Peter Wegner: "A Technique for Counting Ones in a Binary Computer", Communications of the ACM, Volume 3 (1960) Number 5, page 322

// code:
unsigned int countBits(unsigned int x)
{
  // count bits of each 2-bit chunk
  x  = x - ((x >> 1) & 0x55555555);
  // count bits of each 4-bit chunk
  x  = (x & 0x33333333) + ((x >> 2) & 0x33333333);
  // count bits of each 8-bit chunk
  x  = x + (x >> 4);
  // mask out junk
  x &= 0xF0F0F0F;
  // add all four 8-bit chunks
  return (x * 0x01010101) >> 24;
}

unsigned int countBitsLoop(unsigned int x)
{
  unsigned int result = 0;
  // strip one set bit per iteration
  while (x != 0)
  {
    x &= x-1;
    result++;
  }
  return result;
}

// assembler:
#ifdef _MSC_VER
__forceinline void countBitsAsm() // stripped down to core algorithm to avoid overhead of parameter pushes/pops
{
  // compiler: VC 2008
  // in: eax - x
  // out: eax - result
  // temp: ecx
  __asm
  {
    ; x  = x - ((x >> 1) & 0x55555555)
    mov ecx, eax
    shr ecx, 1
    and ecx, 55555555h
    sub eax, ecx
    ; x  = (x & 0x33333333) + ((x >> 2) & 0x33333333)
    mov ecx, eax
    and eax, 33333333h
    shr ecx, 2
    and ecx, 33333333h
    add ecx, eax
    ; x  = x + (x >> 4)
    mov eax, ecx
    shr eax, 4
    add eax, ecx
    ; x &= 0xF0F0F0F
    and eax, 0F0F0F0Fh
    ; (x * 0x01010101) >> 24
    imul eax, 01010101h
    shr eax, 24
  }
}

__forceinline void countBitsLoopAsm() // stripped down to core algorithm to avoid overhead of parameter pushes/pops
{
  // compiler: VC 2008
  // in: ecx - x
  // out: eax - result
  // temp: edx
  __asm
  {
    ; result = 0
    xor eax, eax
    ; while (x != 0)
    test ecx, ecx
    je $finish
  $while:
    ; x &= x-1
    lea edx, [ecx-1] ; clever x86 trick: edx = ecx-1
    and ecx, edx
    ; result++
    inc eax
    test ecx, ecx
    jne $while
  $finish:
  }
}
#endif

// alternative:
unsigned int countBitsSimple(unsigned int x)
{
  // true, if odd number of 1's
  unsigned int result = 0;

  // look at each bits and set "result" accordingly

  // bits 0-7
  result += x & 1; x >>= 1;
  result += x & 1; x >>= 1;
  result += x & 1; x >>= 1;
  result += x & 1; x >>= 1;
  result += x & 1; x >>= 1;
  result += x & 1; x >>= 1;
  result += x & 1; x >>= 1;
  result += x & 1; x >>= 1;
  // bits 8-15
  result += x & 1; x >>= 1;
  result += x & 1; x >>= 1;
  result += x & 1; x >>= 1;
  result += x & 1; x >>= 1;
  result += x & 1; x >>= 1;
  result += x & 1; x >>= 1;
  result += x & 1; x >>= 1;
  result += x & 1; x >>= 1;
  // bits 16-23
  result += x & 1; x >>= 1;
  result += x & 1; x >>= 1;
  result += x & 1; x >>= 1;
  result += x & 1; x >>= 1;
  result += x & 1; x >>= 1;
  result += x & 1; x >>= 1;
  result += x & 1; x >>= 1;
  result += x & 1; x >>= 1;
  // bits 24-31
  result += x & 1; x >>= 1;
  result += x & 1; x >>= 1;
  result += x & 1; x >>= 1;
  result += x & 1; x >>= 1;
  result += x & 1; x >>= 1;
  result += x & 1; x >>= 1;
  result += x & 1; x >>= 1;
  result += x & 1;

  return result;
}

// restrictions:
// - designed for 32 bits
// - except for countBitsLoop, which is fine for all integer bit widths

// explanation:
// countBits (line 1 - line 13):
// Counting all set bits of an integer was part of many mainframe CPU's assembler language but somehow
// x86 CPUs ignored it for decades. Apparently Intel introduced the POPCNT opcode in its Core i7 design.
//
// Meanwhile, the population count has to be implemented by other means.
// The main observations lies in the fact that you can subdivide any bitblock into smaller chunks,
// compute their population count and add all intermediate results.
//
// First, the code counts the bits of two adjacent bits:
//
// 0b and 0b => 00b
// 0b and 1b => 01b
// 1b and 0b => 01b
// 1b and 1b => 10b
//
// The whole algorithm modifies the input in order to generate the output, that means it works in-place.
// Line 4 performs the 2-bit count at once based on the observation:
//
// 00b => unchanged, still 00b
// 01b => unchanged, still 01b
// 10b => must be converted to 01b
// 11b => must be converted to 10b
//
// Whenever the higher bit of each 2-bit group is set, subtracting 01b gives the desired outcome.
// Looks like branching ... but as it turns out, the subtraction can be done always: just subtract the higher bit !
// If it is 0, the result remains unchanged, if it is 1, then we get the right numbers, too.
// The shift x >> 1 and the following mask of all odd bits (0x55 is 01010101b):
//
// 00b => shifted: ?0b => masked: 00b => subtraction: 00b - 00b => 00b
// 01b => shifted: ?0b => masked: 00b => subtraction: 01b - 00b => 01b
// 10b => shifted: ?1b => masked: 01b => subtraction: 10b - 01b => 01b
// 11b => shifted: ?1b => masked: 01b => subtraction: 11b - 01b => 10b
//
// Now the 2-bit count is done. As you can see, there are just three possible decimal results: 0, 1 or 2.
//
// Then, two adjacent 2-bit groups are joined to 4-bit groups (line 6):
//
// 00b and 00b => 0000b
// 00b and 01b => 0001b
// 00b and 10b => 0010b
// 01b and 00b => 0001b
// 01b and 01b => 0010b
// 01b and 10b => 0011b
// 10b and 00b => 0010b
// 10b and 01b => 0011b
// 10b and 10b => 0100b
//
// This time, the 2-bit groups are masked and shifted to match and then simply added. No overflow is possible.
//
// 00b + 00b => 0000b
// 00b + 01b => 0001b
// 00b + 10b => 0010b
// 01b + 00b => 0001b
// 01b + 01b => 0010b
// 01b + 10b => 0011b
// 10b + 00b => 0010b
// 10b + 01b => 0011b
// 10b + 10b => 0100b
//
// The same procedure is done for all 4-bit groups yielding the bit counts for each of the four bytes (line 8)
// in their lower four bits. That means, each byte contains its bit count, however, the upper four bits may
// contain junk and are masked out (line 10).
//
// Multiplying by 0x01010101 has an interesting property if we name the four bytes A, B, C, D:
//
// A, B, C, D => A+B+C+D, B+C+D, C+D, D
//
// Obviously the highest byte is what we are looking for. The right shift (line 12) returns just it.
//
//
// countBitsLoop (line 15 - line 25):
//
// A very elegant and sometimes faster algorithm was presented in Brian Kernighan's and Dennis Ritchie's famous book
// "The C Programming Language". On the internet I found out that Peter Wegener actually deserves the credit for first publishing it.
//
// Unlike the other algorithms, its performance heavily depends on its input. When less than about four bits are set, it outperforms all
// other algorithms but suffers heavily when almost all bits are set.
//
// The main trick, stripping a single bit with x &= x - 1 (line 21), deserves some attention:
// - if x = 0, then the while-loop is not entered at all, so we do not need to consider this case
// - if the rightmost bit is 1, then the rightmost bit of x - 1 is 0. All other bits are identical and x &= x - 1 => x = x - 1.
//   Because all other bits are identical we stripped one set bit, the rightmost bit.
// - if the rightmost bits are 0, then x looks like this: ...1000. And x - 1 looks like this: ...0111. Result of x &= x-1: ...0000.
//   Hence, x &= x - 1 clears all bits except for the ones represented as dots, they remain the same. Again, exactly one bit was cleared.
// In general, x &= x - 1 always sets the rightmost bit which was 1 to 0.

// validation:
#include <stdio.h>
#ifdef _MSC_VER
#include <intrin.h>
#endif

int main(int, char**)
{
  // Microsoft Visual C++ performance measurement
#ifdef _MSC_VER
  printf("performance test ...\n");

  const size_t maxLoop = 100000;
  const size_t unroll  = 10;

  unsigned long long start = __rdtsc();
  for (size_t i = maxLoop; i != 0; i--)
  {
    // unroll 10 times to keep loop overhead to a minimum
    countBitsAsm(); countBitsAsm();
    countBitsAsm(); countBitsAsm();
    countBitsAsm(); countBitsAsm();
    countBitsAsm(); countBitsAsm();
    countBitsAsm(); countBitsAsm();
  }
  unsigned long long elapsed = __rdtsc() - start;
  printf("Constant time: %I64d ticks => about %.3f ticks per call\n", elapsed, elapsed/(maxLoop*float(unroll)));

  start = __rdtsc();
  for (size_t i = maxLoop; i != 0; i--)
  {
    // unroll 10 times to keep loop overhead to a minimum
    __asm { mov ecx,  00000000h } countBitsLoopAsm();
    __asm { mov ecx,  0000000Fh } countBitsLoopAsm();
    __asm { mov ecx,  000000FFh } countBitsLoopAsm();
    __asm { mov ecx,  00000FFFh } countBitsLoopAsm();
    __asm { mov ecx,  00003FFFh } countBitsLoopAsm();
    __asm { mov ecx,  0003FFFFh } countBitsLoopAsm();
    __asm { mov ecx,  000FFFFFh } countBitsLoopAsm();
    __asm { mov ecx,  00FFFFFFh } countBitsLoopAsm();
    __asm { mov ecx,  0FFFFFFFh } countBitsLoopAsm();
    __asm { mov ecx, 0FFFFFFFFh } countBitsLoopAsm();
  }
  elapsed = __rdtsc() - start;
  printf("Alternative: %I64d ticks => about %.3f ticks per call\n", elapsed, elapsed/(maxLoop*float(unroll)));

  start = __rdtsc();
  for (size_t i = maxLoop; i != 0; i--)
  {
    // unroll 10 times to keep loop overhead to a minimum
    __asm { mov ecx, 0 } countBitsLoopAsm();
    __asm { mov ecx, 0 } countBitsLoopAsm();
    __asm { mov ecx, 0 } countBitsLoopAsm();
    __asm { mov ecx, 0 } countBitsLoopAsm();
    __asm { mov ecx, 0 } countBitsLoopAsm();
    __asm { mov ecx, 0 } countBitsLoopAsm();
    __asm { mov ecx, 0 } countBitsLoopAsm();
    __asm { mov ecx, 0 } countBitsLoopAsm();
    __asm { mov ecx, 0 } countBitsLoopAsm();
    __asm { mov ecx, 0 } countBitsLoopAsm();
  }
  elapsed = __rdtsc() - start;
  printf("Loop(0): %I64d ticks => about %.3f ticks per call\n", elapsed, elapsed/(maxLoop*float(unroll)));

  start = __rdtsc();
  for (size_t i = maxLoop; i != 0; i--)
  {
    // unroll 10 times to keep loop overhead to a minimum
    __asm { mov ecx, 0000FFFFh } countBitsLoopAsm();
    __asm { mov ecx, 0000FFFFh } countBitsLoopAsm();
    __asm { mov ecx, 0000FFFFh } countBitsLoopAsm();
    __asm { mov ecx, 0000FFFFh } countBitsLoopAsm();
    __asm { mov ecx, 0000FFFFh } countBitsLoopAsm();
    __asm { mov ecx, 0000FFFFh } countBitsLoopAsm();
    __asm { mov ecx, 0000FFFFh } countBitsLoopAsm();
    __asm { mov ecx, 0000FFFFh } countBitsLoopAsm();
    __asm { mov ecx, 0000FFFFh } countBitsLoopAsm();
    __asm { mov ecx, 0000FFFFh } countBitsLoopAsm();
  }
  elapsed = __rdtsc() - start;
  printf("Loop(16 bits set): %I64d ticks => about %.3f ticks per call\n", elapsed, elapsed/(maxLoop*float(unroll)));

  start = __rdtsc();
  for (size_t i = maxLoop; i != 0; i--)
  {
    // unroll 10 times to keep loop overhead to a minimum
    __asm { mov ecx, -1 } countBitsLoopAsm();
    __asm { mov ecx, -1 } countBitsLoopAsm();
    __asm { mov ecx, -1 } countBitsLoopAsm();
    __asm { mov ecx, -1 } countBitsLoopAsm();
    __asm { mov ecx, -1 } countBitsLoopAsm();
    __asm { mov ecx, -1 } countBitsLoopAsm();
    __asm { mov ecx, -1 } countBitsLoopAsm();
    __asm { mov ecx, -1 } countBitsLoopAsm();
    __asm { mov ecx, -1 } countBitsLoopAsm();
    __asm { mov ecx, -1 } countBitsLoopAsm();
  }
  elapsed = __rdtsc() - start;
  printf("Loop(all bits set): %I64d ticks => about %.3f ticks per call\n", elapsed, elapsed/(maxLoop*float(unroll)));

  start = __rdtsc();
  for (size_t i = maxLoop; i != 0; i--)
  {
    // unroll 10 times to keep loop overhead to a minimum
    volatile unsigned dummy;
    dummy = countBitsSimple(dummy); dummy = countBitsSimple(dummy);
    dummy = countBitsSimple(dummy); dummy = countBitsSimple(dummy);
    dummy = countBitsSimple(dummy); dummy = countBitsSimple(dummy);
    dummy = countBitsSimple(dummy); dummy = countBitsSimple(dummy);
    dummy = countBitsSimple(dummy); dummy = countBitsSimple(dummy);
    // I have to admit that memory transfers forced by "volatile" do have a negative impact here
  }
  elapsed = __rdtsc() - start;
  printf("Simple: %I64d ticks => about %.3f ticks per call\n", elapsed, elapsed/(maxLoop*float(unroll)));
#endif

  // analyze all numbers from 0 to 1^32 - 1
  printf("verifying ");
  unsigned int maxX   = 0xFFFFFFFF;
  unsigned int x      = 0;
  unsigned int errors = 0;
  do
  {
    if (countBits(x) != countBitsSimple(x))
    {
      printf("failed ! x=%u has wrong bit count, expected %u but got %u.\n", x, countBits(x), countBitsSimple(x));
      errors++;
    }

    // progress meter, the whole validation takes about a minute on my computer
    if ((x & 0x07FFFFFF) == 0)
      printf(".");

    x++;
  } while (x < maxX);

  printf(" %u errors.\n", errors);
  return errors;
}

// performance:*7
// + Intel Pentium D:
// - countBits: approx. 23 cycles per number, constant time, data independent
// - countBitsLoop: approx. 68 cycles per number (average)
// - countBitsLoop(0): approx. 4 cycles per number when 0 bits are set (0% are set)
// - countBitsLoop(16): approx. 54 cycles per number when 16 bits are set (50% are set)
// - countBitsLoop(32): approx. 145 cycles per number when 32 bits are set (100% are set)
// - simple (unrolled for-loop): approx. 60 cycles per number, constant time, data independent
//
// + Intel Core 2:
// - countBits: approx. 16.5 cycles per number, constant time, data independent
// - countBitsLoop: approx. 37.5 cycles per number (average)
// - countBitsLoop(0): approx. 2.5 cycles per number when 0 bits are set (0% are set)
// - countBitsLoop(16): approx. 37 cycles per number when 16 bits are set (50% are set)
// - countBitsLoop(32): approx. 70 cycles per number when 32 bits are set (100% are set)
// - simple (unrolled for-loop): approx. 77 cycles per number, constant time, data independent
//
// + Intel Core i7:
// - countBits: approx. 16 cycles per number, constant time, data independent
// - countBitsLoop: approx. 38 cycles per number (average)
// - countBitsLoop(0): approx. 2.5 cycles per number when 0 bits are set (0% are set)
// - countBitsLoop(16): approx. 37 cycles per number when 16 bits are set (50% are set)
// - countBitsLoop(32): approx. 70 cycles per number when 32 bits are set (100% are set)
// - simple (unrolled for-loop): approx. 67 cycles per number, constant time, data independent