// Parity
// http://bits.stephan-brumme.com

// references:
// http://graphics.stanford.edu/~seander/bithacks.html

// code:
bool parity(unsigned int x)
{
  x ^= x >> 16;
  x ^= x >>  8;
  x ^= x >>  4;
  x &= 0x0F;
  return ((0x6996 >> x) & 1) != 0;
}

bool parity2(unsigned int x)
{
  x ^= x >> 1;
  x ^= x >> 2;
  x &= 0x011111111;
  x *= 0x011111111;
  return ((x >> 28) & 1) != 0;
}

// alternative:
bool parityValidate(unsigned int x)
{
  // true, if odd number of 1's
  int result;

  // look at each bit and set result accordingly

  // bits 0-7
  result  = x; x >>= 1;
  result ^= x; x >>= 1;
  result ^= x; x >>= 1;
  result ^= x; x >>= 1;
  result ^= x; x >>= 1;
  result ^= x; x >>= 1;
  result ^= x; x >>= 1;
  result ^= x; x >>= 1;
  // bits 8-15
  result ^= x; x >>= 1;
  result ^= x; x >>= 1;
  result ^= x; x >>= 1;
  result ^= x; x >>= 1;
  result ^= x; x >>= 1;
  result ^= x; x >>= 1;
  result ^= x; x >>= 1;
  result ^= x; x >>= 1;
  // bits 16-23
  result ^= x; x >>= 1;
  result ^= x; x >>= 1;
  result ^= x; x >>= 1;
  result ^= x; x >>= 1;
  result ^= x; x >>= 1;
  result ^= x; x >>= 1;
  result ^= x; x >>= 1;
  result ^= x; x >>= 1;
  // bits 24-31
  result ^= x; x >>= 1;
  result ^= x; x >>= 1;
  result ^= x; x >>= 1;
  result ^= x; x >>= 1;
  result ^= x; x >>= 1;
  result ^= x; x >>= 1;
  result ^= x; x >>= 1;
  result ^= x;

  return (result & 1) != 0;
}

// assembler:
#ifdef _MSC_VER
__forceinline void parityAsm() // stripped down to core algorithm to avoid overhead of parameter pushes/pops
{
  // compiler: Visual C++ 2008
  // in: eax - x
  // out: eax - result
  // temp: ecx
  // temp: edx
  __asm
  {
    ; x ^= x >> 16
    mov ecx, eax
    shr ecx, 16
    xor eax, ecx
    ; x ^= x >>  8
    mov edx, eax
    shr edx, 8
    xor eax, edx
    ; x ^= x >>  4
    mov ecx, eax
    shr ecx, 4
    xor ecx, eax
    ; x &= 0x0F
    and ecx, 0000000Fh
    ; return ((0x6996 >> x) & 1) != 0
    mov eax, 00006996h
    sar eax, cl
    and eax, 1
  }
}

__forceinline void parity2Asm() // stripped down to core algorithm to avoid overhead of parameter pushes/pops
{
  // compiler: Visual C++ 2008
  // in: ecx - x
  // out: eax - result
  __asm
  {
    ; x ^= x >> 1
    mov eax, ecx
    shr eax, 1
    xor ecx, eax
    ; x ^= x >> 2
    mov eax, ecx
    shr eax, 2
    xor eax, ecx
    ; x &= 0x11111111
    and eax, 11111111h
    ; x *= 0x11111111
    imul eax, 11111111h
    ; return (x >> 28) & 1
    shr eax, 28
    and eax, 1
  }
}
#endif

// restrictions:
// - designed for 32 bits
// - parity: insert "x ^= x >> 32;" before line 3 for 64 bits
// - parity: remove line 3 for 16 bits
// - parity: remove line 3 and line 4 for 8 bits
// - parity2: for 64 bits, replace 0x011111111 by 0x01111111111111111 and shift right by 60 instead of 28

// explanation:
// Bit parity tells whether a given input contains an odd number of 1s.
// When the input is subdivided into several chunks, the total parity can be determined by computing the
// parity of the sub parities. Might sound weird, so here is a formula to make it clear:
//
// parity(A and B) = parity(parity(A) and parity(B))
//
// If we break down the 32 bits into chunks of 4 bits then the problem is greatly simplified. XOR is the way to go:
//
// (0 XOR 0) => 0
// (0 XOR 1) => 1
// (1 XOR 0) => 1
// (1 XOR 1) => 0
//
// That basically means that the parity of two bits can be computed with a XOR operation.
// Even better, XOR can be applied to the full value to compute the parities of multiple bit pairs in parallel.
//
// first algorithm (parity, line 1 to line 8):
//
// The code first "merges" bits 0 - 15 with bits 16 - 31 using a right shift and XOR (line 3).
// Including line 4 and line 5 reduce the input width is reduced from 32 bits to 8 bits by repeatedly applying shifted XORs.
// We are only interested in the lower bits. That means, after step 1 the lower 16 bits are of importance,
// then the lower 8, then the lowest 4. After having applied XOR three times, we delete all higher bits (line 6).
//
// We are left with sixteen possible values for x. Their parity is:
// parity(15) => parity(1111b) => 4 bits set => even
// parity(14) => parity(1110b) => 3 bits set => odd
// parity(13) => parity(1101b) => 3 bits set => odd
// parity(12) => parity(1100b) => 2 bits set => even
// parity(11) => parity(1011b) => 3 bits set => odd
// parity(10) => parity(1010b) => 2 bits set => even
// parity( 9) => parity(1001b) => 2 bits set => even
// parity( 8) => parity(1000b) => 1 bits set => odd
// parity( 7) => parity(0111b) => 3 bits set => odd
// parity( 6) => parity(0110b) => 2 bits set => even
// parity( 5) => parity(0101b) => 2 bits set => even
// parity( 4) => parity(0100b) => 1 bits set => odd
// parity( 3) => parity(0011b) => 2 bits set => even
// parity( 2) => parity(0010b) => 1 bits set => odd
// parity( 1) => parity(0001b) => 1 bits set => odd
// parity( 0) => parity(0000b) => 0 bits set => even
//
// If we encode even by 0 and odd by 1 beginning with parity(15) then we get 0110 1001 0110 1001 = 0x6996,
// which is the magic number found in line 7. The shift moves the relevant bit to bit 0. Then everything except
// for bit 0 is masked out. In the end, we get 0 for even and 1 for odd, exactly as desird.
//
// second algorithm (parity2, line 10 to line 17):
//
// This time, parity is computed bottom-up instead of top-down.
// Using a similar XOR technique, line 12 and line 13 compute the parity of each four-bit-block and put it
// into to lowest bit of each block. All other bits become garbage and need to be cleared (line 14).
//
// Multiplying by 0x011111111 has an interesting property if we label the four-bit-block A, B, C, D, E, F, G, H:
// A, B, C, D, E, F, G, H => A+B+C+D+E+F+G+H, B+C+D+E+F+G+H, C+D+E+F+G+H, D+E+F+G+H, E+F+G+H, F+G+H, G+H, H
// Obviously the highest four-bit-block is what we are looking for. If the sum of multiple parities is odd,
// then the overall parity is odd as well (vice versa for even).
// The right shift followed by AND (line 16) returns the lowest bit which gives us the parity.

// validation:
#include <stdio.h>
#ifdef _MSC_VER
#include <intrin.h>
#endif

int main(int argc, char* argv[])
{
  // Microsoft Visual C++ performance measurement
#ifdef _MSC_VER
  printf("performance test ...\n");

  const size_t maxLoop = 100000;
  const size_t unroll  = 10;

  unsigned long long start = __rdtsc();
  for (size_t i = maxLoop; i != 0; i--)
  {
    // unroll 10 times to keep loop overhead to a minimum
    parityAsm(); parityAsm();
    parityAsm(); parityAsm();
    parityAsm(); parityAsm();
    parityAsm(); parityAsm();
    parityAsm(); parityAsm();
  }
  unsigned long long elapsed = __rdtsc() - start;
  printf("parity: %I64d ticks => about %.3f ticks per call\n", elapsed, elapsed/(maxLoop*float(unroll)));

  start = __rdtsc();
  for (size_t i = maxLoop; i != 0; i--)
  {
    // unroll 10 times to keep loop overhead to a minimum
    parity2Asm(); parity2Asm();
    parity2Asm(); parity2Asm();
    parity2Asm(); parity2Asm();
    parity2Asm(); parity2Asm();
    parity2Asm(); parity2Asm();
  }
  elapsed = __rdtsc() - start;
  printf("parity2: %I64d ticks => about %.3f ticks per call\n", elapsed, elapsed/(maxLoop*float(unroll)));

  start = __rdtsc();
  for (size_t i = maxLoop; i != 0; i--)
  {
    // unroll 10 times to keep loop overhead to a minimum
    volatile int input = i; // dummy to prevent compiler from optimizing too much (constant data)
    volatile bool result;   // dummy to prevent compiler from optimizing too much (result is thrown away)
    result = parityValidate(input); result = parityValidate(input);
    result = parityValidate(input); result = parityValidate(input);
    result = parityValidate(input); result = parityValidate(input);
    result = parityValidate(input); result = parityValidate(input);
    result = parityValidate(input); result = parityValidate(input);
  }
  elapsed = __rdtsc() - start;
  printf("validate: %I64d ticks => about %.3f ticks per call\n", elapsed, elapsed/(maxLoop*float(unroll)));
#endif

  // analyze all numbers from 0 to 1^32 - 1
  printf("verifying ");
  unsigned int maxX   = 0xFFFFFFFF;
  unsigned int x      = 0;
  unsigned int errors = 0;
  do
  {
    bool testParity   = parity(x);
    bool testParity2  = parity2(x);
    bool testValidate = parityValidate(x);
    if ((testValidate != testParity ) ||
        (testValidate != testParity2))
    {
      printf("failed ! x=%u has wrong parity, expected %u but got %u and %u.\n", x, testValidate, testParity, testParity2);
      errors++;
    }

    // progress meter, the whole validation takes about a minute on my computer
    if ((x & 0x07FFFFFF) == 0)
      printf(".");

    x++;
  } while (x != 0);

  printf(" %u errors.\n", errors);
  return errors;
}

// performance:*
// - constant time, data independent
//
// + Intel Pentium D:
// - parity: approx. 13 cycles per number
// - parity2: approx. 6 cycles per number
//
// + Intel Core 2:
// - parity: approx. 12 cycles per number
// - parity2: approx. 5 cycles per number
//
// + Intel Core i7:
// - parity: approx. 11.5 cycles per number
// - parity2: approx. 5.25 cycles per number