// Bit manipulation basics
// http://bits.stephan-brumme.com

// references:
// none

// code:
int setBit(int x, unsigned char position)
{
  int mask = 1 << position;
  return x | mask;
}

int clearBit(int x, unsigned char position)
{
  int mask = 1 << position;
  return x & ~mask;
}

int modifyBit(int x, unsigned char position, bool newState)
{
  int mask = 1 << position;
  int state = int(newState); // relies on true = 1 and false = 0
  return (x & ~mask) | (-state & mask);
}

int flipBit(int x, unsigned char position)
{
  int mask = 1 << position;
  return x ^ mask;
}

bool isBitSet(int x, unsigned char position)
{
  x >>= position;
  return (x & 1) != 0;
}

// assembler:
__forceinline void setBitAsm() // stripped down to core algorithm to avoid overhead of parameter pushes/pops
{
  // compiler: VC 2008
  // in: edx - x
  // in: cl - position
  // out: eax - result
  __asm
  {
    mov eax, 1
    shl eax, cl
    or  eax, edx
  }
}

__forceinline void clearBitAsm() // stripped down to core algorithm to avoid overhead of parameter pushes/pops
{
  // compiler: VC 2008
  // in: edx - x
  // in: cl - position
  // out: eax - result
  __asm
  {
    mov eax, 1
    shl eax, cl
    not eax
    and eax, edx
  }
}

__forceinline void modifyBitAsm() // stripped down to core algorithm to avoid overhead of parameter pushes/pops
{
  // compiler: VC 2008
  // in: esi - x
  // in: cl - position
  // in: dl - newState
  // out: eax - result
  // temp: esi
  __asm
  {
    ; mask = 1 << position
    mov eax, 1
    shl eax, cl
    ; state = int(newState)
    movzx ecx, dl
    neg ecx
    ; return (x & ~mask) | (-state & mask)
    and ecx, eax
    not eax
    and eax, esi
    or eax, ecx
  }
}

__forceinline void flipBitAsm() // stripped down to core algorithm to avoid overhead of parameter pushes/pops
{
  // compiler: VC 2008
  // in: edx - x
  // in: cl - position
  // out: eax - result
  __asm
  {
    mov eax, 1
    shl eax, cl
    xor eax, edx
  }
}

__forceinline void isBitSetAsm() // stripped down to core algorithm to avoid overhead of parameter pushes/pops
{
  // compiler: VC 2008
  // in: eax - x
  // in: cl - position
  // out: eax - result
  __asm
  {
    shr eax, cl
    and eax, 1
  }
}

// restrictions:
// - integer based
// - no range checking for position

// explanation:
// The shown operation are the 101 of bit manipulation. If you have at least an introductory knowledge
// about bit twiddling, you should skip this page.
//
// These basic algorithms rely on a mask representing the position of the bit to be changed.
// This mask is created by left shifting 1 (line 3, line 9 and line 15). Depending on the compiler/system,
// position is limited to 15, 31 or 63, although - to simplify things - this restriction is not checked by the code.
//
// Here are the basics of boolean algebra:
//
// 0 OR 0 => 0
// 0 OR 1 => 1
// 1 OR 0 => 1
// 1 OR 1 => 1
//
// 0 AND 0 => 0
// 0 AND 1 => 0
// 1 AND 0 => 0
// 1 AND 1 => 1
//
// 0 XOR 0 => 0
// 0 XOR 1 => 1
// 1 XOR 0 => 1
// 1 XOR 1 => 0
//
// Whenever a bit needs to be set independent of its former state (setBit), OR comes in handy:
//
// input OR 1 => 1
// input OR 0 => input
//
// That means, a mask which is 1 for all bits to be set and 0 for all other bits fulfills our purposes (line 3, line 4).
//
// For clearing a bit (clearBit), AND has to be chosen because:
//
// input AND 0 => 0
// input AND 1 => input
//
// This time, the mask must be 0 for all cleared bits and 1 for bits keeping their former state.
// Line 10 uses the bit-wise negation to flip all bits of the mask accordingly.
//
// Flipping a bit (flipBit) is the domain of XOR:
//
// input XOR 1 => opposite of input
// input XOR 0 => input
//
// That code looks almost identical to OR.
//
// Last but not least, querying a single bit (isBitSet) can be implemented very similiar to clearing a single bit:
// create a mask, negate it, check for 0. When looking at the assembler code, I changed to a faster way
// by shifting right the value to move the interesting bit to position 0 (line 28).
// Then, AND 1 clears all but this bit. Checking for 0 returns the desired result (line 29).
// In C++, bool is typically an integer where false => 0 and true => 1.
// That means, just two assembler instructions are required for isBitSet.
//
// Note: Visual C++ provides the _bittest, _bittestandset, _bittestandreset and _bittestandcomplement intrinsics
//          which might generate a single assembler instruction (if available) for these operations and are never slower.
// Note, too: Even though these intrinsics are not supported by all compilers, the shown assembler code will probably
//                be heavily optimized, too: if the shift is constant, the mask is precomputed during compile time
//                and just one instruction is left.

// validation:
#include <stdio.h>
#include <intrin.h>

int main(int, char**)
{
  // Microsoft Visual C++ performance measurement
#ifdef _MSC_VER
  printf("performance test ...\n");

  const size_t maxLoop = 100000;
  const size_t unroll  = 10;

  unsigned long long start = __rdtsc();
  for (size_t i = maxLoop; i != 0; i--)
  {
    // unroll 10 times to keep loop overhead to a minimum
    setBitAsm(); setBitAsm(); setBitAsm(); setBitAsm(); setBitAsm();
    setBitAsm(); setBitAsm(); setBitAsm(); setBitAsm(); setBitAsm();
  }
  unsigned long long elapsed = __rdtsc() - start;
  printf("setBit: %I64d ticks => about %.3f ticks per call\n", elapsed, elapsed/(maxLoop*float(unroll)));

  start = __rdtsc();
  for (size_t i = maxLoop; i != 0; i--)
  {
    // unroll 10 times to keep loop overhead to a minimum
    clearBitAsm(); clearBitAsm(); clearBitAsm(); clearBitAsm(); clearBitAsm();
    clearBitAsm(); clearBitAsm(); clearBitAsm(); clearBitAsm(); clearBitAsm();
  }
  elapsed = __rdtsc() - start;
  printf("clearBit: %I64d ticks => about %.3f ticks per call\n", elapsed, elapsed/(maxLoop*float(unroll)));

  start = __rdtsc();
  for (size_t i = maxLoop; i != 0; i--)
  {
    // unroll 10 times to keep loop overhead to a minimum
    volatile int x = 50; volatile char c = 10; volatile bool state = true;
    volatile int a = modifyBit(x,c,state); modifyBitAsm(); modifyBitAsm(); modifyBitAsm(); modifyBitAsm();
    modifyBitAsm(); modifyBitAsm(); modifyBitAsm(); modifyBitAsm(); modifyBitAsm();
  }
  elapsed = __rdtsc() - start;
  printf("modifyBit: %I64d ticks => about %.3f ticks per call\n", elapsed, elapsed/(maxLoop*float(unroll)));

  start = __rdtsc();
  for (size_t i = maxLoop; i != 0; i--)
  {
    // unroll 10 times to keep loop overhead to a minimum
    flipBitAsm(); flipBitAsm(); flipBitAsm(); flipBitAsm(); flipBitAsm();
    flipBitAsm(); flipBitAsm(); flipBitAsm(); flipBitAsm(); flipBitAsm();
  }
  elapsed = __rdtsc() - start;
  printf("flipBit: %I64d ticks => about %.3f ticks per call\n", elapsed, elapsed/(maxLoop*float(unroll)));

  start = __rdtsc();
  for (size_t i = maxLoop; i != 0; i--)
  {
    // unroll 10 times to keep loop overhead to a minimum
    isBitSetAsm(); isBitSetAsm(); isBitSetAsm(); isBitSetAsm(); isBitSetAsm();
    isBitSetAsm(); isBitSetAsm(); isBitSetAsm(); isBitSetAsm(); isBitSetAsm();
  }
  elapsed = __rdtsc() - start;
  printf("isBitSet: %I64d ticks => about %.3f ticks per call\n", elapsed, elapsed/(maxLoop*float(unroll)));
#endif

  return 0;
}

// performance:*5
// - constant time, data independent
//
// + Intel Pentium D:
// - setBit: approx. 2.5 cycles per number
// - clearBit: approx. 3.5 cycles per number
// - modifyBit: approx. 6.5 cycles per number
// - flipBit: approx. 2.5 cycles per number
// - isBitSet: approx. 3 cycles per number
//
// + Intel Core 2:
// - setBit: approx. 1 cycle per number
// - clearBit: approx. 1.5 cycles per number
// - modifyBit: approx. 3.5 cycles per number
// - flipBit: approx. 1 cycles per number
// - isBitSet: approx. 2 cycles per number
//
// + Intel Core i7:
// - setBit: approx. 1 cycle per number
// - clearBit: approx. 1.33 cycles per number
// - modifyBit: approx. 3 cycles per number
// - flipBit: approx. 1 cycles per number
// - isBitSet: approx. 2 cycles per number
