// Swap two values
// http://bits.stephan-brumme.com

// references:
// none

// code:
void swapXor(int& a, int& b)
{
  a ^= b;
  b ^= a;
  a ^= b;
}

void swapTemp(int& a, int& b)
{
  int temp = a;
  a = b;
  b = temp;
}

// assembler:
#ifdef _MSC_VER
__forceinline void swapXorAsm()
{
  // compiler: VC 2008
  // in: eax - a
  // in: edx - b
  // out: eax - b
  // out: edx - a
  __asm
  {
    xor eax, edx
    xor edx, eax
    xor eax, edx
  }
}

__forceinline void swapTempAsm()
{
  // compiler: VC 2008
  // in: eax - a
  // in: edx - b
  // out: eax - b
  // out: edx - a
  // temp: ebx
  __asm
  {
    mov ebx, eax
    mov eax, edx
    mov edx, ebx
  }
}
#endif

// restrictions:
// - integers only
// - with proper aliasing to an integer values, float can be swapped, too
// - must not be used with same variable twice, e.g., swapXor(a,a)

// explanation:
// For several years, the XOR swap trick served as a common example of bit twiddling.
// Nowadays, its performance advantage is completely gone, nevertheless, the clever idea behind it remains worth looking at.
//
// Let's review XOR:
//
// 0 XOR 0 => 0
// 0 XOR 1 => 1
// 1 XOR 0 => 1
// 1 XOR 1 => 0
//
// That means XORing any bit with itself always gives zero: y XOR y => 0.
// Moreover, XORing any bit with zero always keeps the bit in its current state: y XOR 0 => 0 XOR y => y.
// Now it follows: a XOR b XOR a => b.
//
// Line 3, line 4 and line 5 produce the following equations:
//
// line 3: a_new = a XOR b
// line 4: b_swapped = b XOR a_new = b XOR (a XOR b) = a
// line 5: a_swapped = a_new XOR b_swapped = (a XOR b) XOR a = b
//
// In the end, a_swapped = b and b_swapped = a.
//
// The XOR swap algorithm became very popular because it does not need any additional registers.
// Modern CPUs perform the more obvious code of swapTemp much slower than the XOR trick, on my computer twice as fast.
// If there is no free register available then XOR might be the winner again.

// validation:
#include <stdio.h>
#ifdef _MSC_VER
#include <intrin.h>
#endif

int main(int, char**)
{
  // Microsoft Visual C++ performance measurement
#ifdef _MSC_VER
  printf("performance test ...\n");

  const size_t maxLoop = 100000;
  const size_t unroll  = 10;

  unsigned long long start = __rdtsc();
  for (size_t i = maxLoop; i != 0; i--)
  {
    // unroll 10 times to keep loop overhead to a minimum
    swapXorAsm(); swapXorAsm();
    swapXorAsm(); swapXorAsm();
    swapXorAsm(); swapXorAsm();
    swapXorAsm(); swapXorAsm();
    swapXorAsm(); swapXorAsm();
  }
  unsigned long long elapsed = __rdtsc() - start;
  printf("XOR: %I64d ticks => about %.3f ticks per call\n", elapsed, elapsed/(maxLoop*float(unroll)));
  
  start = __rdtsc();
  for (size_t i = maxLoop; i != 0; i--)
  {
    // unroll 10 times to keep loop overhead to a minimum
    swapTempAsm(); swapTempAsm();
    swapTempAsm(); swapTempAsm();
    swapTempAsm(); swapTempAsm();
    swapTempAsm(); swapTempAsm();
    swapTempAsm(); swapTempAsm();
  }
  elapsed = __rdtsc() - start;
  printf("MOV w/ temporary: %I64d ticks => about %.3f ticks per call\n", elapsed, elapsed/(maxLoop*float(unroll)));
#endif

  return 0;
}

// performance:*
// - constant time, data independent
//
// + Intel Pentium D:
// - swapXor: approx. 3.0 cycles per number
// - swapTemp: approx. 1.5 cycles per number
//
// + Intel Core 2:
// - swapXor: approx. 3.0 cycles per number
// - swapTemp: approx. 2.0 cycles per number
//
// + Intel Core i7:
// - swapXor: approx. 3.0 cycles per number
// - swapTemp: approx. 2.25 cycles per number
