// Sign comparison
// http://bits.stephan-brumme.com

// references:
// multiple sources, author unknown

// code:
bool sameSign(int x, int y)
{
  return (x^y) >= 0;
}

bool sameSignFast32(int x, int y)
{
  // XOR both numbers
  int temp = x^y;
  // shift sign bit to the lowest bit position and clear the rest
  temp >>= 31; // 64 bit: use 63 instead of 31
  temp  &=  1;
  // negate
  return !temp;
}

bool sameSignFloat(float a, float b)
{
  int* x = (int*)&a;
  int* y = (int*)&b;
  return sameSignFast32(*x,*y);
}

template <typename TypeX, typename TypeY>
bool sameSignSimple(TypeX x, TypeY y)
{
  return (x >= 0 && y >= 0) || (x < 0 && y < 0);
}

// assembler:
#ifdef _MSC_VER
__forceinline void sameSignAsm() // stripped down to core algorithm to avoid overhead of parameter pushes/pops
{
  // compiler: Visual C++ 2008
  // in: eax - x
  // in: ecx - y
  // out: eax - result
  __asm
  {
    ; set sign flag
    xor eax, ecx
    ; clear eax
    mov eax, 0
    ; al = sign flag
    setge al
  }
}

__forceinline void sameSignFast32Asm() // stripped down to core algorithm to avoid overhead of parameter pushes/pops
{
  // compiler: Visual C++ 2008
  // in: eax - x
  // in: ecx - y
  // out: eax - result
  __asm
  {
    ; x = !x
    not eax
    ; x ^= y
    xor eax, ecx
    ; x >>= 31
    sar eax, 31
    ; x &= 1
    and eax, 1
  }
}

__forceinline void sameSignSimpleAsm() // stripped down to core algorithm to avoid overhead of parameter pushes/pops
{
  // compiler: Visual C++ 2008
  // in: ecx - x
  // in: eax - y
  // out: eax - result
  __asm
  {
    ; x >= 0
    cmp ecx, 0
    jl $negative
    ; y >= 0
    test eax, eax
    jge $same
  $different:
    ; return false
    xor eax, eax
    jmp $finish
  $negative:
    ; y < 0 ? (x is < 0)
    test eax, eax
    jge $different
  $same:
    ; return true
    mov eax, 1
  $finish:
  }
}

#endif

// restrictions:
// - sameSignFast32 designed for 32 bits, can be easily modified for 64 bits
// - sameSignFloat may not work with infinity and/or degenerated numbers

// explanation:
// The highest bit of an integer is called the sign bit.
// It is set to 1 for all negative values or 0 for zero and all positive values.
//
// For two values x and y, the following holds true: signX XOR signY is only 1 if their sign differs.
// If XOR is applied to all bits of these values (not just the sign bits), we can check the sign of the result:
// It is only positive if both have the same sign.
//
// The obvious approach of sameSign suffers from the high latency of teh setge assembler instruction.
// sameSignFast32 shifts the sign bit to the lowest bits and clears all other bits.
// The resulting value (0 for same sign, 1 for different sign) must be negated before converting to a bool.
//
// The performance figures for float values are identical to integers' if these floats are stored in main memory.

// validation:
#include <stdio.h>
#ifdef _MSC_VER
#include <intrin.h>
#endif

int main(int, char**)
{
  // Microsoft Visual C++ performance measurement
#ifdef _MSC_VER
  printf("performance test ...\n");

  const size_t maxLoop = 100000;
  const size_t unroll  = 10;

  unsigned long long start = __rdtsc();
  for (size_t i = maxLoop; i != 0; i--)
  {
    // unroll 10 times to keep loop overhead to a minimum
    sameSignSimpleAsm(); sameSignSimpleAsm();
    sameSignSimpleAsm(); sameSignSimpleAsm();
    sameSignSimpleAsm(); sameSignSimpleAsm();
    sameSignSimpleAsm(); sameSignSimpleAsm();
    sameSignSimpleAsm(); sameSignSimpleAsm();
  }
  unsigned long long elapsed = __rdtsc() - start;
  printf("sameSignSimple: %I64d ticks => about %.3f ticks per call\n", elapsed, elapsed/(maxLoop*float(unroll)));

  start = __rdtsc();
  for (size_t i = maxLoop; i != 0; i--)
  {
    // unroll 10 times to keep loop overhead to a minimum
    sameSignAsm(); sameSignAsm();
    sameSignAsm(); sameSignAsm();
    sameSignAsm(); sameSignAsm();
    sameSignAsm(); sameSignAsm();
    sameSignAsm(); sameSignAsm();
  }
  elapsed = __rdtsc() - start;
  printf("sameSign: %I64d ticks => about %.3f ticks per call\n", elapsed, elapsed/(maxLoop*float(unroll)));

  start = __rdtsc();
  for (size_t i = maxLoop; i != 0; i--)
  {
    // unroll 10 times to keep loop overhead to a minimum
    sameSignFast32Asm(); sameSignFast32Asm();
    sameSignFast32Asm(); sameSignFast32Asm();
    sameSignFast32Asm(); sameSignFast32Asm();
    sameSignFast32Asm(); sameSignFast32Asm();
    sameSignFast32Asm(); sameSignFast32Asm();
  }
  elapsed = __rdtsc() - start;
  printf("sameSignFast32: %I64d ticks => about %.3f ticks per call\n", elapsed, elapsed/(maxLoop*float(unroll)));
#endif

  // force compiler to generate assember code
  volatile int x = 1;
  volatile int y = -1;
  volatile int result;
  result = sameSign      (x,y);
  result = sameSignFast32(x,y);
  result = sameSignSimple(x,y);
  result = sameSignFloat (x,y);

  // simple tests
  printf("\nintegers (simple):\n");
  printf("sameSignSimple(%+2d,%+2d) = %s\n", +1,+1,sameSignSimple(+1,+1) == true  ? "ok" : "FAILED");
  printf("sameSignSimple(%+2d,%+2d) = %s\n", +1,-1,sameSignSimple(+1,-1) == false ? "ok" : "FAILED");
  printf("sameSignSimple(%+2d,%+2d) = %s\n", -1,-1,sameSignSimple(-1,-1) == true  ? "ok" : "FAILED");
  printf("sameSignSimple(%+2d,%+2d) = %s\n",  0,+1,sameSignSimple( 0,+1) == true  ? "ok" : "FAILED");
  printf("sameSignSimple(%+2d,%+2d) = %s\n",  0,-1,sameSignSimple( 0,-1) == false ? "ok" : "FAILED");
  printf("sameSignSimple(%+2d,%+2d) = %s\n",  0, 0,sameSignSimple( 0, 0) == true  ? "ok" : "FAILED");

  printf("\nintegers (XOR):\n");
  printf("sameSign(%+2d,%+2d) = %s\n", +1,+1,sameSign(+1,+1) == true  ? "ok" : "FAILED");
  printf("sameSign(%+2d,%+2d) = %s\n", +1,-1,sameSign(+1,-1) == false ? "ok" : "FAILED");
  printf("sameSign(%+2d,%+2d) = %s\n", -1,-1,sameSign(-1,-1) == true  ? "ok" : "FAILED");
  printf("sameSign(%+2d,%+2d) = %s\n",  0,+1,sameSign( 0,+1) == true  ? "ok" : "FAILED");
  printf("sameSign(%+2d,%+2d) = %s\n",  0,-1,sameSign( 0,-1) == false ? "ok" : "FAILED");
  printf("sameSign(%+2d,%+2d) = %s\n",  0, 0,sameSign( 0, 0) == true  ? "ok" : "FAILED");

  printf("\nintegers (32 bit, fast):\n");
  printf("sameSignFast32(%+2d,%+2d) = %s\n", +1,+1,sameSignFast32(+1,+1) == true  ? "ok" : "FAILED");
  printf("sameSignFast32(%+2d,%+2d) = %s\n", +1,-1,sameSignFast32(+1,-1) == false ? "ok" : "FAILED");
  printf("sameSignFast32(%+2d,%+2d) = %s\n", -1,-1,sameSignFast32(-1,-1) == true  ? "ok" : "FAILED");
  printf("sameSignFast32(%+2d,%+2d) = %s\n",  0,+1,sameSignFast32( 0,+1) == true  ? "ok" : "FAILED");
  printf("sameSignFast32(%+2d,%+2d) = %s\n",  0,-1,sameSignFast32( 0,-1) == false ? "ok" : "FAILED");
  printf("sameSignFast32(%+2d,%+2d) = %s\n",  0, 0,sameSignFast32( 0, 0) == true  ? "ok" : "FAILED");

  printf("\nfloats:\n");
  printf("sameSignFloat(%+2d,%+2d) = %s\n", +1,+1,sameSignFloat(+1,+1) == true  ? "ok" : "FAILED");
  printf("sameSignFloat(%+2d,%+2d) = %s\n", +1,-1,sameSignFloat(+1,-1) == false ? "ok" : "FAILED");
  printf("sameSignFloat(%+2d,%+2d) = %s\n", -1,-1,sameSignFloat(-1,-1) == true  ? "ok" : "FAILED");
  printf("sameSignFloat(%+2d,%+2d) = %s\n",  0,+1,sameSignFloat( 0,+1) == true  ? "ok" : "FAILED");
  printf("sameSignFloat(%+2d,%+2d) = %s\n",  0,-1,sameSignFloat( 0,-1) == false ? "ok" : "FAILED");
  printf("sameSignFloat(%+2d,%+2d) = %s\n",  0, 0,sameSignFloat( 0, 0) == true  ? "ok" : "FAILED");
  return 0;
}

// performance:*5
// - constant execution time because branch free
//
// + Intel Pentium D:
// - sameSignSimple: approx. 5 cycles per comparison
// - sameSign: approx. 10 cycles per comparison
// - sameSignFast32: approx. 4 cycles per comparison
//
// + Intel Core 2:
// - sameSignSimple: approx. 10 cycles per comparison
// - sameSign: approx. 3.75 cycles per comparison
// - sameSignFast32: approx. 4 cycles per comparison
//
// + Intel Core i7:
// - sameSignSimple: approx. 7 cycles per comparison
// - sameSign: approx. 4 cycles per comparison
// - sameSignFast32: approx. 4 cycles per comparison

