// Sign of a 32 bit integer
// http://bits.stephan-brumme.com

// references:
// "PowerPC Compiler Writer's Guide" by IBM
// http://the.wall.riscom.net/books/proc/ppc/cwg/cwg_ix.html

// code:
int sign(int x)
{
  // minusOne will be 0xFFFFFFFF (=> -1) for negative numbers, else 0x00000000
  int minusOne = x >> 31;

  // plusOne will be 0x00000001 for positive numbers, else 0x00000000
  unsigned int negateX = (unsigned int) -x;
  int plusOne = (int)(negateX >> 31);

  // at least minusOne or plusOne is zero, the other one gives our desired result
  int result = minusOne | plusOne;
  return result;
}

int sign2(int x)
{
  return (x > 0) - (x < 0);
}

// assembler:
#ifdef _MSC_VER
__forceinline void signAsm() // stripped down to core algorithm to avoid overhead of parameter pushes/pops
{
  // compiler: Visual C++ 2008
  // in: ecx - x
  // out: eax - result
  __asm
  {
    mov eax, ecx
    neg eax
    shr eax, 31
    sar ecx, 31
    or  eax, ecx
  }
}

__forceinline void sign2VcAsm() // stripped down to core algorithm to avoid overhead of parameter pushes/pops
{
  // compiler: Visual C++ 2008
  // in: eax - x
  // out: eax - result
  // temp: ecx
  // temp: edx
  __asm
  {
    xor ecx, ecx
    test eax, eax
    setg cl
    xor edx, edx
    test eax, eax
    setl dl
    sub ecx, edx
    mov eax, ecx
  }
}

__forceinline void sign2GccAsm() // stripped down to core algorithm to avoid overhead of parameter pushes/pops
{
  // compiler: GCC 4.4
  // in: edx - x
  // out: edx - result
  // temp: eax
  __asm
  {
    xor eax, eax
    test edx, edx
    setg al
    shr edx, 31
    sub edx, eax
  }
}
#endif

// restrictions:
// - designed for 32 bits, can be easily modified for 64 bits

// explanation:
// The sign() function is usually defined as followed:
// if x < 0 => sign(x) = -1
// if x = 0 => sign(x) =  0
// if x > 0 => sign(x) = +1
//
// It's pretty straightforward to convert any negative number into -1, a number where all bits are set.
// All we have to do is a full arithmetic right shift that spreads the highest bit (=> 0xFFFFFFFF).
// For any 32 bit integer, a right shift of 31 is perfect. The simple code in line 4 stores
// the intermediate result 0 (non-negative input) or -1 (negative input) in minusOne.
//
// Note: In C, for unsigned data types an arithmetic shift is generated whereas for signed data types
// a logical shift will be used. The difference lies in the value assigned to the new bits on the left side:
// An arithmetic right shift copies the highest bit, but a logical right shift always insert a zero.
//
// It is a little bit tricky to get +1 for all positive numbers: the highest bit (bit 31) is always zero.
// If we negate any positive number then the highest bit will be set to one (line 7).
// Now we perform a logical right shift, which, in contrast to an arithmetic right shift, zeros the high bits (line 8),
// just as explained before. That's why negateX must be unsigned while minusOne is signed.
// Line 7 and line 8 produce 0 for any non-positive number.
//
// Keep in mind that zero is neither positive nor negative. Negating zero or shifting zero still yields zero,
// which is essential to get the right results for both minusOne and plusOne.
//
// Combining minusOne and plusOne gives us the following situation:
// if x < 0 => minusOne = -1, plusOne = 0
// if x = 0 => minusOne =  0, plusOne =  0
// if x > 0 => minusOne =  0, plusOne = +1
//
// It doesn't matter whether we OR (that means ||) or ADD (that means + ) minusOne and plusOne, I choose the latter way.
// The code looks a bit ugly in C because we need a few type conversion between int and unsigned int to force
// the compiler to generate the right code.
//
// I included a very elegant solution of computing the sign in line 17, that makes use of the way boolean
// values are implemented in C. My compilers convert the short C code into similarly short assembler code based on
// the SETx instructions of x86/x64 CPUs. Despite it's efficient register usage, it runs much slower than
// the first algorithm on my machines. However, this code runs out-of-the-box on 64 bit systems ...

// validation:
#include <stdio.h>
#ifdef _MSC_VER
#include <intrin.h>
#endif

int main(int, char**)
{
  // Microsoft Visual C++ performance measurement
#ifdef _MSC_VER
  printf("performance test ...\n");

  const size_t maxLoop = 100000;
  const size_t unroll  = 10;

  unsigned long long start = __rdtsc();
  for (size_t i = maxLoop; i != 0; i--)
  {
    // unroll 10 times to keep loop overhead to a minimum
    signAsm(); signAsm();
    signAsm(); signAsm();
    signAsm(); signAsm();
    signAsm(); signAsm();
    signAsm(); signAsm();
  }
  unsigned long long elapsed = __rdtsc() - start;
  printf("sign: %I64d ticks => about %.3f ticks per call\n", elapsed, elapsed/(maxLoop*float(unroll)));
  
  start = __rdtsc();
  for (size_t i = maxLoop; i != 0; i--)
  {
    // unroll 10 times to keep loop overhead to a minimum
    sign2VcAsm(); sign2VcAsm();
    sign2VcAsm(); sign2VcAsm();
    sign2VcAsm(); sign2VcAsm();
    sign2VcAsm(); sign2VcAsm();
    sign2VcAsm(); sign2VcAsm();
  }
  elapsed = __rdtsc() - start;
  printf("sign2/VC: %I64d ticks => about %.3f ticks per call\n", elapsed, elapsed/(maxLoop*float(unroll)));

  start = __rdtsc();
  for (size_t i = maxLoop; i != 0; i--)
  {
    // unroll 10 times to keep loop overhead to a minimum
    sign2GccAsm(); sign2GccAsm();
    sign2GccAsm(); sign2GccAsm();
    sign2GccAsm(); sign2GccAsm();
    sign2GccAsm(); sign2GccAsm();
    sign2GccAsm(); sign2GccAsm();
  }
  elapsed = __rdtsc() - start;
  printf("sign2/GCC: %I64d ticks => about %.3f ticks per call\n", elapsed, elapsed/(maxLoop*float(unroll)));
#endif

  // analyze all numbers from 0 to 1^32 - 1
  printf("verifying ");
  unsigned int totalX = 0xFFFFFFFF;
           int x      = 0;
  unsigned int errors = 0;

  do 
  {
    int mySign = sign(x);

    if (x < 0 && mySign != -1)
    {
      printf("failed ! x=%d is negative but sign(%d) returned %d.\n", x,x,mySign);
      errors++;
    }
    if (x == 0 && mySign != 0)
    {
      printf("failed ! x=%d is zero but sign(%d) returned %d.\n", x,x,mySign);
      errors++;
    }
    if (x > 0 && mySign != +1)
    {
      printf("failed ! x=%d is positive but sign(%d) returned %d.\n", x,x,mySign);
      errors++;
    }

    // progress meter, the whole validation takes about a minute on my computer
    if ((x & 0x07FFFFFF) == 0)
      printf(".");

    x++;
  } while (x == 0);

  printf(" %u errors.\n", errors);
  return errors;
}

// performance:*
// - constant time, data independent
//
// + Intel Pentium D:
// - sign: approx. 2.5 cycles per number
// - sign2/VC: approx. 13 cycles per number
// - sign2/GCC: approx. 11 cycles per number
//
// + Intel Core 2:
// - sign: approx. 2 cycles per number
// - sign2/VC: approx. 5 cycles per number
// - sign2/GCC: approx. 3.5 cycles per number
//
// + Intel Core i7:
// - sign: approx. 2.25 cycles per number
// - sign2/VC: approx. 5 cycles per number
// - sign2/GCC: approx. 3 cycles per number
