// Sign comparison // http://bits.stephan-brumme.com // references: // multiple sources, author unknown // code: bool sameSign(int x, int y) { return (x^y) >= 0; } bool sameSignFast32(int x, int y) { // XOR both numbers int temp = x^y; // shift sign bit to the lowest bit position and clear the rest temp >>= 31; // 64 bit: use 63 instead of 31 temp &= 1; // negate return !temp; } bool sameSignFloat(float a, float b) { int* x = (int*)&a; int* y = (int*)&b; return sameSignFast32(*x,*y); } template bool sameSignSimple(TypeX x, TypeY y) { return (x >= 0 && y >= 0) || (x < 0 && y < 0); } // assembler: #ifdef _MSC_VER __forceinline void sameSignAsm() // stripped down to core algorithm to avoid overhead of parameter pushes/pops { // compiler: Visual C++ 2008 // in: eax - x // in: ecx - y // out: eax - result __asm { ; set sign flag xor eax, ecx ; clear eax mov eax, 0 ; al = sign flag setge al } } __forceinline void sameSignFast32Asm() // stripped down to core algorithm to avoid overhead of parameter pushes/pops { // compiler: Visual C++ 2008 // in: eax - x // in: ecx - y // out: eax - result __asm { ; x = !x not eax ; x ^= y xor eax, ecx ; x >>= 31 sar eax, 31 ; x &= 1 and eax, 1 } } __forceinline void sameSignSimpleAsm() // stripped down to core algorithm to avoid overhead of parameter pushes/pops { // compiler: Visual C++ 2008 // in: ecx - x // in: eax - y // out: eax - result __asm { ; x >= 0 cmp ecx, 0 jl $negative ; y >= 0 test eax, eax jge $same $different: ; return false xor eax, eax jmp $finish $negative: ; y < 0 ? (x is < 0) test eax, eax jge $different $same: ; return true mov eax, 1 $finish: } } #endif // restrictions: // - sameSignFast32 designed for 32 bits, can be easily modified for 64 bits // - sameSignFloat may not work with infinity and/or degenerated numbers // explanation: // The highest bit of an integer is called the sign bit. // It is set to 1 for all negative values or 0 for zero and all positive values. // // For two values x and y, the following holds true: signX XOR signY is only 1 if their sign differs. // If XOR is applied to all bits of these values (not just the sign bits), we can check the sign of the result: // It is only positive if both have the same sign. // // The obvious approach of sameSign suffers from the high latency of teh setge assembler instruction. // sameSignFast32 shifts the sign bit to the lowest bits and clears all other bits. // The resulting value (0 for same sign, 1 for different sign) must be negated before converting to a bool. // // The performance figures for float values are identical to integers' if these floats are stored in main memory. // validation: #include #ifdef _MSC_VER #include #endif int main(int, char**) { // Microsoft Visual C++ performance measurement #ifdef _MSC_VER printf("performance test ...\n"); const size_t maxLoop = 100000; const size_t unroll = 10; unsigned long long start = __rdtsc(); for (size_t i = maxLoop; i != 0; i--) { // unroll 10 times to keep loop overhead to a minimum sameSignSimpleAsm(); sameSignSimpleAsm(); sameSignSimpleAsm(); sameSignSimpleAsm(); sameSignSimpleAsm(); sameSignSimpleAsm(); sameSignSimpleAsm(); sameSignSimpleAsm(); sameSignSimpleAsm(); sameSignSimpleAsm(); } unsigned long long elapsed = __rdtsc() - start; printf("sameSignSimple: %I64d ticks => about %.3f ticks per call\n", elapsed, elapsed/(maxLoop*float(unroll))); start = __rdtsc(); for (size_t i = maxLoop; i != 0; i--) { // unroll 10 times to keep loop overhead to a minimum sameSignAsm(); sameSignAsm(); sameSignAsm(); sameSignAsm(); sameSignAsm(); sameSignAsm(); sameSignAsm(); sameSignAsm(); sameSignAsm(); sameSignAsm(); } elapsed = __rdtsc() - start; printf("sameSign: %I64d ticks => about %.3f ticks per call\n", elapsed, elapsed/(maxLoop*float(unroll))); start = __rdtsc(); for (size_t i = maxLoop; i != 0; i--) { // unroll 10 times to keep loop overhead to a minimum sameSignFast32Asm(); sameSignFast32Asm(); sameSignFast32Asm(); sameSignFast32Asm(); sameSignFast32Asm(); sameSignFast32Asm(); sameSignFast32Asm(); sameSignFast32Asm(); sameSignFast32Asm(); sameSignFast32Asm(); } elapsed = __rdtsc() - start; printf("sameSignFast32: %I64d ticks => about %.3f ticks per call\n", elapsed, elapsed/(maxLoop*float(unroll))); #endif // force compiler to generate assember code volatile int x = 1; volatile int y = -1; volatile int result; result = sameSign (x,y); result = sameSignFast32(x,y); result = sameSignSimple(x,y); result = sameSignFloat (x,y); // simple tests printf("\nintegers (simple):\n"); printf("sameSignSimple(%+2d,%+2d) = %s\n", +1,+1,sameSignSimple(+1,+1) == true ? "ok" : "FAILED"); printf("sameSignSimple(%+2d,%+2d) = %s\n", +1,-1,sameSignSimple(+1,-1) == false ? "ok" : "FAILED"); printf("sameSignSimple(%+2d,%+2d) = %s\n", -1,-1,sameSignSimple(-1,-1) == true ? "ok" : "FAILED"); printf("sameSignSimple(%+2d,%+2d) = %s\n", 0,+1,sameSignSimple( 0,+1) == true ? "ok" : "FAILED"); printf("sameSignSimple(%+2d,%+2d) = %s\n", 0,-1,sameSignSimple( 0,-1) == false ? "ok" : "FAILED"); printf("sameSignSimple(%+2d,%+2d) = %s\n", 0, 0,sameSignSimple( 0, 0) == true ? "ok" : "FAILED"); printf("\nintegers (XOR):\n"); printf("sameSign(%+2d,%+2d) = %s\n", +1,+1,sameSign(+1,+1) == true ? "ok" : "FAILED"); printf("sameSign(%+2d,%+2d) = %s\n", +1,-1,sameSign(+1,-1) == false ? "ok" : "FAILED"); printf("sameSign(%+2d,%+2d) = %s\n", -1,-1,sameSign(-1,-1) == true ? "ok" : "FAILED"); printf("sameSign(%+2d,%+2d) = %s\n", 0,+1,sameSign( 0,+1) == true ? "ok" : "FAILED"); printf("sameSign(%+2d,%+2d) = %s\n", 0,-1,sameSign( 0,-1) == false ? "ok" : "FAILED"); printf("sameSign(%+2d,%+2d) = %s\n", 0, 0,sameSign( 0, 0) == true ? "ok" : "FAILED"); printf("\nintegers (32 bit, fast):\n"); printf("sameSignFast32(%+2d,%+2d) = %s\n", +1,+1,sameSignFast32(+1,+1) == true ? "ok" : "FAILED"); printf("sameSignFast32(%+2d,%+2d) = %s\n", +1,-1,sameSignFast32(+1,-1) == false ? "ok" : "FAILED"); printf("sameSignFast32(%+2d,%+2d) = %s\n", -1,-1,sameSignFast32(-1,-1) == true ? "ok" : "FAILED"); printf("sameSignFast32(%+2d,%+2d) = %s\n", 0,+1,sameSignFast32( 0,+1) == true ? "ok" : "FAILED"); printf("sameSignFast32(%+2d,%+2d) = %s\n", 0,-1,sameSignFast32( 0,-1) == false ? "ok" : "FAILED"); printf("sameSignFast32(%+2d,%+2d) = %s\n", 0, 0,sameSignFast32( 0, 0) == true ? "ok" : "FAILED"); printf("\nfloats:\n"); printf("sameSignFloat(%+2d,%+2d) = %s\n", +1,+1,sameSignFloat(+1,+1) == true ? "ok" : "FAILED"); printf("sameSignFloat(%+2d,%+2d) = %s\n", +1,-1,sameSignFloat(+1,-1) == false ? "ok" : "FAILED"); printf("sameSignFloat(%+2d,%+2d) = %s\n", -1,-1,sameSignFloat(-1,-1) == true ? "ok" : "FAILED"); printf("sameSignFloat(%+2d,%+2d) = %s\n", 0,+1,sameSignFloat( 0,+1) == true ? "ok" : "FAILED"); printf("sameSignFloat(%+2d,%+2d) = %s\n", 0,-1,sameSignFloat( 0,-1) == false ? "ok" : "FAILED"); printf("sameSignFloat(%+2d,%+2d) = %s\n", 0, 0,sameSignFloat( 0, 0) == true ? "ok" : "FAILED"); return 0; } // performance:*5 // - constant execution time because branch free // // + Intel Pentium D: // - sameSignSimple: approx. 5 cycles per comparison // - sameSign: approx. 10 cycles per comparison // - sameSignFast32: approx. 4 cycles per comparison // // + Intel Core 2: // - sameSignSimple: approx. 10 cycles per comparison // - sameSign: approx. 3.75 cycles per comparison // - sameSignFast32: approx. 4 cycles per comparison // // + Intel Core i7: // - sameSignSimple: approx. 7 cycles per comparison // - sameSign: approx. 4 cycles per comparison // - sameSignFast32: approx. 4 cycles per comparison