// Detects zero bytes inside a 32 bit integer
// http://bits.stephan-brumme.com

// references:
// newsgroup posting of Alan Mycroft on April 27, 1987

// code:
bool hasZeroByte(unsigned int x)
{
  return ((x - 0x01010101) & ~x & 0x80808080) != 0;
}

bool hasZeroByteSimple(unsigned int x)
{
  if ((x & 0x000000FF) == 0)
    return true;
  if ((x & 0x0000FF00) == 0)
    return true;
  if ((x & 0x00FF0000) == 0)
    return true;
  if ((x & 0xFF000000) == 0)
    return true;

  return false;
}

// assembler:
#ifdef _MSC_VER
__forceinline void hasZeroByteAsm() // stripped down to core algorithm to avoid overhead of parameter pushes/pops
{
  // in: ecx - x
  // out: eax - result
  __asm
  {
    ; x - 0x01010101
    lea eax, [ecx-01010101h]
    ; ~x
    not ecx
    and eax, ecx
    and eax, 80808080h
    ; compare to zero
    neg eax
    sbb eax, eax
    ; set eax=1, boolean value for true
    neg eax
  }
}

__forceinline void hasZeroByteSimpleAsm() // stripped down to core algorithm to avoid overhead of parameter pushes/pops
{
  // in: ecx - x
  // out: eax - result
  __asm
  {
    /* dummy instruction to prevent input to be always 0 or 1 */
    /**/ mov eax, 12345678h
    ; if ((x & 0x000000FF) == 0)
    test al, al
    jnz $if2
  $found:
    ; 1 means true
    mov al, 1
    jmp $done
  $if2:
    ; if ((x & 0x0000FF00) == 0)
    test eax, 0000FF00h
    jz $found
    ; if ((x & 0x00FF0000) == 0)
    test eax, 00FF0000h
    jz $found
    ; if ((x & 0xFF000000) == 0)
    test  eax, 0FF000000h
    setz  al
  $done:
  }
}
#endif

// restrictions:
// - designed for 32 bits, easily extendable to 64 bits

// explanation:
// One of the most prominent example of detecting zeroes is strlen. To improve performance many bytes are examined in parallel.
//
// The expression x - 0x01010101 sets the highest bit of a byte if a single input byte is within the input set
// { 0x00, 0x81, 0x82, 0x83, ..., 0xFF } because the result will be { 0xFF, 0x80, 0x81, 0x82, ..., 0xFE }
//
// On the other hand, ~x sets the highest bit of a byte if a single input byte is within the input set
// { 0x00, 0x01, 0x02, 0x03, ..., 0x7F } because the result will be { 0xFF, 0xFE, 0xFD, 0xFC, ..., 0x80 }
//
// The only value contained in both input sets is 0x00, exactly what we are looking for.
// That means, only for x = 0 the highest bit is still set after ANDing: (x - 0x01010101) & ~x
// Please do not be confused by the fact that a result != 0 actually indicates a zero byte.
//
// Then, all bits except for the highest bit of each byte are reset to zero using AND 0x80808080.
// The resulting value is equal to zero if all (!) of these highest bits are zero.
// If at least one is still set to 1, caused by a zero byte, the comparison != 0 gives "true".
//
// hasZeroByteSimple shows the common approach to this problem.

// validation:
#include <stdio.h>
#ifdef _MSC_VER
#include <intrin.h>
#endif

int main(int, char**)
{
  // Microsoft Visual C++ performance measurement
#ifdef _MSC_VER
  printf("performance test ...\n");

  const size_t maxLoop = 100000;
  const size_t unroll  = 10;

  unsigned long long start = __rdtsc();
  for (size_t i = maxLoop; i != 0; i--)
  {
    // unroll 10 times to keep loop overhead to a minimum
    hasZeroByteAsm(); hasZeroByteAsm();
    hasZeroByteAsm(); hasZeroByteAsm();
    hasZeroByteAsm(); hasZeroByteAsm();
    hasZeroByteAsm(); hasZeroByteAsm();
    hasZeroByteAsm(); hasZeroByteAsm();
  }
  unsigned long long elapsed = __rdtsc() - start;
  printf("fast: %I64d ticks => about %.3f ticks per call\n", elapsed, elapsed/(maxLoop*float(unroll)));

  start = __rdtsc();
  for (size_t i = maxLoop; i != 0; i--)
  {
    // unroll 10 times to keep loop overhead to a minimum
    hasZeroByteSimpleAsm(); hasZeroByteSimpleAsm();
    hasZeroByteSimpleAsm(); hasZeroByteSimpleAsm();
    hasZeroByteSimpleAsm(); hasZeroByteSimpleAsm();
    hasZeroByteSimpleAsm(); hasZeroByteSimpleAsm();
    hasZeroByteSimpleAsm(); hasZeroByteSimpleAsm();
  }
  elapsed = __rdtsc() - start;
  printf("simple: %I64d ticks => about %.3f ticks per call\n", elapsed, elapsed/(maxLoop*float(unroll)));
#endif

  // analyze all numbers from 0 to 1^32 - 1
  printf("verifying ");
  unsigned int totalX = 0xFFFFFFFF;
  unsigned int x      = 0;
  unsigned int errors = 0;

  do 
  {
    bool zero = hasZeroByte(x);

    bool expectZero = false;
    if ((x & 0x000000FF) == 0)
      expectZero = true;
    else
    if ((x & 0x0000FF00) == 0)
      expectZero = true;
    else
    if ((x & 0x00FF0000) == 0)
      expectZero = true;
    else
    if ((x & 0xFF000000) == 0)
      expectZero = true;

    if (zero && !expectZero)
    {
      printf("failed ! x=%u has no zero byte, but hasZeroByte reported one.\n", x);
      errors++;
    }
    else
    if (expectZero && !zero)
    {
      printf("failed ! x=%u has a zero byte, but hasZeroByte reported none.\n", x);
      errors++;
    }

    // progress meter, the whole validation takes about a minute on my computer
    if ((x & 0x07FFFFFF) == 0)
      printf(".");

    x++;
  } while (x != 0);

  printf(" %u errors.\n", errors);
  return errors;
}

// performance:*
// - constant time, data independent
//
// + Intel Pentium D:
// - hasZeroByte: approx. 6 cycles per number
// - hasZeroByteSimple: approx. 7 cycles per number
//
// + Intel Core 2:
// - hasZeroByte: approx. 3 cycles per number
// - hasZeroByteSimple: approx. 3 cycles per number
//
// + Intel Core i7:
// - hasZeroByte: approx. 3.5 cycles per number
// - hasZeroByteSimple: approx. 3 cycles per number
