// Extend bit width
// http://bits.stephan-brumme.com

// references:
// http://sol.gfxile.net/boolean.html

// code:
unsigned int extend(unsigned int value, unsigned int smallSize, unsigned int bigSize)
{
  // i.e. to extend from 5 to 8 bits, use extend(x,5,8)
  unsigned int leftShift  = bigSize - smallSize;
  unsigned int rightShift = bigSize - leftShift;

  // extend and fill lower bits smartly
  return (value << leftShift) | (value >> rightShift);
}

// assembler:
#ifdef _MSC_VER
__forceinline void extendAsm() // stripped down to core algorithm to avoid overhead of parameter pushes/pops
{
  // compiler: Visual C++ 2008
  // in: eax - x
  // in: ecx - bigSize
  // in: edx - smallSize
  // out: eax - result
  // temp: esi
  __asm
  {
    push esi
    mov esi, eax
    shr eax, cl
    mov cl, dl
    shl esi, cl
    or  eax, ebx
    pop esi
  }
}
#endif

// restrictions:
// - none

// explanation:
// Especially for graphic cards, values are often truncated to a few bits, i.e. 5 bits per color channel.
// When used to actually draw pixels, these values have to be extended to 8 bits while preserving three important properties:
// 1. scaling must be as linear as possible
// 2. zero must remain zero
// 3. the maximum possible input must be mapped to the maximum possible output
//
// In terms of a graphics designer, colors must not be falsified and especially White must remain White while Black remains Black.
//
// A simple left shift suffices properties 1 and 2. However, it fails when it comes to requirement 3.
// In the example given, 0x1F is the maximum value for 5 bits but a left shift by 3 bits generates 0xF8 which is clearly not 0xFF.
//
// The presented method not only performs the shift (line 4 and line 8) but also maps the highest bits to the lowest bits, too,
// and thus solves the problem (line 5 and line 8).
// Note: if the input bit width is less than half the output bit width, requirement 3 is not always fulfilled.
//
// Compilers convert lines 4 and 5 to constant values. The only actual code is to be found in line 8.

// validation:
#include <stdio.h>
#ifdef _MSC_VER
#include <intrin.h>
#endif

int main(int, char**)
{
  // Microsoft Visual C++ performance measurement
#ifdef _MSC_VER
  printf("performance test ...\n");

  const size_t maxLoop = 100000;
  const size_t unroll  = 10;

  unsigned long long start = __rdtsc();
  for (size_t i = maxLoop; i != 0; i--)
  {
    // unroll 10 times to keep loop overhead to a minimum
    extendAsm(); extendAsm();
    extendAsm(); extendAsm();
    extendAsm(); extendAsm();
    extendAsm(); extendAsm();
    extendAsm(); extendAsm();
  }
  unsigned long long elapsed = __rdtsc() - start;
  printf("%I64d ticks => about %.3f ticks per call\n", elapsed, elapsed/(maxLoop*float(unroll)));
#endif

  // examples, "volatile" prevents the optimizing compiler from removing the unused variables
  volatile unsigned int five2eightMin = extend( 0, 5,8);
  volatile unsigned int five2eightMax = extend(31, 5,8);
  volatile unsigned int four2sixMin   = extend( 0, 4,6);
  volatile unsigned int four2sixMax   = extend(15, 4,6);

  return 0;
}

// performance:*
// - constant time, data independent
//
// + Intel Pentium D:
// - extend: approx. 5.5 cycles per number
//
// + Intel Core 2:
// - extend: approx. 3 cycles per number
//
// + Intel Core i7:
// - extend: approx. 5 cycles per number

