Thursday, June 17, 2010

Bit twidding vs lookup tables

In semiconductor and RTL design using a pipeline of operations is much faster than a LUT(Look up table). However in software the result is flipped and small scale LUTs cut the complex sequence of operations to a single simplified action. In short memory lookup tables are generally much faster despite the chance of cache and memory glitches. Since im a semiconductor background the result always seems to surprise me when I revisit it.

The output is
speed of twiddling: 1482
speed of a lookup table: 874

The test code was:
#include <iostream>
#include <algorithm>
#include <iterator>
using namespace std;

// ********************* BIT TWIDDLER ********************* //

#define SUM_MASK1 0x55555555
#define SUM_MASK2 0x33333333
#define SUM_MASK3 0x07070707
#define SUM_MASK4 0x000f000f
#define SUM_MASK5 0x0000001f

unsigned int SumOfOnesTwiddle(unsigned int x)
{
  x = (SUM_MASK1 & (x >>  1)) + (SUM_MASK1 & x);
  x = (SUM_MASK2 & (x >>  2)) + (SUM_MASK2 & x);
  x = (SUM_MASK3 & (x >>  4)) + (SUM_MASK3 & x);
  x = (SUM_MASK4 & (x >>  8)) + (SUM_MASK4 & x);
  x = (SUM_MASK5 & (x >> 16)) + (SUM_MASK5 & x);

  return x;
}

void SumOfOnesTwiddleArray(unsigned int* in, unsigned int* out, unsigned int size)
{
  for(int i=0;i<size;i++)
    out[i] = SumOfOnesTwiddle(in[i]);
}

// ********************* LOOKUP TABLE ********************* //

#define TABLE_SIZE  256
unsigned int table[TABLE_SIZE];

void computeTable()
{
  for(int i=0; i<TABLE_SIZE; i++)
      table[i] = SumOfOnesTwiddle(i);
}

unsigned int SumOfOnesTable(unsigned int x)
{
  return  table[(x    ) & 0xff] +
          table[(x>>8 ) & 0xff] +
          table[(x>>16) & 0xff] +
          table[(x>>24) & 0xff];
}

void SumOfOnesTableArray(unsigned int* in, unsigned int* out, unsigned int size)
{
  for(int i=0;i<size;i++)
    out[i] = SumOfOnesTable(in[i]);
}

//testing

template<class itor>
void printOut(itor start, itor end)
{
  cout << hex << "0x";
  copy (start, end, ostream_iterator<int>(cout, " 0x"));
  cout << endl << endl;
}

#define SIZE 5000
int main()
{
  unsigned int indata[SIZE];
  unsigned int outdata[SIZE];
  unsigned int outdata2[SIZE];

  clock_t start_time, end_time;
  computeTable();

  srand(time(NULL));
  for(int i=0;i < SIZE; i++)
    indata[i] = rand();

  printOut(indata, indata+SIZE);
  SumOfOnesTwiddleArray(indata, outdata, SIZE);
  SumOfOnesTableArray  (indata, outdata2, SIZE);
  printOut(outdata,  outdata+SIZE);
  printOut(outdata2, outdata2+SIZE);

  for(int i=0;i < SIZE; i++)
    if(outdata[i] != outdata2[i])
       cout << "BUG!" << endl;

  start_time = clock();
  for(int i=0;i < SIZE; i++)
    SumOfOnesTwiddleArray(indata, outdata, SIZE);
  end_time = clock();

  cout << dec << "speed of bit twiddling: " << (end_time - start_time) << endl;

  start_time = clock();
  for(int i=0;i < SIZE; i++)
    SumOfOnesTableArray(indata, outdata, SIZE);
  end_time = clock();
  cout << dec << "speed of a lookup table: " << (end_time - start_time) << endl;

}

No comments:

Post a Comment