The Coding Slim Jim: sorting

Showing posts with label sorting. Show all posts

Saturday, June 5, 2010

Sorting billions of numbers

There are several key facts to consider when sorting a massive number of numbers

Key factors to consider

* The memory usage profile and efficiency of the sort algorithm
- - sort algorithms that can run back and forth over pages while sorting 1 number are out
  - -- that means bubble sort, insertion sort, selection sort, quicksort are out
- - a sort algorithm that can naturally deal with chucked groups of data is ideal
  - -- the best candidate is merge sort. Using 4 chucks of memory 2 input and 2 output chunks
  - -- use the insertion sort when the but of items in the merge list are below 32 since is more efficient
* FileIO speed and rates
- - fileIO is slow waiting for it is inefficent create a 2 threads that:
  - -- reads in chucks of data when the input buffer drops below a threshold.
  - -- writes out chucks of data when the output buffer contains data.
  - -- these threads may need to handle multiple buffer files as well as the final in/out file.
* the number of cores and virtual cores on the system
- -- create inter/intra sorting worker threads equal to the number of virtual cores in the system so that the cpu is fully utilized.
* the physical paging sizing of memory and the allocation time and size of memory chunks
- - dont repeatedly allocate and free memory instead keep a buffer pool with chucks of memory available
- - allocate chucks of memory are below the size of a memory page.
- - allocate chucks of memory only when the buffer pool is almost empty
- - free chucks of memory if the buffer pool excess an upper limit.

Pesudo Code;
So the basic merge sort needs be to modified into a couple of ways to make it work

- First the FileIO thread will load buffers into the input buffers, these buffer have unsorted data inside chucks
- using multiple threads and if the instage buffer is running low, perform intra sort on each chuck, ie sort the data in the chucks and move it to the instage buffer pool marking them as a sequence of 1 chuck, ie mark the last chuck in the sequence with an end flag.
- using multiple threads, each thread will take wait for 2 input out of sequences from the instage buffers
- -- remove the instage sequance from the instage buffer
- -- perform inter merge between next inline 2 chucks of the input sequences until one of the sequences final chucks are full processed
- -- when an input sequences final chuck is found move all the remain input chucks over to the output sequence (note the final chuck is left marked final)
- -- once the first output chuck of the new sequance is complete the sequence can be moved in the instage buffer, The final chuck is not needed.
- -- as each input chucks merge is completed move it to the empty buffer pool.
- -- as each output chuck is completed append it to output sequence and get a new chunk from the empty buffer pool
- -- if both final chucks of the input sequences are merged the output chuck needs to be marked as final

Thursday, June 3, 2010

In place merge sort is kinda like shoting your self in the foot before a race.. memory is cheap so the additional movement of data that this thing creates is meaningless. Also this version isnt very efficient because it implements the rotate of the data array via 3 reversal steps.
The alternate is done with a gray counter and a modulas of the counters remainder (ie the greatest common denominator between the total size left and right halves)... clearly Im not about to waste more time on it....

How it Works;
1) divide the list in half
2) perform mergesort on the left and right parts
3) merge the left and right parts
3a) to merge step through the array until a swap with the right is needed
3b) circular rotate the data around in the array by a shift right equal to the size of the right.
3c) set the point to the start of the old left list.

Properties;
Best, Average and Worst case speed is O(nlog(n)).
Memory is 1
In general it is unstable but stable versions do exist.

//compile with g++
#include <iostream>
#include <iomanip>
using namespace std;

void print(int* data, int size);

void reverse(int* data, int size)
{
  int temp;
  for(int i = 0; i < size/2; i++)
    {
      temp = data[i];
      data[i] = data[size-i-1];
      data[size-i-1] = temp;
    }
  //print(data, size);
}

void inPlaceMergeSort(int* data, int size)
{
  if(size < 2)
    return;

  int mid       = size/2;

  inPlaceMergeSort(data,       mid);
  inPlaceMergeSort(&data[mid], size - mid);

  int left      = 0;
  int out       = 0;
  while(left != mid)
    {
      if(data[mid] < data[left]) 
 {
   //cout << "orginal:" << left << " " << mid << " " << size << endl;
   //print(data, size);
   int size_right = size - mid;
   reverse(&data[left], mid  - left);
   reverse(&data[mid],  size - mid);
   reverse(&data[left], size - left);
   mid = size - ( mid - left);
 }
      else                         
 { left++; }
    }
}

//test
#define SIZE 20
#define SCRAMBLE(x, y) ((0xa57b & ~y) + ((0x3829 & x) << 1))

bool check(int data[], int size)
{
  for(int i = 1; i < size; i++)
    if(data[i] < data[i-1])
      {
        cout << "FAIL!" << endl;
        return false;
      }
  cout << "PASS" << endl;
  return true;
}

void print(int* data, int size)
{
  for(int i = 0; i < size; i++)
    cout << setw(5) << data[i] << " ";
  cout << endl;
}

bool test(int* data, int size)
{
  print(data, SIZE);
  inPlaceMergeSort(data, SIZE);
  print(data, SIZE);
  return check(data, SIZE);
}

int main()
{
  int data[SIZE];
  bool pass = true;

  //easy data
  data[0] = 1;
  for(int i = 0; i < SIZE; i++)
    data[i] = SIZE - i;
  pass &= test(data, SIZE);

  //semi repeated data
  data[0] = 1;
  for(int i = 1; i < SIZE; i++)
    data[i] = SCRAMBLE(i, data[i-1]);
  pass &= test(data, SIZE);

  //the sort killer
  for(int i = 0; i < SIZE; i++)
    data[i] = 5;
  pass &= test(data, SIZE);

  //and some randoms to catch anything i missed
  srand ( time(NULL) );

  for(int j = 0; j < 100; j++)
    {
      for(int i = 0; i < SIZE; i++)
        data[i] = (int)((float)(j+1)*((float)rand()/(float)RAND_MAX));
      pass &= test(data, SIZE);
    }

  if(pass)
    cout << "ALL PASSED" << endl;
  else
    cout << "FAILED" << endl;
}

Saturday, May 29, 2010

Shell Sort

Shell sort

Basically the idea of shell sort is to move numbers in a large step first, and slow reduce this step down to 1. The idea summarizes as simulated annealing. The whole reason for the additionally outer loop is to reduce the overall randomness in inital sort the data by quickly and roughly grouping low value items to the left and high value items to the right.

The inner core of it is insertion sort which will correct the sorting of the data irrespective of what your choice of outer steps are so long as the last step is 1.

How it Works;
1) choose a step by which to do your insertion sort
2) preform insertion sort as normal but using the step to determine which data is side by side.

The understandable version of the algorithm works as follows:
1) break the list of data into rows and columns
2) preform insertion sort on each column
3) repeat with more less columns until we have a single column.

In my code example I have have created an easy to understand implementation of shell sort it explicitly names out the columns and rows

Properties;
Worst case speed is O(n^2) or O(nlog^2n) if you use the special sequence of steps.
Memory is 1
It is Unstable (due the interleaving of the data several neigbour items are skipped)

//compile with g++
#include <iostream>
#include <iomanip>
using namespace std;

void print(int data[], int size);

void shellSort(int data[], int size)
{
  int tmp;
  int i; 
   // int step = 1; //comment the for and uncomment this to see that shell is just insertation sort!
   for(int step = size/2; step > 0; step = step/2)
      for(int j = step; j < size; j++)
        {
          tmp = data[j];
          i = j;
          while(
                (i >= step) &&
                (data[i-step] > tmp)
                )
            {
              data[i] = data[i-step];
              i -=step;
            }

          if(i != j)
            data[i] = tmp;
        }
}

void shellSortUnderstandable(int data[], int size)
{
  int tmp;
  int srow;
  for(int step = size/2; step > 0; step = step/2)
    {
      for(int column= 0; column < step; column++)
        {
          int rows = (int)((size-column)/step) + 1;
          for(int row = 1; row < rows-1; row++)
            {
              tmp = data[row*step + column];

              for(srow = row-1; srow >= 0; srow--)
                if(data[srow*step + column] > tmp)
                  data[(srow+1)*step + column] = data[(srow*step) + column];
                else
                  break;
              
              if(srow+1 != row)
                data[(srow+1)*step + column] = tmp;
            }
        }
    }
}

//test
#define SCRAMBLE(x, y) ((0xa57b & ~y) + ((0x3829 & x) << 1))

bool check(int data[], int size)
{
  for(int i = 1; i < size; i++)
    if(data[i] < data[i-1])
      {
        cout << "FAIL!" << endl;
        return false;
      }
  cout << "PASS" << endl;
  return true;
}

void print(int data[], int size)
{
  for(int i = 0; i < size; i++)
    cout << setw(5) << data[i] << " ";
  cout << endl;
}

bool test(int data[], int size)
{
  print(data, size);
  shellSort(data,size);
  print(data, size);
  return check(data, size);
}

#define SIZE 300
int main()
{
  int data[SIZE];
  bool pass = true;

  //easy data
  data[0] = 1;
  for(int i = 0; i < SIZE; i++)
    data[i] = SIZE - i;
  pass &= test(data, SIZE);

  //semi repeated data
  data[0] = 1;
  for(int i = 1; i < SIZE; i++)
    data[i] = SCRAMBLE(i, data[i-1]);
  pass &= test(data, SIZE);

  //the sort killer!
  for(int i = 0; i < SIZE; i++)
    data[i] = 5;
  pass &= test(data, SIZE);

  //and some randoms to catch anything i missed
  srand ( time(NULL) );

  for(int j = 1; j < 100; j++)
    {
      for(int i = 0; i < 3*j; i++)
        data[i] = (int)((float)(j+1)*((float)rand()/(float)RAND_MAX));
      pass &= test(data, 3*j);
    }

  if(pass)
    cout << "ALL PASSED" << endl;
  else
    cout << "FAILED" << endl;
}

Insertion Sort

How it Works;
0) Sorted data will be at the front of the array, starting 1 place after the start of the data,
1) take your current value out of the array, creating an empty point
2) set your insertion point to the tail of the sorted data.
3a) if the current item is lower then the insertion point item move the insertion points item up to the empty place in the array repeat with the next possible insertion point
3b) if the current item is equal to or greater than the insertion point place the current item back into the arrays empty place and repeat from 1 with the next unsorted data.

Properties;
Worst and average case speed is O(n^2)
Memory is 1
It is Stable (unless you implement it badly)

//compile with g++
#include <iostream>
#include <iomanip>
using namespace std;

void print(int data[], int size);

void insertionSort(int data[], int size)
{
  int tmp;
  int i;
  for(int j = 1; j < size; j++)
    {
      tmp = data[j];
      i = j;
      while(
            (i > 0) &&
            (data[i-1] > tmp)
            )
        {
          data[i] = data[i-1];
          i--;
        }

      if(i != j)
        data[i] = tmp;
    }
}

//test
#define SIZE 20
#define SCRAMBLE(x, y) ((0xa57b & ~y) + ((0x3829 & x) << 1))

bool check(int data[], int size)
{
  for(int i = 1; i < size; i++)
    if(data[i] < data[i-1])
      {
        cout << "FAIL!" << endl;
        return false;
      }
  cout << "PASS" << endl;
  return true;
}

void print(int data[], int size)
{
  for(int i = 0; i < size; i++)
    cout << setw(5) << data[i] << " ";
  cout << endl;
}

bool test(int data[], int size)
{
  print(data, SIZE);
  insertionSort(data,SIZE);
  print(data, SIZE);
  return check(data, SIZE);
}

int main()
{
  int data[SIZE];
  bool pass = true;

  //easy data
  data[0] = 1;
  for(int i = 0; i < SIZE; i++)
    data[i] = SIZE - i;
  pass &= test(data, SIZE);

  //semi repeated data
  data[0] = 1;
  for(int i = 1; i < SIZE; i++)
    data[i] = SCRAMBLE(i, data[i-1]);
  pass &= test(data, SIZE);

  //the sort killer!
  for(int i = 0; i < SIZE; i++)
    data[i] = 5;
  pass &= test(data, SIZE);

  //and some randoms to catch anything i missed
  srand ( time(NULL) );

  for(int j = 0; j < 100; j++)
    {
      for(int i = 0; i < SIZE; i++)
        data[i] = (int)((float)(j+1)*((float)rand()/(float)RAND_MAX));
      pass &= test(data, SIZE);
    }

  if(pass)
    cout << "ALL PASSED" << endl;
  else
    cout << "FAILED" << endl;
}

Selection Sort

The algorithm works as follows:
How it Works;
1) Find the minimum value in the list
2) Swap the minimum item with first item
3) Repeat for all remain positions

Properties;
Worst and average case speed is O(n^2)
Memory is 1
It is Stable (unless you implement it badly)

//compile with g++
#include <iostream>
#include <iomanip>
using namespace std;

void print(int data[], int size);

void selectionSort(int data[], int size)
{
  int min;
  int tmp;
  for(int j = 0; j < size; j++)
    {
      min = j;
      for(int i = j;i < size;i++)
        if(data[i] < data[min])
          min = i;

      if(min != j)
        {
          tmp = data[min];
          data[min] = data[j];
          data[j] = tmp;
        }
    }
}

//test
#define SIZE 20
#define SCRAMBLE(x, y) ((0xa57b & ~y) + ((0x3829 & x) << 1))

bool check(int data[], int size)
{
  for(int i = 1; i < size; i++)
    if(data[i] < data[i-1])
      {
        cout << "FAIL!" << endl;
        return false;
      }
  cout << "PASS" << endl;
  return true;
}

void print(int data[], int size)
{
  for(int i = 0; i < size; i++)
    cout << setw(5) << data[i] << " ";
  cout << endl;
}

bool test(int data[], int size)
{
  print(data, SIZE);
  selectionSort(data,SIZE);
  print(data, SIZE);
  return check(data, SIZE);
}

int main()
{
  int data[SIZE];
  bool pass = true;

  //easy data
  data[0] = 1;
  for(int i = 0; i < SIZE; i++)
    data[i] = SIZE - i;
  pass &= test(data, SIZE);

  //semi repeated data
  data[0] = 1;
  for(int i = 1; i < SIZE; i++)
    data[i] = SCRAMBLE(i, data[i-1]);
  pass &= test(data, SIZE);

  //the sort killer!
  for(int i = 0; i < SIZE; i++)
    data[i] = 5;
  pass &= test(data, SIZE);

  //and some randoms to catch anything i missed
  srand ( time(NULL) );

  for(int j = 0; j < 100; j++)
    {
      for(int i = 0; i < SIZE; i++)
        data[i] = (int)((float)(j+1)*((float)rand()/(float)RAND_MAX));
      pass &= test(data, SIZE);
    }

  if(pass)
    cout << "ALL PASSED" << endl;
  else
    cout << "FAILED" << endl;
}

Bubble Sort

Bubble sort

How it works;
Basically bubble sort works by stepping through the data a swapping the current and next data items if they are out of order.

Properties;
Best case and worst case speed are both O(n^2)
Memory is 1 (the temp swap location)

//compile with g++
#include <iostream>
#include <iomanip>
using namespace std;

void print(int data[], int size);

void bubblesort(int data[], int size)
{
  bool change = true;
  int tmp;
  while(change)
    {
      change = false;
      for(int i = 0;i < size-1;i++)
        if(data[i] > data[i+1])
          {
            tmp = data[i];
            data[i] = data[i+1];
            data[i+1] = tmp;
            change = true;
          }
    }
}

//test
#define SIZE 20
#define SCRAMBLE(x, y) ((0xa57b & ~y) + ((0x3829 & x) << 1))

bool check(int data[], int size)
{
  for(int i = 1; i < size; i++)
    if(data[i] < data[i-1])
      {
        cout << "FAIL!" << endl;
        return false;
      }
  cout << "PASS" << endl;
  return true;
}

void print(int data[], int size)
{
  for(int i = 0; i < size; i++)
    cout << setw(5) << data[i] << " ";
  cout << endl;
}

bool test(int data[], int size)
{
  print(data, SIZE);
  bubblesort(data,SIZE);
  print(data, SIZE);
  return check(data, SIZE);
}

int main()
{
  int data[SIZE];
  bool pass = true;

  //easy data
  data[0] = 1;
  for(int i = 0; i < SIZE; i++)
    data[i] = SIZE - i;
  pass &= test(data, SIZE);

  //semi repeated data
  data[0] = 1;
  for(int i = 0; i < SIZE; i++)
    data[i] = SCRAMBLE(i, data[i-1]);
  pass &= test(data, SIZE);

  //the sort killer!
  for(int i = 0; i < SIZE; i++)
    data[i] = 5;
  pass &= test(data, SIZE);

  //and some randoms to catch anything i missed
  srand ( time(NULL) );

  for(int j = 0; j < 100; j++)
    {
      for(int i = 0; i < SIZE; i++)
        data[i] = (int)((float)(j+1)*((float)rand()/(float)RAND_MAX));
      pass &= test(data, SIZE);
    }

  if(pass)
    cout << "ALL PASSED" << endl;
  else
    cout << "FAILED" << endl;
}

Friday, May 28, 2010

Quick sort

UPDATE: I noticed a lot o traffic coming to this entry.. since it seems popular ill clean up and simplify the code some time later this week(currently its 2010/06/17). Then repost it in a new entry and note the new link here..

Quick sort..

How it works.
1) First choose a pivot value.
2) Then divide the list into 2 sections, do the division by finding the left most item the exceeds the pivot and swapping it with the right most item that is less than the pivot.
3) Then recurse on the 2 halves.

There are several hidden fail points
1) if a bad pivot value is chosen then the list wont divide into 2
2) if the data becomes to similar, or even completely identical then the chances of a bad pivot increase.

As a result of the fail point care needs to be taken to correctly find the right and left swap items and where that the the list is be split.

Speed:
Quick sort is O(nlog(n)) on average. and O(n^2) in the worst case.

//compile with g++
#include <iostream>
#include <iomanip>
using namespace std;

void print(int data[], int size);

void quicksort(int data[], int sectionLow, int sectionHigh)
{
  if(sectionLow >= sectionHigh-1)
    return;

  int low  = sectionLow;
  int high = sectionHigh-1;
  int mid  = (low+high)/2;

  int pivotValue = data[mid];

  while(low < high)
    {
      while(
            (low < high) &&
            (data[low]  < pivotValue) &&
            (data[high] > pivotValue)
            )
        {
          //O(N^2) avoidance.. move the pointers inorder or 1 will start to dominate 
          // when the data starts to get too similar
          low++;
          high--;
        }
      
      //one or both of these wont run.. the other will find its end...
      while((low < high) && (data[low]  < pivotValue)) low++;
      while((low < high) && (data[high] > pivotValue)) high--;

      cout << low << "<>" << high << " ";

      //do the swap
      if(low < high)
        {
          int temp = data[low];
          data[low]  = data[high];
          data[high] = temp;

          //step away from it otherwise it will get stuck in case that these are both pivotValues
          low++;
          high--;
        }
    }

  cout << endl;

  cout << low << " ";
  //quick sort boundary conditions are problem matic
  //make certian of where the end of the data was
  if(data[low] < pivotValue)   mid = low+1;
  else                         mid = low;

  cout << mid << " ";

  //make certain that this isnt the start or end edge or we will end in an infinite loop
  if(mid == sectionHigh) mid--;
  if(mid == sectionLow)  mid++;

  cout << mid << endl;

  cout << "stage result: " << sectionLow
       << "<->" << mid
       << "<->" << sectionHigh << " pivot:" << pivotValue << endl;
  print(&(data[sectionLow]), sectionHigh-sectionLow);

  quicksort(data, sectionLow, mid);
  quicksort(data, mid, sectionHigh);
}

//test
#define SIZE 20
#define SCRAMBLE(x, y) ((0xa57b & ~y) + ((0x3829 & x) << 1))

bool check(int data[], int size)
{
  for(int i = 1; i < size; i++)
    if(data[i] < data[i-1])
      {
        cout << "FAIL!" << endl;
        return false;
      }
  cout << "PASS" << endl;
  return true;
}

void print(int data[], int size)
{
  for(int i = 0; i < size; i++)
    cout << setw(5) << data[i] << " ";
  cout << endl;
}

bool test(int data[], int size)
{
  print(data, SIZE);
  quicksort(data,0, SIZE);
  print(data, SIZE);
  return check(data, SIZE);
}

int main()
{
  int data[SIZE];
  bool pass = true;

  //easy data
  data[0] = 1;
  for(int i = 0; i < SIZE; i++)
    data[i] = SIZE - i;
  pass &= test(data, SIZE);

  //semi repeated data
  data[0] = 1;
  for(int i = 0; i < SIZE; i++)
    data[i] = SCRAMBLE(i, data[i-1]);
  pass &= test(data, SIZE);

  //the sort killer!
  for(int i = 0; i < SIZE; i++)
    data[i] = 5;
  pass &= test(data, SIZE);

  //and some randoms to catch anything i missed
  srand ( time(NULL) );

  for(int j = 0; j < 100; j++)
    {
      for(int i = 0; i < SIZE; i++)
        data[i] = (int)((float)(j+1)*((float)rand()/(float)RAND_MAX));
      pass &= test(data, SIZE);
    }
  
  if(pass)
    cout << "ALL PASSED" << endl;
  else
    cout << "FAILED" << endl;
}