Saturday, August 13, 2016

google interview: efficiently find a number in a partial sorted list

// http://www.impactinterview.com/2009/10/140-google-interview-questions/#software_engineer
// You are given a list of numbers.  When you reach the end of the list you
// will come back to the beginning of the list (a circular list).  Write the
// most efficient algorithm to find the minimum # in this list.  Find any given
// # in the list.  The numbers in the list are always increasing but you
// don’t know where the circular list begins, ie: 38, 40, 55, 89, 6, 13, 20,
// 23, 36.

// First of all note very carefully this is for a *unidirectional circular linked 
// list* if we alter the design of the data structure there are much better ways 
// to do this.. but i believe that is out of bounds as a potential answer.

// hmm..  essentially the find min number algorithm is a component of the find 
// any number algorithm..  Since the question isn't written as find the min, 
// then fix the list start and then find any # ill assume that it was meant as 
// a helper step to get to the find algorithm..  so in the interest of getting 
// it done lets just cut to the find any # algorithm and code a few evolutions 
// showing efficiency improvements..

#include <iostream>
#include <vector>
#include <chrono>
#include <functional>

// ****************************************
// *************** CYCLIC LIST ************
// ****************************************

class List
{
    std::vector<int> data_;
    int offset_;

public:
    class Iterator
    {
        List* list_;
        int index_;

    public:
        Iterator() :
            list_(NULL),
            index_(-1)
        {}

        Iterator(List* list) :
            list_(list),
            index_(list->offset_)
        {}

        // WARNING DO NOT USE in find.. that was not spec'ed
        // this is for testing only
        Iterator(List* list, int offset) :
            list_(list),
            index_(offset)
        {}

        Iterator& operator++()
        {
            index_ = (index_ + 1) % list_->data_.size();
            return *this;
        }

        Iterator operator+(int delta)
        {
            int offset = (index_ + delta) % list_->data_.size();
            while (offset < 0) offset += list_->data_.size();

            return Iterator(list_, offset);
        }

        // WARNING DO NOT UE in find.. that was not spec'ed
        // this is for testing only
        Iterator& operator--()
        {
            index_ = (list_->data_.size() + index_ - 1) % list_->data_.size();
            return *this;
        }

        int operator*()
        {
            return list_->data_[index_];
        }

        bool operator==(const Iterator& other) const
        {
            return other.list_ == list_ and
                other.index_ == index_;
        }

        bool operator!=(const Iterator& other) const
        {
            return not (other == *this);
        }

        bool valid() const
        {
            return list_ != NULL;
        }
    };

    List(std::initializer_list<int> data,
         int offset = 0) :
        data_(data),
        offset_(offset)
    {}

    List(int size) :
        data_(size),
        offset_(rand() % size)
    {
        // random fill
        int value = 0;
        for (int i = 0; i < size; ++i)
        {
            value += 2 + rand() % 5;
            data_[i] = value;
        }
    }

    bool empty() const
    {
        return data_.empty();
    }

    Iterator begin()
    {
        return Iterator(this);
    }

    Iterator random()
    {
        return Iterator(this, rand() % data_.size());
    }
};

// ****************************************
// *************** FIND NAIVE *************
// ****************************************

List::Iterator find_naive(List& data, int target)
{
    if (data.empty()) return List::Iterator();

    List::Iterator start = data.begin();
    List::Iterator i = start;

    do
    {
        if (*i == target) return i;
        ++i;
    }
    while (i != start);

    return List::Iterator();
}

// ****************************************
// ************* FIND NAIVE V2 ************
// ****************************************

// Attempt to simply the code a bit so that we handle the cyclic iteration and
// partial sorted issue cleaner for latter scaling

List::Iterator find_naive2(List& data, int target)
{
    if (data.empty()) return  List::Iterator();

    //termination is an issue because start *is* the end ..
    // simplify this by checking the start..
    List::Iterator end = data.begin();
    if (target == *end) return end;

    // then start at the first item after and now "start" is really the end
    List::Iterator i = end + 1;

    while (i != end and *i != target) ++i;
    if (*i == target) return i;

    return  List::Iterator();
}

// Of course you could try a divide and conquer idea..  but your dealing with a
// list that you want to divide in half.  Dividing it it half means you need to
// step the iterator the mid point..  So assuming you know the size of the list
// the total search on a fully sorted list will *always* cost u n/2 + n/4 + n/8
// ...  moves. Which totals to n-1 (ie if n=8 is 4+2+1=7)..  So the performance
// is actually worst on average for the naive version ... of course this might 
// not be true because the cost of iterators might be very low compared to the 
// compare.. but i dont think it is... so im skipping it..

// ****************************************
// ******** SKIP TO SORTED SECTION ********
// ****************************************

// What we can really do is try to take advantage of the sorted order of data
// and terminate early..  In order to do that what you have to do is skip to
// the correct sorted part for your target and then search... (hence the find
// min number stuff before)

// Keep in mind this gets much less robust to problems with the data..  ie
// if the list isnt sort (and parted) before hand..

List::Iterator find_skip_start(List& data, int target)
{
    // handle the empty dataset case..
    if (data.empty()) return  List::Iterator();

    //termination is an issue because start *is* the end ..
    // simplify this by checking the start..
    List::Iterator end = data.begin();
    if (target == *end) return end;

    // then start at the first item after and now "start" is really the end
    List::Iterator i = end + 1;

    // if the target is in the second part
    if (target < *end)
    {
        // skip to the second half
        int split_edge = *end;
        while (*i >= split_edge and i != end) ++i;
    }

    // then search for the target
    while (*i < target and i != end) ++i;

    if (*i == target) return i;
    return  List::Iterator();
}

// ****************************************
// ******* DOUBLE SKIP IN SECTION *********
// ****************************************

// Now because its sorted you dont have to check items every time you move
// forward.  You can step and then roll back if you went to far..  this makes
// some big assumptions about the cost of iterator movement vs integer
// compares.. my "List" class isnt great but it works out.. and in 
// theory should cut out every second compare..

List::Iterator find_double_skip(List& data, int target)
{
    // handle the empty dataset case..
    if (data.empty()) return  List::Iterator();

    //termination is an issue because start *is* the end ..
    // simplify this by checking the start..
    List::Iterator end = data.begin();
    if (target == *end) return end;

    // then start at the first item after and now "start" is really the end
    List::Iterator i  = end + 1;

    if (target < *end)
    {
        int split_edge = *end;

        // double skip to second half
        List::Iterator iprev;
        while (*i >= split_edge and i != end)
        {
            ++i;
            iprev = i;
            if (i != end) ++i;
        }

        // check if we need to back up the double step.
        if (iprev.valid() and *iprev < split_edge) i = iprev;
    }

    {
        // double skip till target found or exceeded
        List::Iterator iprev;
        while (*i < target and i != end)
        {
            ++i;
            iprev = i;
            if (i != end) ++i;
        }

        // check if found the target at either of the pointers..
        if (iprev.valid() and *iprev == target) return iprev;
        if (*i == target) return i;
    }

    return  List::Iterator();
}

// ****************************************
// ********** N SKIP IN SECTION ***********
// ****************************************

// Optimization..  of course if u can do better with steps of 2 then u can
// generalize it to any number of steps..  for example using steps of 20..  now
// we are really playing balance games..  trading the speeds of various
// comparisons, assignments etc..  Then u have to tune the size of skip for
// your machine(s)

List::Iterator find_n_skip(List& data, int target)
{
    const int skip = 20;

    // handle the empty dataset case..
    if (data.empty()) return  List::Iterator();

    //termination is an issue because start *is* the end ..
    // simplify this by checking the start..
    List::Iterator end = data.begin();
    if (target == *end) return end;

    // then start at the first item after and now "start" is really the end
    List::Iterator i  = end + 1;

    if (target < *end)
    {
        List::Iterator iprev = i;
        int split_edge = *end;
        // skip to second half
        while (*i >= split_edge and i != end)
        {
            ++i;
            iprev = i;
            int steps = skip;
            while (--steps > 0 and i != end) ++i;
        }

        // when found return to the last known ok point and search forward 1 by 1
        i = iprev;
        while (*i >= split_edge and i != end) ++i;
    }

    {
        List::Iterator iprev = i;
        while (*i < target and i != end)
        {
            ++i;
            iprev = i;
            int steps = skip;
            while (--steps > 0 and i != end) ++i;
        }

        // when found return to the last known ok point and search forward 1 by 1
        i = iprev;
        while (*i < target and i != end) ++i;

        if (*i == target) return i;
    }

    return  List::Iterator();
}

void test(std::string dataTag,
          std::string algoTag,
          std::function<List::Iterator (List& , int)> func,
          List& data,
          int target,
          bool hasTarget)
{
    auto start = std::chrono::system_clock::now();
    List::Iterator result = func(data, target);
    auto end   = std::chrono::system_clock::now();

    std::cout << algoTag << " for " << dataTag << ": ";
    if (not result.valid())
        std::cout << (hasTarget ? "FAIL" : "pass");
    else
        std::cout << ((hasTarget and (target == *result)) ? "pass" : "FAIL");

    std::cout << " -- time:"
              << std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count()
              << " nSec" << std::endl;
}

void testSet(std::string tag,
             List& data,
             int target,
             bool hasTarget)
{
    test(tag, "naive ",  find_naive,       data, target, hasTarget);
    test(tag, "naive2",  find_naive2,      data, target, hasTarget);
    test(tag, "skip  ",  find_skip_start,  data, target, hasTarget);
    test(tag, "2xskip",  find_double_skip, data, target, hasTarget);
    test(tag, "nskip ",  find_n_skip,      data, target, hasTarget);
    std::cout << "\n";
}

int main()
{
    srand(std::chrono::system_clock::now().time_since_epoch().count());

    {
        List list({10,11,12,1,2,3,4,5,6,7,8});
        testSet("small     ", list, 4, true);
    }

    {
        List list(1000000);
        testSet("big rand  ", list, *list.random(), true);
        testSet("big nrand ", list, *list.random()-1, false);  // note that  random always is +2 ~ +7 so -1 means i will not be in the data..

        testSet("big early ", list, *(list.begin() + 10), true);    // i hope.. its random so it might not be..
        testSet("big nearly", list, *(list.begin() + 10)-1, false); // check early exit in first half on fail

        testSet("big last  ", list, *(--list.begin()), true);
        testSet("big nlast ", list, -1, false);
    }
}



naive  for small     : pass -- time:3207 nSec
naive2 for small     : pass -- time:2175 nSec
skip   for small     : pass -- time:1996 nSec
2xskip for small     : pass -- time:2151 nSec
nskip  for small     : pass -- time:4232 nSec

naive  for big rand  : pass -- time:14229998 nSec
naive2 for big rand  : pass -- time:13228995 nSec
skip   for big rand  : pass -- time:12786059 nSec
2xskip for big rand  : pass -- time:10874261 nSec
nskip  for big rand  : pass -- time:9004453 nSec

naive  for big nrand : pass -- time:43750054 nSec
naive2 for big nrand : pass -- time:37382249 nSec
skip   for big nrand : pass -- time:17939681 nSec
2xskip for big nrand : pass -- time:15509062 nSec
nskip  for big nrand : pass -- time:12927864 nSec

naive  for big early : pass -- time:738 nSec
naive2 for big early : pass -- time:611 nSec
skip   for big early : pass -- time:632 nSec
2xskip for big early : pass -- time:708 nSec
nskip  for big early : pass -- time:1064 nSec

naive  for big nearly: pass -- time:28980196 nSec
naive2 for big nearly: pass -- time:27551947 nSec
skip   for big nearly: pass -- time:1034 nSec
2xskip for big nearly: pass -- time:719 nSec
nskip  for big nearly: pass -- time:1217 nSec

naive  for big last  : pass -- time:27341796 nSec
naive2 for big last  : pass -- time:27780499 nSec
skip   for big last  : pass -- time:28165415 nSec
2xskip for big last  : pass -- time:24593599 nSec
nskip  for big last  : pass -- time:22455148 nSec

naive  for big nlast : pass -- time:27195820 nSec
naive2 for big nlast : pass -- time:26584976 nSec
skip   for big nlast : pass -- time:8640245 nSec
2xskip for big nlast : pass -- time:7304119 nSec
nskip  for big nlast : pass -- time:6601818 nSec

No comments:

Post a Comment