The Coding Slim Jim: google interview: find number in multi disk list better than O(logn)

// http://www.impactinterview.com/2009/10/140-google-interview-questions/#software_engineer
//
// Find or determine non existence of a number in a sorted list
// of N numbers where the numbers range over M, M>> N and N large
// enough to span multiple disks. Algorithm to beat O(log n) bonus
// points for constant time algorithm.

// the const time algo is called a "bloom filter" if you work with large
// databases you (hopefully) learn this..  what a bloom filter does on insert
// is hash your key in 3 ways and then light up flags in 3 indexes..  on lookup
// it hashs again and checks the 3 places..  if the any one the flags is off
// then you have a item that doesnt exist..  otherwise commence your normal
// logN search

#include <iostream>
#include <vector>
#include <stdint.h>
#include <memory>
#include <cstring>
#include <cstdlib>
#include <chrono>

#define BLOOMFILTER

class PagedVector
{
public:
    typedef uint64_t                Key;         // ideally this would be a template param
    typedef std::vector<Key>        Keys;
    typedef std::shared_ptr<Keys>   KeysHandle;
    typedef std::vector<KeysHandle> PagedKeys;

    const uint32_t filterSize = 256;             // ideally this would be tunable to reduce misses/mem use
    const uint32_t filterMask = filterSize - 1;

private:
    std::size_t threashold_;

    PagedKeys pages_;

    std::vector<bool> presence_[3]; // the bloom filter

    // the divide part of a divide and conqurer search
    template <typename Iterator,
              typename Compare,
              typename Splitter>
    struct Divider
    {
        Compare&  comparer_;
        Splitter& splitter_;

        Divider(Compare&  comparer,
                Splitter& splitter) :
            comparer_(comparer),
            splitter_(splitter)
        {}

        void operator()(Iterator& a,
                        Iterator& b)
        {
            // should never happen unless we are empty..
            if (b==a) return;

            Iterator n = splitter_(a,b);

            if (comparer_(n))
            {
                b = n;
                return;
            }
            a = n + 1;
        }
    };

    // All divide and conqurer searches do 1 thing..  they split the data in
    // some way and repeat until the desired location is found..  In this
    // system we have mutliple uses of the search.. they are used to search and find insertion points

    template <typename Iterator,
              typename Compare,
              typename Splitter>
    Iterator search(Iterator begin,
                    Iterator end,
                    Compare  comparer,
                    Splitter splitter)
    {
        // point at the match or the point to insert in if it doesnt match
        Divider<Iterator, Compare, Splitter> Divider(comparer, splitter);

        // narrows the sub range in which to keep searching when the range is 1 item we are done..
        while (1)
        {
            Divider(begin,end);
            if (end == begin)
                return begin;
        }
    }

    struct FindInPage
    {
        Key target_;

        FindInPage(Key target) :
            target_(target)
        {}

        bool operator()(const PagedKeys::iterator& a)
        {
            // ok slight strangeness here.. pages are a range of numers
            //  t < [b,e] then true
            //  t > [b,e] then false
            // but
            // t > b, t < e its equal..  so it has to be absolete less than!

            const KeysHandle& pageHandle = *a;
            if (pageHandle->size() > 0)
            {
                return target_ <= (*pageHandle)[pageHandle->size()-1];

            }
            return true; // empty... should be the last if it exists..
        }
    };

    struct SplitPage
    {
        Key target_;

        SplitPage()
        {}

        PagedKeys::iterator operator()(const PagedKeys::iterator& a,
                                       const PagedKeys::iterator& b)
        {
            std::size_t offset = (b-a)/2;
            return a + offset;
        }
    };

    struct FindInVector
    {
        Key target_;

        FindInVector(Key target) :
            target_(target)
        {}

        bool operator()(const Keys::iterator& a)
        {
            return target_ <= (*a);
        }
    };

    struct SplitVector
    {
        Key target_;

        SplitVector(Key target) :
            target_(target)
        {}

        Keys::iterator operator()(const Keys::iterator& a,
                                  const Keys::iterator& b)
        {
            // std::size_t offset = (b-a)/2;
            // return a + offset;

            if ((b-a) <= 2)
            {
                std::size_t offset = (b-a)/2;
                return a + offset;
            }

            Key high = *(b-1);
            Key low  = *a;

            // since we have the values...
            if (target_ <= low)  return a;
            if (target_ >= high) return b-1;

            double splitPercent;
            splitPercent = static_cast<double>(target_ - low) / static_cast<double>(high-low);
            splitPercent = (splitPercent < 0.0) ? 0.0 : splitPercent;

            std::size_t range  = (b-a);
            std::size_t offset = range * splitPercent;

            return  a + offset;
        }
    };

    // retun a fax iterator to the page and page offset where the item is or should be
    std::pair<PagedKeys::iterator, Keys::iterator> find(Key target)
    {
        std::pair<PagedKeys::iterator, Keys::iterator> iter;

        iter.first = search(pages_.begin(),
                            pages_.end(),
                            FindInPage(target),
                            SplitPage());

        if (iter.first == pages_.end())
            iter.first = pages_.end() - 1;

        KeysHandle& page = *(iter.first);

        // find the location to add into the list
        iter.second = search(page->begin(),
                             page->end(),
                             FindInVector(target),
                             SplitVector(target));
        return iter;
    }

public:
    PagedVector(std::size_t threashold) :
        threashold_(threashold),
        pages_(),
        presence_()
    {
        for (int i = 0; i < 3; ++i)
            presence_[i].resize(filterSize);

        // make it a bit more easy on the boarder conditions
        KeysHandle newPage(new Keys);
        pages_.push_back(newPage);
    }

    // add the target to the paged data struture
    void insert(Key target)
    {
#ifdef BLOOMFILTER
        // not the best way but assume we are doing a item by item insert
        uint64_t hash = std::hash<Key>()(target);

        // light the bloomfilters presence bits according to the hash
        presence_[0][ hash        & filterMask] = true;
        presence_[1][(hash >> 8)  & filterMask] = true;
        presence_[2][(hash >> 16) & filterMask] = true;
#endif

        // find the relevent part of the list
        std::pair<PagedKeys::iterator, Keys::iterator> iter = find(target);

        KeysHandle& page = *(iter.first);

        page->insert(iter.second, target);

        // then threshold check.
        if (page->size() > threashold_)
        {
            // ok over the threshold.. so divide the list in half
            KeysHandle newPage(new Keys);
            uint32_t spliceAt = page->size() / 2;
            uint32_t sizeAfter = page->size() - spliceAt;
            newPage->resize(sizeAfter);
            std::memcpy(&((*newPage)[0]), &((*page)[spliceAt]), sizeAfter * sizeof(Key));
            page->resize(spliceAt);

            pages_.insert(iter.first + 1, newPage);
        }
    }

    bool exists(Key target)
    {
#ifdef BLOOMFILTER
        uint64_t hash = std::hash<Key>()(target);

        // a quick O(1) pre check before the more costly real check
        if (not (presence_[0][ hash        & filterMask] and
                 presence_[1][(hash >> 8)  & filterMask] and
                 presence_[2][(hash >> 16) & filterMask]))
            return false;
#endif

        // ok technically the question was about this.. so no std:: here..
        // basically have to implement a better then log(N) search
        std::pair<PagedKeys::iterator, Keys::iterator> iter = find(target);

        return *(iter.second) == target;
    }

    const PagedKeys& pages() const
    {
        return pages_;
    }

    void print(std::ostream& os) const
    {
        for ( KeysHandle page : pages_)
        {
            os << "\n ########################################## \n";
            for ( Key item : *page)
            {
                os << " " << item;
            }
        }
    }
};

std::ostream& operator<<(std::ostream& os, const PagedVector& list)
{
    list.print(os);
    return os;
}

void validate_sorted(PagedVector& theList)
{
    bool failed = false;
    bool first = true;
    PagedVector::Key prev;

    for ( PagedVector::KeysHandle page : theList.pages() )
    {
        for ( PagedVector::Key item : *page)
        {
            if (first)
            {
                first = false;
                prev = item;
            }
            else
            {
                if (prev > item)
                    failed = true;
                prev = item;
            }
        }
    }
    std::cout << (failed ? "FAILED" : "passed")
              << "\n";
}

void test_random()
{
    PagedVector theList(5000);

    auto istart = std::chrono::system_clock::now();
    for (int i = 0; i < 1000000;  ++i)
    {
        PagedVector::Key item = (std::rand() % 100000)*2;

        // std::cout << "adding:" << item << "\n";
        theList.insert(item);
    }
    auto iend = std::chrono::system_clock::now();

    std::cout << "inserted time:"
              << std::chrono::duration_cast<std::chrono::milliseconds>(iend - istart).count()
              << "mSec \n";
    // std::cout << theList << "\n";

    int count = 0;
    int exist = 0;

    auto cstart = std::chrono::system_clock::now();
    for (int i = 0; i < 1000000;  ++i)
    {
        PagedVector::Key item = std::rand() % 200000;
        if (theList.exists(item))
            ++exist;
        ++count;

        // std::cout << " " << item << " "
        //           << (theList.exists(item) ? "exists" : "missing")
        //           << "\n";
    }
    auto cend = std::chrono::system_clock::now();

    std::cout << "checking time:"
              << std::chrono::duration_cast<std::chrono::milliseconds>(cend - cstart).count()
              << "mSec\n\n";

    // expecting a 50% hit rate
    std::cout << "checked:"
              << count
              << " exist:" << exist
              << "\n";

    validate_sorted(theList);
}

int main()
{
    test_random();
}

Output with a bloomfilter is:

inserted time:2613mSec 
checking time:735mSec

checked:1000000 exist:499965
passed

The output without the filter is

inserted time:2584mSec 
checking time:1292mSec

checked:1000000 exist:499965
passed

The Coding Slim Jim

Wednesday, July 20, 2016

google interview: find number in multi disk list better than O(logn)

No comments:

Post a Comment