// http://www.impactinterview.com/2009/10/140-google-interview-questions/#software_engineer
//
// Find or determine non existence of a number in a sorted list
// of N numbers where the numbers range over M, M>> N and N large
// enough to span multiple disks. Algorithm to beat O(log n) bonus
// points for constant time algorithm.
// the const time algo is called a "bloom filter" if you work with large
// databases you (hopefully) learn this.. what a bloom filter does on insert
// is hash your key in 3 ways and then light up flags in 3 indexes.. on lookup
// it hashs again and checks the 3 places.. if the any one the flags is off
// then you have a item that doesnt exist.. otherwise commence your normal
// logN search
#include <iostream>
#include <vector>
#include <stdint.h>
#include <memory>
#include <cstring>
#include <cstdlib>
#include <chrono>
#define BLOOMFILTER
class PagedVector
{
public:
typedef uint64_t Key; // ideally this would be a template param
typedef std::vector<Key> Keys;
typedef std::shared_ptr<Keys> KeysHandle;
typedef std::vector<KeysHandle> PagedKeys;
const uint32_t filterSize = 256; // ideally this would be tunable to reduce misses/mem use
const uint32_t filterMask = filterSize - 1;
private:
std::size_t threashold_;
PagedKeys pages_;
std::vector<bool> presence_[3]; // the bloom filter
// the divide part of a divide and conqurer search
template <typename Iterator,
typename Compare,
typename Splitter>
struct Divider
{
Compare& comparer_;
Splitter& splitter_;
Divider(Compare& comparer,
Splitter& splitter) :
comparer_(comparer),
splitter_(splitter)
{}
void operator()(Iterator& a,
Iterator& b)
{
// should never happen unless we are empty..
if (b==a) return;
Iterator n = splitter_(a,b);
if (comparer_(n))
{
b = n;
return;
}
a = n + 1;
}
};
// All divide and conqurer searches do 1 thing.. they split the data in
// some way and repeat until the desired location is found.. In this
// system we have mutliple uses of the search.. they are used to search and find insertion points
template <typename Iterator,
typename Compare,
typename Splitter>
Iterator search(Iterator begin,
Iterator end,
Compare comparer,
Splitter splitter)
{
// point at the match or the point to insert in if it doesnt match
Divider<Iterator, Compare, Splitter> Divider(comparer, splitter);
// narrows the sub range in which to keep searching when the range is 1 item we are done..
while (1)
{
Divider(begin,end);
if (end == begin)
return begin;
}
}
struct FindInPage
{
Key target_;
FindInPage(Key target) :
target_(target)
{}
bool operator()(const PagedKeys::iterator& a)
{
// ok slight strangeness here.. pages are a range of numers
// t < [b,e] then true
// t > [b,e] then false
// but
// t > b, t < e its equal.. so it has to be absolete less than!
const KeysHandle& pageHandle = *a;
if (pageHandle->size() > 0)
{
return target_ <= (*pageHandle)[pageHandle->size()-1];
}
return true; // empty... should be the last if it exists..
}
};
struct SplitPage
{
Key target_;
SplitPage()
{}
PagedKeys::iterator operator()(const PagedKeys::iterator& a,
const PagedKeys::iterator& b)
{
std::size_t offset = (b-a)/2;
return a + offset;
}
};
struct FindInVector
{
Key target_;
FindInVector(Key target) :
target_(target)
{}
bool operator()(const Keys::iterator& a)
{
return target_ <= (*a);
}
};
struct SplitVector
{
Key target_;
SplitVector(Key target) :
target_(target)
{}
Keys::iterator operator()(const Keys::iterator& a,
const Keys::iterator& b)
{
// std::size_t offset = (b-a)/2;
// return a + offset;
if ((b-a) <= 2)
{
std::size_t offset = (b-a)/2;
return a + offset;
}
Key high = *(b-1);
Key low = *a;
// since we have the values...
if (target_ <= low) return a;
if (target_ >= high) return b-1;
double splitPercent;
splitPercent = static_cast<double>(target_ - low) / static_cast<double>(high-low);
splitPercent = (splitPercent < 0.0) ? 0.0 : splitPercent;
std::size_t range = (b-a);
std::size_t offset = range * splitPercent;
return a + offset;
}
};
// retun a fax iterator to the page and page offset where the item is or should be
std::pair<PagedKeys::iterator, Keys::iterator> find(Key target)
{
std::pair<PagedKeys::iterator, Keys::iterator> iter;
iter.first = search(pages_.begin(),
pages_.end(),
FindInPage(target),
SplitPage());
if (iter.first == pages_.end())
iter.first = pages_.end() - 1;
KeysHandle& page = *(iter.first);
// find the location to add into the list
iter.second = search(page->begin(),
page->end(),
FindInVector(target),
SplitVector(target));
return iter;
}
public:
PagedVector(std::size_t threashold) :
threashold_(threashold),
pages_(),
presence_()
{
for (int i = 0; i < 3; ++i)
presence_[i].resize(filterSize);
// make it a bit more easy on the boarder conditions
KeysHandle newPage(new Keys);
pages_.push_back(newPage);
}
// add the target to the paged data struture
void insert(Key target)
{
#ifdef BLOOMFILTER
// not the best way but assume we are doing a item by item insert
uint64_t hash = std::hash<Key>()(target);
// light the bloomfilters presence bits according to the hash
presence_[0][ hash & filterMask] = true;
presence_[1][(hash >> 8) & filterMask] = true;
presence_[2][(hash >> 16) & filterMask] = true;
#endif
// find the relevent part of the list
std::pair<PagedKeys::iterator, Keys::iterator> iter = find(target);
KeysHandle& page = *(iter.first);
page->insert(iter.second, target);
// then threshold check.
if (page->size() > threashold_)
{
// ok over the threshold.. so divide the list in half
KeysHandle newPage(new Keys);
uint32_t spliceAt = page->size() / 2;
uint32_t sizeAfter = page->size() - spliceAt;
newPage->resize(sizeAfter);
std::memcpy(&((*newPage)[0]), &((*page)[spliceAt]), sizeAfter * sizeof(Key));
page->resize(spliceAt);
pages_.insert(iter.first + 1, newPage);
}
}
bool exists(Key target)
{
#ifdef BLOOMFILTER
uint64_t hash = std::hash<Key>()(target);
// a quick O(1) pre check before the more costly real check
if (not (presence_[0][ hash & filterMask] and
presence_[1][(hash >> 8) & filterMask] and
presence_[2][(hash >> 16) & filterMask]))
return false;
#endif
// ok technically the question was about this.. so no std:: here..
// basically have to implement a better then log(N) search
std::pair<PagedKeys::iterator, Keys::iterator> iter = find(target);
return *(iter.second) == target;
}
const PagedKeys& pages() const
{
return pages_;
}
void print(std::ostream& os) const
{
for ( KeysHandle page : pages_)
{
os << "\n ########################################## \n";
for ( Key item : *page)
{
os << " " << item;
}
}
}
};
std::ostream& operator<<(std::ostream& os, const PagedVector& list)
{
list.print(os);
return os;
}
void validate_sorted(PagedVector& theList)
{
bool failed = false;
bool first = true;
PagedVector::Key prev;
for ( PagedVector::KeysHandle page : theList.pages() )
{
for ( PagedVector::Key item : *page)
{
if (first)
{
first = false;
prev = item;
}
else
{
if (prev > item)
failed = true;
prev = item;
}
}
}
std::cout << (failed ? "FAILED" : "passed")
<< "\n";
}
void test_random()
{
PagedVector theList(5000);
auto istart = std::chrono::system_clock::now();
for (int i = 0; i < 1000000; ++i)
{
PagedVector::Key item = (std::rand() % 100000)*2;
// std::cout << "adding:" << item << "\n";
theList.insert(item);
}
auto iend = std::chrono::system_clock::now();
std::cout << "inserted time:"
<< std::chrono::duration_cast<std::chrono::milliseconds>(iend - istart).count()
<< "mSec \n";
// std::cout << theList << "\n";
int count = 0;
int exist = 0;
auto cstart = std::chrono::system_clock::now();
for (int i = 0; i < 1000000; ++i)
{
PagedVector::Key item = std::rand() % 200000;
if (theList.exists(item))
++exist;
++count;
// std::cout << " " << item << " "
// << (theList.exists(item) ? "exists" : "missing")
// << "\n";
}
auto cend = std::chrono::system_clock::now();
std::cout << "checking time:"
<< std::chrono::duration_cast<std::chrono::milliseconds>(cend - cstart).count()
<< "mSec\n\n";
// expecting a 50% hit rate
std::cout << "checked:"
<< count
<< " exist:" << exist
<< "\n";
validate_sorted(theList);
}
int main()
{
test_random();
}
Output with a bloomfilter is:inserted time:2613mSec checking time:735mSec checked:1000000 exist:499965 passed
The output without the filter is
inserted time:2584mSec checking time:1292mSec checked:1000000 exist:499965 passed
No comments:
Post a Comment