// http://www.impactinterview.com/2009/10/140-google-interview-questions/#software_engineer // // Find or determine non existence of a number in a sorted list // of N numbers where the numbers range over M, M>> N and N large // enough to span multiple disks. Algorithm to beat O(log n) bonus // points for constant time algorithm. // the const time algo is called a "bloom filter" if you work with large // databases you (hopefully) learn this.. what a bloom filter does on insert // is hash your key in 3 ways and then light up flags in 3 indexes.. on lookup // it hashs again and checks the 3 places.. if the any one the flags is off // then you have a item that doesnt exist.. otherwise commence your normal // logN search #include <iostream> #include <vector> #include <stdint.h> #include <memory> #include <cstring> #include <cstdlib> #include <chrono> #define BLOOMFILTER class PagedVector { public: typedef uint64_t Key; // ideally this would be a template param typedef std::vector<Key> Keys; typedef std::shared_ptr<Keys> KeysHandle; typedef std::vector<KeysHandle> PagedKeys; const uint32_t filterSize = 256; // ideally this would be tunable to reduce misses/mem use const uint32_t filterMask = filterSize - 1; private: std::size_t threashold_; PagedKeys pages_; std::vector<bool> presence_[3]; // the bloom filter // the divide part of a divide and conqurer search template <typename Iterator, typename Compare, typename Splitter> struct Divider { Compare& comparer_; Splitter& splitter_; Divider(Compare& comparer, Splitter& splitter) : comparer_(comparer), splitter_(splitter) {} void operator()(Iterator& a, Iterator& b) { // should never happen unless we are empty.. if (b==a) return; Iterator n = splitter_(a,b); if (comparer_(n)) { b = n; return; } a = n + 1; } }; // All divide and conqurer searches do 1 thing.. they split the data in // some way and repeat until the desired location is found.. In this // system we have mutliple uses of the search.. they are used to search and find insertion points template <typename Iterator, typename Compare, typename Splitter> Iterator search(Iterator begin, Iterator end, Compare comparer, Splitter splitter) { // point at the match or the point to insert in if it doesnt match Divider<Iterator, Compare, Splitter> Divider(comparer, splitter); // narrows the sub range in which to keep searching when the range is 1 item we are done.. while (1) { Divider(begin,end); if (end == begin) return begin; } } struct FindInPage { Key target_; FindInPage(Key target) : target_(target) {} bool operator()(const PagedKeys::iterator& a) { // ok slight strangeness here.. pages are a range of numers // t < [b,e] then true // t > [b,e] then false // but // t > b, t < e its equal.. so it has to be absolete less than! const KeysHandle& pageHandle = *a; if (pageHandle->size() > 0) { return target_ <= (*pageHandle)[pageHandle->size()-1]; } return true; // empty... should be the last if it exists.. } }; struct SplitPage { Key target_; SplitPage() {} PagedKeys::iterator operator()(const PagedKeys::iterator& a, const PagedKeys::iterator& b) { std::size_t offset = (b-a)/2; return a + offset; } }; struct FindInVector { Key target_; FindInVector(Key target) : target_(target) {} bool operator()(const Keys::iterator& a) { return target_ <= (*a); } }; struct SplitVector { Key target_; SplitVector(Key target) : target_(target) {} Keys::iterator operator()(const Keys::iterator& a, const Keys::iterator& b) { // std::size_t offset = (b-a)/2; // return a + offset; if ((b-a) <= 2) { std::size_t offset = (b-a)/2; return a + offset; } Key high = *(b-1); Key low = *a; // since we have the values... if (target_ <= low) return a; if (target_ >= high) return b-1; double splitPercent; splitPercent = static_cast<double>(target_ - low) / static_cast<double>(high-low); splitPercent = (splitPercent < 0.0) ? 0.0 : splitPercent; std::size_t range = (b-a); std::size_t offset = range * splitPercent; return a + offset; } }; // retun a fax iterator to the page and page offset where the item is or should be std::pair<PagedKeys::iterator, Keys::iterator> find(Key target) { std::pair<PagedKeys::iterator, Keys::iterator> iter; iter.first = search(pages_.begin(), pages_.end(), FindInPage(target), SplitPage()); if (iter.first == pages_.end()) iter.first = pages_.end() - 1; KeysHandle& page = *(iter.first); // find the location to add into the list iter.second = search(page->begin(), page->end(), FindInVector(target), SplitVector(target)); return iter; } public: PagedVector(std::size_t threashold) : threashold_(threashold), pages_(), presence_() { for (int i = 0; i < 3; ++i) presence_[i].resize(filterSize); // make it a bit more easy on the boarder conditions KeysHandle newPage(new Keys); pages_.push_back(newPage); } // add the target to the paged data struture void insert(Key target) { #ifdef BLOOMFILTER // not the best way but assume we are doing a item by item insert uint64_t hash = std::hash<Key>()(target); // light the bloomfilters presence bits according to the hash presence_[0][ hash & filterMask] = true; presence_[1][(hash >> 8) & filterMask] = true; presence_[2][(hash >> 16) & filterMask] = true; #endif // find the relevent part of the list std::pair<PagedKeys::iterator, Keys::iterator> iter = find(target); KeysHandle& page = *(iter.first); page->insert(iter.second, target); // then threshold check. if (page->size() > threashold_) { // ok over the threshold.. so divide the list in half KeysHandle newPage(new Keys); uint32_t spliceAt = page->size() / 2; uint32_t sizeAfter = page->size() - spliceAt; newPage->resize(sizeAfter); std::memcpy(&((*newPage)[0]), &((*page)[spliceAt]), sizeAfter * sizeof(Key)); page->resize(spliceAt); pages_.insert(iter.first + 1, newPage); } } bool exists(Key target) { #ifdef BLOOMFILTER uint64_t hash = std::hash<Key>()(target); // a quick O(1) pre check before the more costly real check if (not (presence_[0][ hash & filterMask] and presence_[1][(hash >> 8) & filterMask] and presence_[2][(hash >> 16) & filterMask])) return false; #endif // ok technically the question was about this.. so no std:: here.. // basically have to implement a better then log(N) search std::pair<PagedKeys::iterator, Keys::iterator> iter = find(target); return *(iter.second) == target; } const PagedKeys& pages() const { return pages_; } void print(std::ostream& os) const { for ( KeysHandle page : pages_) { os << "\n ########################################## \n"; for ( Key item : *page) { os << " " << item; } } } }; std::ostream& operator<<(std::ostream& os, const PagedVector& list) { list.print(os); return os; } void validate_sorted(PagedVector& theList) { bool failed = false; bool first = true; PagedVector::Key prev; for ( PagedVector::KeysHandle page : theList.pages() ) { for ( PagedVector::Key item : *page) { if (first) { first = false; prev = item; } else { if (prev > item) failed = true; prev = item; } } } std::cout << (failed ? "FAILED" : "passed") << "\n"; } void test_random() { PagedVector theList(5000); auto istart = std::chrono::system_clock::now(); for (int i = 0; i < 1000000; ++i) { PagedVector::Key item = (std::rand() % 100000)*2; // std::cout << "adding:" << item << "\n"; theList.insert(item); } auto iend = std::chrono::system_clock::now(); std::cout << "inserted time:" << std::chrono::duration_cast<std::chrono::milliseconds>(iend - istart).count() << "mSec \n"; // std::cout << theList << "\n"; int count = 0; int exist = 0; auto cstart = std::chrono::system_clock::now(); for (int i = 0; i < 1000000; ++i) { PagedVector::Key item = std::rand() % 200000; if (theList.exists(item)) ++exist; ++count; // std::cout << " " << item << " " // << (theList.exists(item) ? "exists" : "missing") // << "\n"; } auto cend = std::chrono::system_clock::now(); std::cout << "checking time:" << std::chrono::duration_cast<std::chrono::milliseconds>(cend - cstart).count() << "mSec\n\n"; // expecting a 50% hit rate std::cout << "checked:" << count << " exist:" << exist << "\n"; validate_sorted(theList); } int main() { test_random(); }Output with a bloomfilter is:
inserted time:2613mSec checking time:735mSec checked:1000000 exist:499965 passed
The output without the filter is
inserted time:2584mSec checking time:1292mSec checked:1000000 exist:499965 passed
No comments:
Post a Comment