//http://www.impactinterview.com/2009/10/140-google-interview-questions/#software_engineer // You have a stream of infinite queries (ie: real time Google search queries // that people are entering). Describe how you would go about finding a good // estimate of 1000 samples from this never ending set of data and then write // code for it. #include <iostream> #include <iomanip> #include <vector> #include <ctime> // Reservoir Sampling // did it here http://code-slim-jim.blogspot.jp/2010/06/reservoir-sampling.html // but wow 2010 is so long ago.. // ok so the explanation: // you have stream and u need to sample from N "good" items it.. assumably // "good" means uniform here so until you have the first N items u just take // every thing // // now the tricky part when u get the N+1 item each item is therefore supposed // to have a N/(N+1) probability of being in the output so the new item has a // N/(N+1) chance of selection and all the existing items have a 1/N chance of // begin rejected.. // // then we can generalize to the (N+x)-th sample. when u get the N+x item each // item is therefore supposed to have a N/(N+x) probability of being in the // output so the new item has a N/(N+x) chance of selection and all the // existing items have a 1/N chance of begin rejected template <typename Type> class ReservoirSample { public: typedef std::vector<Type> Samples; private: std::size_t size_; std::size_t count_; std::vector<Type> samples_; public: ReservoirSample(unsigned int size) : size_(size), count_(0), samples_() {} float rand() { return static_cast<float>(std::rand() % 1000) / 1000.0; } void sample(Type& item) { count_ += 1; if (samples_.size() < size_) { samples_.push_back(item); } else { float chance = static_cast<float>(size_)/static_cast<float>(count_); if (rand() < chance) { std::size_t idx = size_ * rand(); samples_[idx] = item; } } } std::vector<Type>& samples() { return samples_; } }; void test(int selection, int streamSize) { // testing random stuff can be a pain.. you either de-random it // or you repeat and confirm the expected distrubution (uniform...) const int cycles = 100000; std::vector<int> freq(streamSize,0); for (int j = 0; j < cycles; ++j) { ReservoirSample<int> sampler(selection); for (int i = 0; i < streamSize; ++i) { sampler.sample(i); } const std::vector<int>& samples = sampler.samples(); for ( std::vector<int>::const_iterator sit = samples.begin(); sit != samples.end(); ++sit) { ++(freq[*sit]); } } // each number will come up once in a cycle and "selection" are choosen out of "streamSize" float expected = (static_cast<float>(cycles) * static_cast<float>(selection)) / static_cast<float>(streamSize); // and lets give it +/- 5% .. (which will fail now and then..) float expectMin = 0.95 * expected; float expectMax = 1.05 * expected; for (int i = 0; i < streamSize; ++i) { bool passed = freq[i] > expectMin and freq[i] < expectMax; std::cout << std::setw(3) << i << ":" << freq[i] << " " << (passed ? "passed" : "FAILED") << "\n"; } std::cout << "\n"; } int main() { std::srand(std::time(0)); test(1,30); test(3,30); test(7,30); test(15,30); }
0:3307 passed 1:3302 passed 2:3237 passed 3:3344 passed 4:3308 passed 5:3233 passed 6:3317 passed 7:3257 passed 8:3230 passed 9:3407 passed 10:3310 passed 11:3267 passed 12:3303 passed 13:3414 passed 14:3267 passed 15:3327 passed 16:3333 passed 17:3371 passed 18:3351 passed 19:3364 passed 20:3306 passed 21:3364 passed 22:3402 passed 23:3440 passed 24:3326 passed 25:3390 passed 26:3351 passed 27:3451 passed 28:3289 passed 29:3432 passed 0:10056 passed 1:9997 passed 2:10060 passed 3:10012 passed 4:9919 passed 5:10021 passed 6:9984 passed 7:9945 passed 8:9987 passed 9:10040 passed 10:9919 passed 11:9965 passed 12:9918 passed 13:9904 passed 14:9864 passed 15:9876 passed 16:10043 passed 17:9945 passed 18:10064 passed 19:10025 passed 20:9878 passed 21:10021 passed 22:10108 passed 23:9796 passed 24:10100 passed 25:10025 passed 26:10206 passed 27:10003 passed 28:10172 passed 29:10147 passed 0:23484 passed 1:23126 passed 2:23259 passed 3:23345 passed 4:23290 passed 5:23339 passed 6:23497 passed 7:23348 passed 8:23290 passed 9:23179 passed 10:23603 passed 11:23154 passed 12:23223 passed 13:23333 passed 14:23394 passed 15:23193 passed 16:23546 passed 17:23219 passed 18:23401 passed 19:23201 passed 20:23246 passed 21:23240 passed 22:23621 passed 23:23224 passed 24:23410 passed 25:23376 passed 26:23518 passed 27:23268 passed 28:23348 passed 29:23325 passed 0:49663 passed 1:49911 passed 2:50401 passed 3:49617 passed 4:49691 passed 5:50278 passed 6:49804 passed 7:49940 passed 8:50497 passed 9:49687 passed 10:49582 passed 11:50552 passed 12:49871 passed 13:49670 passed 14:50304 passed 15:50184 passed 16:49988 passed 17:49965 passed 18:50077 passed 19:49945 passed 20:50290 passed 21:50173 passed 22:49972 passed 23:50051 passed 24:49943 passed 25:50053 passed 26:50288 passed 27:49852 passed 28:50009 passed 29:49742 passed
No comments:
Post a Comment