## Saturday, July 2, 2016

### google interview: uniform sampling from an infinite stream

//http://www.impactinterview.com/2009/10/140-google-interview-questions/#software_engineer
// You have a stream of infinite queries (ie: real time Google search queries
// that people are entering).  Describe how you would go about finding a good
// estimate of 1000 samples from this never ending set of data and then write
// code for it.

#include <iostream>
#include <iomanip>
#include <vector>
#include <ctime>

// Reservoir Sampling
// did it here http://code-slim-jim.blogspot.jp/2010/06/reservoir-sampling.html
// but wow 2010 is so long ago..

// ok so the explanation:
// you have stream and u need to sample from N "good" items it..  assumably
// "good" means uniform here so until you have the first N items u just take
// every thing
//
// now the tricky part when u get the N+1 item each item is therefore supposed
// to have a N/(N+1) probability of being in the output so the new item has a
// N/(N+1) chance of selection and all the existing items have a 1/N chance of
// begin rejected..
//
// then we can generalize to the (N+x)-th sample.  when u get the N+x item each
// item is therefore supposed to have a N/(N+x) probability of being in the
// output so the new item has a N/(N+x) chance of selection and all the
// existing items have a 1/N chance of begin rejected

template <typename Type>
class ReservoirSample
{
public:
typedef std::vector<Type> Samples;

private:
std::size_t       size_;
std::size_t       count_;
std::vector<Type> samples_;

public:
ReservoirSample(unsigned int size) :
size_(size),
count_(0),
samples_()
{}

float rand()
{
return static_cast<float>(std::rand() % 1000) / 1000.0;
}

void sample(Type& item)
{
count_ += 1;
if (samples_.size() < size_)
{
samples_.push_back(item);
}
else
{
float chance = static_cast<float>(size_)/static_cast<float>(count_);

if (rand() < chance)
{
std::size_t idx = size_ * rand();
samples_[idx] = item;
}
}
}

std::vector<Type>& samples()
{
return samples_;
}
};

void test(int selection,
int streamSize)
{
// testing random stuff can be a pain.. you either de-random it
// or you repeat and confirm the expected distrubution (uniform...)
const int cycles = 100000;
std::vector<int> freq(streamSize,0);

for (int j = 0; j < cycles; ++j)
{
ReservoirSample<int> sampler(selection);

for (int i = 0; i < streamSize; ++i)
{
sampler.sample(i);
}

const std::vector<int>& samples = sampler.samples();
for ( std::vector<int>::const_iterator sit = samples.begin();
sit != samples.end();
++sit)
{
++(freq[*sit]);
}
}

// each number will come up once in a cycle and "selection" are choosen out of "streamSize"
float expected
= (static_cast<float>(cycles)
* static_cast<float>(selection))
/ static_cast<float>(streamSize);

// and lets give it +/- 5% .. (which will fail now and then..)
float expectMin = 0.95 * expected;
float expectMax = 1.05 * expected;

for (int i = 0; i < streamSize; ++i)
{
bool passed = freq[i] > expectMin and freq[i] < expectMax;
std::cout << std::setw(3) << i << ":" << freq[i]
<< " " << (passed ? "passed" : "FAILED")
<< "\n";
}
std::cout << "\n";
}

int main()
{
std::srand(std::time(0));

test(1,30);
test(3,30);
test(7,30);
test(15,30);
}


0:3307 passed
1:3302 passed
2:3237 passed
3:3344 passed
4:3308 passed
5:3233 passed
6:3317 passed
7:3257 passed
8:3230 passed
9:3407 passed
10:3310 passed
11:3267 passed
12:3303 passed
13:3414 passed
14:3267 passed
15:3327 passed
16:3333 passed
17:3371 passed
18:3351 passed
19:3364 passed
20:3306 passed
21:3364 passed
22:3402 passed
23:3440 passed
24:3326 passed
25:3390 passed
26:3351 passed
27:3451 passed
28:3289 passed
29:3432 passed

0:10056 passed
1:9997 passed
2:10060 passed
3:10012 passed
4:9919 passed
5:10021 passed
6:9984 passed
7:9945 passed
8:9987 passed
9:10040 passed
10:9919 passed
11:9965 passed
12:9918 passed
13:9904 passed
14:9864 passed
15:9876 passed
16:10043 passed
17:9945 passed
18:10064 passed
19:10025 passed
20:9878 passed
21:10021 passed
22:10108 passed
23:9796 passed
24:10100 passed
25:10025 passed
26:10206 passed
27:10003 passed
28:10172 passed
29:10147 passed

0:23484 passed
1:23126 passed
2:23259 passed
3:23345 passed
4:23290 passed
5:23339 passed
6:23497 passed
7:23348 passed
8:23290 passed
9:23179 passed
10:23603 passed
11:23154 passed
12:23223 passed
13:23333 passed
14:23394 passed
15:23193 passed
16:23546 passed
17:23219 passed
18:23401 passed
19:23201 passed
20:23246 passed
21:23240 passed
22:23621 passed
23:23224 passed
24:23410 passed
25:23376 passed
26:23518 passed
27:23268 passed
28:23348 passed
29:23325 passed

0:49663 passed
1:49911 passed
2:50401 passed
3:49617 passed
4:49691 passed
5:50278 passed
6:49804 passed
7:49940 passed
8:50497 passed
9:49687 passed
10:49582 passed
11:50552 passed
12:49871 passed
13:49670 passed
14:50304 passed
15:50184 passed
16:49988 passed
17:49965 passed
18:50077 passed
19:49945 passed
20:50290 passed
21:50173 passed
22:49972 passed
23:50051 passed
24:49943 passed
25:50053 passed
26:50288 passed
27:49852 passed
28:50009 passed
29:49742 passed