Saturday, July 2, 2016

google interview: uniform sampling from an infinite stream

//http://www.impactinterview.com/2009/10/140-google-interview-questions/#software_engineer
// You have a stream of infinite queries (ie: real time Google search queries
// that people are entering).  Describe how you would go about finding a good
// estimate of 1000 samples from this never ending set of data and then write
// code for it.

#include <iostream>
#include <iomanip>
#include <vector>
#include <ctime>

// Reservoir Sampling
// did it here http://code-slim-jim.blogspot.jp/2010/06/reservoir-sampling.html
// but wow 2010 is so long ago..

// ok so the explanation:
// you have stream and u need to sample from N "good" items it..  assumably
// "good" means uniform here so until you have the first N items u just take
// every thing
//
// now the tricky part when u get the N+1 item each item is therefore supposed
// to have a N/(N+1) probability of being in the output so the new item has a
// N/(N+1) chance of selection and all the existing items have a 1/N chance of
// begin rejected..
//
// then we can generalize to the (N+x)-th sample.  when u get the N+x item each
// item is therefore supposed to have a N/(N+x) probability of being in the
// output so the new item has a N/(N+x) chance of selection and all the
// existing items have a 1/N chance of begin rejected


template <typename Type>
class ReservoirSample
{
 public:
    typedef std::vector<Type> Samples;

 private:
    std::size_t       size_;
    std::size_t       count_;
    std::vector<Type> samples_;

 public:
    ReservoirSample(unsigned int size) :
        size_(size),
        count_(0),
        samples_()
    {}

    float rand()
    {
        return static_cast<float>(std::rand() % 1000) / 1000.0;
    }

    void sample(Type& item)
    {
        count_ += 1;
        if (samples_.size() < size_)
        {
            samples_.push_back(item);
        }
        else
        {
            float chance = static_cast<float>(size_)/static_cast<float>(count_);

            if (rand() < chance)
            {
                std::size_t idx = size_ * rand();
                samples_[idx] = item;
            }
        }
    }

    std::vector<Type>& samples()
    {
        return samples_;
    }
};

void test(int selection,
          int streamSize)
{
    // testing random stuff can be a pain.. you either de-random it
    // or you repeat and confirm the expected distrubution (uniform...)
    const int cycles = 100000;
    std::vector<int> freq(streamSize,0);

    for (int j = 0; j < cycles; ++j)
    {
        ReservoirSample<int> sampler(selection);

        for (int i = 0; i < streamSize; ++i)
        {
            sampler.sample(i);
        }

        const std::vector<int>& samples = sampler.samples();
        for ( std::vector<int>::const_iterator sit = samples.begin();
              sit != samples.end();
              ++sit)
        {
            ++(freq[*sit]);
        }
    }

    // each number will come up once in a cycle and "selection" are choosen out of "streamSize"
    float expected
        = (static_cast<float>(cycles)
           * static_cast<float>(selection))
        / static_cast<float>(streamSize);

    // and lets give it +/- 5% .. (which will fail now and then..)
    float expectMin = 0.95 * expected;
    float expectMax = 1.05 * expected;

    for (int i = 0; i < streamSize; ++i)
    {
        bool passed = freq[i] > expectMin and freq[i] < expectMax;
        std::cout << std::setw(3) << i << ":" << freq[i]
                  << " " << (passed ? "passed" : "FAILED")
                  << "\n";
    }
    std::cout << "\n";
}

int main()
{
    std::srand(std::time(0));

    test(1,30);
    test(3,30);
    test(7,30);
    test(15,30);
}

0:3307 passed
  1:3302 passed
  2:3237 passed
  3:3344 passed
  4:3308 passed
  5:3233 passed
  6:3317 passed
  7:3257 passed
  8:3230 passed
  9:3407 passed
 10:3310 passed
 11:3267 passed
 12:3303 passed
 13:3414 passed
 14:3267 passed
 15:3327 passed
 16:3333 passed
 17:3371 passed
 18:3351 passed
 19:3364 passed
 20:3306 passed
 21:3364 passed
 22:3402 passed
 23:3440 passed
 24:3326 passed
 25:3390 passed
 26:3351 passed
 27:3451 passed
 28:3289 passed
 29:3432 passed

  0:10056 passed
  1:9997 passed
  2:10060 passed
  3:10012 passed
  4:9919 passed
  5:10021 passed
  6:9984 passed
  7:9945 passed
  8:9987 passed
  9:10040 passed
 10:9919 passed
 11:9965 passed
 12:9918 passed
 13:9904 passed
 14:9864 passed
 15:9876 passed
 16:10043 passed
 17:9945 passed
 18:10064 passed
 19:10025 passed
 20:9878 passed
 21:10021 passed
 22:10108 passed
 23:9796 passed
 24:10100 passed
 25:10025 passed
 26:10206 passed
 27:10003 passed
 28:10172 passed
 29:10147 passed

  0:23484 passed
  1:23126 passed
  2:23259 passed
  3:23345 passed
  4:23290 passed
  5:23339 passed
  6:23497 passed
  7:23348 passed
  8:23290 passed
  9:23179 passed
 10:23603 passed
 11:23154 passed
 12:23223 passed
 13:23333 passed
 14:23394 passed
 15:23193 passed
 16:23546 passed
 17:23219 passed
 18:23401 passed
 19:23201 passed
 20:23246 passed
 21:23240 passed
 22:23621 passed
 23:23224 passed
 24:23410 passed
 25:23376 passed
 26:23518 passed
 27:23268 passed
 28:23348 passed
 29:23325 passed

  0:49663 passed
  1:49911 passed
  2:50401 passed
  3:49617 passed
  4:49691 passed
  5:50278 passed
  6:49804 passed
  7:49940 passed
  8:50497 passed
  9:49687 passed
 10:49582 passed
 11:50552 passed
 12:49871 passed
 13:49670 passed
 14:50304 passed
 15:50184 passed
 16:49988 passed
 17:49965 passed
 18:50077 passed
 19:49945 passed
 20:50290 passed
 21:50173 passed
 22:49972 passed
 23:50051 passed
 24:49943 passed
 25:50053 passed
 26:50288 passed
 27:49852 passed
 28:50009 passed
 29:49742 passed

No comments:

Post a Comment