Skip to content

Commit

Permalink
add data sampling
Browse files Browse the repository at this point in the history
  • Loading branch information
qyliu-hkust committed Aug 23, 2024
1 parent 80d0c5b commit 4d1064a
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 13 deletions.
20 changes: 13 additions & 7 deletions main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ auto bench_pgm(const std::vector<uint64_t>& data, const std::vector<uint64_t>& q
volatile uint64_t res = 0;

std::cout << "Construct PGM index eps_l=" << Epsilon << " eps_i=" << EpsilonRecursive << std::endl;
pgm::PGMIndex<uint64_t, Epsilon, EpsilonRecursive, true, 16, float> index_branchless(data.begin(), data.end()-1);
pgm::PGMIndex<uint64_t, Epsilon, EpsilonRecursive, true, 0, float> index_branchless(data.begin(), data.end()-1);

// branchless PGM without last-mile search
auto start = std::chrono::high_resolution_clock::now();
Expand Down Expand Up @@ -131,8 +131,9 @@ auto bench_pgm(const std::vector<uint64_t>& data, const std::vector<uint64_t>& q
// to avoid influence of hot cache
std::vector<uint64_t> data_cpy(data);
std::vector<uint64_t> queries_cpy(queries);

std::cout << "Construct PGM index eps_l=" << Epsilon << " eps_i=" << EpsilonRecursive << std::endl;
pgm::PGMIndex<uint64_t, Epsilon, EpsilonRecursive, false, 16, float> index(data_cpy.begin(), data_cpy.end()-1);
pgm::PGMIndex<uint64_t, Epsilon, EpsilonRecursive, false, 0, float> index(data_cpy.begin(), data_cpy.end()-1);
// branchy PGM without last-mile search
start = std::chrono::high_resolution_clock::now();
for (auto q : queries_cpy) {
Expand Down Expand Up @@ -166,13 +167,18 @@ int main(int argc, const char * argv[]) {
// bench_search_repeat(20, 500, "/Users/liuqiyu/Desktop/bench_search_result_new.csv");
// exit(0);

const std::string fname = "/Users/liuqiyu/Desktop/SOSD_data/books_800M_uint64";
const size_t nq = 200;
const std::string fname = "/Users/liuqiyu/Desktop/SOSD_data/osm_cellids_800M_uint64";
const size_t nq = 500;
const size_t repeat = 10;

std::cout << "Load data from " << fname << std::endl;
auto data = benchmark::load_data<uint64_t>(fname);
std::sort(data.begin(), data.end()-1);
auto data = benchmark::load_data<uint64_t>(fname, true, 200000000);
std::sort(data.begin(), data.end());

auto data_stats = benchmark::get_data_stats(data);
std::cout << "mean: " << data_stats.mean
<< " variance: " << data_stats.var
<< " hardness ratio: " << data_stats.var/data_stats.mean << std::endl;

std::vector<std::pair<size_t, stats>> bench_results;

Expand Down Expand Up @@ -243,7 +249,7 @@ int main(int argc, const char * argv[]) {
}

// start from 7 cold cache config
std::ofstream ofs("/Users/liuqiyu/Desktop/bench_pgm_result_books_repeat_10_0250.csv");
std::ofstream ofs("/Users/liuqiyu/Desktop/bench_pgm_result_osm_repeat_10_2103.csv");
ofs << "round,eps_l,eps_i,levels,lls,ils,latency_branchy_i,latency_branchy_l,latency_branchless_i,latency_branchless_l" << std::endl;

for (auto br : bench_results) {
Expand Down
21 changes: 15 additions & 6 deletions utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,7 @@ static uint64_t timing(std::function<void()> fn) {

// Loads values from binary file into vector.
template <typename T>
static std::vector<T> load_data(const std::string& filename,
bool print = true) {
static std::vector<T> load_data(const std::string& filename, bool print = true, size_t sample_size = 0) {
std::vector<T> data;
const uint64_t ns = timing([&] {
std::ifstream in(filename, std::ios::binary);
Expand All @@ -46,8 +45,16 @@ static std::vector<T> load_data(const std::string& filename,
<< ms << " ms (" << static_cast<double>(data.size()) / 1000 / ms
<< " M values/s)" << std::endl;
}

return data;

if (sample_size > 0) {
std::random_device rd;
std::mt19937 gen(rd());
std::vector<T> sample;
std::sample(data.begin(), data.end()-1, std::back_inserter(sample), sample_size, gen);
return sample;
} else {
return data;
}
}

template <typename K>
Expand All @@ -58,8 +65,10 @@ auto get_data_stats(const std::vector<K>& data) {
}
const auto n = gaps.size();
double mean = std::accumulate(gaps.begin(), gaps.end(), 0.0)/n;
double sq_sum = std::inner_product(gaps.begin(), gaps.end(), gaps.begin(), 0.0);
double var = sq_sum/n - mean*mean;
double sq_sum = std::accumulate(gaps.begin(), gaps.end(), 0.0, [mean](double acc, double val) {
return acc + (val - mean) * (val - mean);
});
double var = sq_sum/n;
struct data_stats {double mean; double var;};
return data_stats {mean, var};
}
Expand Down

0 comments on commit 4d1064a

Please sign in to comment.