We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
GPU: A100 gcc: 9.2
I my tests, I found that the insert_or_assign API will meet value error when the capacity is large. And here is the code to reproduce the error:
insert_or_assign
#include <math.h> #include <stdio.h> #include <stdlib.h> #include <algorithm> #include <array> #include <iostream> #include <thread> #include <vector> #include <map> #include <unordered_map> #include "merlin_hashtable.cuh" #include "merlin/types.cuh" #include "test_util.cuh" using K = uint64_t; using V = float; using S = uint64_t; using i64 = int64_t; using u64 = uint64_t; using f32 = float; using EvictStrategy = nv::merlin::EvictStrategy; using TableOptions = nv::merlin::HashTableOptions; template <class K, class S> struct ExportIfPredFunctor { __forceinline__ __device__ bool operator()(const K& key, S& score, const K& pattern, const S& threshold) { return score < threshold; } }; void test_export_batch_if_with_limited_size() { constexpr uint64_t CAP = 1llu << 24; size_t n0 = (1llu << 23) - 163; size_t n1 = (1llu << 23) + 221; size_t n2 = (1llu << 23) - 17; size_t dim = 64; size_t table_size = 0; i64 pattern = 0; u64 threshold = 40; cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); TableOptions options; options.init_capacity = CAP; options.max_capacity = CAP; options.dim = dim; options.max_hbm_for_vectors = nv::merlin::GB(100); using Table = nv::merlin::HashTable<i64, f32, u64, EvictStrategy::kCustomized>; std::unique_ptr<Table> table = std::make_unique<Table>(); table->init(options); size_t* d_cnt = nullptr; CUDA_CHECK(cudaMallocAsync(&d_cnt, sizeof(size_t), stream)); CUDA_CHECK(cudaMemsetAsync(d_cnt, 0, sizeof(size_t), stream)); test_util::KVMSBuffer<i64, f32, u64> buffer0; buffer0.Reserve(n0, dim, stream); buffer0.ToRange(0, 1, stream); buffer0.Setscore((u64)15, stream); { test_util::KVMSBuffer<i64, f32, u64> buffer0_ev; buffer0_ev.Reserve(n0, dim, stream); buffer0_ev.ToZeros(stream); table->insert_or_assign(n0, buffer0.keys_ptr(), buffer0.values_ptr(), buffer0.scores_ptr(), stream, true, false); //table->insert_and_evict(n0, buffer0.keys_ptr(), buffer0.values_ptr(), buffer0.scores_ptr(), // buffer0_ev.keys_ptr(), buffer0_ev.values_ptr(), buffer0_ev.scores_ptr(), // d_cnt, stream, true, false); table_size = table->size(stream); CUDA_CHECK(cudaStreamSynchronize(stream)); MERLIN_EXPECT_TRUE(table_size == n0, "Invalid table size."); } test_util::KVMSBuffer<i64, f32, u64> buffer1; buffer1.Reserve(n1, dim, stream); buffer1.ToRange(n0, 1, stream); buffer1.Setscore((u64)30, stream); { test_util::KVMSBuffer<i64, f32, u64> buffer1_ev; buffer1_ev.Reserve(n0, dim, stream); buffer1_ev.ToZeros(stream); table->insert_or_assign(n1, buffer1.keys_ptr(), buffer1.values_ptr(), buffer1.scores_ptr(), stream, true, false); //table->insert_and_evict(n0, buffer1.keys_ptr(), buffer1.values_ptr(), buffer1.scores_ptr(), // buffer1_ev.keys_ptr(), buffer1_ev.values_ptr(), buffer1_ev.scores_ptr(), // d_cnt, stream, true, false); table_size = table->size(stream); CUDA_CHECK(cudaStreamSynchronize(stream)); } test_util::KVMSBuffer<i64, f32, u64> buffer2; buffer2.Reserve(n2, dim, stream); buffer2.ToRange(n0 + n1, 1, stream); buffer2.Setscore((u64)45, stream); { test_util::KVMSBuffer<i64, f32, u64> buffer2_ev; buffer2_ev.Reserve(n0, dim, stream); buffer2_ev.ToZeros(stream); table->insert_or_assign(n2, buffer2.keys_ptr(), buffer2.values_ptr(), buffer2.scores_ptr(), stream, true, false); //table->insert_and_evict(n0, buffer2.keys_ptr(), buffer2.values_ptr(), buffer2.scores_ptr(), // buffer2_ev.keys_ptr(), buffer2_ev.values_ptr(), buffer2_ev.scores_ptr(), // d_cnt, stream, true, false); table_size = table->size(stream); CUDA_CHECK(cudaStreamSynchronize(stream)); printf("final size: %zu, capacity: %zu\n", table_size, table->capacity()); } size_t h_cnt2 = 0; test_util::KVMSBuffer<i64, f32, u64> buffer_out; buffer_out.Reserve(CAP, dim, stream); buffer_out.ToZeros(stream); CUDA_CHECK(cudaMemsetAsync(d_cnt, 0, sizeof(size_t), stream)); CUDA_CHECK(cudaStreamSynchronize(stream)); bool use_pin = false; uint64_t t0 = test_util::getTimestamp(); cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start); table->export_batch_if<ExportIfPredFunctor>(pattern, threshold, static_cast<size_t>(CAP), 0, d_cnt, buffer_out.keys_ptr(!use_pin), buffer_out.values_ptr(!use_pin), buffer_out.scores_ptr(!use_pin), stream); cudaEventRecord(stop); CUDA_CHECK(cudaMemcpyAsync(&h_cnt2, d_cnt, sizeof(size_t), cudaMemcpyDeviceToHost, stream)); CUDA_CHECK(cudaStreamSynchronize(stream)); printf("final h_cnt2: %zu\n", h_cnt2); float cu_cost = 0; cudaEventElapsedTime(&cu_cost, start, stop); uint64_t t1 = test_util::getTimestamp(); printf("final h_cnt2: %zu, cost: %zu, cu_cost: %f\n", h_cnt2, t1 - t0, cu_cost); if (!use_pin) { buffer_out.SyncData(false, stream); } CUDA_CHECK(cudaStreamSynchronize(stream)); std::unordered_map<i64, u64> record; for (size_t i = 0; i < h_cnt2; i++) { i64 key = buffer_out.keys_ptr(false)[i]; u64 score = buffer_out.scores_ptr(false)[i]; MERLIN_EXPECT_TRUE(score < threshold, ""); record[key] = score; for (int j = 0; j < dim; j++) { f32 value = buffer_out.values_ptr(false)[i * dim + j]; //MERLIN_EXPECT_TRUE(key == static_cast<i64>(value), ""); if (key != static_cast<i64>(value) && j == 0) { printf("----> check key(%lld) != value(%f) at i: %zu, j: %d\n", key, value, i, j); } } } MERLIN_EXPECT_TRUE(record.size() == h_cnt2, ""); printf("record: %zu\n", record.size()); printf("n0+n1: %zu\n", n0+n1); printf("n0+n1+n2: %zu\n", n0+n1+n2); printf("done\n"); } int main() { test_export_batch_if_with_limited_size(); return 0; }
In the test cast, it expects value equals to key for every vec. But it will get invalid outputs in stdout.
final size: 16777216, capacity: 16777216 final h_cnt2: 8388625 final h_cnt2: 8388625, cost: 24, cu_cost: 24.695711 ----> check key(16777235) != value(16777236.000000) at i: 376767, j: 0 ----> check key(16777259) != value(16777260.000000) at i: 387306, j: 0 ----> check key(16777241) != value(16777240.000000) at i: 490218, j: 0 ----> check key(16777267) != value(16777268.000000) at i: 647383, j: 0 ----> check key(16777239) != value(16777240.000000) at i: 1051866, j: 0 ----> check key(16777243) != value(16777244.000000) at i: 1094834, j: 0 ----> check key(16777251) != value(16777252.000000) at i: 1239658, j: 0 ----> check key(16777223) != value(16777224.000000) at i: 1326925, j: 0 ----> check key(16777273) != value(16777272.000000) at i: 1553255, j: 0 ----> check key(16777245) != value(16777244.000000) at i: 2216367, j: 0 ----> check key(16777229) != value(16777228.000000) at i: 3559115, j: 0 ----> check key(16777271) != value(16777272.000000) at i: 4042540, j: 0 ----> check key(16777237) != value(16777236.000000) at i: 4183398, j: 0 ----> check key(16777233) != value(16777232.000000) at i: 4776781, j: 0 ----> check key(16777225) != value(16777224.000000) at i: 5460733, j: 0 ----> check key(16777227) != value(16777228.000000) at i: 5616921, j: 0 ----> check key(16777221) != value(16777220.000000) at i: 6148795, j: 0 ----> check key(16777261) != value(16777260.000000) at i: 6203695, j: 0 ----> check key(16777249) != value(16777248.000000) at i: 6227725, j: 0 ----> check key(16777255) != value(16777256.000000) at i: 6274915, j: 0 ----> check key(16777257) != value(16777256.000000) at i: 6398638, j: 0 ----> check key(16777217) != value(16777216.000000) at i: 6431965, j: 0 ----> check key(16777263) != value(16777264.000000) at i: 6547061, j: 0 ----> check key(16777247) != value(16777248.000000) at i: 6741382, j: 0 ----> check key(16777231) != value(16777232.000000) at i: 7109209, j: 0 ----> check key(16777253) != value(16777252.000000) at i: 7387730, j: 0 ----> check key(16777269) != value(16777268.000000) at i: 8151589, j: 0 ----> check key(16777219) != value(16777220.000000) at i: 8273935, j: 0 record: 8388625 n0+n1: 16777274 n0+n1+n2: 25165865
The error will not occur if using insert_and_evict or the capacity is 1<<23 or smaller.
insert_and_evict
The text was updated successfully, but these errors were encountered:
No branches or pull requests
GPU: A100
gcc: 9.2
I my tests, I found that the
insert_or_assign
API will meet value error when the capacity is large. And here is the code to reproduce the error:In the test cast, it expects value equals to key for every vec. But it will get invalid outputs in stdout.
The error will not occur if using
insert_and_evict
or the capacity is 1<<23 or smaller.The text was updated successfully, but these errors were encountered: