Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[BUG] Value error when using insert_or_assign in 1<<24 capacity #203

Open
Lifann opened this issue Aug 20, 2024 · 0 comments
Open

[BUG] Value error when using insert_or_assign in 1<<24 capacity #203

Lifann opened this issue Aug 20, 2024 · 0 comments

Comments

@Lifann
Copy link
Collaborator

Lifann commented Aug 20, 2024

GPU: A100
gcc: 9.2

I my tests, I found that the insert_or_assign API will meet value error when the capacity is large. And here is the code to reproduce the error:

#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <algorithm>
#include <array>
#include <iostream>
#include <thread>
#include <vector>
#include <map>
#include <unordered_map>
#include "merlin_hashtable.cuh"
#include "merlin/types.cuh"
#include "test_util.cuh"

using K = uint64_t;
using V = float;
using S = uint64_t;
using i64 = int64_t;
using u64 = uint64_t;
using f32 = float;
using EvictStrategy = nv::merlin::EvictStrategy;
using TableOptions = nv::merlin::HashTableOptions;

template <class K, class S>
struct ExportIfPredFunctor {
  __forceinline__ __device__ bool operator()(const K& key, S& score,
                                             const K& pattern,
                                             const S& threshold) {
    return score < threshold;
  }
};

void test_export_batch_if_with_limited_size() {
  constexpr uint64_t CAP = 1llu << 24;
  size_t n0 = (1llu << 23) - 163;
  size_t n1 = (1llu << 23) + 221;
  size_t n2 = (1llu << 23) - 17;
  size_t dim = 64;
  size_t table_size = 0;
  i64 pattern = 0;
  u64 threshold = 40;

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  TableOptions options;
  options.init_capacity = CAP;
  options.max_capacity = CAP;
  options.dim = dim;
  options.max_hbm_for_vectors = nv::merlin::GB(100);
  using Table = nv::merlin::HashTable<i64, f32, u64, EvictStrategy::kCustomized>;

  std::unique_ptr<Table> table = std::make_unique<Table>();
  table->init(options);

  size_t* d_cnt = nullptr;
  CUDA_CHECK(cudaMallocAsync(&d_cnt, sizeof(size_t), stream));
  CUDA_CHECK(cudaMemsetAsync(d_cnt, 0, sizeof(size_t), stream));

  test_util::KVMSBuffer<i64, f32, u64> buffer0;
  buffer0.Reserve(n0, dim, stream);
  buffer0.ToRange(0, 1, stream);
  buffer0.Setscore((u64)15, stream);
  {
    test_util::KVMSBuffer<i64, f32, u64> buffer0_ev;
    buffer0_ev.Reserve(n0, dim, stream);
    buffer0_ev.ToZeros(stream);
    table->insert_or_assign(n0, buffer0.keys_ptr(), buffer0.values_ptr(), buffer0.scores_ptr(), stream, true, false);
    //table->insert_and_evict(n0, buffer0.keys_ptr(), buffer0.values_ptr(), buffer0.scores_ptr(),
    //                        buffer0_ev.keys_ptr(), buffer0_ev.values_ptr(), buffer0_ev.scores_ptr(),
    //                        d_cnt, stream, true, false);
    table_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    MERLIN_EXPECT_TRUE(table_size == n0, "Invalid table size.");
  }

  test_util::KVMSBuffer<i64, f32, u64> buffer1;
  buffer1.Reserve(n1, dim, stream);
  buffer1.ToRange(n0, 1, stream);
  buffer1.Setscore((u64)30, stream);
        {
    test_util::KVMSBuffer<i64, f32, u64> buffer1_ev;
    buffer1_ev.Reserve(n0, dim, stream);
    buffer1_ev.ToZeros(stream);
    table->insert_or_assign(n1, buffer1.keys_ptr(), buffer1.values_ptr(), buffer1.scores_ptr(), stream, true, false);
    //table->insert_and_evict(n0, buffer1.keys_ptr(), buffer1.values_ptr(), buffer1.scores_ptr(),
    //                        buffer1_ev.keys_ptr(), buffer1_ev.values_ptr(), buffer1_ev.scores_ptr(),
    //                        d_cnt, stream, true, false);
    table_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
        }

  test_util::KVMSBuffer<i64, f32, u64> buffer2;
  buffer2.Reserve(n2, dim, stream);
  buffer2.ToRange(n0 + n1, 1, stream);
  buffer2.Setscore((u64)45, stream);
        {
    test_util::KVMSBuffer<i64, f32, u64> buffer2_ev;
    buffer2_ev.Reserve(n0, dim, stream);
    buffer2_ev.ToZeros(stream);
    table->insert_or_assign(n2, buffer2.keys_ptr(), buffer2.values_ptr(), buffer2.scores_ptr(), stream, true, false);
    //table->insert_and_evict(n0, buffer2.keys_ptr(), buffer2.values_ptr(), buffer2.scores_ptr(),
    //                        buffer2_ev.keys_ptr(), buffer2_ev.values_ptr(), buffer2_ev.scores_ptr(),
    //                        d_cnt, stream, true, false);
    table_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    printf("final size: %zu, capacity: %zu\n", table_size, table->capacity());
        }

  size_t h_cnt2 = 0;

  test_util::KVMSBuffer<i64, f32, u64> buffer_out;
  buffer_out.Reserve(CAP, dim, stream);
  buffer_out.ToZeros(stream);

  CUDA_CHECK(cudaMemsetAsync(d_cnt, 0, sizeof(size_t), stream));
  CUDA_CHECK(cudaStreamSynchronize(stream));

  bool use_pin = false;

  uint64_t t0 = test_util::getTimestamp();
  cudaEvent_t start, stop;
  cudaEventCreate(&start);
  cudaEventCreate(&stop);
  cudaEventRecord(start);
  table->export_batch_if<ExportIfPredFunctor>(pattern, threshold,
                                              static_cast<size_t>(CAP), 0,
                                              d_cnt, buffer_out.keys_ptr(!use_pin),
                                              buffer_out.values_ptr(!use_pin),
                                              buffer_out.scores_ptr(!use_pin),
                                              stream);
  cudaEventRecord(stop);
  CUDA_CHECK(cudaMemcpyAsync(&h_cnt2, d_cnt, sizeof(size_t), cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaStreamSynchronize(stream));
  printf("final h_cnt2: %zu\n", h_cnt2);

  float cu_cost = 0;
  cudaEventElapsedTime(&cu_cost, start, stop);
  uint64_t t1 = test_util::getTimestamp();
  printf("final h_cnt2: %zu, cost: %zu, cu_cost: %f\n", h_cnt2, t1 - t0, cu_cost);

  if (!use_pin) {
    buffer_out.SyncData(false, stream);
  }
  CUDA_CHECK(cudaStreamSynchronize(stream));

  std::unordered_map<i64, u64> record;
  for (size_t i = 0; i < h_cnt2; i++) {
    i64 key = buffer_out.keys_ptr(false)[i];
    u64 score = buffer_out.scores_ptr(false)[i];
    MERLIN_EXPECT_TRUE(score < threshold, "");
    record[key] = score;
    for (int j = 0; j < dim; j++) {
      f32 value = buffer_out.values_ptr(false)[i * dim + j];
      //MERLIN_EXPECT_TRUE(key == static_cast<i64>(value), "");
      if (key != static_cast<i64>(value) && j == 0) {
        printf("----> check key(%lld) != value(%f) at i: %zu, j: %d\n", key, value, i, j);
      }
    }
  }
  MERLIN_EXPECT_TRUE(record.size() == h_cnt2, "");
  printf("record: %zu\n", record.size());
  printf("n0+n1: %zu\n", n0+n1);
  printf("n0+n1+n2: %zu\n", n0+n1+n2);
  printf("done\n");
}


int main() {
  test_export_batch_if_with_limited_size();
  return 0;
}

In the test cast, it expects value equals to key for every vec. But it will get invalid outputs in stdout.

final size: 16777216, capacity: 16777216
final h_cnt2: 8388625
final h_cnt2: 8388625, cost: 24, cu_cost: 24.695711
----> check key(16777235) != value(16777236.000000) at i: 376767, j: 0
----> check key(16777259) != value(16777260.000000) at i: 387306, j: 0
----> check key(16777241) != value(16777240.000000) at i: 490218, j: 0
----> check key(16777267) != value(16777268.000000) at i: 647383, j: 0
----> check key(16777239) != value(16777240.000000) at i: 1051866, j: 0
----> check key(16777243) != value(16777244.000000) at i: 1094834, j: 0
----> check key(16777251) != value(16777252.000000) at i: 1239658, j: 0
----> check key(16777223) != value(16777224.000000) at i: 1326925, j: 0
----> check key(16777273) != value(16777272.000000) at i: 1553255, j: 0
----> check key(16777245) != value(16777244.000000) at i: 2216367, j: 0
----> check key(16777229) != value(16777228.000000) at i: 3559115, j: 0
----> check key(16777271) != value(16777272.000000) at i: 4042540, j: 0
----> check key(16777237) != value(16777236.000000) at i: 4183398, j: 0
----> check key(16777233) != value(16777232.000000) at i: 4776781, j: 0
----> check key(16777225) != value(16777224.000000) at i: 5460733, j: 0
----> check key(16777227) != value(16777228.000000) at i: 5616921, j: 0
----> check key(16777221) != value(16777220.000000) at i: 6148795, j: 0
----> check key(16777261) != value(16777260.000000) at i: 6203695, j: 0
----> check key(16777249) != value(16777248.000000) at i: 6227725, j: 0
----> check key(16777255) != value(16777256.000000) at i: 6274915, j: 0
----> check key(16777257) != value(16777256.000000) at i: 6398638, j: 0
----> check key(16777217) != value(16777216.000000) at i: 6431965, j: 0
----> check key(16777263) != value(16777264.000000) at i: 6547061, j: 0
----> check key(16777247) != value(16777248.000000) at i: 6741382, j: 0
----> check key(16777231) != value(16777232.000000) at i: 7109209, j: 0
----> check key(16777253) != value(16777252.000000) at i: 7387730, j: 0
----> check key(16777269) != value(16777268.000000) at i: 8151589, j: 0
----> check key(16777219) != value(16777220.000000) at i: 8273935, j: 0
record: 8388625
n0+n1: 16777274
n0+n1+n2: 25165865

The error will not occur if using insert_and_evict or the capacity is 1<<23 or smaller.

@Lifann Lifann changed the title [BUG] Value error of insert_or_assign [BUG] Value error when using insert_or_assign in 1<<24 capacity Aug 20, 2024
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant