[GpuGraph] fix kernel overflow (PaddlePaddle#138)

* optimize mem in uniq slot feature * cherry-pick var slot_feature * fix kernel overflow && add max feature num flag Co-authored-by: huwei02 <[email protected]>
danleifeng · Oct 18, 2022 · dde1573 · dde1573
1 parent 4e8b290
commit dde1573
Show file tree

Hide file tree

Showing 4 changed files with 47 additions and 6 deletions.
diff --git a/paddle/fluid/distributed/ps/table/common_graph_table.cc b/paddle/fluid/distributed/ps/table/common_graph_table.cc
@@ -34,6 +34,7 @@
 DECLARE_bool(graph_load_in_parallel);
 DECLARE_bool(graph_get_neighbor_id);
 DECLARE_int32(gpugraph_storage_mode);
+DECLARE_uint64(gpugraph_slot_feasign_max_num);
 
 namespace paddle {
 namespace distributed {
@@ -2010,8 +2011,8 @@ int GraphTable::parse_feature(int idx,
   thread_local std::vector<paddle::string::str_ptr> fea_fields;
   fea_fields.clear();
   c = feature_separator_.at(0);
-  paddle::string::split_string_ptr(fields[1].ptr, fields[1].len, c, &fea_fields);
-
+  paddle::string::split_string_ptr(fields[1].ptr, fields[1].len, c, &fea_fields, FLAGS_gpugraph_slot_feasign_max_num);
+  
   std::string name = fields[0].to_string();
   auto it = feat_id_map[idx].find(name);
   if (it != feat_id_map[idx].end()) {

diff --git a/paddle/fluid/framework/fleet/heter_ps/feature_value.cu b/paddle/fluid/framework/fleet/heter_ps/feature_value.cu
@@ -60,11 +60,11 @@ __global__ void PullDedupCopy(const size_t N,
                               const int64_t* slot_lens,
                               uint64_t max_val_size,
                               const int* slot_dims,
-                              const int hidden,
+                              const size_t hidden,
                               const int* key2slot,
                               const uint32_t* restore_idx,
                               TAccess accessor) {
-  CUDA_KERNEL_LOOP(idx, N) {
+  CUDA_KERNEL_LOOP_TYPE(idx, N, size_t) {
     int i = idx / hidden;
     int off = idx % hidden;
 
@@ -158,7 +158,7 @@ __global__ void PushMergeCopyAtomic(const size_t N,
                                     const uint32_t* d_restore_idx,
                                     size_t grad_value_size,
                                     TAccess accessor) {
-  CUDA_KERNEL_LOOP(idx, N) {
+  CUDA_KERNEL_LOOP_TYPE(idx, N, size_t) {
     int i = idx / hidden;
     int off = idx % hidden;
     // filter 0 keys
@@ -224,7 +224,7 @@ __global__ void PushMergeCopy(const size_t N,
                               const uint32_t* d_sort_cnt,
                               size_t grad_value_size,
                               TAccess accessor) {
-  CUDA_KERNEL_LOOP(idx, N) {
+  CUDA_KERNEL_LOOP_TYPE(idx, N, size_t) {
     int i = idx / hidden;
     int off = idx % hidden;
     // filter 0 keys

diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc
@@ -973,6 +973,10 @@ PADDLE_DEFINE_EXPORTED_uint64(
     gpugraph_merge_grads_segment_size,
     128,
     "segment size with segment gradient merge, default 128");
+PADDLE_DEFINE_EXPORTED_uint64(
+    gpugraph_slot_feasign_max_num,
+    5,
+    "max feasign number in one slot, default 5");
 PADDLE_DEFINE_EXPORTED_int32(
     gpugraph_dedup_pull_push_mode,
     0,

diff --git a/paddle/utils/string/string_helper.h b/paddle/utils/string/string_helper.h
@@ -334,6 +334,42 @@ inline int split_string_ptr(const char* str,
   return num;
 }
 
+inline int split_string_ptr(const char* str,
+                            size_t len,
+                            char delim,
+                            std::vector<str_ptr>* values,
+                            int max_num) {
+  if (len <= 0) {
+    return 0;
+  }
+
+  int num = 0;
+  const char* p = str;
+  const char* end = str + len;
+  const char* last = str;
+  while (p < end) {
+    if (*p != delim) {
+      ++p;
+      continue;
+    }
+    values->emplace_back(last, (size_t)(p - last));
+    ++num;
+    ++p;
+    if (num >= max_num) {
+      return num;
+    }
+    // skip continue delim
+    while (*p == delim) {
+      ++p;
+    }
+    last = p;
+  }
+  if (p > last) {
+    values->emplace_back(last, (size_t)(p - last));
+    ++num;
+  }
+  return num;
+}
 // A helper class for reading lines from file. A line buffer is maintained. It
 // doesn't need to know the maximum possible length of a line.