add show_mem (PaddlePaddle#249)

Co-authored-by: root <[email protected]>
danleifeng · Sep 12, 2023 · e7cb818 · e7cb818
1 parent 937ebbe
commit e7cb818
Show file tree

Hide file tree

Showing 6 changed files with 48 additions and 37 deletions.
diff --git a/paddle/fluid/framework/data_feed.cu b/paddle/fluid/framework/data_feed.cu
@@ -1540,40 +1540,6 @@ int GraphDataGenerator::FillSlotFeature(uint64_t *d_walk, size_t key_num) {
   return 0;
 }
 
-int GraphDataGenerator::FillFeatureBuf(uint64_t *d_walk,
-                                       uint64_t *d_feature,
-                                       size_t key_num) {
-  platform::CUDADeviceGuard guard(gpuid_);
-
-  auto gpu_graph_ptr = GraphGpuWrapper::GetInstance();
-  int ret = gpu_graph_ptr->get_feature_of_nodes(
-      gpuid_,
-      d_walk,
-      d_feature,
-      key_num,
-      slot_num_,
-      reinterpret_cast<int *>(d_slot_feature_num_map_->ptr()),
-      fea_num_per_node_);
-  return ret;
-}
-
-int GraphDataGenerator::FillFeatureBuf(
-    std::shared_ptr<phi::Allocation> d_walk,
-    std::shared_ptr<phi::Allocation> d_feature) {
-  platform::CUDADeviceGuard guard(gpuid_);
-
-  auto gpu_graph_ptr = GraphGpuWrapper::GetInstance();
-  int ret = gpu_graph_ptr->get_feature_of_nodes(
-      gpuid_,
-      reinterpret_cast<uint64_t *>(d_walk->ptr()),
-      reinterpret_cast<uint64_t *>(d_feature->ptr()),
-      buf_size_,
-      slot_num_,
-      reinterpret_cast<int *>(d_slot_feature_num_map_->ptr()),
-      fea_num_per_node_);
-  return ret;
-}
-
 // 对于deepwalk模式，尝试插入table，0表示插入成功，1表示插入失败；
 // 对于sage模式，尝试插入table，table数量不够则清空table重新插入，返回值无影响。
 int GraphDataGenerator::InsertTable(

diff --git a/paddle/fluid/framework/data_feed.h b/paddle/fluid/framework/data_feed.h
@@ -943,9 +943,6 @@ class GraphDataGenerator {
   int FillInferBuf();
   void DoWalkandSage();
   int FillSlotFeature(uint64_t* d_walk);
-  int FillFeatureBuf(uint64_t* d_walk, uint64_t* d_feature, size_t key_num);
-  int FillFeatureBuf(std::shared_ptr<phi::Allocation> d_walk,
-                     std::shared_ptr<phi::Allocation> d_feature);
   void FillOneStep(uint64_t* start_ids,
                    int etype_id,
                    uint64_t* walk,

diff --git a/paddle/fluid/framework/fleet/heter_ps/gpu_graph_utils.h b/paddle/fluid/framework/fleet/heter_ps/gpu_graph_utils.h
@@ -134,6 +134,46 @@ inline void debug_gpu_memory_info(const char* desc) {
             << "desc=" << desc;
   }
 }
+
+inline void show_gpu_mem(const char* desc) {
+  CudaDeviceRestorer r;
+
+  int device_num = 0;
+  auto err = cudaGetDeviceCount(&device_num);
+  PADDLE_ENFORCE_EQ(
+      err,
+      cudaSuccess,
+      platform::errors::InvalidArgument("cudaGetDeviceCount failed!"));
+
+  size_t avail{0};
+  size_t total{0};
+  for (int i = 0; i < device_num; ++i) {
+    cudaSetDevice(i);
+    auto err = cudaMemGetInfo(&avail, &total);
+    PADDLE_ENFORCE_EQ(
+        err,
+        cudaSuccess,
+        platform::errors::InvalidArgument("cudaMemGetInfo failed!"));
+    VLOG(0) << "[" << desc << "] hbm on device " << i << ", "
+            << "avail=" << avail / 1024.0 / 1024.0 / 1024.0 << "g, "
+            << "total=" << total / 1024.0 / 1024.0 / 1024.0 << "g";
+  }
+}
+
+inline void show_cpu_mem(const char* desc) {
+  //MB
+  long virtual_mem = 0, resident_mem = 0;
+
+  FILE * fp = fopen("/proc/self/statm", "r");
+  if (NULL != fp) {
+      fscanf(fp, "%ld %ld", &virtual_mem, &resident_mem);
+      resident_mem = resident_mem * 4096 / 1000000;
+      virtual_mem = virtual_mem * 4096 / 1000000;
+      fclose(fp);
+  }
+
+  VLOG(0) << "[" << desc << "] mem used " << resident_mem << "MB";
+}
 
 };  // namespace framework
 };  // namespace paddle
diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu
@@ -752,6 +752,12 @@ void GraphGpuWrapper::finalize() {
   reinterpret_cast<GpuPsGraphTable *>(graph_table)->show_table_collisions();
 }
 
+void GraphGpuWrapper::show_mem(const char* msg)
+{
+  show_cpu_mem(msg);
+  show_gpu_mem(msg);
+}
+
 // edge table
 void GraphGpuWrapper::upload_batch(int table_type,
                                    int slice_num,

diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h
@@ -198,6 +198,7 @@ class GraphGpuWrapper {
   std::unordered_map<int, int>& get_graph_type_to_index();
   std::string& get_node_type_size(std::string first_node_type);
   std::string& get_edge_type_size();
+  void show_mem(const char* msg);
 
   std::unordered_map<std::string, int> edge_to_id, node_to_id;
   std::vector<std::string> id_to_feature, id_to_edge;

diff --git a/paddle/fluid/pybind/fleet_py.cc b/paddle/fluid/pybind/fleet_py.cc
@@ -389,6 +389,7 @@ void BindGraphGpuWrapper(py::module* m) {
                              bool>(
                &GraphGpuWrapper::load_edge_file))
       .def("load_node_and_edge", &GraphGpuWrapper::load_node_and_edge)
+      .def("show_mem", &GraphGpuWrapper::show_mem)
       .def("upload_batch",
            py::overload_cast<int, int, const std::string&>(
                &GraphGpuWrapper::upload_batch))