Skip to content

Commit

Permalink
add excluded_train_pair and infer_node_type (PaddlePaddle#187)
Browse files Browse the repository at this point in the history
  • Loading branch information
huwei02 authored and zmxdream committed Dec 24, 2022
1 parent e2aeb29 commit 20852e4
Show file tree
Hide file tree
Showing 8 changed files with 212 additions and 83 deletions.
209 changes: 148 additions & 61 deletions paddle/fluid/framework/data_feed.cu

Large diffs are not rendered by default.

7 changes: 7 additions & 0 deletions paddle/fluid/framework/data_feed.h
Original file line number Diff line number Diff line change
Expand Up @@ -914,7 +914,9 @@ class GraphDataGenerator {
int FillFeatureBuf(std::shared_ptr<phi::Allocation> d_walk,
std::shared_ptr<phi::Allocation> d_feature);
void FillOneStep(uint64_t* start_ids,
int etype_id,
uint64_t* walk,
uint8_t *walk_ntype,
int len,
NeighborSampleResult& sample_res, // NOLINT
int cur_degree,
Expand Down Expand Up @@ -999,6 +1001,8 @@ class GraphDataGenerator {
std::shared_ptr<phi::Allocation> d_train_metapath_keys_;

std::shared_ptr<phi::Allocation> d_walk_;
std::shared_ptr<phi::Allocation> d_walk_ntype_;
std::shared_ptr<phi::Allocation> d_excluded_train_pair_;
std::shared_ptr<phi::Allocation> d_feature_list_;
std::shared_ptr<phi::Allocation> d_feature_;
std::shared_ptr<phi::Allocation> d_len_per_row_;
Expand Down Expand Up @@ -1038,6 +1042,7 @@ class GraphDataGenerator {
std::vector<std::vector<std::shared_ptr<phi::Allocation>>> graph_edges_vec_;
std::vector<std::vector<std::vector<int>>> edges_split_num_vec_;

int excluded_train_pair_len_;
int64_t reindex_table_size_;
int sage_batch_count_;
int sage_batch_num_;
Expand Down Expand Up @@ -1067,6 +1072,8 @@ class GraphDataGenerator {
int total_row_;
size_t infer_node_start_;
size_t infer_node_end_;
std::set<int> infer_node_type_index_set_;
std::string infer_node_type_;
};

class DataFeed {
Expand Down
2 changes: 2 additions & 0 deletions paddle/fluid/framework/data_feed.proto
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ message GraphConfig {
optional string samples = 12;
optional int64 train_table_cap = 13 [ default = 80000 ];
optional int64 infer_table_cap = 14 [ default = 80000 ];
optional string excluded_train_pair = 15;
optional string infer_node_type = 16;
}

message DataFeedDesc {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1340,6 +1340,7 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
return result;
}
// only for graphsage
NeighborSampleResultV2 GpuPsGraphTable::graph_neighbor_sample_all_edge_type(
int gpu_id,
int edge_type_len,
Expand Down
57 changes: 40 additions & 17 deletions paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ void GraphGpuWrapper::set_device(std::vector<int> ids) {
}

void GraphGpuWrapper::init_conf(const std::string &first_node_type,
const std::string &meta_path) {
const std::string &meta_path,
const std::string &excluded_train_pair) {
static std::mutex mutex;
{
std::lock_guard<std::mutex> lock(mutex);
Expand All @@ -45,30 +46,52 @@ void GraphGpuWrapper::init_conf(const std::string &first_node_type,
paddle::string::split_string<std::string>(first_node_type, ";");
VLOG(2) << "node_types: " << first_node_type;
for (auto &type : node_types) {
auto iter = feature_to_id.find(type);
auto iter = node_to_id.find(type);
PADDLE_ENFORCE_NE(iter,
feature_to_id.end(),
node_to_id.end(),
platform::errors::NotFound(
"(%s) is not found in feature_to_id.", type));
VLOG(2) << "feature_to_id[" << type << "] = " << iter->second;
"(%s) is not found in node_to_id.", type));
VLOG(2) << "node_to_id[" << type << "] = " << iter->second;
first_node_type_.push_back(iter->second);
}
meta_path_.resize(first_node_type_.size());
auto meta_paths = paddle::string::split_string<std::string>(meta_path, ";");

for (size_t i = 0; i < meta_paths.size(); i++) {
auto path = meta_paths[i];
auto nodes = paddle::string::split_string<std::string>(path, "-");
auto edges = paddle::string::split_string<std::string>(path, "-");
for (auto &edge : edges) {
auto iter = edge_to_id.find(edge);
PADDLE_ENFORCE_NE(iter,
edge_to_id.end(),
platform::errors::NotFound(
"(%s) is not found in edge_to_id.", edge));
VLOG(2) << "edge_to_id[" << edge << "] = " << iter->second;
meta_path_[i].push_back(iter->second);
if (edge_to_node_map_.find(iter->second) == edge_to_node_map_.end()) {
auto nodes = paddle::string::split_string<std::string>(edge, "2");
uint64_t src_node_id = node_to_id.find(nodes[0])->second;
uint64_t dst_node_id = node_to_id.find(nodes[1])->second;
edge_to_node_map_[iter->second] = src_node_id << 32 | dst_node_id;
}
}
}

auto paths = paddle::string::split_string<std::string>(excluded_train_pair, ";");
VLOG(2) << "excluded_train_pair[" << excluded_train_pair << "]";
for (auto &path: paths) {
auto nodes = paddle::string::split_string<std::string>(path, "2");
for (auto &node : nodes) {
auto iter = edge_to_id.find(node);
auto iter = node_to_id.find(node);
PADDLE_ENFORCE_NE(iter,
edge_to_id.end(),
platform::errors::NotFound(
"(%s) is not found in edge_to_id.", node));
VLOG(2) << "edge_to_id[" << node << "] = " << iter->second;
meta_path_[i].push_back(iter->second);
excluded_train_pair_.push_back(iter->second);
}
}

int max_dev_id = 0;
for (size_t i = 0; i < device_id_mapping.size(); i++) {
if (device_id_mapping[i] > max_dev_id) {
Expand All @@ -85,11 +108,11 @@ void GraphGpuWrapper::init_conf(const std::string &first_node_type,
auto &finish_node_type = finish_node_type_[i];
finish_node_type.clear();

for (size_t idx = 0; idx < feature_to_id.size(); idx++) {
for (size_t idx = 0; idx < node_to_id.size(); idx++) {
infer_node_type_start[idx] = 0;
}
for (auto &type : node_types) {
auto iter = feature_to_id.find(type);
auto iter = node_to_id.find(type);
node_type_start[iter->second] = 0;
infer_node_type_start[iter->second] = 0;
}
Expand Down Expand Up @@ -188,7 +211,7 @@ void GraphGpuWrapper::init_metapath(std::string cur_metapath,
int first_node_idx;
std::string first_node =
paddle::string::split_string<std::string>(cur_metapath_, "2")[0];
auto it = feature_to_id.find(first_node);
auto it = node_to_id.find(first_node);
first_node_idx = it->second;
d_graph_train_total_keys_.resize(thread_num);
h_graph_train_keys_len_.resize(thread_num);
Expand Down Expand Up @@ -309,8 +332,8 @@ void GraphGpuWrapper::set_up_types(const std::vector<std::string> &edge_types,
}
id_to_feature = node_types;
for (size_t table_id = 0; table_id < node_types.size(); table_id++) {
int res = feature_to_id.size();
feature_to_id[node_types[table_id]] = res;
int res = node_to_id.size();
node_to_id[node_types[table_id]] = res;
}
table_feat_mapping.resize(node_types.size());
this->table_feat_conf_feat_name.resize(node_types.size());
Expand Down Expand Up @@ -394,7 +417,7 @@ void GraphGpuWrapper::load_node_file(std::string name, std::string filepath) {

std::string params = "n" + name;

if (feature_to_id.find(name) != feature_to_id.end()) {
if (node_to_id.find(name) != node_to_id.end()) {
reinterpret_cast<GpuPsGraphTable *>(graph_table)
->cpu_graph_table_->Load(std::string(filepath), params);
}
Expand Down Expand Up @@ -422,8 +445,8 @@ void GraphGpuWrapper::add_table_feat_conf(std::string table_name,
std::string feat_name,
std::string feat_dtype,
int feat_shape) {
if (feature_to_id.find(table_name) != feature_to_id.end()) {
int idx = feature_to_id[table_name];
if (node_to_id.find(table_name) != node_to_id.end()) {
int idx = node_to_id[table_name];
if (table_feat_mapping[idx].find(feat_name) ==
table_feat_mapping[idx].end()) {
int res = table_feat_mapping[idx].size();
Expand Down Expand Up @@ -776,7 +799,7 @@ std::string &GraphGpuWrapper::get_node_type_size(std::string first_node_type) {
auto &type_to_index = get_graph_type_to_index();
std::vector<std::string> node_type_size;
for (auto node : uniq_first_node_) {
auto it = feature_to_id.find(node);
auto it = node_to_id.find(node);
auto first_node_idx = it->second;
size_t f_idx = type_to_index[first_node_idx];
int type_total_key_size = graph_all_type_total_keys[f_idx].size();
Expand Down
10 changes: 8 additions & 2 deletions paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,8 @@ class GraphGpuWrapper {
}
static std::shared_ptr<GraphGpuWrapper> s_instance_;
void init_conf(const std::string& first_node_type,
const std::string& meta_path);
const std::string& meta_path,
const std::string& excluded_train_pair);
void initialize();
void finalize();
void set_device(std::vector<int> ids);
Expand Down Expand Up @@ -160,7 +161,7 @@ class GraphGpuWrapper {
std::string& get_node_type_size(std::string first_node_type);
std::string& get_edge_type_size();

std::unordered_map<std::string, int> edge_to_id, feature_to_id;
std::unordered_map<std::string, int> edge_to_id, node_to_id;
std::vector<std::string> id_to_feature, id_to_edge;
std::vector<std::unordered_map<std::string, int>> table_feat_mapping;
std::vector<std::vector<std::string>> table_feat_conf_feat_name;
Expand All @@ -175,6 +176,7 @@ class GraphGpuWrapper {
std::string feature_separator_ = std::string(" ");
bool conf_initialized_ = false;
std::vector<int> first_node_type_;
std::vector<uint8_t> excluded_train_pair_;
std::vector<std::vector<int>> meta_path_;

std::vector<std::set<int>> finish_node_type_;
Expand All @@ -187,6 +189,10 @@ class GraphGpuWrapper {
std::vector<size_t> h_graph_train_keys_len_;
std::vector<std::vector<std::shared_ptr<phi::Allocation>>>
d_graph_all_type_total_keys_;
std::map<uint64_t, // edge_id
uint64_t // src_node_id << 32 | dst_node_id
> edge_to_node_map_;

std::vector<std::vector<uint64_t>> h_graph_all_type_keys_len_;
std::string slot_feature_separator_ = std::string(" ");

Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1444,7 +1444,7 @@ void PSGPUWrapper::SparseTableToHbm() {
gpu_task->init(thread_keys_shard_num_, device_num, multi_mf_dim_);
gpu_task->pass_id_ = (uint16_t)(dataset_->GetPassID());
auto gpu_graph_ptr = GraphGpuWrapper::GetInstance();
auto node_to_id = gpu_graph_ptr->feature_to_id;
auto node_to_id = gpu_graph_ptr->node_to_id;
auto edge_to_id = gpu_graph_ptr->edge_to_id;
std::vector<uint64_t> vec_data = gpu_graph_ptr->get_graph_total_keys();

Expand Down
7 changes: 5 additions & 2 deletions python/paddle/fluid/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -1122,8 +1122,11 @@ def set_graph_config(self, config):
"train_table_cap", 800000
)
self.proto_desc.graph_config.infer_table_cap = config.get(
"infer_table_cap", 800000
)
"infer_table_cap", 800000)
self.proto_desc.graph_config.excluded_train_pair = config.get(
"excluded_train_pair", "")
self.proto_desc.graph_config.infer_node_type = config.get(
"infer_node_type", "")
self.dataset.set_gpu_graph_mode(True)

def set_pass_id(self, pass_id):
Expand Down

0 comments on commit 20852e4

Please sign in to comment.