Skip to content

Commit

Permalink
Merge pull request #21 from songbell/bell/merge_with_remote_tensor_WIP
Browse files Browse the repository at this point in the history
Bell/merge with remote tensor wip
  • Loading branch information
WeldonWangwang authored Aug 29, 2024
2 parents 1988738 + 38cded5 commit c4a0b80
Show file tree
Hide file tree
Showing 3 changed files with 48 additions and 15 deletions.
2 changes: 2 additions & 0 deletions src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -292,6 +292,8 @@ struct network {
std::map<std::string, std::vector<int64_t>> tp_host_times;
#ifdef GPU_DEBUG_CONFIG
int64_t iteration = 0;
int all_reduce_num_per_iter = 0;
int all_gather_num_per_iter = 0;
#endif
};
} // namespace cldnn
60 changes: 45 additions & 15 deletions src/plugins/intel_gpu/src/graph/network.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -443,19 +443,35 @@ network::network(program::ptr program, stream::ptr stream, uint16_t stream_id, o

network::~network() {
GPU_DEBUG_IF(debug_configuration::get_instance()->host_time_profiling) {
if (tp_host_times["sync_tensor"].size() >= 2) {
double first = static_cast<double>(tp_host_times["sync_tensor"][0]);
if (tp_host_times["sync_tensor_all_reduce"].size() >= 2) {
double first = static_cast<double>(tp_host_times["sync_tensor_all_reduce"][0]);
double avg = static_cast<double>(
std::accumulate(tp_host_times["sync_tensor"].begin() + 1, tp_host_times["sync_tensor"].end(), (size_t)0, std::plus<size_t>()));
avg /= (tp_host_times["sync_tensor"].size() - 1);
std::accumulate(
tp_host_times["sync_tensor_all_reduce"].begin() + 1, tp_host_times["sync_tensor_all_reduce"].end(), (size_t)0, std::plus<size_t>()));
avg /= (tp_host_times["sync_tensor_all_reduce"].size() - 1);
std::string resolution = " us";
if (avg > 1000.0) {
resolution = " ms";
avg /= 1000.0;
first /= 1000.0;
}
GPU_DEBUG_COUT << "Network[" << net_id << "] First infer total sync tensor host time: " << first << resolution << std::endl;
GPU_DEBUG_COUT << "Network[" << net_id << "] total sync tensor avg host time: " << avg << resolution << std::endl;
GPU_DEBUG_COUT << "Network[" << net_id << "] First infer total sync tensor all reduce host time: " << first << resolution << std::endl;
GPU_DEBUG_COUT << "Network[" << net_id << "] total sync tensor all reduce avg host time: " << avg << resolution << std::endl;
}
if (tp_host_times["sync_tensor_all_gather"].size() >= 2) {
double first = static_cast<double>(tp_host_times["sync_tensor_all_gather"][0]);
double avg = static_cast<double>(
std::accumulate(
tp_host_times["sync_tensor_all_gather"].begin() + 1, tp_host_times["sync_tensor_all_gather"].end(), (size_t)0, std::plus<size_t>()));
avg /= (tp_host_times["sync_tensor_all_gather"].size() - 1);
std::string resolution = " us";
if (avg > 1000.0) {
resolution = " ms";
avg /= 1000.0;
first /= 1000.0;
}
GPU_DEBUG_COUT << "Network[" << net_id << "] First infer total sync tensor all gather host time: " << first << resolution << std::endl;
GPU_DEBUG_COUT << "Network[" << net_id << "] total sync tensor all gather avg host time: " << avg << resolution << std::endl;
}
if (tp_host_times["concat"].size() >= 2) {
double first = static_cast<double>(tp_host_times["concat"][0]);
Expand All @@ -472,6 +488,10 @@ network::~network() {
GPU_DEBUG_COUT << "Network[" << net_id << "] total concat avg host time: " << avg << resolution << std::endl;
}
}
#ifdef GPU_DEBUG_CONFIG
GPU_DEBUG_COUT << "all reduce operations per infer: " << all_reduce_num_per_iter << std::endl;
GPU_DEBUG_COUT << "all gather operations per infer: " << all_gather_num_per_iter << std::endl;
#endif
if (_program != nullptr)
_program->cancel_compilation_context();
_memory_pool->clear_pool_for_network(net_id);
Expand Down Expand Up @@ -967,7 +987,6 @@ void network::add_to_exec_order(const primitive_id& id) {

std::map<primitive_id, network_output> network::execute(const std::vector<event::ptr>& dependencies) {
execute_impl(dependencies);

auto output_ids = get_output_ids();
std::map<primitive_id, network_output> result;
for (auto& id : output_ids) {
Expand Down Expand Up @@ -1066,9 +1085,8 @@ void network::execute_impl(const std::vector<event::ptr>& events) {
const size_t flush_frequency = needs_flushing ? 16 : 0;
size_t executed_prims = 0;
std::map<std::string, std::vector<int64_t>> tp_host_times_each_iter;
tp_host_times["sync_tensor"];
tp_host_times["concat"];
tp_host_times_each_iter["sync_tensor"];
tp_host_times_each_iter["sync_tensor_all_reduce"];
tp_host_times_each_iter["sync_tensor_all_gather"];
tp_host_times_each_iter["concat"];
for (auto& inst : _exec_order) {
auto start = std::chrono::high_resolution_clock::now();
Expand Down Expand Up @@ -1244,7 +1262,10 @@ void network::execute_impl(const std::vector<event::ptr>& events) {
auto end = std::chrono::high_resolution_clock::now();
GPU_DEBUG_IF(debug_configuration::get_instance()->host_time_profiling) {
if (inst->get_node().is_type<sync_tensor>()) {
tp_host_times_each_iter["sync_tensor"].push_back(std::chrono::duration_cast<std::chrono::microseconds>(end - start).count());
if (inst->get_impl_params()->need_add)
tp_host_times_each_iter["sync_tensor_all_reduce"].push_back(std::chrono::duration_cast<std::chrono::microseconds>(end - start).count());
else
tp_host_times_each_iter["sync_tensor_all_gather"].push_back(std::chrono::duration_cast<std::chrono::microseconds>(end - start).count());
} else {
tp_host_times_each_iter["concat"].push_back(std::chrono::duration_cast<std::chrono::microseconds>(end - start).count());
}
Expand All @@ -1253,17 +1274,26 @@ void network::execute_impl(const std::vector<event::ptr>& events) {
}
// statistic for each iter
GPU_DEBUG_IF(debug_configuration::get_instance()->host_time_profiling) {
if (tp_host_times_each_iter["sync_tensor"].size() >= 1) {
const auto begin = std::begin(tp_host_times_each_iter["sync_tensor"]);
const auto end = std::end(tp_host_times_each_iter["sync_tensor"]);
tp_host_times["sync_tensor"].push_back(std::accumulate(begin, end, (size_t)0, std::plus<size_t>()));
if (tp_host_times_each_iter["sync_tensor_all_reduce"].size() >= 1) {
const auto begin = std::begin(tp_host_times_each_iter["sync_tensor_all_reduce"]);
const auto end = std::end(tp_host_times_each_iter["sync_tensor_all_reduce"]);
tp_host_times["sync_tensor_all_reduce"].push_back(std::accumulate(begin, end, (size_t)0, std::plus<size_t>()));
}
if (tp_host_times_each_iter["sync_tensor_all_gather"].size() >= 1) {
const auto begin = std::begin(tp_host_times_each_iter["sync_tensor_all_gather"]);
const auto end = std::end(tp_host_times_each_iter["sync_tensor_all_gather"]);
tp_host_times["sync_tensor_all_gather"].push_back(std::accumulate(begin, end, (size_t)0, std::plus<size_t>()));
}
if (tp_host_times_each_iter["concat"].size() >= 1) {
const auto begin = std::begin(tp_host_times_each_iter["concat"]);
const auto end = std::end(tp_host_times_each_iter["concat"]);
tp_host_times["concat"].push_back(std::accumulate(begin, end, (size_t)0, std::plus<size_t>()));
}
}
#ifdef GPU_DEBUG_CONFIG
all_reduce_num_per_iter = tp_host_times_each_iter["sync_tensor_all_reduce"].size();
all_gather_num_per_iter = tp_host_times_each_iter["sync_tensor_all_gather"].size();
#endif
// print '-data_shape' option for benchmark_app
GPU_DEBUG_IF(debug_config->print_input_data_shapes == 1) {
std::stringstream data_shape_str;
Expand Down
1 change: 1 addition & 0 deletions src/plugins/intel_gpu/src/runtime/execution_config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ void ExecutionConfig::set_default() {
std::make_tuple(ov::hint::num_requests, 0),
std::make_tuple(ov::hint::enable_cpu_pinning, false),
std::make_tuple(ov::hint::dynamic_quantization_group_size, 0),
std::make_tuple(ov::device::priorities, ""), // temporarily used for TP

std::make_tuple(ov::intel_gpu::hint::host_task_priority, ov::hint::Priority::MEDIUM),
std::make_tuple(ov::intel_gpu::hint::queue_throttle, ov::intel_gpu::hint::ThrottleLevel::MEDIUM),
Expand Down

0 comments on commit c4a0b80

Please sign in to comment.