diff --git a/include/tvm/runtime/profiling.h b/include/tvm/runtime/profiling.h index 0163f0c2e49e..83c26933be45 100644 --- a/include/tvm/runtime/profiling.h +++ b/include/tvm/runtime/profiling.h @@ -25,6 +25,7 @@ #define TVM_RUNTIME_PROFILING_H_ #include +#include #include #include #include @@ -192,6 +193,11 @@ class ReportNode : public Object { * because these metrics include the overhead of the executor. */ Map> device_metrics; + /*! Configuration used for this profiling run. Includes number of threads, executor. + * + * Values must be an object type that can be used with device_metrics. + */ + Map configuration; /*! \brief Output `calls` in CSV format. * * Note that this does not include `device_metrics`, it only includes per-call metrics. @@ -255,9 +261,11 @@ class Report : public ObjectRef { /*! Construct a Report from a set of calls (with associated metrics) and per-device metrics. * \param calls Function calls and associated metrics. * \param device_metrics Per-device metrics for overall execution. + * \param configuration Configuration data specific to this profiling run. */ explicit Report(Array> calls, - Map> device_metrics); + Map> device_metrics, + Map configuration); /*! Deserialize a Report from a JSON object. Needed for sending the report over RPC. * \param json Serialized json report from `ReportNode::AsJSON`. @@ -366,8 +374,10 @@ class Profiler { * \param devs The list of devices the profiler will be running on. Should * include all devices used by profiled operators. * \param metric_collectors Additional `MetricCollector`s to use with this profiler. + * \param configuration Additional configuration data to add to the outputted profiling report. */ - explicit Profiler(std::vector devs, std::vector metric_collectors); + explicit Profiler(std::vector devs, std::vector metric_collectors, + std::unordered_map configuration = {}); /*! \brief Start the profiler. * * This function should only be called once per object. @@ -400,7 +410,7 @@ class Profiler { * \returns A `Report` that can either be formatted as CSV (with `.AsCSV`) * or as a human readable table (with `.AsTable`). */ - profiling::Report Report(bool aggregate = true, bool sort = true); + profiling::Report Report(); /*! \brief Check if the profiler is currently running. * \returns Whether or not the profiler is running. */ @@ -412,6 +422,7 @@ class Profiler { std::vector calls_; std::stack in_flight_; std::vector collectors_; + std::unordered_map configuration_; }; /* \brief A duration in time. */ diff --git a/python/tvm/runtime/profiling/__init__.py b/python/tvm/runtime/profiling/__init__.py index 573779037827..347d8b9f94f1 100644 --- a/python/tvm/runtime/profiling/__init__.py +++ b/python/tvm/runtime/profiling/__init__.py @@ -36,7 +36,10 @@ class Report(Object): """ def __init__( - self, calls: Sequence[Dict[str, Object]], device_metrics: Dict[str, Dict[str, Object]] + self, + calls: Sequence[Dict[str, Object]], + device_metrics: Dict[str, Dict[str, Object]], + configuration: Dict[str, Object], ): """Construct a profiling report from a list of metrics and per-device metrics. @@ -47,8 +50,12 @@ def __init__( device_metrics : Dict[str, Dict[str, Object]] Per device metrics. + + configuration : Dict[str, Object] + Configuration of TVM for this profiling run. Includes number of + threads, executor. """ - self.__init_handle_by_constructor__(_ffi_api.Report, calls, device_metrics) + self.__init_handle_by_constructor__(_ffi_api.Report, calls, device_metrics, configuration) def csv(self): """Convert this profiling report into CSV format. diff --git a/python/tvm/utils/roofline.py b/python/tvm/utils/roofline.py index 6d1ac753e27e..adb8bf5cc97c 100644 --- a/python/tvm/utils/roofline.py +++ b/python/tvm/utils/roofline.py @@ -400,7 +400,10 @@ def roofline_from_existing( new_calls.append(call) else: new_calls.append(call) - return profiling.Report(new_calls, report.device_metrics) + new_configuration = dict(report.configuration.items()) + new_configuration["Estimated Peak FMA FLOP/s"] = profiling.Ratio(peak_flops) + new_configuration["Estimated Peak Bandwidth (byte/second)"] = profiling.Ratio(peak_bandwidth) + return profiling.Report(new_calls, report.device_metrics, new_configuration) def roofline_analysis( diff --git a/src/node/structural_hash.cc b/src/node/structural_hash.cc index e97e5f41bfc2..23811e219078 100644 --- a/src/node/structural_hash.cc +++ b/src/node/structural_hash.cc @@ -521,6 +521,7 @@ struct ReportNodeTrait { static void VisitAttrs(runtime::profiling::ReportNode* report, AttrVisitor* attrs) { attrs->Visit("calls", &report->calls); attrs->Visit("device_metrics", &report->device_metrics); + attrs->Visit("configuration", &report->configuration); } static constexpr std::nullptr_t SEqualReduce = nullptr; static constexpr std::nullptr_t SHashReduce = nullptr; diff --git a/src/runtime/graph_executor/debug/graph_executor_debug.cc b/src/runtime/graph_executor/debug/graph_executor_debug.cc index bd3b0db0403f..4a950153954f 100644 --- a/src/runtime/graph_executor/debug/graph_executor_debug.cc +++ b/src/runtime/graph_executor/debug/graph_executor_debug.cc @@ -294,7 +294,7 @@ class GraphExecutorDebug : public GraphExecutor { */ profiling::Report Profile(Array collectors) { std::vector cs(collectors.begin(), collectors.end()); - profiling::Profiler prof(devices_, cs); + profiling::Profiler prof(devices_, cs, {{String("Executor"), String("Graph")}}); // warm up. 1 iteration does not seem enough. for (int i = 0; i < 3; i++) { diff --git a/src/runtime/profiling.cc b/src/runtime/profiling.cc index 9499a6e7a5bb..9f95bf18f74b 100644 --- a/src/runtime/profiling.cc +++ b/src/runtime/profiling.cc @@ -105,8 +105,9 @@ TVM_REGISTER_GLOBAL("profiling.start_timer").set_body_typed(Timer::Start); namespace profiling { -Profiler::Profiler(std::vector devs, std::vector metric_collectors) - : devs_(devs), collectors_(metric_collectors) { +Profiler::Profiler(std::vector devs, std::vector metric_collectors, + std::unordered_map configuration) + : devs_(devs), collectors_(metric_collectors), configuration_(configuration) { is_running_ = false; std::vector wrapped_devs; for (auto dev : devs) { @@ -117,6 +118,9 @@ Profiler::Profiler(std::vector devs, std::vector metric } // reset the thread pool so that PAPI eventset hooks are set in all threads. threading::ResetThreadPool(); + + configuration_[String("Number of threads")] = + ObjectRef(make_object(threading::NumThreads())); } void Profiler::Start() { @@ -279,7 +283,7 @@ String ReportNode::AsCSV() const { } namespace { -void print_metric(std::ostream& os, ObjectRef o) { +void metric_as_json(std::ostream& os, ObjectRef o) { if (o.as()) { os << "{\"string\":" << "\"" << Downcast(o) << "\"" @@ -309,13 +313,14 @@ String ReportNode::AsJSON() const { // value we want to print. Instead we construct the json by hand because it // is easier. s << "{"; + s << "\"calls\":["; for (size_t i = 0; i < calls.size(); i++) { size_t j = 0; s << "{"; for (const auto& kv : calls[i]) { s << "\"" << kv.first << "\":"; - print_metric(s, kv.second); + metric_as_json(s, kv.second); if (j < calls[i].size() - 1) { s << ","; } @@ -326,7 +331,8 @@ String ReportNode::AsJSON() const { s << ","; } } - s << "],"; + s << "],"; // end calls + s << "\"device_metrics\":{"; size_t i = 0; for (const auto& dev_kv : device_metrics) { @@ -334,7 +340,7 @@ String ReportNode::AsJSON() const { s << "\"" << dev_kv.first << "\":{"; for (const auto& metric_kv : dev_kv.second) { s << "\"" << metric_kv.first << "\":"; - print_metric(s, metric_kv.second); + metric_as_json(s, metric_kv.second); if (j < dev_kv.second.size() - 1) { s << ","; } @@ -346,7 +352,20 @@ String ReportNode::AsJSON() const { } i++; } - s << "}}"; + s << "},"; // end device metrics + + s << "\"configuration\":{"; + size_t k = 0; + for (const auto& kv : configuration) { + s << "\"" << kv.first << "\":"; + metric_as_json(s, kv.second); + if (k < configuration.size() - 1) { + s << ","; + } + k++; + } + s << "}"; // end configuration + s << "}"; return s.str(); } @@ -395,6 +414,35 @@ ObjectRef AggregateMetric(const std::vector& metrics) { } } +static String print_metric(ObjectRef metric) { + std::string val; + if (metric.as()) { + std::stringstream s; + s.imbue(std::locale("")); // for 1000s seperators + s << std::fixed << metric.as()->value; + val = s.str(); + } else if (metric.as()) { + std::stringstream s; + s.imbue(std::locale("")); // for 1000s seperators + s << std::fixed << std::setprecision(2) << metric.as()->microseconds; + val = s.str(); + } else if (metric.as()) { + std::stringstream s; + s << std::fixed << std::setprecision(2) << metric.as()->percent; + val = s.str(); + } else if (metric.as()) { + std::stringstream s; + s.imbue(std::locale("")); // for 1000s seperators + s << std::setprecision(2) << metric.as()->ratio; + val = s.str(); + } else if (metric.as()) { + val = Downcast(metric); + } else { + LOG(FATAL) << "Cannot print metric of type " << metric->GetTypeKey(); + } + return val; +} + String ReportNode::AsTable(bool sort, bool aggregate, bool compute_col_sums) const { // aggregate calls by op hash (or op name if hash is not set) + argument shapes std::vector> aggregated_calls; @@ -533,30 +581,7 @@ String ReportNode::AsTable(bool sort, bool aggregate, bool compute_col_sums) con // fill empty data with empty strings cols[i].push_back(""); } else { - std::string val; - if ((*it).second.as()) { - std::stringstream s; - s.imbue(std::locale("")); // for 1000s seperators - s << std::fixed << (*it).second.as()->value; - val = s.str(); - } else if ((*it).second.as()) { - std::stringstream s; - s.imbue(std::locale("")); // for 1000s seperators - s << std::fixed << std::setprecision(2) << (*it).second.as()->microseconds; - val = s.str(); - } else if ((*it).second.as()) { - std::stringstream s; - s << std::fixed << std::setprecision(2) << (*it).second.as()->percent; - val = s.str(); - } else if ((*it).second.as()) { - std::stringstream s; - s.imbue(std::locale("")); // for 1000s seperators - s << std::setprecision(2) << (*it).second.as()->ratio; - val = s.str(); - } else if ((*it).second.as()) { - val = Downcast((*it).second); - } - cols[i].push_back(val); + cols[i].push_back(print_metric((*it).second)); } } } @@ -592,6 +617,12 @@ String ReportNode::AsTable(bool sort, bool aggregate, bool compute_col_sums) con } s << std::endl; } + + // Add configuration information. It will not be aligned with the columns. + s << std::endl << "Configuration" << std::endl << "-------------" << std::endl; + for (auto kv : configuration) { + s << kv.first << ": " << print_metric(kv.second) << std::endl; + } return s.str(); } @@ -599,7 +630,7 @@ std::string DeviceString(Device dev) { return DeviceName(dev.device_type) + std::to_string(dev.device_id); } -Report Profiler::Report(bool aggregate, bool sort) { +Report Profiler::Report() { // sync all timers and normalize rows std::vector> rows; for (auto& cf : calls_) { @@ -638,14 +669,16 @@ Report Profiler::Report(bool aggregate, bool sort) { converted_rows.push_back(row); } - return profiling::Report(converted_rows, device_metrics); + return profiling::Report(converted_rows, device_metrics, configuration_); } Report::Report(Array> calls, - Map> device_metrics) { + Map> device_metrics, + Map configuration) { auto node = make_object(); node->calls = std::move(calls); node->device_metrics = std::move(device_metrics); + node->configuration = std::move(configuration); data_ = std::move(node); } @@ -697,6 +730,7 @@ Report Report::FromJSON(String json) { std::string key; Array> calls; Map> device_metrics; + Map configuration; reader.BeginObject(); while (reader.NextObjectItem(&key)) { @@ -713,10 +747,12 @@ Report Report::FromJSON(String json) { device_metrics.Set(device_name, parse_metrics(&reader)); } // reader.EndObject(); + } else if (key == "configuration") { + configuration = parse_metrics(&reader); } } - return Report(calls, device_metrics); + return Report(calls, device_metrics, configuration); } TVM_REGISTER_OBJECT_TYPE(DurationNode); @@ -855,8 +891,9 @@ PackedFunc WrapTimeEvaluator(PackedFunc pf, Device dev, int number, int repeat, TVM_REGISTER_GLOBAL("runtime.profiling.Report") .set_body_typed([](Array> calls, - Map> device_metrics) { - return Report(calls, device_metrics); + Map> device_metrics, + Map configuration) { + return Report(calls, device_metrics, configuration); }); TVM_REGISTER_GLOBAL("runtime.profiling.Count").set_body_typed([](int64_t count) { diff --git a/src/runtime/vm/profiler/vm.cc b/src/runtime/vm/profiler/vm.cc index 393d1b399878..0ace910b5c53 100644 --- a/src/runtime/vm/profiler/vm.cc +++ b/src/runtime/vm/profiler/vm.cc @@ -58,9 +58,9 @@ PackedFunc VirtualMachineDebug::GetFunction(const std::string& name, // on remotes, we accept a nullptr for collectors. if (collectors.defined()) { std::vector cs(collectors.begin(), collectors.end()); - prof_ = profiling::Profiler(devices, cs); + prof_ = profiling::Profiler(devices, cs, {{String("Executor"), String("VM")}}); } else { - prof_ = profiling::Profiler(devices, {}); + prof_ = profiling::Profiler(devices, {}, {{String("Executor"), String("VM")}}); } auto invoke = VirtualMachine::GetFunction("invoke", sptr_to_self); @@ -77,7 +77,7 @@ PackedFunc VirtualMachineDebug::GetFunction(const std::string& name, return report; }); } else if (name == "profile_rpc") { - // We cannot return a Report over RPC because TMV RPC mechanism only + // We cannot return a Report over RPC because TVM RPC mechanism only // supports a subset of Object classes. Instead we serialize it on the // remote (here) and deserialize it on the other end. return TypedPackedFunc([sptr_to_self, this](std::string arg_name) { diff --git a/tests/python/unittest/test_runtime_profiling.py b/tests/python/unittest/test_runtime_profiling.py index 919057f08d27..a701be7c3c9b 100644 --- a/tests/python/unittest/test_runtime_profiling.py +++ b/tests/python/unittest/test_runtime_profiling.py @@ -69,6 +69,7 @@ def test_vm(target, dev): assert "Total" in str(report) assert "AllocTensorReg" in str(report) assert "AllocStorage" in str(report) + assert report.configuration["Executor"] == "VM" csv = read_csv(report) assert "Hash" in csv.keys() @@ -102,6 +103,7 @@ def test_graph_executor(target, dev): assert "fused_nn_softmax" in str(report) assert "Total" in str(report) assert "Hash" in str(report) + assert "Graph" in str(report) @tvm.testing.parametrize_targets("cuda", "llvm") @@ -147,6 +149,7 @@ def test_json(): parsed = json.loads(report.json()) assert "device_metrics" in parsed assert "calls" in parsed + assert "configuration" in parsed assert "Duration (us)" in parsed["calls"][0] assert "microseconds" in parsed["calls"][0]["Duration (us)"] assert len(parsed["calls"]) > 0