From af244d715740a2c8bd51cf78d72902b5f3ffab1f Mon Sep 17 00:00:00 2001 From: Qiumin Xu Date: Mon, 9 Apr 2018 11:47:14 -0700 Subject: [PATCH 1/6] Add per host profile for viewing pod scale traces --- .../profile/input_pipeline_analyzer/BUILD | 2 +- .../input-pipeline-analyzer.html | 23 +- .../profile/overview_page/overview-page.html | 21 +- tensorboard/plugins/profile/profile_plugin.py | 97 +++++-- .../profile/tf_op_profile/tf-op-profile.html | 22 +- .../tf-profile-dashboard.html | 250 ++++++++++++------ 6 files changed, 256 insertions(+), 159 deletions(-) diff --git a/tensorboard/plugins/profile/input_pipeline_analyzer/BUILD b/tensorboard/plugins/profile/input_pipeline_analyzer/BUILD index 7e72c5f2139..4687bd5efc4 100644 --- a/tensorboard/plugins/profile/input_pipeline_analyzer/BUILD +++ b/tensorboard/plugins/profile/input_pipeline_analyzer/BUILD @@ -10,7 +10,7 @@ tf_web_library( path = "/tf-input-pipeline", visibility = ["//visibility:public"], deps = [ - "//tensorboard/components/tf_imports:plottable", "//tensorboard/components/vz_line_chart", + "@org_polymer_paper_button", ], ) diff --git a/tensorboard/plugins/profile/input_pipeline_analyzer/input-pipeline-analyzer.html b/tensorboard/plugins/profile/input_pipeline_analyzer/input-pipeline-analyzer.html index a55d57408a4..90bc029a425 100644 --- a/tensorboard/plugins/profile/input_pipeline_analyzer/input-pipeline-analyzer.html +++ b/tensorboard/plugins/profile/input_pipeline_analyzer/input-pipeline-analyzer.html @@ -208,15 +208,6 @@ Polymer({ is: 'input-pipeline-analyzer', properties: { - _requestManager: { - type: Object, - readOnly: true, - value: () => new tf_backend.RequestManager(), - }, - run: { - type: String, - observer: '_reloadToolData', - }, _data: { type: Object, observer: '_updateView', @@ -249,25 +240,13 @@ onClick: function(e) { this.set('_show_host_side_table', !this._show_host_side_table); }, - _reloadToolData: function(run) { - if (!run) return; - this._requestManager.request(tf_backend.addParams( - tf_backend.getRouter().pluginRoute('profile', '/data'), - {tag: 'input_pipeline_analyzer', run}) - ).catch(error => { - console.error(error); - }).then((data) => { - if (data) { - this.set('_data', data); - } - }); - }, _getToggleButtonText: function(show_host_side_table) { return (show_host_side_table ? 'Hide' : 'Show') + ' Input Op Statistics'; }, /* Update view according to new data */ _updateView: function() { + if (this._data == null) return; var deviceJson = this._data[0]; var hostJson = this._data[1]; var recommendationJson = this._data[2]; diff --git a/tensorboard/plugins/profile/overview_page/overview-page.html b/tensorboard/plugins/profile/overview_page/overview-page.html index ea4639f267c..911aeb02f15 100644 --- a/tensorboard/plugins/profile/overview_page/overview-page.html +++ b/tensorboard/plugins/profile/overview_page/overview-page.html @@ -162,15 +162,6 @@ Polymer({ is: 'overview-page', properties: { - _requestManager: { - type: Object, - readOnly: true, - value: () => new tf_backend.RequestManager(), - }, - run: { - type: String, - observer: '_reloadToolData', - }, _data: { type: Object, observer: '_updateView', @@ -206,17 +197,6 @@ _build_target: String, _statement: String, - _reloadToolData: function(run) { - this._requestManager.request(tf_backend.addParams( - tf_backend.getRouter().pluginRoute('profile', '/data'), - {tag: 'overview_page', run}) - ).then((data) => { - if (data) { - this.set('_data', data); - } - }); - }, - /* Toggles _show_top_ops_table */ onClickTopOps: function(e) { this.set('_show_top_ops_table', !this._show_top_ops_table); @@ -229,6 +209,7 @@ /* Updates view according to new data */ _updateView: function() { + if (this._data == null) return; var generalAnalysisJson = this._data[0]; var inputAnalysisJson = this._data[1]; var runEnvironmentJson = this._data[2]; diff --git a/tensorboard/plugins/profile/profile_plugin.py b/tensorboard/plugins/profile/profile_plugin.py index 172de8c6484..da1b46a6ca5 100644 --- a/tensorboard/plugins/profile/profile_plugin.py +++ b/tensorboard/plugins/profile/profile_plugin.py @@ -38,6 +38,7 @@ LOGDIR_ROUTE = '/logdir' DATA_ROUTE = '/data' TOOLS_ROUTE = '/tools' +HOSTS_ROUTE = '/hosts' # Available profiling tools -> file name of the tool data. _FILE_NAME = 'TOOL_FILE_NAME' @@ -45,7 +46,7 @@ 'trace_viewer': 'trace', 'op_profile': 'op_profile.json', 'input_pipeline_analyzer': 'input_pipeline.json', - 'overview_page': 'overview_page.json', + 'overview_page': 'overview_page.json' } # Tools that consume raw data. @@ -92,13 +93,19 @@ def index_impl(self): In the plugin log directory, each directory contains profile data for a single run (identified by the directory name), and files in the run directory contains data for different tools. The file that contains profile - for a specific tool "x" will have a fixed name TOOLS["x"]. + for a specific tool "x" will have a suffix name TOOLS["x"]. Example: log/ run1/ - trace + plugins/ + profile/ + host1.trace + host2.trace run2/ - trace + plugins/ + profile/ + host1.trace + host2.trace Returns: A map from runs to tool names e.g. @@ -110,11 +117,11 @@ def index_impl(self): # run1/ # plugins/ # profile/ - # trace + # host1.trace # run2/ # plugins/ # profile/ - # trace + # host2.trace run_to_tools = {} if not tf.gfile.IsDirectory(self.plugin_logdir): return run_to_tools @@ -124,9 +131,15 @@ def index_impl(self): continue run_to_tools[run] = [] for tool in TOOLS: - tool_filename = TOOLS[tool] - if tf.gfile.Exists(os.path.join(run_dir, tool_filename)): - run_to_tools[run].append(tool) + tool_pattern = '*' + TOOLS[tool] + path = os.path.join(run_dir, tool_pattern) + try: + files = tf.gfile.Glob(path); + if len(files) >= 1: + run_to_tools[run].append(tool) + except tf.errors.OpError: + logging.warning("Cannot read asset directory: %s, OpError %s", + run_dir, e) return run_to_tools @wrappers.Request.application @@ -134,21 +147,72 @@ def tools_route(self, request): run_to_tools = self.index_impl() return http_util.Respond(request, run_to_tools, 'application/json') - def data_impl(self, run, tool): - """Retrieves and processes the tool data for a run. + def host_impl(self, run, tool): + """Returns available hosts for the run and tool in the log directory. + + In the plugin log directory, each directory contains profile data for a + single run (identified by the directory name), and files in the run + directory contains data for different tools and hosts. The file that + contains profile for a specific tool "x" will have a prefix name TOOLS["x"]. + + Example: + log/ + run1/ + plugins/ + profile/ + host1.trace + host2.trace + run2/ + plugins/ + profile/ + host1.trace + host2.trace + + Returns: + A list of host names e.g. + {"host1", "host2", "host3"} for the example. + """ + tool_to_hosts = {} + if not tf.gfile.IsDirectory(self.plugin_logdir): + return tool_to_hosts + run_dir = self._run_dir(run) + if not run_dir: + logging.warning("Cannot find asset directory: %s", run_dir) + return; + tool_pattern = '*' + TOOLS[tool] + try: + files = tf.gfile.Glob(os.path.join(run_dir,tool_pattern)) + tool_to_hosts = [os.path.basename(f).replace(TOOLS[tool],'') for f in files] + except tf.errors.OpError: + logging.warning("Cannot read asset directory: %s, OpError %s", + run_dir, e) + return tool_to_hosts + + + @wrappers.Request.application + def hosts_route(self, request): + run = request.args.get('run') + tool = request.args.get('tag') + tool_to_hosts = self.host_impl(run, tool) + return http_util.Respond(request, tool_to_hosts, 'application/json') + + def data_impl(self, run, tool, host): + """Retrieves and processes the tool data for a run and a host. Args: run: Name of the run. tool: Name of the tool. + host: Name of the host. Returns: - A string that can be served to the frontend tool or None if tool or - run is invalid. + A string that can be served to the frontend tool or None if tool, + run or host is invalid. """ # Path relative to the path of plugin directory. if tool not in TOOLS: return None - rel_data_path = os.path.join(run, TOOLS[tool]) + tool_name = str(host) + TOOLS[tool] + rel_data_path = os.path.join(run, tool_name) asset_path = os.path.join(self.plugin_logdir, rel_data_path) raw_data = None try: @@ -173,9 +237,11 @@ def data_route(self, request): # run: The run name. # tag: The tool name e.g. trace_viewer. The plugin returns different UI # data for different tools of the same run. + # host: The host name. run = request.args.get('run') tool = request.args.get('tag') - data = self.data_impl(run, tool) + host = request.args.get('host') + data = self.data_impl(run, tool, host) if data is None: return http_util.Respond(request, '404 Not Found', 'text/plain', code=404) return http_util.Respond(request, data, 'text/plain') @@ -184,6 +250,7 @@ def get_plugin_apps(self): return { LOGDIR_ROUTE: self.logdir_route, TOOLS_ROUTE: self.tools_route, + HOSTS_ROUTE: self.hosts_route, DATA_ROUTE: self.data_route, } diff --git a/tensorboard/plugins/profile/tf_op_profile/tf-op-profile.html b/tensorboard/plugins/profile/tf_op_profile/tf-op-profile.html index 5a0838d103c..d4ce4ab8571 100644 --- a/tensorboard/plugins/profile/tf_op_profile/tf-op-profile.html +++ b/tensorboard/plugins/profile/tf_op_profile/tf-op-profile.html @@ -34,7 +34,7 @@ tf-op-details { position: fixed; /* don't set top, so it ends up next to tf-op-table */ - padding-top: 6.5em; + margin-top: 10em; left: 16px; width: 330px; } @@ -56,7 +56,7 @@

Overall TPU FLOPS utilization is

Modifying your model's architecture, data dimensions, and improving the efficiency of CPU operations may help reach the TPU's FLOPS potential.

- + @@ -65,15 +65,6 @@

Overall TPU FLOPS utilization is Polymer({ is: 'tf-op-profile', properties: { - _requestManager: { - type: Object, - readOnly: true, - value: () => new tf_backend.RequestManager(), - }, - run: { - type: String, - observer: '_load' - }, _data: { type: Object, notify: true, @@ -89,15 +80,6 @@

Overall TPU FLOPS utilization is notify: true, }, }, - _load: function(run) { - if (!run) return; - this._requestManager.request(tf_backend.addParams( - tf_backend.getRouter().pluginRoute('profile', '/data'), {tag: 'op_profile', run}) - ).catch(error => {} - ).then((data) => { - this._data = data; - }); - }, _getRoot: function(data, breakdown) { return data[breakdown]; }, _utilizationPercent: function(node) { return tf_op_profile.percent(tf_op_profile.utilization(node)); }, _hasFlops: function(node) { return node.metrics.flops > 0; }, diff --git a/tensorboard/plugins/profile/tf_profile_dashboard/tf-profile-dashboard.html b/tensorboard/plugins/profile/tf_profile_dashboard/tf-profile-dashboard.html index 591e3af4f82..8cfcddc3809 100644 --- a/tensorboard/plugins/profile/tf_profile_dashboard/tf-profile-dashboard.html +++ b/tensorboard/plugins/profile/tf_profile_dashboard/tf-profile-dashboard.html @@ -15,6 +15,7 @@ limitations under the License. --> + @@ -69,31 +70,41 @@

No profile data was found.