diff --git a/.github/.wordlist.txt b/.github/.wordlist.txt index 9ac4419af442c9..0a930013cb9509 100644 --- a/.github/.wordlist.txt +++ b/.github/.wordlist.txt @@ -455,6 +455,9 @@ GetDeviceInfo GetDns GetIP getstarted +GH +gh +ghp githubusercontent gitignore glibc @@ -848,6 +851,7 @@ PyEval PyFunction pylint PyObject +pypi PyRun pytest QEMU @@ -958,6 +962,7 @@ SiLabs SiliconLabs SimpleFileExFlags SimpleLink +sizedb sl SLAAC SLTB @@ -1041,6 +1046,7 @@ testws texinfo textboxes TFT +ThIsIsNoTMyReAlGiThUbToKeNSoDoNoTtRy threadOperationalDataset ThreadStackManager ThreadStackManagerImpl @@ -1052,6 +1058,7 @@ TLV tmp tngvndl TODO +toJson tokenized toolchain toolchains diff --git a/scripts/tools/memory/.pylintrc b/scripts/tools/memory/.pylintrc index 6b77df256b40d1..413c27f80e068c 100644 --- a/scripts/tools/memory/.pylintrc +++ b/scripts/tools/memory/.pylintrc @@ -1,7 +1,7 @@ [BASIC] -disable=too-few-public-methods,bad-whitespace +disable=too-few-public-methods,bad-whitespace,broad-except -no-docstring-rgx=main +no-docstring-rgx=main|__init__ docstring-min-length=5 min-public-methods=1 max-args=7 diff --git a/scripts/tools/memory/README-GitHub-CI.md b/scripts/tools/memory/README-GitHub-CI.md new file mode 100644 index 00000000000000..d2171e26eb9050 --- /dev/null +++ b/scripts/tools/memory/README-GitHub-CI.md @@ -0,0 +1,180 @@ +# Scripts for GitHub CI + +A set of `gh_*.py` scripts work together to produce size comparisons for PRs. + +## Reports on Pull Requests + +The scripts' results are presented as comments on PRs. + +**Note** that a comment may be updated by the scripts as CI run results become +available. + +**Note** that the scripts will not create a comment for a commit if there is +already a newer commit in the PR. + +A size report comment consists of a title followed by one to four tables. A +title looks like: + +> PR #12345678: Size comparison from `base-SHA` to `pr-SHA` + +The first table, if present, lists items with a large increase, according to a +configurable threshold. + +The next table, if present, lists all items that have increased in size. + +The next table, if present, lists all items that have decreased in size. + +The final table, always present, lists all items. + +## Usage in CI + +The original intent was to have a tool that would run after a build in CI, add +its sizes to a central database, and immediately report on size changes from the +parent commit in the database. Unfortunately, GitHub provides no practical place +to store and share such a database between workflow actions. Instead, the +process is split; builds in CI record size information in the form of GitHub +[artifacts](https://docs.github.com/en/actions/advanced-guides/storing-workflow-data-as-artifacts), +and a later step reads these artifacts to generate reports. + +### 1. Build workflows + +#### gh_sizes_environment.py + +The `gh_sizes_environment.py` script should be run once in each workflow that +records sizes, _after_ checkout and _before_ any use of `gh_sizes.py` It takes a +single argument, a JSON dictionary of the `github` context. Typically run as: + +``` + steps: + - name: Checkout + uses: actions/checkout@v2 + with: + submodules: true + + - name: Set up environment for size reports + if: ${{ !env.ACT }} + env: + GH_CONTEXT: ${{ toJson(github) }} + run: scripts/tools/memory/gh_sizes_environment.py "${GH_CONTEXT}" +``` + +#### gh_sizes.py + +The `gh_sizes.py` script runs on a built binary (executable or library) and +produces a JSON file containing size information. + +Usage: `gh_sizes.py` _platform_ _config_ _target_ _binary_ [_output_] + +Where _platform_ is the platform name, corresponding to a config file in +`scripts/tools/memory/platform/`. + +Where _config_ is a configuration identification string. This has no fixed +meaning, but is intended to describe a build variation, e.g. a particular target +board or debug vs release. + +Where _target_ is a readable name for the build artifact, identifying it in +reports. + +Where _binary_ is the input build artifact. + +Where _output_ is the name for the output JSON file, or a directory for it, in +which case the name will be +_platform_`-`_config_name_`-`_target_name_`-sizes.json`. + +Example: + +``` + scripts/tools/memory/gh_sizes.py \ + linux arm64 thermostat-no-ble \ + out/linux-arm64-thermostat-no-ble/thermostat-app \ + /tmp/bloat_reports/ +``` + +#### Upload artifacts + +The JSON files generated by `gh_sizes.py` must be uploaded with an artifact name +of a very specific form in order to be processed correctly. + +Example: + +``` +Size,Linux-Examples,${{ env.GH_EVENT_PR }},${{ env.GH_EVENT_HASH }},${{ env.GH_EVENT_PARENT }},${{ github.event_name }} +``` + +Other builds must replace `Linux-Examples` with a label unique to the workflow, +but otherwise use the form exactly. + +### 2. Reporting workflow + +Run a periodic workflow calling `gh_report.py` to generate PR comments. This +script has full `--help`, but normal use is probably best illustrated by an +example: + +``` + scripts/tools/memory/gh_report.py \ + --verbose \ + --report-increases 0.2 \ + --report-pr \ + --github-comment \ + --github-limit-artifact-pages 50 \ + --github-limit-artifacts 500 \ + --github-limit-comments 20 \ + --github-repository project-chip/connectedhomeip \ + --github-api-token "${{ secrets.GITHUB_TOKEN }}" +``` + +Notably, the `--report-increases` flag provides a _percent growth_ threshold for +calling out ‘large’ increases in GitHub comments. + +When this script successfully posts a comment on a GitHub PR, it removes the +corresponding PR artifact(s) so that a future run will not process it again and +post the same comment. Only PR artifacts are removed, not push (trunk) +artifacts, since those may be used as a comparison base by many different PRs. + +## Using a database + +It can be useful to keep a permanent record of build sizes. + +### Updating the database: `gh_db_load.py` + +To update an SQLite file of trunk commit sizes, periodically run: + +``` + gh_db_load.py \ + --repo project-chip/connectedhomeip \ + --token ghp_ThIsIsNoTMyReAlGiThUbToKeNSoDoNoTtRy \ + --db /path/to/database +``` + +Those interested in only a single platform can add the `--github-label` option, +providing the same name as in the size artifact name after `Size,` (e.g. +`Linux-Examples` in the upload example above). + +See `--help` for additional options. + +_Note_: Transient 4xx and 5xx errors from GitHub's API are very common. Run +`gh_db_load.py` frequently enough to give it several attempts before the +relevant artifacts expire. + +### Querying the database: `gh_db_query.py` + +While the database can of course be used directly, the `gh_db_query.py` script +provides a handful of common queries. + +Note that this script (like others that show tables) has an `--output-format` +option offering (among others) CSV, several JSON formats, and any text format +provided by [tabulate](https://pypi.org/project/tabulate/). + +Two notable options: + +- `--query-build-sizes PLATFORM,CONFIG,TARGET` lists sizes for all builds of + the given kind, with a column for each section. +- `--query-section-changes PLATFORM,CONFIG,TARGET,SECTION` lists changes for + the given section. The `--report-increases PERCENT` option limits this to + changes over a given threshold (as is done for PR comments). + +(To find out what PLATFORM, CONFIG, TARGET, and SECTION exist: +`--query-platforms`, then `--query-platform-targets=PLATFORM` and +`--query-platform-sections=PLATFORM`.) + +See `--help` for additional options. diff --git a/scripts/tools/memory/README.md b/scripts/tools/memory/README.md index 9ccd4a2560813e..3bde9607ce165c 100644 --- a/scripts/tools/memory/README.md +++ b/scripts/tools/memory/README.md @@ -41,14 +41,15 @@ The following options are common to _most_ of the scripts, where applicable: - `--output-format` _FORMAT_, `--to` _FORMAT_, `-t` _FORMAT_ Output format. One of: - `text` — Plain text tables, in a single file. - - `csv` — Comma-separated tables (in several files). - - `tsv` — Tab-separated tables (in several files). + - `csv` — Comma-separated tables (in several files, if not stdout). + - `tsv` — Tab-separated tables (in several files, if not stdout). - `json_split` — JSON - see Pandas documentation for details. - `json_records` — JSON - see Pandas documentation for details. - `json_index` — JSON - see Pandas documentation for details. - `json_columns` — JSON - see Pandas documentation for details. - `json_values` — JSON - see Pandas documentation for details. - `json_table` — JSON - see Pandas documentation for details. + - Any format provided by [tabulate](https://pypi.org/project/tabulate/). - `--report-limit` _BYTES_, `--limit` _BYTES_ Limit display to items above the given size. Suffixes (e.g. `K`) are accepted. - `--report-by` _GROUP_, `--by` _GROUP_ Reporting group. One of: diff --git a/scripts/tools/memory/gh_db_load.py b/scripts/tools/memory/gh_db_load.py new file mode 100755 index 00000000000000..9cb69bea5c2b7e --- /dev/null +++ b/scripts/tools/memory/gh_db_load.py @@ -0,0 +1,113 @@ +#!/usr/bin/env python3 +# +# Copyright (c) 2021 Project CHIP Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""Fetch data from GitHub size artifacts.""" + +import io +import logging +import sys + +import memdf.sizedb +import memdf.util.config +import memdf.util.markdown +import memdf.util.sqlite +from memdf.util.github import Gh +from memdf import Config, ConfigDescription + +GITHUB_CONFIG: ConfigDescription = { + Config.group_def('github'): { + 'title': 'github options', + }, + 'github.event': { + 'help': 'Download only event type(s) (default ‘push’)', + 'metavar': 'EVENT', + 'default': [], + 'argparse': { + 'alias': ['--event'] + }, + }, + 'github.limit-artifacts': { + 'help': 'Download no more than COUNT artifacts', + 'metavar': 'COUNT', + 'default': 0, + 'argparse': { + 'type': int, + }, + }, + 'github.label': { + 'help': 'Download artifacts for one label only', + 'metavar': 'LABEL', + 'default': '', + }, +} + + +def main(argv): + status = 0 + try: + sqlite_config = memdf.util.sqlite.CONFIG + sqlite_config['database.file']['argparse']['required'] = True + + config = Config().init({ + **memdf.util.config.CONFIG, + **memdf.util.github.CONFIG, + **sqlite_config, + **GITHUB_CONFIG, + }) + config.argparse.add_argument('inputs', metavar='FILE', nargs='*') + config.parse(argv) + + db = memdf.sizedb.SizeDatabase(config['database.file']).open() + + if gh := Gh(config): + + artifact_limit = config['github.limit-artifacts'] + artifacts_added = 0 + events = config['github.event'] + if not events: + events = ['push'] + for a in gh.get_size_artifacts(label=config['github.label']): + if events and a.event not in events: + logging.debug('Skipping %s artifact %d', a.event, a.id) + continue + cur = db.execute('SELECT id FROM build WHERE artifact = ?', + (a.id,)) + if cur.fetchone(): + logging.debug('Skipping known artifact %d', a.id) + continue + blob = gh.download_artifact(a.id) + if blob: + logging.info('Adding artifact %d %s %s %s %s', + a.id, a.commit[:12], a.pr, a.event, a.group) + db.add_sizes_from_zipfile(io.BytesIO(blob), + {'artifact': a.id}) + db.commit() + artifacts_added += 1 + if artifact_limit and artifact_limit <= artifacts_added: + break + + for filename in config['args.inputs']: + db.add_sizes_from_file(filename) + db.commit() + + except Exception as exception: + raise exception + + return status + + +if __name__ == '__main__': + sys.exit(main(sys.argv)) diff --git a/scripts/tools/memory/gh_db_query.py b/scripts/tools/memory/gh_db_query.py new file mode 100755 index 00000000000000..e42262e567ba7f --- /dev/null +++ b/scripts/tools/memory/gh_db_query.py @@ -0,0 +1,443 @@ +#!/usr/bin/env python3 +# +# Copyright (c) 2022 Project CHIP Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""Common queries on a size database.""" + +import datetime +import logging +import sys + +from typing import cast, Dict, List, Mapping, Optional, Tuple + +import pandas as pd # type: ignore + +import memdf.report +import memdf.util.config +import memdf.util.sqlite +from memdf.sizedb import SizeDatabase +from memdf import Config + + +QUERY_CONFIG = { + Config.group_map('query'): { + 'group': 'output' + }, + 'report.increases': { + 'help': 'Highlight large increases', + 'metavar': 'PERCENT', + 'default': 0.0, + 'argparse': { + 'alias': ['--threshold'], + 'type': float, + }, + }, + 'query.where': { + 'help': 'SQL filter', + 'metavar': 'SQL-EXPR', + 'default': '', + 'argparse': { + 'alias': ['--where'], + }, + }, + 'query.order': { + 'help': 'sort order', + 'metavar': 'COLUMN[,COLUMN]*', + 'default': '', + 'argparse': { + 'alias': ['--order'], + }, + }, + 'query.limit': { + 'help': 'limit result size', + 'metavar': 'ROWS', + 'default': 0, + 'argparse': { + 'alias': ['--limit'], + }, + }, +} + + +def argsplit(metavar: str, value: str) -> Tuple[Optional[Tuple], Dict]: + """Given comma-separated metavar and values, match them up.""" + values = tuple(value.split(',')) + names = metavar.split(',') + if len(names) < len(values): + logging.error('Too many values for %s', metavar) + return (None, {}) + if len(names) > len(values): + logging.error('Missing %s for %s', ','.join(names[len(values):]), + metavar) + return (None, {}) + return (values, dict(zip(names, values))) + + +def postprocess_canned_sql_option(config: Config, key: str, + info: Mapping) -> None: + """Record information from simple SQL query options in one place.""" + value = config[key] + if not value: + return + title = info['sql']['title'] + if isinstance(value, str): + metavar = info.get('metavar', 'VALUE') + if ',' in metavar: + values, args = argsplit(metavar, value) + if not values: + return + else: + values = (value,) + args = {metavar: value} + title = title.format(**args) + else: + values = tuple() + + if config['queries'] is None: + config['queries'] = [] + cast(list, config['queries']).append((title, key, values, info)) + + +def make_query(config: Config, info: Mapping) -> str: + """Construct an SQL query string for a simple SQL query option.""" + args = {'where': '', 'order': '', 'limit': ''} + if where := config.get('query.where'): + if kw := info['sql'].get('where'): + args['where'] = f'{kw} {where}' + if order := (config.get('query.order') or info['sql'].get('order')): + args['order'] = f'ORDER BY {order}' + if limit := config.get('query.limit'): + args['limit'] = f'LIMIT {limit}' + return info['sql']['query'].format(**args) + + +def postprocess_df_time(_config: Config, df: pd.DataFrame) -> pd.DataFrame: + """Convert a DataFrame ‘time’ column from Unix timestamp to ISO.""" + df['time'] = df['time'].map(lambda t: datetime.datetime.utcfromtimestamp(t) + .isoformat()) + return df + + +def postprocess_df_changes(config: Config, df: pd.DataFrame) -> pd.DataFrame: + """Given ‘parent_size’and ‘commit_size’ columns, add change columns.""" + df['change'] = df.apply(lambda row: row.commit_size - row.parent_size, + axis=1) + df['% change'] = df.apply(lambda row: SizeDatabase.percent_change( + row.parent_size, row.commit_size), + axis=1) + if threshold := config['report.increases']: + df = df[df['% change'] > threshold] + return df + + +QUERY_CONFIG |= { + 'query.platforms': { + 'help': 'List known platforms', + 'default': False, + 'postprocess': postprocess_canned_sql_option, + 'sql': { + 'title': 'Platforms', + 'query': ''' + SELECT DISTINCT platform FROM thing {where} {order} {limit} + ''', + 'where': 'WHERE', + 'order': 'platform', + }, + 'argparse': { + 'alias': ['--platforms'], + }, + }, + 'query.platform-targets': { + 'help': 'List known targets for the given platform', + 'metavar': 'PLATFORM', + 'default': '', + 'postprocess': postprocess_canned_sql_option, + 'sql': { + 'title': 'Platform Targets', + 'query': ''' + SELECT DISTINCT platform, config, target + FROM thing + WHERE platform=? {where} + {order} {limit} + ''', + 'where': 'AND', + 'order': 'platform, config, target', + }, + 'argparse': { + 'alias': ['--platform-targets'], + }, + }, + 'query.platform-sections': { + 'help': 'List known sections for the given platform', + 'metavar': 'PLATFORM', + 'default': '', + 'postprocess': postprocess_canned_sql_option, + 'sql': { + 'title': 'Platform Sections', + 'query': ''' + SELECT DISTINCT platform, s.name AS section + FROM thing t + INNER JOIN build b ON t.id == b.thing_id + INNER JOIN size s ON b.id == s.build_id + WHERE platform=? {where} + {order} {limit} + ''', + 'where': 'AND', + 'order': 'platform, section', + }, + 'argparse': { + 'alias': ['--platform-sections'], + }, + }, + 'query.section-sizes': { + 'help': 'List size data for a given build section', + 'metavar': 'PLATFORM,CONFIG,TARGET,SECTION', + 'default': '', + 'postprocess': postprocess_canned_sql_option, + 'sql': { + 'title': 'Sizes for {PLATFORM} {CONFIG} {TARGET} {SECTION}', + 'query': ''' + SELECT DISTINCT time, hash, pr, size + FROM build b + INNER JOIN size s ON b.id == s.build_id + WHERE b.thing_id == (SELECT id FROM thing + WHERE platform == ? + AND config == ? + AND target == ?) + AND name == ? + {where} + {order} {limit} + ''', + 'where': 'AND', + 'order': 'time', + 'postprocess': [postprocess_df_time], + }, + }, + 'query.section-changes': { + 'help': 'List size changes for a given build section', + 'metavar': 'PLATFORM,CONFIG,TARGET,SECTION', + 'default': '', + 'postprocess': postprocess_canned_sql_option, + 'sql': { + 'title': 'Changes for {PLATFORM} {CONFIG} {TARGET} {SECTION}', + 'query': ''' + WITH builds (bid, pid, time, pr, hash) AS ( + SELECT DISTINCT b.id, p.id, b.time, b.pr, b.hash + FROM build b + INNER JOIN build p + ON p.hash = b.parent AND p.thing_id == b.thing_id + WHERE b.thing_id == (SELECT id FROM thing + WHERE platform == ? + AND config == ? + AND target == ?) + ) + SELECT DISTINCT + time, hash, pr, + ps.size as parent_size, + bs.size as commit_size + FROM builds + INNER JOIN size bs ON builds.bid == bs.build_id + INNER JOIN size ps ON builds.pid == ps.build_id + WHERE bs.name == ? AND ps.name == bs.name + {where} + {order} {limit} + ''', + 'where': 'AND', + 'order': 'time', + 'postprocess': [postprocess_df_time, postprocess_df_changes], + }, + }, + 'query.all-changes': { + 'help': 'List all size changes', + 'default': False, + 'postprocess': postprocess_canned_sql_option, + 'sql': { + 'title': 'Size Changes', + 'query': ''' + WITH + builds (bid, pid, time, pr, hash, thing_id) AS ( + SELECT DISTINCT b.id, p.id, b.time, b.pr, b.hash, b.thing_id + FROM build b + INNER JOIN build p + ON p.hash = b.parent AND p.thing_id == b.thing_id + ), + changes (bid, tid, name, parent_size, commit_size, change) AS ( + SELECT DISTINCT + bs.build_id, + thing_id, + bs.name, + ps.size as parent_size, + bs.size as commit_size, + bs.size - ps.size as change + FROM builds + INNER JOIN size bs ON builds.bid == bs.build_id + INNER JOIN size ps ON builds.pid == ps.build_id + WHERE bs.name == ps.name + ) + SELECT + time, hash, + platform, config, target, name, + parent_size, commit_size, change + FROM changes + INNER JOIN build ON bid == build.id + INNER JOIN thing ON tid == thing.id + {where} {order} {limit} + ''', + 'where': 'AND', + 'order': 'time', + 'postprocess': [postprocess_df_time, postprocess_df_changes], + }, + }, + 'query.build-sizes': { + # SQLite doesn't have PIVOT so we have to script this. + 'help': 'List size changes for a given build', + 'metavar': 'PLATFORM,CONFIG,TARGET', + 'default': '', + }, +} + + +def get_build_sections(db: SizeDatabase, build: str) -> Optional[Tuple]: + """Split a build arg and get its thing_id and sections.""" + values, args = argsplit('PLATFORM,CONFIG,TARGET', build) + if not values: + return None + + platform = args['PLATFORM'] + pconfig = args['CONFIG'] + ptarget = args['TARGET'] + thing_id = db.select_thing_id(platform, pconfig, ptarget) + if not thing_id: + logging.error('No match for %s,%s,%s', platform, pconfig, ptarget) + return None + + sections = db.select_sections_for_thing(thing_id) + if not sections: + logging.warning('No sections for %s,%s,%s', platform, pconfig, ptarget) + return None + + return (platform, pconfig, ptarget, thing_id, sections) + + +def make_build_sizes_query(config: Config, thing_id: str, + sections: List[str]) -> Tuple[List[str], str]: + """Construct and SQL query for all section sizes for a given thing.""" + # SQLite doesn't have PIVOT so we need to construct a query with + # a column for each section. + columns = ['time', 'hash', 'pr'] + cols = ', '.join(columns) + joins = '' + where = f' WHERE b.thing_id == {thing_id}' + for i, s in enumerate(sections): + columns.append(s) + cols += f', s{i}.size AS s{i}z' + joins += f' INNER JOIN size s{i} ON b.id == s{i}.build_id' + where += f' AND s{i}.name == "{s}"' + if qw := config['query.where']: + where += f' AND {qw}' + query = f'''SELECT {cols} + FROM build b + {joins} + {where} + ORDER BY {config.get('query.order') or 'time'}''' + if limit := config['query.limit']: + query += f' LIMIT {limit}' + return (columns, query) + + +def query_build_sizes(config: Config, db: SizeDatabase, + build: str) -> Optional[pd.DataFrame]: + """Get all sizes for the given build.""" + t = get_build_sections(db, build) + if not t: + return None + platform, pconfig, ptarget, thing_id, sections = t + + columns, query = make_build_sizes_query(config, thing_id, sections) + logging.debug('Query: %s', query) + + cur = db.execute(query) + rows = cur.fetchall() + if rows: + df = pd.DataFrame(rows, columns=columns) + df.attrs = { + 'name': f'qbs-{build}', + 'title': f'Sizes for {platform} {pconfig} {ptarget}', + } + return postprocess_df_time(config, df) + + return None + + +def main(argv): + status = 0 + try: + cfg = { + **memdf.util.config.CONFIG, + **memdf.util.sqlite.CONFIG, + **memdf.report.OUTPUT_CONFIG, + **QUERY_CONFIG, + } + cfg['database.file']['argparse']['required'] = True + + config = Config().init(cfg) + config.parse(argv) + + db = SizeDatabase(config['database.file'], writable=False) + db.open() + + dfs = {} + + q = 0 + for title, key, values, info in config.get('queries', []): + q += 1 + query = make_query(config, info) + logging.debug('Option: %s', key) + logging.debug('Title: %s', title) + logging.debug('Query: %s', query.strip()) + logging.debug('With: %s', values) + cur = db.execute(query, values) + columns = [i[0] for i in cur.description] + rows = cur.fetchall() + if rows: + df = pd.DataFrame(rows, columns=columns) + df.attrs = {'name': f'query{q}', 'title': title} + for f in info['sql'].get('postprocess', []): + df = f(config, df) + dfs[df.attrs['name']] = df + + if build := config['query.build-sizes']: + q += 1 + if (df := query_build_sizes(config, db, build)) is not None: + dfs[df.attrs['name']] = df + + if q == 0: + config.argparse.print_help() + return 1 + + memdf.report.write_dfs(config, + dfs, + hierify=config['hierify'], + title=True, + floatfmt='5.1f') + + except Exception as exception: + raise exception + + return status + + +if __name__ == '__main__': + sys.exit(main(sys.argv)) diff --git a/scripts/tools/memory/gh_report.py b/scripts/tools/memory/gh_report.py index 7a52052c4287a5..608fe9c5edbf9a 100755 --- a/scripts/tools/memory/gh_report.py +++ b/scripts/tools/memory/gh_report.py @@ -17,48 +17,39 @@ """Generate reports from size artifacts.""" import io -import itertools -import json import logging -import os -import os.path import re -import sqlite3 import sys -import zipfile -from pathlib import Path -from typing import Dict, IO, Iterable, Optional, Union +from typing import Dict -import dateutil # type: ignore import fastcore # type: ignore -import ghapi.all # type: ignore import pandas as pd # type: ignore import memdf.report +import memdf.sizedb import memdf.util.config +import memdf.util.markdown import memdf.util.sqlite +from memdf.util.github import Gh from memdf import Config, ConfigDescription -GITHUB_CONFIG: ConfigDescription = { - Config.group_def('github'): { - 'title': 'github options', +DB_CONFIG: ConfigDescription = { + Config.group_def('database'): { + 'title': 'database options', }, - 'github.token': { - 'help': 'Github API token, or "SKIP" to suppress connecting to github', - 'metavar': 'TOKEN', - 'default': '', + 'database.readonly': { + 'help': 'Open database read only', + 'default': False, 'argparse': { - 'alias': ['--github-api-token', '--token'], + 'alias': ['--db-readonly'], }, }, - 'github.repository': { - 'help': 'Github repostiory', - 'metavar': 'OWNER/REPO', - 'default': '', - 'argparse': { - 'alias': ['--repo'], - }, +} + +GITHUB_CONFIG: ConfigDescription = { + Config.group_def('github'): { + 'title': 'github options', }, 'github.comment': { 'help': 'Send output as github PR comments', @@ -67,17 +58,6 @@ 'alias': ['--comment'], }, }, - 'github.keep': { - 'help': 'Leave PR artifacts after commenting', - 'default': False, - 'argparse': { - 'alias': ['--keep'], - }, - }, - 'github.dryrun-comment': { - 'help': 'Dry run for sending output as github PR comments', - 'default': False, - }, 'github.limit-comments': { 'help': 'Send no more than COUNT comments', 'metavar': 'COUNT', @@ -94,14 +74,6 @@ 'type': int, }, }, - 'github.limit-artifact-pages': { - 'help': 'Examine no more than COUNT pages of artifacts', - 'metavar': 'COUNT', - 'default': 0, - 'argparse': { - 'type': int, - }, - }, 'github.limit-pr': { 'help': 'Report only on PR, if present.', 'metavar': 'PR', @@ -110,6 +82,9 @@ 'type': int, }, }, +} + +REPORT_CONFIG: ConfigDescription = { Config.group_map('report'): { 'group': 'output' }, @@ -127,13 +102,6 @@ 'alias': ['--push'] }, }, - 'report.query': { - 'help': 'Run an SQL query', - 'default': [], - 'argparse': { - 'alias': ['--query', '--sql'] - }, - }, 'report.increases': { 'help': 'Highlight large increases', 'metavar': 'PERCENT', @@ -146,151 +114,33 @@ } -class SizeDatabase(memdf.util.sqlite.Database): - """A database for recording and comparing size reports.""" - on_open = ["PRAGMA foreign_keys = ON", "PRAGMA encoding = 'UTF-8'"] - on_writable = [ - """ - -- A ‘thing’ identifies the kind of built object. - -- Builds of the same thing are comparable. - CREATE TABLE IF NOT EXISTS thing ( - id INTEGER PRIMARY KEY, - platform TEXT NOT NULL, -- Build platform - config TEXT NOT NULL, -- Build configuration discriminator - target TEXT NOT NULL, -- Build target - UNIQUE(platform, config, target) - ) - """, """ - -- A ‘build’ identifies a built instance of a thing at some point. - CREATE TABLE IF NOT EXISTS build ( - id INTEGER PRIMARY KEY, - thing_id INTEGER REFERENCES thing(id), - hash TEXT NOT NULL, -- Commit hash - parent TEXT NOT NULL, -- Parent commit hash - pr INTEGER DEFAULT 0, -- Github PR number - time INTEGER NOT NULL, -- Unix-epoch timestamp - artifact INTEGER DEFAULT 0, -- Github artifact ID - commented INTEGER DEFAULT 0, -- 1 if recorded in a GH comment - ref TEXT, -- Target git ref - event TEXT, -- Github build trigger event - UNIQUE(thing_id, hash, parent, pr, time, artifact) - ) - """, """ - -- A ‘size’ entry gives the size of a section for a particular build. - CREATE TABLE IF NOT EXISTS size ( - build_id INTEGER REFERENCES build(id), - name TEXT NOT NULL, -- Section name - size INTEGER NOT NULL, -- Section size in bytes - PRIMARY KEY (build_id, name) - ) - """ - ] +class SizeContext: + """Generate reports from size artifacts.""" + + comment_format_re = re.compile(r"^") def __init__(self, config: Config): - super().__init__(config['database.file']) self.config = config - self.gh = gh_open(config) - self.deleted_artifacts: set[int] = set() - - def add_sizes(self, **kwargs): - """ - Add a size report to the database. - - The incoming arguments must contain the required non-ID column names - from ‘thing’ and ‘build’ tables, plus a 'sizes' entry that is a - sequence of mappings containing 'name' and 'size'. - """ - td = {k: kwargs[k] for k in ('platform', 'config', 'target')} - thing = self.store_and_return_id('thing', **td) - bd = {k: kwargs[k] for k in ('hash', 'parent', 'time', 'event')} - if 'ref' in kwargs: - bd['ref'] = kwargs['ref'] - cd = {k: kwargs.get(k, 0) for k in ('pr', 'artifact', 'commented')} - build = self.store_and_return_id('build', thing_id=thing, **bd, **cd) - if build is None: - logging.error('Failed to store %s %s %s', thing, bd, cd) - else: - for d in kwargs['sizes']: - self.store('size', build_id=build, **d) - - def add_sizes_from_json(self, s: Union[bytes, str], origin: Dict): - """Add sizes from a JSON size report.""" - r = origin.copy() - r.update(json.loads(s)) - r['sizes'] = [] - # Add section sizes. - for i in r['frames'].get('section', []): - r['sizes'].append({'name': i['section'], 'size': i['size']}) - # Add segment sizes. - for i in r['frames'].get('wr', []): - r['sizes'].append({ - 'name': ('(read only)', '(read/write)')[int(i['wr'])], - 'size': i['size'] - }) - self.add_sizes(**r) - - def add_sizes_from_zipfile(self, f: Union[IO, Path], origin: Dict): - """Add size reports from a zip.""" - with zipfile.ZipFile(f, 'r') as zip_file: - for i in zip_file.namelist(): - if i.endswith('-sizes.json'): - origin['member'] = i - with zip_file.open(i) as member: - self.add_sizes_from_json(member.read(), origin) - - def add_sizes_from_file(self, filename: str): - """Add size reports from a file.""" - origin = {'file': filename} - path = Path(filename) - if path.suffix == '.json': - logging.info('ASJ: reading JSON %s', path) - with open(path) as f: - self.add_sizes_from_json(f.read(), origin) - elif path.suffix == '.zip': - logging.info('ASZ: reading ZIP %s', path) - self.add_sizes_from_zipfile(path, origin) - else: - logging.warning('Unknown file type "%s" ignored', filename) + self.gh = Gh(config) + db_file = config.get('database.file', ':memory:') + self.db = memdf.sizedb.SizeDatabase(db_file, + not config['database.readonly']) + self.db.open() def add_sizes_from_github(self): """Read size report artifacts from github.""" - if not self.gh: - return - artifact_limit = self.config['github.limit-artifacts'] - artifact_pages = self.config['github.limit-artifact-pages'] - - # Size artifacts have names of the form: - # Size,{group},{pr},{commit_hash},{parent_hash}[,{event}] - # Record them keyed by group and commit_hash to match them up + # Record size artifacts keyed by group and commit_hash to match them up # after we have the entire list. - page = 0 size_artifacts: Dict[str, Dict[str, fastcore.basics.AttrDict]] = {} - for i in ghapi.all.paged(self.gh.actions.list_artifacts_for_repo, 100): - if not i.artifacts: - break - for a in i.artifacts: - if a.name.startswith('Size,') and a.name.count(',') >= 4: - _, group, pr, commit, parent, *etc = a.name.split(',') - a.parent = parent - a.pr = pr - a.created_at = dateutil.parser.isoparse(a.created_at) - # Old artifact names don't include the event. - if etc: - event = etc[0] - else: - event = 'push' if pr == '0' else 'pull_request' - a.event = event - if group not in size_artifacts: - size_artifacts[group] = {} - size_artifacts[group][commit] = a - logging.debug('ASG: artifact %d %s', a.id, a.name) - page += 1 - logging.debug('ASP: artifact page %d of %d', page, artifact_pages) - if artifact_pages and page >= artifact_pages: - break + for a in self.gh.get_size_artifacts(): + if a.group not in size_artifacts: + size_artifacts[a.group] = {} + size_artifacts[a.group][a.commit] = a + logging.debug('ASG: artifact %d %s', a.id, a.name) # Determine required size artifacts. + artifact_limit = self.config['github.limit-artifacts'] required_artifact_ids: set[int] = set() for group, group_reports in size_artifacts.items(): logging.debug('ASG: group %s', group) @@ -313,419 +163,223 @@ def add_sizes_from_github(self): # Download and add required artifacts. for i in required_artifact_ids: - logging.debug('ASD: download artifact %d', i) - try: - blob = self.gh.actions.download_artifact(i, 'zip') - except Exception as e: - blob = None - logging.error('Failed to download artifact %d: %s', i, e) + blob = self.gh.download_artifact(i) if blob: - self.add_sizes_from_zipfile(io.BytesIO(blob), {'artifact': i}) + self.db.add_sizes_from_zipfile(io.BytesIO(blob), + {'artifact': i}) def read_inputs(self): """Read size report from github and/or local files.""" - self.add_sizes_from_github() + if self.gh: + self.add_sizes_from_github() for filename in self.config['args.inputs']: - self.add_sizes_from_file(filename) - self.commit() - - def select_matching_commits(self): - """Find matching builds, where one's commit is the other's parent.""" - return self.execute(''' - SELECT DISTINCT - c.event as event, - c.pr AS pr, - c.hash AS hash, - p.hash AS parent - FROM build c - INNER JOIN build p ON p.hash = c.parent - WHERE c.commented = 0 - ORDER BY c.time DESC, c.pr, c.hash, p.hash - ''') - - def set_commented(self, build_ids: Iterable[int]): - """Set the commented flag for the given builds.""" - if not build_ids: - return - for build_id in build_ids: - self.execute('UPDATE build SET commented = 1 WHERE id = ?', - (build_id, )) - self.commit() - - def delete_stale_builds(self, build_ids: Iterable[int]): - """Delete stale builds.""" - if not build_ids: - return - for build_id in build_ids: - logging.info('DSB: deleting obsolete build %d', build_id) - self.execute('DELETE FROM size WHERE build_id = ?', (build_id, )) - self.execute('DELETE FROM build WHERE id = ?', (build_id, )) - self.commit() - - def delete_artifact(self, artifact_id: int): - if (self.gh and artifact_id - and artifact_id not in self.deleted_artifacts): - self.deleted_artifacts.add(artifact_id) - try: - self.gh.actions.delete_artifact(artifact_id) - except Exception: - # During manual testing we sometimes lose the race against CI. - logging.error('Failed to delete artifact %d', artifact_id) - - def delete_stale_artifacts(self, stale_artifacts: Iterable[int]): - if not self.config['github.keep']: - for artifact_id in stale_artifacts: - logging.info('DSA: deleting obsolete artifact %d', artifact_id) - self.delete_artifact(artifact_id) - - def should_report(self, event: Optional[str] = None) -> bool: - """Return true if reporting is enabled for the event.""" - if event is None: + self.db.add_sizes_from_file(filename) + self.db.commit() + return self + + def should_report(self, event: str = '') -> bool: + """Return true if reporting is enabled for the action event.""" + if not event: return self.config['report.pr'] or self.config['report.push'] if event == 'pull_request': return self.config['report.pr'] return self.config['report.push'] - -def gh_open(config: Config) -> Optional[ghapi.core.GhApi]: - """Return a GhApi, if so configured.""" - gh: Optional[ghapi.core.GhApi] = None - if config['github.repository']: - owner, repo = config.get('github.repository').split('/', 1) - config.put('github.owner', owner) - config.put('github.repo', repo) - if not config['github.token']: - config['github.token'] = os.environ.get('GITHUB_TOKEN') - if not config['github.token']: - logging.error('Missing --github-token') - return None - token = config['github.token'] - if token != 'SKIP': - gh = ghapi.all.GhApi(owner=owner, - repo=repo, - token=config['github.token']) - return gh - - -def gh_get_comments_for_pr(gh: ghapi.core.GhApi, pr: int): - return itertools.chain.from_iterable( - ghapi.all.paged(gh.issues.list_comments, pr)) - - -def gh_get_commits_for_pr(gh: ghapi.core.GhApi, pr: int): - return itertools.chain.from_iterable( - ghapi.all.paged(gh.pulls.list_commits, pr)) - - -def percent_change(a: int, b: int) -> float: - if a == 0: - return 0.0 if b == 0 else float('inf') - return 100. * (b - a) / a - - -def changes_for_commit(db: SizeDatabase, pr: int, commit: str, - parent: str) -> pd.DataFrame: - """Return a DataFrame with size changes between the given commits.""" - cur = db.execute( - ''' - SELECT DISTINCT - t.id AS thing, - cb.artifact AS artifact, - pb.id AS parent_build, - cb.id AS commit_build, - t.platform, t.config, t.target, - cs.name AS name, - ps.size AS parent_size, - cs.size AS commit_size, - cs.size - ps.size AS change, - cb.time AS time - FROM thing t - INNER JOIN build cb ON cb.thing_id = t.id - INNER JOIN build pb ON pb.thing_id = t.id AND pb.hash = cb.parent - INNER JOIN size cs ON cs.build_id = cb.id - INNER JOIN size ps ON ps.build_id = pb.id AND cs.name = ps.name - WHERE cb.hash = ? AND pb.hash = ? - ORDER BY t.platform, t.config, t.target, - cs.name, cb.time DESC, pb.time DESC - ''', (commit, parent)) - - keep = ('platform', 'target', 'config', 'name', 'parent_size', - 'commit_size', 'change') - things: set[int] = set() - artifacts: set[int] = set() - builds: set[int] = set() - stale_builds: set[int] = set() - stale_artifacts: set[int] = set() - previous: Optional[sqlite3.Row] = None - rows = [] - - for row in cur.fetchall(): - row = sqlite3.Row(cur, row) - things.add(row['thing']) - if (previous is not None and row['thing'] == previous['thing'] - and row['name'] == previous['name']): - # This is duplicate build, older because we sort descending, - # presumably from a partial workflow re-run. - if row['parent_build'] != previous['parent_build']: - stale_builds.add(row['parent_build']) - if row['commit_build'] != previous['commit_build']: - stale_builds.add(row['commit_build']) - stale_artifacts.add(row['artifact']) - else: - previous = row - new = [row[k] for k in keep] - new.append(percent_change(row['parent_size'], row['commit_size'])) - rows.append(new) - artifacts.add(row['artifact']) - builds.add(row['commit_build']) - - db.delete_stale_builds(stale_builds) - db.delete_stale_artifacts(stale_artifacts) - - df = pd.DataFrame(rows, - columns=('platform', 'target', 'config', 'section', - parent[:8], commit[:8], 'change', '% change')) - df.attrs = { - 'name': f'{pr},{parent},{commit}', - 'title': (f'PR #{pr}: ' if pr else '') + - f'Size comparison from {parent} to {commit}', - 'things': things, - 'builds': builds, - 'artifacts': artifacts, - 'pr': pr, - 'commit': commit, - 'parent': parent, - } - return df - - -comment_format_re = re.compile(r"^") - - -def gh_send_change_report(db: SizeDatabase, df: pd.DataFrame) -> bool: - """Send a change report as a github comment.""" - - if not db.gh: - return False - - # Look for an existing comment for this change. - pr = df.attrs['pr'] - - # Check the most recent commit on the PR, so that we don't comment on - # builds that are already outdated. - commit = df.attrs['commit'] - commits = sorted( - gh_get_commits_for_pr(db.gh, pr), - key=lambda c: f'{c.commit.committer.date}{c.commit.author.date}', - reverse=True) - if commits and commit != commits[0].sha: - logging.info('SCS: PR #%s: not commenting for stale %s; newest is %s', - pr, commit, commits[0].sha) - # Return True so that the obsolete artifacts get removed. - return True - - # Check for an existing size report comment. If one exists, we'll add - # the new report to it. - title = df.attrs['title'] - existing_comment = None - existing_comment_format = 0 - for comment in gh_get_comments_for_pr(db.gh, pr): - comment_parts = comment.body.partition('\n') - if comment_parts[0].strip() == title: - existing_comment = comment - if m := comment_format_re.match(comment_parts[2]): - existing_comment_format = int(m.group(1)) - break - - if existing_comment_format != 1: + def get_existing_comment(self, pr: int, title: str): + """Check for an existing comment.""" existing_comment = None - text = gh_comment_v1(db, df, existing_comment) + existing_comment_format = 0 + for comment in self.gh.get_comments_for_pr(pr): + comment_parts = comment.body.partition('\n') + if comment_parts[0].strip() == title: + existing_comment = comment + if m := self.comment_format_re.match(comment_parts[2]): + existing_comment_format = int(m.group(1)) + break + return (existing_comment, existing_comment_format) + + def get_newest_commit(self, pr: int) -> str: + """Get the hash of the most recent commit on the PR.""" + commits = sorted( + self.gh.get_commits_for_pr(pr), + key=lambda c: f'{c.commit.committer.date}{c.commit.author.date}', + reverse=True) + return commits[0].sha if commits else '' + + def post_change_report(self, df: pd.DataFrame) -> bool: + """Send a change report as a github comment.""" + if not self.gh: + return False + pr = df.attrs['pr'] + + # Check for an existing size report comment. If one exists, we'll add + # the new information to it. + existing_comment, existing_comment_format = self.get_existing_comment( + pr, df.attrs['title']) + + if not existing_comment: + # Check the most recent commit on the PR, so that we don't comment + # for commits that are already outdated. + commit = df.attrs['commit'] + latest = self.get_newest_commit(pr) + if commit != latest: + logging.info( + 'SCS: PR #%s: not commenting for stale %s; newest is %s', + pr, commit, latest) + # Return True so that the obsolete artifacts get removed. + return True + + if existing_comment_format == 1: + df = V1Comment.merge(df, existing_comment) + else: + existing_comment = None + text = V1Comment.format(self.config, df) - logging.info( - 'SCR: %s %s', df.attrs['title'], - f'updating comment {existing_comment.id}' - if existing_comment else 'as new comment') + if existing_comment: + return self.gh.update_comment(existing_comment.id, text) + return self.gh.create_comment(pr, text) - if db.config['github.dryrun-comment']: - logging.debug('%s', text) - return False + def report_matching_commits(self) -> Dict[str, pd.DataFrame]: + """Report on all new comparable commits.""" + if not self.should_report(): + return {} - try: - if existing_comment: - db.gh.issues.update_comment(existing_comment.id, text) - else: - db.gh.issues.create_comment(pr, text) - return True - except Exception: - return False - - -def gh_comment_v1(db: SizeDatabase, df: pd.DataFrame, existing_comment) -> str: - """Format a github comment.""" - - if existing_comment: - df = v1_comment_merge(df, existing_comment) - - threshold_df = None - increase_df = df[df['change'] > 0] - if increase_df.empty: - increase_df = None - elif threshold := db.config['report.increases']: - threshold_df = df[df['% change'] > threshold] - if threshold_df.empty: - threshold_df = None - decrease_df = df[df['change'] < 0] - if decrease_df.empty: - decrease_df = None - - with io.StringIO() as md: - md.write(df.attrs['title']) - md.write('\n\n\n') - - if threshold_df is not None: - md.write(f'**Increases above {threshold:.2g}%:**\n\n') - md.write('\n\n') - v1_comment_write_df(db, threshold_df, md) - - if increase_df is not None: - summary = v1_comment_summary(increase_df) - md.write('
\n') - md.write(f'Increases ({summary})\n') - md.write('\n\n') - v1_comment_write_df(db, increase_df, md) - md.write('
\n\n') + comment_count = 0 + comment_limit = self.config['github.limit-comments'] + comment_enabled = (self.config['github.comment'] + or self.config['github.dryrun-comment']) - if decrease_df is not None: - summary = v1_comment_summary(decrease_df) - md.write('
\n') - md.write(f'Decreases ({summary})\n') - md.write('\n\n') - v1_comment_write_df(db, decrease_df, md) - md.write('
\n\n') - - summary = v1_comment_summary(df) - md.write('
\n') - md.write(f'Full report ({summary})\n') - md.write('\n\n') - v1_comment_write_df(db, df, md) - md.write('\n
\n') - - return md.getvalue() - - -def v1_comment_merge(df: pd.DataFrame, comment) -> pd.DataFrame: - with io.StringIO(comment.body) as body: - for line in body: - if line.startswith(''): - body.readline() # Blank line before table. - header, rows = read_hierified(body) - break - logging.debug('REC: read %d rows', len(rows)) - df = df.append(pd.DataFrame(data=rows, columns=header).astype(df.dtypes)) - return df.sort_values( - by=['platform', 'target', 'config', 'section']).drop_duplicates() - - -def read_hierified(f): - """Read a markdown table in ‘hierified’ format.""" - - line = f.readline() - header = tuple((s.strip() for s in line.split('|')[1:-1])) - - _ = f.readline() # The line under the header. - - rows = [] - for line in f: - line = line.strip() - if not line: - break - row = [] - columns = line.split('|') - for i in range(0, len(header)): - column = columns[i + 1].strip() - if not column: - column = rows[-1][i] - row.append(column) - rows.append(tuple(row)) - - return (header, rows) - - -def v1_comment_write_df(db: SizeDatabase, df: pd.DataFrame, - out: memdf.report.OutputOption): - memdf.report.write_df(db.config, - df, - out, - 'pipe', - hierify=True, - title=False, - floatfmt='5.1f') - - -def v1_comment_summary(df: pd.DataFrame) -> str: - count = df[['platform', 'target', 'config']].drop_duplicates().shape[0] - platforms = ', '.join(sorted(list(set(df['platform'])))) - return f'{count} build{"" if count == 1 else "s"} for {platforms}' - - -def report_matching_commits(db: SizeDatabase) -> Dict[str, pd.DataFrame]: - """Report on all new comparable commits.""" - if not db.should_report(): - return {} - - comment_count = 0 - comment_limit = db.config['github.limit-comments'] - comment_enabled = (db.config['github.comment'] - or db.config['github.dryrun-comment']) - - only_pr = db.config['github.limit-pr'] - - dfs = {} - for event, pr, commit, parent in db.select_matching_commits().fetchall(): - if not db.should_report(event): - continue - - # Github doesn't have a way to fetch artifacts associated with a - # particular PR. For testing purposes, filter to a single PR here. - if only_pr and pr != only_pr: - continue - - df = changes_for_commit(db, pr, commit, parent) - if df.empty: - # Matching commits had no new matching builds. - continue - dfs[df.attrs['name']] = df - - if (event == 'pull_request' and comment_enabled - and (comment_limit == 0 or comment_limit > comment_count)): - if gh_send_change_report(db, df): - # Mark the originating builds, and remove the originating - # artifacts, so that they don't generate duplicate report - # comments. - db.set_commented(df.attrs['builds']) - if not db.config['github.keep']: - for artifact_id in df.attrs['artifacts']: - logging.info('RMC: deleting artifact %d', artifact_id) - db.delete_artifact(artifact_id) - comment_count += 1 - return dfs - - -def report_queries(db: SizeDatabase) -> Dict[str, pd.DataFrame]: - """Perform any requested SQL queries.""" - dfs = {} - q = 0 - for query in db.config['report.query']: - q += 1 - cur = db.execute(query) - columns = [i[0] for i in cur.description] - rows = cur.fetchall() - if rows: - df = pd.DataFrame(rows, columns=columns) - df.attrs = {'name': f'query{q}', 'title': query} + only_pr = self.config['github.limit-pr'] + + dfs = {} + commits = self.db.select_matching_commits() + for event, pr, commit, parent in commits.fetchall(): + if not self.should_report(event): + continue + + # Github doesn't have a way to fetch artifacts associated with a + # particular PR. For testing purposes, filter to a single PR here. + if only_pr and pr != only_pr: + continue + + changes = self.db.select_changes(parent, commit) + + self.db.delete_builds(changes.stale_builds) + self.gh.delete_artifacts(changes.stale_artifacts) + + if not changes.rows: + # Matching commits had no new matching builds. + continue + + df = pd.DataFrame(changes.rows, columns=changes.columns) + df.attrs = { + 'name': f'{pr},{parent},{commit}', + 'title': (f'PR #{pr}: ' if pr else '') + + f'Size comparison from {parent} to {commit}', + 'things': changes.things, + 'builds': changes.builds, + 'artifacts': changes.artifacts, + 'pr': pr, + 'commit': commit, + 'parent': parent, + } dfs[df.attrs['name']] = df - db.commit() - return dfs + + if (event == 'pull_request' and comment_enabled + and (comment_limit == 0 or comment_limit > comment_count)): + if self.post_change_report(df): + # Mark the originating builds, and remove the originating + # artifacts, so that they don't generate duplicate report + # comments. + self.db.set_commented(df.attrs['builds']) + self.gh.delete_artifacts(df.attrs['artifacts']) + comment_count += 1 + return dfs + + +class V1Comment: + """Format of a GitHub comment.""" + + @staticmethod + def format(config: Config, df: pd.DataFrame): + """Format a GitHub comment.""" + + threshold_df = None + increase_df = df[df['change'] > 0] + if increase_df.empty: + increase_df = None + elif threshold := config['report.increases']: + threshold_df = df[df['% change'] > threshold] + if threshold_df.empty: + threshold_df = None + decrease_df = df[df['change'] < 0] + if decrease_df.empty: + decrease_df = None + + with io.StringIO() as md: + md.write(df.attrs['title']) + md.write('\n\n\n') + + if threshold_df is not None: + md.write(f'**Increases above {threshold:.2g}%:**\n\n') + md.write('\n\n') + V1Comment.write_df(config, threshold_df, md) + + if increase_df is not None: + summary = V1Comment.summary(increase_df) + md.write('
\n') + md.write(f'Increases ({summary})\n') + md.write('\n\n') + V1Comment.write_df(config, increase_df, md) + md.write('
\n\n') + + if decrease_df is not None: + summary = V1Comment.summary(decrease_df) + md.write('
\n') + md.write(f'Decreases ({summary})\n') + md.write('\n\n') + V1Comment.write_df(config, decrease_df, md) + md.write('
\n\n') + + summary = V1Comment.summary(df) + md.write('
\n') + md.write(f'Full report ({summary})\n') + md.write('\n\n') + V1Comment.write_df(config, df, md) + md.write('\n
\n') + + return md.getvalue() + + @staticmethod + def summary(df: pd.DataFrame) -> str: + count = df[['platform', 'target', 'config']].drop_duplicates().shape[0] + platforms = ', '.join(sorted(list(set(df['platform'])))) + return f'{count} build{"" if count == 1 else "s"} for {platforms}' + + @staticmethod + def write_df(config: Config, df: pd.DataFrame, + out: memdf.report.OutputOption): + memdf.report.write_df(config, + df, + out, + 'pipe', + hierify=True, + title=False, + floatfmt='5.1f') + + @staticmethod + def merge(df: pd.DataFrame, comment) -> pd.DataFrame: + """Merge an existing comment into the DataFrame.""" + with io.StringIO(comment.body) as body: + for line in body: + if line.startswith(''): + body.readline() # Blank line before table. + cols, rows = memdf.util.markdown.read_hierified(body) + break + logging.debug('REC: read %d rows', len(rows)) + df = df.append(pd.DataFrame(data=rows, columns=cols).astype(df.dtypes)) + return df.sort_values( + by=['platform', 'target', 'config', 'section']).drop_duplicates() def main(argv): @@ -733,18 +387,19 @@ def main(argv): try: config = Config().init({ **memdf.util.config.CONFIG, + **memdf.util.github.CONFIG, **memdf.util.sqlite.CONFIG, **memdf.report.OUTPUT_CONFIG, **GITHUB_CONFIG, + **DB_CONFIG, + **REPORT_CONFIG, }) config.argparse.add_argument('inputs', metavar='FILE', nargs='*') config.parse(argv) - dfs = {} - with SizeDatabase(config) as db: - db.read_inputs() - dfs.update(report_matching_commits(db)) - dfs.update(report_queries(db)) + szc = SizeContext(config) + szc.read_inputs() + dfs = szc.report_matching_commits() memdf.report.write_dfs(config, dfs, diff --git a/scripts/tools/memory/memdf/README.md b/scripts/tools/memory/memdf/README.md index 6741da33162378..83628c7676e25a 100644 --- a/scripts/tools/memory/memdf/README.md +++ b/scripts/tools/memory/memdf/README.md @@ -1,17 +1,25 @@ This package contains routines to to collect, aggregate, and report memory usage, using Pandas `DataFrame` as the primary representation. -- memdf.df — `DataFrame` utilities, in particular definitions of columns and - types for the main uses of data frames. -- memdf.name — Names for synthetic symbols, etc. - memdf.collect — Helpers to read memory information from various sources (e.g. executables) according to command line options. -- memdf.select — Helpers to select relevant subsets of data frames according - to command line or configured options. +- memdf.df — `DataFrame` utilities, in particular definitions of columns and + types for the main uses of data frames. +- memdf.name — Names for synthetic symbols, etc. Individual readers are + located under memdf.collector. - memdf.report — Helpers to write data frames in various formats according to command line or configured options. +- memdf.select — Helpers to select relevant subsets of data frames according + to command line or configured options. +- memdf.sizedb — Helpers for a database of size information. + +Modules under memdf.util are not specifically tied to memory usage. + - memdf.util.config — `Config` utility class for managing command line or other options according to a declarative description. +- memdf.util.github — Utilities for communicating with GitHub. +- memdf.util.markdown — Utilities for manipulating Markdown text. - memdf.util.nd — Nested dictionary utilities, used by `Config`. - memdf.util.pretty — Pretty-printed logging utility functions. +- memdf.util.sqlite - Utilities for connecting to a sqlite3 database. - memdf.util.subprocess — Utilities for executing external commands. diff --git a/scripts/tools/memory/memdf/collect.py b/scripts/tools/memory/memdf/collect.py index b04c7b87d2b63c..0575b967fe4dd7 100644 --- a/scripts/tools/memory/memdf/collect.py +++ b/scripts/tools/memory/memdf/collect.py @@ -62,7 +62,9 @@ **memdf.collector.elftools.CONFIG, **memdf.collector.readelf.CONFIG, 'collect.method': { - 'help': 'Method of input processing', + 'help': + 'Method of input processing: one of' + ' elftools, readelf, bloaty, csv, tsv, su.', 'metavar': 'METHOD', 'choices': ['elftools', 'readelf', 'bloaty', 'csv', 'tsv', 'su'], 'default': 'elftools', diff --git a/scripts/tools/memory/memdf/report.py b/scripts/tools/memory/memdf/report.py index ad5e391492dcc0..bda23163842bd8 100644 --- a/scripts/tools/memory/memdf/report.py +++ b/scripts/tools/memory/memdf/report.py @@ -69,7 +69,7 @@ } -def postprocess_report_by(config: Config, key: str) -> None: +def postprocess_report_by(config: Config, key: str, info: Mapping) -> None: """For --report-by=region, select all sections.""" assert key == 'report.by' if config.get(key) == 'region': @@ -147,7 +147,8 @@ def hierify(df: pd.DataFrame) -> pd.DataFrame: } -def postprocess_output_metadata(config: Config, key: str) -> None: +def postprocess_output_metadata(config: Config, key: str, + info: Mapping) -> None: """For --output-metadata=KEY:VALUE list, convert to dictionary.""" assert key == 'output.metadata' metadata = {} @@ -466,7 +467,7 @@ def __init__(self, 'title': 'output options', }, 'output.format': { - 'help': 'Output format', + 'help': f'Output format: one of {", ".join(WRITERS)}.', 'metavar': 'FORMAT', 'default': 'simple', 'choices': list(WRITERS.keys()), diff --git a/scripts/tools/memory/memdf/select.py b/scripts/tools/memory/memdf/select.py index 77a3d3f05b4d1e..251b1be5642c9a 100644 --- a/scripts/tools/memory/memdf/select.py +++ b/scripts/tools/memory/memdf/select.py @@ -23,7 +23,7 @@ from memdf import Config, ConfigDescription, DF -from typing import Optional +from typing import Mapping, Optional def split_size(config: Config, key: str) -> None: @@ -51,7 +51,7 @@ def get_limit(config: Config, column: str, name: str) -> int: return config.getl([column, 'limit', name], config.get('report.limit', 0)) -def postprocess_selections(config: Config, key: str) -> None: +def postprocess_selections(config: Config, key: str, info: Mapping) -> None: """Resolve select/ignore command options.""" split_size(config, key) choice, select = key.split('.') diff --git a/scripts/tools/memory/memdf/sizedb.py b/scripts/tools/memory/memdf/sizedb.py new file mode 100644 index 00000000000000..7e59caedd042e6 --- /dev/null +++ b/scripts/tools/memory/memdf/sizedb.py @@ -0,0 +1,254 @@ +# +# Copyright (c) 2021 Project CHIP Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""Sqlite3 database of binary sizes over time.""" + +import collections +import json +import logging +import sqlite3 +import zipfile + +from pathlib import Path +from typing import Dict, IO, Iterable, List, Optional, Union + +import memdf.util.sqlite + +ChangeInfo = collections.namedtuple('ChangeInfo', [ + 'columns', 'rows', 'things', 'builds', 'stale_builds', 'artifacts', + 'stale_artifacts' +]) + + +class SizeDatabase(memdf.util.sqlite.Database): + """A database for recording and comparing size reports.""" + on_open = ["PRAGMA foreign_keys = ON", "PRAGMA encoding = 'UTF-8'"] + on_writable = [ + """ + -- A ‘thing’ identifies the kind of built object. + -- Builds of the same thing are comparable. + CREATE TABLE IF NOT EXISTS thing ( + id INTEGER PRIMARY KEY, + platform TEXT NOT NULL, -- Build platform + config TEXT NOT NULL, -- Build configuration discriminator + target TEXT NOT NULL, -- Build target + UNIQUE(platform, config, target) + ) + """, """ + -- A ‘build’ identifies a built instance of a thing at some point. + CREATE TABLE IF NOT EXISTS build ( + id INTEGER PRIMARY KEY, + thing_id INTEGER REFERENCES thing(id), + hash TEXT NOT NULL, -- Commit hash + parent TEXT NOT NULL, -- Parent commit hash + pr INTEGER DEFAULT 0, -- Github PR number + time INTEGER NOT NULL, -- Unix-epoch timestamp + artifact INTEGER DEFAULT 0, -- Github artifact ID + commented INTEGER DEFAULT 0, -- 1 if recorded in a GH comment + ref TEXT, -- Target git ref + event TEXT, -- Github build trigger event + UNIQUE(thing_id, hash, parent, pr, time, artifact) + ) + """, """ + -- A ‘size’ entry gives the size of a section for a particular build. + CREATE TABLE IF NOT EXISTS size ( + build_id INTEGER REFERENCES build(id), + name TEXT NOT NULL, -- Section name + size INTEGER NOT NULL, -- Section size in bytes + PRIMARY KEY (build_id, name) + ) + """ + ] + + def __init__(self, filename: str, writable: bool = True): + super().__init__(filename, writable) + + def add_sizes(self, **kwargs): + """ + Add a size report to the database. + + The incoming arguments must contain the required non-ID column names + from ‘thing’ and ‘build’ tables, plus a 'sizes' entry that is a + sequence of mappings containing 'name' and 'size'. + """ + td = {k: kwargs[k] for k in ('platform', 'config', 'target')} + thing = self.store_and_return_id('thing', **td) + bd = {k: kwargs[k] for k in ('hash', 'parent', 'time', 'event')} + if 'ref' in kwargs: + bd['ref'] = kwargs['ref'] + cd = {k: kwargs.get(k, 0) for k in ('pr', 'artifact', 'commented')} + build = self.store_and_return_id('build', thing_id=thing, **bd, **cd) + if build is None: + logging.error('Failed to store %s %s %s', thing, bd, cd) + else: + for d in kwargs['sizes']: + self.store('size', build_id=build, **d) + + def add_sizes_from_json(self, s: Union[bytes, str], origin: Dict): + """Add sizes from a JSON size report.""" + r = origin.copy() + r.update(json.loads(s)) + r['sizes'] = [] + # Add section sizes. + for i in r['frames'].get('section', []): + r['sizes'].append({'name': i['section'], 'size': i['size']}) + # Add segment sizes. + for i in r['frames'].get('wr', []): + r['sizes'].append({ + 'name': ('(read only)', '(read/write)')[int(i['wr'])], + 'size': + i['size'] + }) + self.add_sizes(**r) + + def add_sizes_from_zipfile(self, f: Union[IO, Path], origin: Dict): + """Add size reports from a zip.""" + with zipfile.ZipFile(f, 'r') as zip_file: + for i in zip_file.namelist(): + if i.endswith('-sizes.json'): + origin['member'] = i + with zip_file.open(i) as member: + self.add_sizes_from_json(member.read(), origin) + + def add_sizes_from_file(self, filename: str): + """Add size reports from a file.""" + origin = {'file': filename} + path = Path(filename) + if path.suffix == '.json': + logging.info('ASJ: reading JSON %s', path) + with open(path, encoding='utf-8') as f: + self.add_sizes_from_json(f.read(), origin) + elif path.suffix == '.zip': + logging.info('ASZ: reading ZIP %s', path) + self.add_sizes_from_zipfile(path, origin) + else: + logging.warning('Unknown file type "%s" ignored', filename) + + def select_thing_id(self, platform: str, config: str, + target: str) -> Optional[str]: + cur = self.execute( + 'SELECT id FROM thing WHERE platform=? AND config=? AND target=?', + (platform, config, target)) + row = cur.fetchone() + return row[0] if row else None + + def select_sections_for_thing(self, thing: str) -> List[str]: + cur = self.execute( + ''' + SELECT DISTINCT name FROM size WHERE build_id = ( + SELECT DISTINCT id FROM build WHERE thing_id == ?) + ORDER BY name + ''', (thing,)) + return [row[0] for row in cur.fetchall()] + + def select_matching_commits(self): + """Find matching builds, where one's commit is the other's parent.""" + return self.execute(''' + SELECT DISTINCT + c.event as event, + c.pr AS pr, + c.hash AS hash, + p.hash AS parent + FROM build c + INNER JOIN build p ON p.hash = c.parent + WHERE c.commented = 0 + ORDER BY c.time DESC, c.pr, c.hash, p.hash + ''') + + def select_changes(self, parent: str, commit: str) -> ChangeInfo: + """Returns size changes between the given commits.""" + cur = self.execute( + ''' + SELECT DISTINCT + t.id AS thing, + cb.artifact AS artifact, + pb.id AS parent_build, + cb.id AS commit_build, + t.platform, t.config, t.target, + cs.name AS name, + ps.size AS parent_size, + cs.size AS commit_size, + cb.time AS time + FROM thing t + INNER JOIN build cb ON cb.thing_id = t.id + INNER JOIN build pb ON pb.thing_id = t.id AND pb.hash = cb.parent + INNER JOIN size cs ON cs.build_id = cb.id + INNER JOIN size ps ON ps.build_id = pb.id AND cs.name = ps.name + WHERE cb.hash = ? AND pb.hash = ? + ORDER BY t.platform, t.config, t.target, + cs.name, cb.time DESC, pb.time DESC + ''', (commit, parent)) + + keep = ('platform', 'target', 'config', 'name', 'parent_size', + 'commit_size') + things: set[int] = set() + artifacts: set[int] = set() + builds: set[int] = set() + stale_builds: set[int] = set() + stale_artifacts: set[int] = set() + previous: Optional[sqlite3.Row] = None + rows = [] + + for row in cur.fetchall(): + row = sqlite3.Row(cur, row) + things.add(row['thing']) + if (previous is not None and row['thing'] == previous['thing'] + and row['name'] == previous['name']): + # This is duplicate build, older because we sort descending, + # presumably from a partial workflow re-run. + if row['parent_build'] != previous['parent_build']: + stale_builds.add(row['parent_build']) + if row['commit_build'] != previous['commit_build']: + stale_builds.add(row['commit_build']) + stale_artifacts.add(row['artifact']) + else: + previous = row + new = [row[k] for k in keep] + parent_size = row['parent_size'] + commit_size = row['commit_size'] + new.append(commit_size - parent_size) + new.append(self.percent_change(parent_size, commit_size)) + rows.append(new) + artifacts.add(row['artifact']) + builds.add(row['commit_build']) + + return ChangeInfo(('platform', 'target', 'config', 'section', + parent[:8], commit[:8], 'change', '% change'), rows, + things, builds, stale_builds, artifacts, + stale_artifacts) + + def set_commented(self, build_ids: Iterable[int]): + """Set the commented flag for the given builds.""" + if not build_ids: + return + for build_id in build_ids: + self.execute('UPDATE build SET commented = 1 WHERE id = ?', + (build_id, )) + self.commit() + + def delete_builds(self, build_ids: Iterable[int]): + """Delete the given builds.""" + if not build_ids: + return + for build_id in build_ids: + self.execute('DELETE FROM size WHERE build_id = ?', (build_id, )) + self.execute('DELETE FROM build WHERE id = ?', (build_id, )) + self.commit() + + @staticmethod + def percent_change(a: int, b: int) -> float: + if a == 0: + return 0.0 if b == 0 else float('inf') + return 100. * (b - a) / a diff --git a/scripts/tools/memory/memdf/util/config.py b/scripts/tools/memory/memdf/util/config.py index 20f1434a236233..f8294ad8b037a5 100644 --- a/scripts/tools/memory/memdf/util/config.py +++ b/scripts/tools/memory/memdf/util/config.py @@ -41,8 +41,8 @@ # supplied as keyword arguments to `argparse.add_argument()`, # except for: # 'alias': list of alternate command line option names -# 'postprocess': a callable invoked after argument parsing with two -# arguments: the config and the key +# 'postprocess': a callable invoked after argument parsing with three +# arguments: the config, the key, and the description entry. # # Special keys can be used to control argument parser groups. By default any # configuration key containing a ‘.’ belongs to a group determined by the @@ -157,6 +157,7 @@ def init_args(self, desc: ConfigDescription, *args, **kwargs) -> 'Config': arg_info = arg_info.copy() name = arg_info.pop('argument', '--' + key.replace('.', '-')) names = [name] + arg_info.pop('alias', []) + info['names'] = names for k in ['metavar', 'choices']: if k in info: arg_info[k] = info[k] @@ -171,7 +172,7 @@ def init_args(self, desc: ConfigDescription, *args, **kwargs) -> 'Config': elif isinstance(default, int) and 'metavar' not in info: arg_info['action'] = 'count' if postprocess := info.get('postprocess'): - self.postprocess_args[key] = postprocess + self.postprocess_args[key] = (postprocess, info) group: Optional[str] = info.get('group') if group is None and (e := key.find('.')) > 0: @@ -226,10 +227,6 @@ def parse(self, argv: Sequence[str]) -> 'Config': key = 'args.' + dest self.put(key, value) - # Postprocess config. - for key, action in self.postprocess_args.items(): - action(self, key) - # Configure logging. if self.get('log-level') is None: verbose = self.get('verbose', 0) @@ -242,6 +239,11 @@ def parse(self, argv: Sequence[str]) -> 'Config': logging.basicConfig(level=self.get('log-level'), format=self.get('log-format')) + # Postprocess config. + for key, postprocess in self.postprocess_args.items(): + action, info = postprocess + action(self, key, info) + memdf.util.pretty.debug(self.d) return self @@ -292,8 +294,8 @@ def __call__(self, parser, namespace, values, option_string=None): # Config description of options shared by all tools. CONFIG: ConfigDescription = { 'log-level': { - 'help': 'Set logging level', - 'metavar': 'LEVEL', + 'help': + 'Set logging level: one of critical, error, warning, info, debug.', 'default': None, 'choices': ['critical', 'error', 'warning', 'info', 'debug'], }, diff --git a/scripts/tools/memory/memdf/util/github.py b/scripts/tools/memory/memdf/util/github.py new file mode 100644 index 00000000000000..bcd14c597701a2 --- /dev/null +++ b/scripts/tools/memory/memdf/util/github.py @@ -0,0 +1,237 @@ +# +# Copyright (c) 2022 Project CHIP Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""Utility wrapper for GitHub operations.""" + +import itertools +import logging +import os + +from typing import Iterable, Mapping, Optional + +import dateutil # type: ignore +import dateutil.parser # type: ignore +import ghapi.all # type: ignore + +from memdf import Config, ConfigDescription + + +def postprocess_config(config: Config, _key: str, _info: Mapping) -> None: + """Postprocess --github-repository.""" + if config['github.repository']: + owner, repo = config.get('github.repository').split('/', 1) + config.put('github.owner', owner) + config.put('github.repo', repo) + if not config['github.token']: + config['github.token'] = os.environ.get('GITHUB_TOKEN') + if not config['github.token']: + logging.error('Missing --github-token') + + +CONFIG: ConfigDescription = { + Config.group_def('github'): { + 'title': 'github options', + }, + 'github.token': { + 'help': 'Github API token, or "SKIP" to suppress connecting to github', + 'metavar': 'TOKEN', + 'default': '', + 'argparse': { + 'alias': ['--github-api-token', '--token'], + }, + }, + 'github.repository': { + 'help': 'Github repostiory', + 'metavar': 'OWNER/REPO', + 'default': '', + 'argparse': { + 'alias': ['--repo'], + }, + 'postprocess': postprocess_config, + }, + 'github.dryrun-comment': { + 'help': "Don't actually post comments", + 'default': False, + }, + 'github.keep': { + 'help': "Don't remove PR artifacts", + 'default': False, + 'argparse': { + 'alias': ['--keep'], + }, + }, + 'github.limit-artifact-pages': { + 'help': 'Examine no more than COUNT pages of artifacts', + 'metavar': 'COUNT', + 'default': 0, + 'argparse': { + 'type': int, + }, + }, +} + + +class Gh: + """Utility wrapper for GitHub operations.""" + + def __init__(self, config: Config): + self.config = config + self.ghapi: Optional[ghapi.all.GhApi] = None + self.deleted_artifacts: set[int] = set() + + owner = config['github.owner'] + repo = config['github.repo'] + token = config['github.token'] + if owner and repo and token and token != 'SKIP': + self.ghapi = ghapi.all.GhApi(owner=owner, repo=repo, token=token) + + def __bool__(self): + return self.ghapi is not None + + def get_comments_for_pr(self, pr: int): + """Iterate PR comments.""" + assert self.ghapi + try: + return itertools.chain.from_iterable( + ghapi.all.paged(self.ghapi.issues.list_comments, pr)) + except Exception as e: + logging.error('Failed to get comments for PR #%d: %s', pr, e) + return [] + + def get_commits_for_pr(self, pr: int): + """Iterate PR commits.""" + assert self.ghapi + try: + return itertools.chain.from_iterable( + ghapi.all.paged(self.ghapi.pulls.list_commits, pr)) + except Exception as e: + logging.error('Failed to get commits for PR #%d: %s', pr, e) + return [] + + def get_artifacts(self, page_limit: int = -1, per_page: int = -1): + """Iterate artifact descriptions.""" + if page_limit < 0: + page_limit = self.config['github.limit-artifact-pages'] + if per_page < 0: + per_page = self.config['github.artifacts-per-page'] or 100 + + assert self.ghapi + try: + page = 0 + for i in ghapi.all.paged( + self.ghapi.actions.list_artifacts_for_repo, + per_page): + if not i.artifacts: + break + for a in i.artifacts: + yield a + page += 1 + logging.debug('ASP: artifact page %d of %d', page, page_limit) + if page_limit and page >= page_limit: + break + except Exception as e: + logging.error('Failed to get artifact list: %s', e) + + def get_size_artifacts(self, + page_limit: int = -1, + per_page: int = -1, + label: str = ''): + """Iterate size artifact descriptions.""" + for a in self.get_artifacts(page_limit, per_page): + # Size artifacts have names of the form: + # Size,{group},{pr},{commit_hash},{parent_hash}[,{event}] + # This information is added to the attribute record from GitHub. + if a.name.startswith('Size,') and a.name.count(',') >= 4: + _, group, pr, commit, parent, *etc = a.name.split(',') + if label and group != label: + continue + a.group = group + a.commit = commit + a.parent = parent + a.pr = pr + a.created_at = dateutil.parser.isoparse(a.created_at) + # Old artifact names don't include the event. + if etc: + event = etc[0] + else: + event = 'push' if pr == '0' else 'pull_request' + a.event = event + yield a + + def download_artifact(self, artifact_id: int): + """Download a GitHub artifact, returning a binary zip object.""" + logging.debug('Downloading artifact %d', artifact_id) + try: + assert self.ghapi + return self.ghapi.actions.download_artifact(artifact_id, 'zip') + except Exception as e: + logging.error('Failed to download artifact %d: %s', artifact_id, e) + return None + + def delete_artifact(self, artifact_id: int) -> bool: + """Delete a GitHub artifact.""" + if not artifact_id or artifact_id in self.deleted_artifacts: + return True + self.deleted_artifacts.add(artifact_id) + + if self.config['github.keep']: + logging.info('Suppressed deleting artifact %d', artifact_id) + return False + + try: + assert self.ghapi + logging.info('Deleting artifact %d', artifact_id) + self.ghapi.actions.delete_artifact(artifact_id) + return True + except Exception as e: + # During manual testing we sometimes lose the race against CI. + logging.error('Failed to delete artifact %d: %s', artifact_id, e) + return False + + def delete_artifacts(self, artifacts: Iterable[int]): + for artifact_id in artifacts: + self.delete_artifact(artifact_id) + + def create_comment(self, issue_id: int, text: str) -> bool: + """Create a GitHub comment.""" + if self.config['github.dryrun-comment']: + logging.info('Suppressed creating comment on #%d', issue_id) + logging.debug('%s', text) + return False + + assert self.ghapi + logging.info('Creating comment on #%d', issue_id) + try: + self.ghapi.issues.create_comment(issue_id, text) + return True + except Exception as e: + logging.error('Failed to created comment on #%d: %s', issue_id, e) + return False + + def update_comment(self, comment_id: int, text: str) -> bool: + """Update a GitHub comment.""" + if self.config['github.dryrun-comment']: + logging.info('Suppressed updating comment #%d', comment_id) + logging.debug('%s', text) + return False + + logging.info('Updating comment #%d', comment_id) + try: + assert self.ghapi + self.ghapi.issues.update_comment(comment_id, text) + return True + except Exception as e: + logging.error('Failed to update comment %d: %s', comment_id, e) + return False diff --git a/scripts/tools/memory/memdf/util/markdown.py b/scripts/tools/memory/memdf/util/markdown.py new file mode 100644 index 00000000000000..52aea03e86db91 --- /dev/null +++ b/scripts/tools/memory/memdf/util/markdown.py @@ -0,0 +1,41 @@ +# +# Copyright (c) 2021 Project CHIP Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""Markdown utilities.""" + + +def read_hierified(f): + """Read a markdown table in ‘hierified’ format.""" + + line = f.readline() + header = tuple((s.strip() for s in line.split('|')[1:-1])) + + _ = f.readline() # The line under the header. + + rows = [] + for line in f: + line = line.strip() + if not line: + break + row = [] + columns = line.split('|') + for i in range(0, len(header)): + column = columns[i + 1].strip() + if not column: + column = rows[-1][i] + row.append(column) + rows.append(tuple(row)) + + return (header, rows) diff --git a/scripts/tools/memory/memdf/util/sqlite.py b/scripts/tools/memory/memdf/util/sqlite.py index 94b6f9b33a2f81..c0d825ac98f860 100644 --- a/scripts/tools/memory/memdf/util/sqlite.py +++ b/scripts/tools/memory/memdf/util/sqlite.py @@ -20,6 +20,8 @@ from typing import List, Optional +import pandas as pd # type: ignore + from memdf import Config, ConfigDescription CONFIG: ConfigDescription = { @@ -29,7 +31,7 @@ 'database.file': { 'help': 'Sqlite3 file', 'metavar': 'FILENAME', - 'default': ':memory:', + 'default': None, 'argparse': { 'alias': ['--db'], }, @@ -112,3 +114,12 @@ def get_matching_id(self, table: str, **kwargs): def store_and_return_id(self, table: str, **kwargs) -> Optional[int]: self.store(table, **kwargs) return self.get_matching_id(table, **kwargs) + + def data_frame(self, query, parameters=None) -> pd.DataFrame: + """Return the results of a query as a DataFrame.""" + cur = self.execute(query, parameters) + columns = [i[0] for i in cur.description] + df = pd.DataFrame(cur.fetchall(), columns=columns) + self.commit() + df.attrs = {'title': query} + return df