From 323571581c11193d98beabe60aa8f3668712dd02 Mon Sep 17 00:00:00 2001
From: Kevin Schoedel <67607049+kpschoedel@users.noreply.github.com>
Date: Tue, 26 Oct 2021 13:36:55 -0400
Subject: [PATCH] Size reports: Improved comments (#10982)

#### Problem

It's hard to pick out actual size changes, beyond those called out in
the large-increase tables.

Report information can be distributed across multiple collapsed tables,
since the generator often runs before all builds are complete.

#### Change overview

Read back any existing comment and merge in new reports.

Comments now contain:

1. An open table highlighting large increases, if any.
2. A collapsed table of all increases, if any.
3. A collapsed table of all decreases, if any.
4. A collapsed table of all data.

#### Testing

Tested with manual runs:
https://github.com/project-chip/connectedhomeip/pull/10979#issuecomment-951981745
---
 scripts/tools/memory/gh_report.py         | 218 ++++++++++++++++------
 scripts/tools/memory/memdf/util/sqlite.py |  16 +-
 2 files changed, 170 insertions(+), 64 deletions(-)

diff --git a/scripts/tools/memory/gh_report.py b/scripts/tools/memory/gh_report.py
index e4ea4eabb3d3b6..44644f9220bf2d 100755
--- a/scripts/tools/memory/gh_report.py
+++ b/scripts/tools/memory/gh_report.py
@@ -22,6 +22,7 @@
 import logging
 import os
 import os.path
+import re
 import sqlite3
 import sys
 import zipfile
@@ -101,6 +102,14 @@
             'type': int,
         },
     },
+    'github.limit-pr': {
+        'help': 'Report only on PR, if present.',
+        'metavar': 'PR',
+        'default': 0,
+        'argparse': {
+            'type': int,
+        },
+    },
     Config.group_map('report'): {
         'group': 'output'
     },
@@ -194,8 +203,11 @@ def add_sizes(self, **kwargs):
         bd = {k: kwargs[k] for k in ('hash', 'parent', 'time')}
         cd = {k: kwargs.get(k, 0) for k in ('pr', 'artifact', 'commented')}
         build = self.store_and_return_id('build', thing_id=thing, **bd, **cd)
-        for d in kwargs['sizes']:
-            self.store('size', build_id=build, **d)
+        if build is None:
+            logging.error('Failed to store %s %s %s', thing, bd, cd)
+        else:
+            for d in kwargs['sizes']:
+                self.store('size', build_id=build, **d)
 
     def add_sizes_from_json(self, s: Union[bytes, str], origin: Dict):
         """Add sizes from a JSON size report."""
@@ -331,9 +343,14 @@ def delete_stale_builds(self, build_ids: Iterable[int]):
         self.commit()
 
     def delete_artifact(self, artifact_id: int):
-        if self.gh and artifact_id and artifact_id not in self.deleted_artifacts:
+        if (self.gh and artifact_id
+                and artifact_id not in self.deleted_artifacts):
             self.deleted_artifacts.add(artifact_id)
-            self.gh.actions.delete_artifact(artifact_id)
+            try:
+                self.gh.actions.delete_artifact(artifact_id)
+            except Exception:
+                # During manual testing we sometimes lose the race against CI.
+                logging.error('Failed to delete artifact %d', artifact_id)
 
     def delete_stale_artifacts(self, stale_artifacts: Iterable[int]):
         if not self.config['github.keep']:
@@ -454,11 +471,16 @@ def changes_for_commit(db: SizeDatabase, pr: int, commit: str,
     return df
 
 
-def gh_send_change_report(db: SizeDatabase, df: pd.DataFrame,
-                          tdf: pd.DataFrame) -> bool:
+comment_format_re = re.compile(r"^<!--ghr-comment-format:(\d+)-->")
+
+
+def gh_send_change_report(db: SizeDatabase, df: pd.DataFrame) -> bool:
     """Send a change report as a github comment."""
+
     if not db.gh:
         return False
+
+    # Look for an existing comment for this change.
     pr = df.attrs['pr']
 
     # Check the most recent commit on the PR, so that we don't comment on
@@ -475,54 +497,32 @@ def gh_send_change_report(db: SizeDatabase, df: pd.DataFrame,
     # Check for an existing size report comment. If one exists, we'll add
     # the new report to it.
     title = df.attrs['title']
-    existing_comment_id = 0
+    existing_comment = None
+    existing_comment_format = 0
     for comment in gh_get_comments_for_pr(db.gh, pr):
-        if comment.body.partition('\n')[0] == df.attrs['title']:
-            existing_comment_id = comment.id
-            title = comment.body
+        comment_parts = comment.body.partition('\n')
+        if comment_parts[0].strip() == title:
+            existing_comment = comment
+            if m := comment_format_re.match(comment_parts[2]):
+                existing_comment_format = int(m.group(1))
             break
 
-    md = io.StringIO()
-    md.write(title)
-    md.write('\n')
-
-    if tdf is not None and not tdf.empty:
-        md.write(f'\n**{tdf.attrs["title"]}:**\n\n')
-        memdf.report.write_df(db.config,
-                              tdf,
-                              md,
-                              'pipe',
-                              hierify=True,
-                              title=False,
-                              tabulate={'floatfmt': '5.1f'})
-
-    count = len(df.attrs['things'])
-    platforms = ', '.join(sorted(list(set(df['platform']))))
-    summary = f'{count} build{"" if count == 1 else "s"} (for {platforms})'
-    md.write(f'\n<details>\n<summary>{summary}</summary>\n\n')
-    memdf.report.write_df(db.config,
-                          df,
-                          md,
-                          'pipe',
-                          hierify=True,
-                          title=False,
-                          tabulate={'floatfmt': '5.1f'})
-    md.write('\n</details>\n')
-    text = md.getvalue()
-    md.close()
+    if existing_comment_format != 1:
+        existing_comment = None
+    text = gh_comment_v1(db, df, existing_comment)
 
-    if existing_comment_id:
-        comment = f'updating comment {existing_comment_id}'
-    else:
-        comment = 'as new comment'
-    logging.info('SCR: %s for %s %s', df.attrs['title'], summary, comment)
+    logging.info(
+        'SCR: %s %s', df.attrs['title'],
+        f'updating comment {existing_comment.id}'
+        if existing_comment else 'as new comment')
 
     if db.config['github.dryrun-comment']:
+        logging.debug('%s', text)
         return False
 
     try:
-        if existing_comment_id:
-            db.gh.issues.update_comment(existing_comment_id, text)
+        if existing_comment:
+            db.gh.issues.update_comment(existing_comment.id, text)
         else:
             db.gh.issues.create_comment(pr, text)
         return True
@@ -530,6 +530,114 @@ def gh_send_change_report(db: SizeDatabase, df: pd.DataFrame,
         return False
 
 
+def gh_comment_v1(db: SizeDatabase, df: pd.DataFrame, existing_comment) -> str:
+    """Format a github comment."""
+
+    if existing_comment:
+        df = v1_comment_merge(df, existing_comment)
+
+    threshold_df = None
+    increase_df = df[df['change'] > 0]
+    if increase_df.empty:
+        increase_df = None
+    elif threshold := db.config['report.increases']:
+        threshold_df = df[df['% change'] > threshold]
+        if threshold_df.empty:
+            threshold_df = None
+    decrease_df = df[df['change'] < 0]
+    if decrease_df.empty:
+        decrease_df = None
+
+    with io.StringIO() as md:
+        md.write(df.attrs['title'])
+        md.write('\n<!--ghr-comment-format:1-->\n\n')
+
+        if threshold_df is not None:
+            md.write(f'**Increases above {threshold:.2g}%:**\n\n')
+            md.write('<!--ghr-report:threshold-->\n\n')
+            v1_comment_write_df(db, threshold_df, md)
+
+        if increase_df is not None:
+            summary = v1_comment_summary(increase_df)
+            md.write('<details>\n')
+            md.write(f'<summary>Increases ({summary})</summary>\n')
+            md.write('<!--ghr-report:increases-->\n\n')
+            v1_comment_write_df(db, increase_df, md)
+            md.write('</details>\n\n')
+
+        if decrease_df is not None:
+            summary = v1_comment_summary(decrease_df)
+            md.write('<details>\n')
+            md.write(f'<summary>Decreases ({summary})</summary>\n')
+            md.write('<!--ghr-report:decreases-->\n\n')
+            v1_comment_write_df(db, decrease_df, md)
+            md.write('</details>\n\n')
+
+        summary = v1_comment_summary(df)
+        md.write('<details>\n')
+        md.write(f'<summary>Full report ({summary})</summary>\n')
+        md.write('<!--ghr-report:full-->\n\n')
+        v1_comment_write_df(db, df, md)
+        md.write('\n</details>\n')
+
+        return md.getvalue()
+
+
+def v1_comment_merge(df: pd.DataFrame, comment) -> pd.DataFrame:
+    with io.StringIO(comment.body) as body:
+        for line in body:
+            if line.startswith('<!--ghr-report:full-->'):
+                body.readline()  # Blank line before table.
+                header, rows = read_hierified(body)
+                break
+    logging.debug('REC: read %d rows', len(rows))
+    df = df.append(pd.DataFrame(data=rows, columns=header).astype(df.dtypes))
+    return df.sort_values(
+        by=['platform', 'target', 'config', 'section']).drop_duplicates()
+
+
+def read_hierified(f):
+    """Read a markdown table in ‘hierified’ format."""
+
+    line = f.readline()
+    header = tuple((s.strip() for s in line.split('|')[1:-1]))
+
+    _ = f.readline()  # The line under the header.
+
+    rows = []
+    for line in f:
+        line = line.strip()
+        if not line:
+            break
+        row = []
+        columns = line.split('|')
+        for i in range(0, len(header)):
+            column = columns[i + 1].strip()
+            if not column:
+                column = rows[-1][i]
+            row.append(column)
+        rows.append(tuple(row))
+
+    return (header, rows)
+
+
+def v1_comment_write_df(db: SizeDatabase, df: pd.DataFrame,
+                        out: memdf.report.OutputOption):
+    memdf.report.write_df(db.config,
+                          df,
+                          out,
+                          'pipe',
+                          hierify=True,
+                          title=False,
+                          tabulate={'floatfmt': '5.1f'})
+
+
+def v1_comment_summary(df: pd.DataFrame) -> str:
+    count = df[['platform', 'target', 'config']].drop_duplicates().shape[0]
+    platforms = ', '.join(sorted(list(set(df['platform']))))
+    return f'{count} build{"" if count == 1 else "s"} for {platforms}'
+
+
 def report_matching_commits(db: SizeDatabase) -> Dict[str, pd.DataFrame]:
     """Report on all new comparable commits."""
     if not (db.config['report.pr'] or db.config['report.push']):
@@ -542,30 +650,24 @@ def report_matching_commits(db: SizeDatabase) -> Dict[str, pd.DataFrame]:
 
     report_push = db.config['report.push']
     report_pr = db.config['report.pr']
+    only_pr = db.config['github.limit-pr']
 
     dfs = {}
     for pr, commit, parent in db.select_matching_commits().fetchall():
         if not (report_pr if pr else report_push):
             continue
 
+        # Github doesn't have a way to fetch artifacts associated with a
+        # particular PR. For testing purposes, filter to a single PR here.
+        if only_pr and pr != only_pr:
+            continue
+
         df = changes_for_commit(db, pr, commit, parent)
         dfs[df.attrs['name']] = df
 
-        if threshold := db.config['report.increases']:
-            tdf = df[df['% change'] > threshold]
-        else:
-            tdf = None
-        if tdf is not None and not tdf.empty:
-            commit = df.attrs['commit']
-            parent = df.attrs['parent']
-            tdf.attrs['name'] = f'L,{commit},{parent}'
-            tdf.attrs['title'] = (
-                f'Increases above {threshold:.1f}% from {parent} to {commit}')
-            dfs[tdf.attrs['name']] = tdf
-
         if (pr and comment_enabled
                 and (comment_limit == 0 or comment_limit > comment_count)):
-            if gh_send_change_report(db, df, tdf):
+            if gh_send_change_report(db, df):
                 # Mark the originating builds, and remove the originating
                 # artifacts, so that they don't generate duplicate report
                 # comments.
diff --git a/scripts/tools/memory/memdf/util/sqlite.py b/scripts/tools/memory/memdf/util/sqlite.py
index eba40f1818de8c..94b6f9b33a2f81 100644
--- a/scripts/tools/memory/memdf/util/sqlite.py
+++ b/scripts/tools/memory/memdf/util/sqlite.py
@@ -97,14 +97,18 @@ def store(self, table: str, **kwargs):
         self.connection().execute(q, v)
 
     def get_matching(self, table: str, columns: List[str], **kwargs):
-        return self.connection().execute(
-            f"SELECT {','.join(columns)} FROM {table}"
-            f"  WHERE {'=? AND '.join(kwargs.keys())}=?",
-            list(kwargs.values()))
+        q = (f"SELECT {','.join(columns)} FROM {table}"
+             f"  WHERE {'=? AND '.join(kwargs.keys())}=?")
+        v = list(kwargs.values())
+        return self.connection().execute(q, v)
 
     def get_matching_id(self, table: str, **kwargs):
-        return self.get_matching(table, ['id'], **kwargs).fetchone()[0]
+        cur = self.get_matching(table, ['id'], **kwargs)
+        row = cur.fetchone()
+        if row:
+            return row[0]
+        return None
 
-    def store_and_return_id(self, table: str, **kwargs) -> int:
+    def store_and_return_id(self, table: str, **kwargs) -> Optional[int]:
         self.store(table, **kwargs)
         return self.get_matching_id(table, **kwargs)