Workflow fail parsing (#28819)

* Update recent_fail_summary.yaml * Update summarize_fail.py * Update recent_fail_summary.yaml * Update summarize_fail.py * Update summarize_fail.py * Update recent_fail_summary.yaml * Update recent_fail_summary.yaml * Update summarize_fail.py * Update summarize_fail.py * Update summarize_fail.py * Update summarize_fail.py * Update summarize_fail.py * Update summarize_fail.py * Update summarize_fail.py * Update recent_fail_summary.yaml * Update summarize_fail.py * code style * Update summarize_fail.py * Update recent_fail_summary.yaml * Update summarize_fail.py * Update recent_fail_summary.yaml * Update summarize_fail.py * Update summarize_fail.py * styling * Update summarize_fail.py * Update summarize_fail.py * Update recent_fail_summary.yaml * Update summarize_fail.py * Update summarize_fail.py * Update summarize_fail.py * Update summarize_fail.py * Update summarize_fail.py * Update summarize_fail.py * Update summarize_fail.py * Update summarize_fail.py * Update summarize_fail.py * Update summarize_fail.py * Update summarize_fail.py * Update summarize_fail.py * Update summarize_fail.py * Update summarize_fail.py * Restyled by autopep8 * Restyled by isort --------- Co-authored-by: Restyled.io <[email protected]>
project-chip · Sep 2, 2023 · edbe8dc · edbe8dc
1 parent 486aea0
commit edbe8dc
Show file tree

Hide file tree

Showing 2 changed files with 97 additions and 18 deletions.
diff --git a/.github/workflows/recent_fail_summary.yaml b/.github/workflows/recent_fail_summary.yaml
@@ -26,22 +26,20 @@ jobs:
     name: Summarize Recent Workflow Failures
     runs-on: ubuntu-latest
     steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-      - name: Setup
-        run: |
-          gh run list -R project-chip/connectedhomeip -b master -s failure --json displayTitle,workflowName > run_list.json
-          pip install pandas
-        env:
-          GH_TOKEN: ${{ github.token }}
+      - uses: actions/checkout@v3
+      - run: pip install pandas python-slugify
       - name: Run Summarization Script
         run: python scripts/tools/summarize_fail.py
+        env:
+          GH_TOKEN: ${{ github.token }}
       - name: Upload Logs
         uses: actions/upload-artifact@v3
         with:
           name: workflow-fail-summary
           path: |
+            run_list.json
             recent_fails.csv
             recent_fails_frequency.csv
+            recent_fails_logs
           retention-days: 5
 
diff --git a/scripts/tools/summarize_fail.py b/scripts/tools/summarize_fail.py
@@ -1,12 +1,93 @@
+import logging
+import os
+import subprocess
+
 import pandas as pd
+from slugify import slugify
+
+error_catalog = {
+    "CodeQL": {
+        "No space left on device": {
+            "short": "Ran out of space",
+            "detail": "Exception with signature \"No space left on device\""
+        },
+        "Check that the disk containing the database directory has ample free space.": {
+            "short": "Ran out of space",
+            "detail": "Fatal internal error with message indicating that disk space most likely ran out"
+        }
+    },
+    "Build example": {
+        "Could not find a version that satisfies the requirement": {
+            "short": "Requirements issue",
+            "detail": "Unable to install a requirements in Python requirements.txt"
+        },
+        "No module named": {
+            "short": "Missing module",
+            "detail": "Expected module was missing"
+        }
+    },
+    "Full builds": {
+        "No space left on device": {
+            "short": "Ran out of space",
+            "detail": "Exception with signature \"No space left on device\""
+        }
+    }
+}
+
+
+def process_fail(id, pr, start_time, workflow):
+    logging.info("Processing failure in {pr}, workflow {workflow} that started at {start_time}.")
+
+    logging.info("Building output file structure.")
+    output_path = f"recent_fails_logs/{slugify(pr)}/{slugify(workflow)}/{slugify(start_time)}"
+    os.makedirs(output_path)
+
+    logging.info("Gathering raw fail logs.")
+    subprocess.run(f"gh run view -R project-chip/connectedhomeip {id} --log-failed > {output_path}/fail_log.txt", shell=True)
+
+    # Eventually turn this into a catalog of error messages per workflow
+    logging.info("Collecting info on likely cause of failure.")
+    root_cause = "Unknown cause"
+    with open(f"{output_path}/fail_log.txt") as fail_log_file:
+        fail_log = fail_log_file.read()
+        workflow_category = workflow.split(" - ")[0]
+        if workflow_category in error_catalog:
+            for error_message in error_catalog[workflow_category]:
+                if error_message in fail_log:
+                    root_cause = error_catalog[workflow_category][error_message]["short"]
+                    break
+    return [pr, workflow, root_cause]
+
+
+def main():
+    logging.info("Gathering recent fails information into run_list.json.")
+    subprocess.run("gh run list -R project-chip/connectedhomeip -b master -s failure --json databaseId,displayTitle,startedAt,workflowName > run_list.json", shell=True)
+
+    logging.info("Reading run_list.json into a DataFrame.")
+    df = pd.read_json("run_list.json")
+
+    logging.info("Listing recent fails.")
+    df.columns = ["ID", "Pull Request", "Start Time", "Workflow"]
+    print("Recent Fails:")
+    print(df.to_string(columns=["Pull Request", "Workflow"], index=False))
+    print()
+    df.to_csv("recent_fails.csv", index=False)
+
+    logging.info("Listing frequency of recent fails by workflow.")
+    frequency = df["Workflow"].value_counts(normalize=True).mul(100).astype(
+        str).reset_index(name="Percentage")  # Reformat this from "50.0" to "50%"
+    print("Percentage Frequency of Fails by Workflow:")
+    print(frequency.to_string(index=False))
+    print()
+    frequency.to_csv("recent_workflow_fails_frequency.csv")
+
+    logging.info("Conducting fail information parsing.")
+    root_causes = df.apply(lambda row: process_fail(row["ID"], row["Pull Request"],
+                           row["Start Time"], row["Workflow"]), axis=1, result_type="expand")
+    root_causes.columns = ["Pull Request", "Workflow", "Cause of Failure"]
+    print("Likely Root Cause of Recent Fails:")
+    print(root_causes.to_string(index=False))
+
 
-df = pd.read_json("run_list.json")
-df.columns = ["Pull Request", "Workflow"]
-print("Recent Failures:")
-print(df.to_string(index=False))
-df.to_csv("recent_fails.csv", index=False)
-print()
-print("Percentage Frequency:")
-frequency = df["Workflow"].value_counts(normalize=True).mul(100).astype(str) + "%"
-print(frequency.to_string())
-frequency.to_csv("recent_fails_frequency.csv")
+if __name__ == "__main__":
+    main()