Skip to content

Commit

Permalink
Full day fail summary (#29609)
Browse files Browse the repository at this point in the history
* Update summarize_fail.py

* Update recent_fail_summary.yaml

* Update summarize_fail.py

* Update summarize_fail.py

* Update summarize_fail.py

* Update summarize_fail.py

* Update summarize_fail.py

* Update recent_fail_summary.yaml

* Update recent_fail_summary.yaml

* Create build_fail_defs.yaml

* Update build_fail_defs.yaml

* Update summarize_fail.py

* Update summarize_fail.py

* Update summarize_fail.py

* Update recent_fail_summary.yaml

* Update summarize_fail.py

* Update summarize_fail.py

* Update recent_fail_summary.yaml

* Update summarize_fail.py

* Update summarize_fail.py

* Update summarize_fail.py

* Update summarize_fail.py

* Update summarize_fail.py

* Update summarize_fail.py

* Update recent_fail_summary.yaml

* Update recent_fail_summary.yaml

* Update summarize_fail.py

* Update recent_fail_summary.yaml

* Update summarize_fail.py

* Update summarize_fail.py

* Update recent_fail_summary.yaml

* Restyled by prettier-yaml

* Restyled by autopep8

* Restyled by isort

* Update summarize_fail.py

---------

Co-authored-by: Restyled.io <[email protected]>
  • Loading branch information
2 people authored and pull[bot] committed Jan 12, 2024
1 parent 29f113e commit c1f7cfc
Show file tree
Hide file tree
Showing 3 changed files with 87 additions and 55 deletions.
20 changes: 15 additions & 5 deletions .github/workflows/recent_fail_summary.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
name: Recent Fail Summary
on:
schedule:
- cron: "0 0 * * *"
- cron: "10 0 * * *"
workflow_dispatch:

concurrency:
Expand All @@ -25,24 +25,34 @@ jobs:
list_workflows:
name: Summarize Recent Workflow Failures
runs-on: ubuntu-latest
permissions: write-all
steps:
- uses: actions/checkout@v4
- run: pip install pandas python-slugify
- run: pip install pandas python-slugify pyyaml tabulate
- name: Run Summarization Script
run: python scripts/tools/summarize_fail.py
env:
GH_TOKEN: ${{ github.token }}
- name: Update Docs
uses: test-room-7/action-update-file@v1
with:
file-path: docs/daily_pass_percentage.md
commit-msg: Update daily pass percentage
github-token: ${{ secrets.GITHUB_TOKEN }}
branch: daily_pass_percentage
- name: Upload Logs
uses: actions/upload-artifact@v3
with:
name: workflow-fail-summary
path: |
run_list.json
fail_run_list.json
all_run_list.json
recent_fails.csv
recent_fails_frequency.csv
failure_cause_summary.csv
workflow_fail_rate.csv
workflow_pass_rate.csv
workflow_pass_rate.sqlite3
recent_fails_logs
workflow_fail_rate
workflow_pass_rate
retention-days: 5

20 changes: 20 additions & 0 deletions scripts/tools/build_fail_defs.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
CodeQL:
No space left on device:
short: Ran out of space
detail: Exception with signature "No space left on device"
Check that the disk containing the database directory has ample free space.:
short: Ran out of space
detail:
Fatal internal error with message indicating that disk space most
likely ran out
Build example:
Could not find a version that satisfies the requirement:
short: Requirements issue
detail: Unable to install a requirements in Python requirements.txt
No module named:
short: Missing module
detail: Expected module was missing
Full builds:
No space left on device:
short: Ran out of space
detail: Exception with signature "No space left on device
102 changes: 52 additions & 50 deletions scripts/tools/summarize_fail.py
Original file line number Diff line number Diff line change
@@ -1,38 +1,31 @@
import datetime
import logging
import os
import sqlite3
import subprocess

import pandas as pd
import yaml
from slugify import slugify

error_catalog = {
"CodeQL": {
"No space left on device": {
"short": "Ran out of space",
"detail": "Exception with signature \"No space left on device\""
},
"Check that the disk containing the database directory has ample free space.": {
"short": "Ran out of space",
"detail": "Fatal internal error with message indicating that disk space most likely ran out"
}
},
"Build example": {
"Could not find a version that satisfies the requirement": {
"short": "Requirements issue",
"detail": "Unable to install a requirements in Python requirements.txt"
},
"No module named": {
"short": "Missing module",
"detail": "Expected module was missing"
}
},
"Full builds": {
"No space left on device": {
"short": "Ran out of space",
"detail": "Exception with signature \"No space left on device\""
}
}
}
yesterday = (datetime.datetime.now() - datetime.timedelta(days=1)).strftime('%Y-%m-%d')

with open("scripts/tools/build_fail_defs.yaml", "r") as fail_defs:
try:
error_catalog = yaml.safe_load(fail_defs)
except Exception:
logging.exception("Could not load fail definition file.")


def pass_fail_rate(workflow):
logging.info(f"Checking recent pass/fail rate of workflow {workflow}.")
workflow_fail_rate_output_path = f"workflow_pass_rate/{slugify(workflow)}"
if not os.path.exists(workflow_fail_rate_output_path):
os.makedirs(workflow_fail_rate_output_path)
subprocess.run(
f"gh run list -R project-chip/connectedhomeip -b master -w '{workflow}' -L 500 --created {yesterday} --json conclusion > {workflow_fail_rate_output_path}/run_list.json", shell=True)
else:
logging.info("This workflow has already been processed.")


def process_fail(id, pr, start_time, workflow):
Expand All @@ -57,28 +50,20 @@ def process_fail(id, pr, start_time, workflow):
root_cause = error_catalog[workflow_category][error_message]["short"]
break

logging.info(f"Checking recent pass/fail rate of workflow {workflow}.")
workflow_fail_rate_output_path = f"workflow_pass_rate/{slugify(workflow)}"
if not os.path.exists(workflow_fail_rate_output_path):
os.makedirs(workflow_fail_rate_output_path)
subprocess.run(
f"gh run list -R project-chip/connectedhomeip -b master -w '{workflow}' --json conclusion > {workflow_fail_rate_output_path}/run_list.json", shell=True)
else:
logging.info("This workflow has already been processed.")

return [pr, workflow, root_cause]


def main():
logging.info("Gathering recent fails information into run_list.json.")
subprocess.run("gh run list -R project-chip/connectedhomeip -b master -s failure --json databaseId,displayTitle,startedAt,workflowName > run_list.json", shell=True)
logging.info("Gathering recent fails information into fail_run_list.json.")
subprocess.run(
f"gh run list -R project-chip/connectedhomeip -b master -s failure -L 500 --created {yesterday} --json databaseId,displayTitle,startedAt,workflowName > fail_run_list.json", shell=True)

logging.info("Reading run_list.json into a DataFrame.")
df = pd.read_json("run_list.json")
logging.info("Reading fail_run_list.json into a DataFrame.")
df = pd.read_json("fail_run_list.json")

logging.info("Listing recent fails.")
logging.info(f"Listing recent fails from {yesterday}.")
df.columns = ["ID", "Pull Request", "Start Time", "Workflow"]
print("Recent Fails:")
print(f"Recent Fails From {yesterday}:")
print(df.to_string(columns=["Pull Request", "Workflow"], index=False))
print()
df.to_csv("recent_fails.csv", index=False)
Expand All @@ -91,6 +76,17 @@ def main():
print()
frequency.to_csv("recent_workflow_fails_frequency.csv")

logging.info("Gathering recent runs information into all_run_list.json.")
subprocess.run(
f"gh run list -R project-chip/connectedhomeip -b master -L 5000 --created {yesterday} --json workflowName > all_run_list.json", shell=True)

logging.info("Reading all_run_list.json into a DataFrame.")
all_df = pd.read_json("all_run_list.json")

logging.info("Gathering pass/fail rate info.")
all_df.columns = ["Workflow"]
all_df.apply(lambda row: pass_fail_rate(row["Workflow"]), axis=1)

logging.info("Conducting fail information parsing.")
root_causes = df.apply(lambda row: process_fail(row["ID"], row["Pull Request"],
row["Start Time"], row["Workflow"]), axis=1, result_type="expand")
Expand All @@ -100,19 +96,25 @@ def main():
print()
root_causes.to_csv("failure_cause_summary.csv")

logging.info("Listing percent fail rate of recent fails by workflow.")
fail_rate = {}
logging.info("Listing percent pass rate of recent fails by workflow.")
pass_rate = {}
for workflow in next(os.walk("workflow_pass_rate"))[1]:
try:
info = pd.read_json(f"workflow_pass_rate/{workflow}/run_list.json")
info = info[info["conclusion"].str.len() > 0]
fail_rate[workflow] = [info.value_counts(normalize=True).mul(100).round()["failure"]]
except Exception:
logging.exception(f"Recent runs info for {workflow} was not collected.")
fail_rate = pd.DataFrame.from_dict(fail_rate, 'index', columns=["Fail Rate"])
print("Recent Fail Rate of Each Workflow:")
print(fail_rate.to_string())
fail_rate.to_csv("workflow_fail_rate.csv")
try:
pass_rate[workflow] = [info.value_counts(normalize=True).mul(100).round()["success"]]
except Exception:
pass_rate[workflow] = [0.0]
pass_rate = pd.DataFrame.from_dict(pass_rate, 'index', columns=["Pass Rate"]).sort_values("Pass Rate")
print("Recent Pass Rate of Each Workflow:")
pass_rate.to_markdown("docs/daily_pass_percentage.md")
pass_rate_sql = sqlite3.connect("workflow_pass_rate.sqlite3")
pass_rate.to_sql("workflow_pass_rate", pass_rate_sql, if_exists="replace")
print(pass_rate.to_string())
pass_rate.to_csv("workflow_pass_rate.csv")


if __name__ == "__main__":
Expand Down

0 comments on commit c1f7cfc

Please sign in to comment.