diff --git a/.github/scripts/collect_github_metrics.py b/.github/scripts/collect_github_metrics.py index d933fa0f927987..275adefe911cda 100644 --- a/.github/scripts/collect_github_metrics.py +++ b/.github/scripts/collect_github_metrics.py @@ -6,6 +6,7 @@ import logging import psycopg2 import dateutil +import argparse def init_logger(): LOGLEVEL = os.environ.get('LOGLEVEL', 'INFO').upper() @@ -13,16 +14,26 @@ def init_logger(): format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s', datefmt='%m-%d-%Y %H:%M:%S') +def make_parser(): + parser = argparse.ArgumentParser() + parser.add_argument('-r', '--repository-name', type=str, required=True, + help='Repository name in OWNER/REPOSITORY format') + parser.add_argument('--run-id', type=str, required=True, + help='Workflow Run ID') + + return parser + def create_db_tables(conn, cur): - cur.execute('''CREATE TABLE IF NOT EXISTS github_workflow_runs_test( - id SERIAL, - run_id BIGINT PRIMARY KEY, + cur.execute('''CREATE TABLE IF NOT EXISTS workflow_runs( + id SERIAL PRIMARY KEY, + run_id BIGINT, html_url TEXT, name VARCHAR(255), run_started_at TIMESTAMP, + created_at TIMESTAMP, + updated_at TIMESTAMP, triggering_actor_login VARCHAR(255), conclusion VARCHAR(25), - run_number INT, event VARCHAR(50), run_attempt INT, repository_full_name VARCHAR(255), @@ -30,13 +41,14 @@ def create_db_tables(conn, cur): head_branch VARCHAR(255), status VARCHAR(25), display_title TEXT, - path TEXT + path TEXT, + total_duration_seconds INT ); ''') - cur.execute('''CREATE TABLE IF NOT EXISTS github_workflow_jobs_test( - id SERIAL, - job_id BIGINT PRIMARY KEY, - parent_run_id BIGINT REFERENCES github_workflow_runs_test(run_id), + cur.execute('''CREATE TABLE IF NOT EXISTS workflow_jobs( + id SERIAL PRIMARY KEY, + job_id BIGINT, + parent_run_id BIGINT, html_url TEXT, name VARCHAR(255), created_at TIMESTAMP, @@ -47,12 +59,14 @@ def create_db_tables(conn, cur): runner_name VARCHAR(255), status VARCHAR(25), conclusion VARCHAR(25), - head_branch VARCHAR(255) + head_branch VARCHAR(255), + run_attempt INT, + workflow_name TEXT ); ''') - cur.execute('''CREATE TABLE IF NOT EXISTS github_workflow_steps_test( + cur.execute('''CREATE TABLE IF NOT EXISTS workflow_steps( id SERIAL PRIMARY KEY, - parent_job_id BIGINT REFERENCES github_workflow_jobs_test(job_id), + parent_job_id BIGINT, name VARCHAR(255), conclusion VARCHAR(25), number INT, @@ -65,20 +79,16 @@ def create_db_tables(conn, cur): def main(): init_logger() - + parser = make_parser() + args = parser.parse_args() logger = logging.getLogger(__name__) github_token = os.environ.get('GITHUB_TOKEN') if not github_token: raise ValueError('GITHUB_TOKEN environment variable is not set!') - run_id = os.environ.get('RUN_ID') - if not run_id: - raise ValueError('RUN_ID environment variable is not set!') - - repo_name = os.environ.get('GITHUB_REPOSITORY') - if not repo_name: - raise ValueError('GITHUB_REPOSITORY environment variable is not set!') + run_id = args.run_id + repo_name = args.repository_name # this should be specified in runner's env @@ -102,18 +112,30 @@ def main(): repo = g.get_repo(repo_name) run = repo.get_workflow_run(int(run_id)) - - workflow_data_query = f'''INSERT INTO github_workflow_runs_test( + if run.status != 'completed': + logger.error('Run %s is not completed! Only completed runs should be in the database', run_id) + raise SystemExit(1) + + # We rely on the following assumptions: + # - The workflow run is completed. When run.status != 'completed' we should not add it to the database + # theoretically the second attempt can be triggerred right after the completion of the first one + # or while the runner which executes this script is deploying + # + # - Job's queued duration equals "job.started_at - job.created_at" if started_at > created_at. + # Otherwise the job should not be added to the database + total_duration_seconds = round(run.timing().run_duration_ms / 1000) + workflow_data_query = f'''INSERT INTO workflow_runs( run_id, html_url, name, - run_started_at, triggering_actor_login, conclusion, - run_number, event, run_attempt, repository_full_name, - head_branch, display_title, path) + run_started_at, created_at, updated_at, triggering_actor_login, conclusion, + event, run_attempt, repository_full_name, + head_branch, display_title, path, total_duration_seconds) VALUES( '{run_id}', '{run.html_url}', '{run.name}', '{run.run_started_at}', + '{run.created_at}', '{run.updated_at}', '{run.raw_data['triggering_actor']['login']}', - '{run.conclusion}', '{run.run_number}', '{run.event}', + '{run.conclusion}', '{run.event}', '{run.run_attempt}', '{run.raw_data['repository']['full_name']}', - '{run.head_branch}', '{run.display_title}', '{run.path}' + '{run.head_branch}', '{run.display_title}', '{run.path}', '{total_duration_seconds}' ); ''' @@ -126,6 +148,10 @@ def main(): duration_seconds = 0 job_created_at_date = dateutil.parser.parse(job.raw_data['created_at']) + if job_created_at_date > job.started_at: + logger.warning('Skipping job %s of run %s - most likely a stub \ + job created after workflow restart', job.name, run_id) + continue queued_duration_timedelta = job.started_at - job_created_at_date queued_duration_seconds = round(queued_duration_timedelta.total_seconds()) @@ -134,17 +160,19 @@ def main(): duration_seconds = round(duration_timedelta.total_seconds()) job_data_query = f''' - INSERT INTO github_workflow_jobs_test( + INSERT INTO workflow_jobs( job_id, parent_run_id, html_url, name, created_at, started_at, completed_at, queued_duration_seconds, duration_seconds, - runner_name, status, conclusion, head_branch) + runner_name, status, conclusion, head_branch, + run_attempt, workflow_name + ) VALUES( '{job_id}', '{run_id}', '{job.html_url}', '{job.name}', '{job.raw_data['created_at']}', '{job.started_at}', '{job.completed_at}', '{queued_duration_seconds}', '{duration_seconds}', '{job.raw_data['runner_name']}', '{job.status}', '{job.conclusion}', - '{job.raw_data['head_branch']}' + '{job.raw_data['head_branch']}', '{job.raw_data['run_attempt']}', '{job.raw_data['workflow_name']}' ); ''' logger.debug('Job query: %s', job_data_query) @@ -154,7 +182,7 @@ def main(): duration_seconds = round(duration_seconds_timedelta.total_seconds()) step_data_query = f''' - INSERT INTO github_workflow_steps_test( + INSERT INTO workflow_steps( parent_job_id, name, conclusion, number, started_at, completed_at, duration_seconds) diff --git a/.github/workflows/send_workflows_to_opentelemetry.yml b/.github/workflows/send_workflows_to_opentelemetry.yml index 59146bda517182..fe8a3778c34f73 100644 --- a/.github/workflows/send_workflows_to_opentelemetry.yml +++ b/.github/workflows/send_workflows_to_opentelemetry.yml @@ -49,6 +49,8 @@ jobs: - name: Install deps run: | pip3 install -r .github/scripts/requirements.txt + # dependency review action has these as an exception + # yet it still complains, so install them here pip3 install PyGithub==2.2.0 psycopg2-binary==2.9.9 - name: Send metrics to SQL database @@ -58,6 +60,9 @@ jobs: PGHOST: ${{ secrets.METRICS_DATABASE_HOST }} PGUSER: ${{ secrets.METRICS_DATABASE_USERNAME }} PGPASSWORD: ${{ secrets.METRICS_DATABASE_PASSWORD }} + PGDATABASE: ${{ secrets.METRICS_DATABASE_NAME }} PGPORT: 5432 run: | - python3 .github/scripts/collect_github_metrics.py + python3 .github/scripts/collect_github_metrics.py \ + --run-id ${{ github.event.workflow_run.id }} \ + --repository-name ${GITHUB_REPOSITORY}