Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

New DB schema for GitHub metrics script #23606

Merged
merged 3 commits into from
Mar 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 59 additions & 31 deletions .github/scripts/collect_github_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,37 +6,49 @@
import logging
import psycopg2
import dateutil
import argparse

def init_logger():
LOGLEVEL = os.environ.get('LOGLEVEL', 'INFO').upper()
logging.basicConfig(level=LOGLEVEL,
format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s',
datefmt='%m-%d-%Y %H:%M:%S')

def make_parser():
parser = argparse.ArgumentParser()
parser.add_argument('-r', '--repository-name', type=str, required=True,
help='Repository name in OWNER/REPOSITORY format')
parser.add_argument('--run-id', type=str, required=True,
help='Workflow Run ID')

return parser

def create_db_tables(conn, cur):
cur.execute('''CREATE TABLE IF NOT EXISTS github_workflow_runs_test(
id SERIAL,
run_id BIGINT PRIMARY KEY,
cur.execute('''CREATE TABLE IF NOT EXISTS workflow_runs(
id SERIAL PRIMARY KEY,
run_id BIGINT,
html_url TEXT,
name VARCHAR(255),
run_started_at TIMESTAMP,
created_at TIMESTAMP,
updated_at TIMESTAMP,
triggering_actor_login VARCHAR(255),
conclusion VARCHAR(25),
run_number INT,
event VARCHAR(50),
run_attempt INT,
repository_full_name VARCHAR(255),
head_repository_full_name VARCHAR(255),
head_branch VARCHAR(255),
status VARCHAR(25),
display_title TEXT,
path TEXT
path TEXT,
total_duration_seconds INT
ababushk marked this conversation as resolved.
Show resolved Hide resolved
);
''')
cur.execute('''CREATE TABLE IF NOT EXISTS github_workflow_jobs_test(
id SERIAL,
job_id BIGINT PRIMARY KEY,
parent_run_id BIGINT REFERENCES github_workflow_runs_test(run_id),
cur.execute('''CREATE TABLE IF NOT EXISTS workflow_jobs(
id SERIAL PRIMARY KEY,
job_id BIGINT,
parent_run_id BIGINT,
html_url TEXT,
name VARCHAR(255),
created_at TIMESTAMP,
Expand All @@ -47,12 +59,14 @@ def create_db_tables(conn, cur):
runner_name VARCHAR(255),
status VARCHAR(25),
conclusion VARCHAR(25),
head_branch VARCHAR(255)
head_branch VARCHAR(255),
run_attempt INT,
workflow_name TEXT
);
''')
cur.execute('''CREATE TABLE IF NOT EXISTS github_workflow_steps_test(
cur.execute('''CREATE TABLE IF NOT EXISTS workflow_steps(
id SERIAL PRIMARY KEY,
parent_job_id BIGINT REFERENCES github_workflow_jobs_test(job_id),
parent_job_id BIGINT,
name VARCHAR(255),
conclusion VARCHAR(25),
number INT,
Expand All @@ -65,20 +79,16 @@ def create_db_tables(conn, cur):

def main():
init_logger()

parser = make_parser()
args = parser.parse_args()
logger = logging.getLogger(__name__)

github_token = os.environ.get('GITHUB_TOKEN')
if not github_token:
raise ValueError('GITHUB_TOKEN environment variable is not set!')

run_id = os.environ.get('RUN_ID')
if not run_id:
raise ValueError('RUN_ID environment variable is not set!')

repo_name = os.environ.get('GITHUB_REPOSITORY')
if not repo_name:
raise ValueError('GITHUB_REPOSITORY environment variable is not set!')
run_id = args.run_id
repo_name = args.repository_name


# this should be specified in runner's env
Expand All @@ -102,18 +112,30 @@ def main():
repo = g.get_repo(repo_name)

run = repo.get_workflow_run(int(run_id))

workflow_data_query = f'''INSERT INTO github_workflow_runs_test(
if run.status != 'completed':
logger.error('Run %s is not completed! Only completed runs should be in the database', run_id)
raise SystemExit(1)

# We rely on the following assumptions:
# - The workflow run is completed. When run.status != 'completed' we should not add it to the database
# theoretically the second attempt can be triggerred right after the completion of the first one
# or while the runner which executes this script is deploying
#
# - Job's queued duration equals "job.started_at - job.created_at" if started_at > created_at.
# Otherwise the job should not be added to the database
total_duration_seconds = round(run.timing().run_duration_ms / 1000)
workflow_data_query = f'''INSERT INTO workflow_runs(
run_id, html_url, name,
run_started_at, triggering_actor_login, conclusion,
run_number, event, run_attempt, repository_full_name,
head_branch, display_title, path)
run_started_at, created_at, updated_at, triggering_actor_login, conclusion,
event, run_attempt, repository_full_name,
head_branch, display_title, path, total_duration_seconds)
VALUES(
'{run_id}', '{run.html_url}', '{run.name}', '{run.run_started_at}',
'{run.created_at}', '{run.updated_at}',
'{run.raw_data['triggering_actor']['login']}',
'{run.conclusion}', '{run.run_number}', '{run.event}',
'{run.conclusion}', '{run.event}',
'{run.run_attempt}', '{run.raw_data['repository']['full_name']}',
'{run.head_branch}', '{run.display_title}', '{run.path}'
'{run.head_branch}', '{run.display_title}', '{run.path}', '{total_duration_seconds}'
);
'''

Expand All @@ -126,6 +148,10 @@ def main():
duration_seconds = 0

job_created_at_date = dateutil.parser.parse(job.raw_data['created_at'])
if job_created_at_date > job.started_at:
logger.warning('Skipping job %s of run %s - most likely a stub \
job created after workflow restart', job.name, run_id)
continue

queued_duration_timedelta = job.started_at - job_created_at_date
queued_duration_seconds = round(queued_duration_timedelta.total_seconds())
Expand All @@ -134,17 +160,19 @@ def main():
duration_seconds = round(duration_timedelta.total_seconds())

job_data_query = f'''
INSERT INTO github_workflow_jobs_test(
INSERT INTO workflow_jobs(
job_id, parent_run_id, html_url, name,
created_at, started_at, completed_at,
queued_duration_seconds, duration_seconds,
runner_name, status, conclusion, head_branch)
runner_name, status, conclusion, head_branch,
run_attempt, workflow_name
)
VALUES(
'{job_id}', '{run_id}', '{job.html_url}', '{job.name}',
'{job.raw_data['created_at']}', '{job.started_at}', '{job.completed_at}',
'{queued_duration_seconds}', '{duration_seconds}',
'{job.raw_data['runner_name']}', '{job.status}', '{job.conclusion}',
'{job.raw_data['head_branch']}'
'{job.raw_data['head_branch']}', '{job.raw_data['run_attempt']}', '{job.raw_data['workflow_name']}'
);
'''
logger.debug('Job query: %s', job_data_query)
Expand All @@ -154,7 +182,7 @@ def main():
duration_seconds = round(duration_seconds_timedelta.total_seconds())

step_data_query = f'''
INSERT INTO github_workflow_steps_test(
INSERT INTO workflow_steps(
parent_job_id, name, conclusion,
number, started_at, completed_at,
duration_seconds)
Expand Down
7 changes: 6 additions & 1 deletion .github/workflows/send_workflows_to_opentelemetry.yml
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ jobs:
- name: Install deps
run: |
pip3 install -r .github/scripts/requirements.txt
# dependency review action has these as an exception
# yet it still complains, so install them here
pip3 install PyGithub==2.2.0 psycopg2-binary==2.9.9

- name: Send metrics to SQL database
Expand All @@ -58,6 +60,9 @@ jobs:
PGHOST: ${{ secrets.METRICS_DATABASE_HOST }}
PGUSER: ${{ secrets.METRICS_DATABASE_USERNAME }}
PGPASSWORD: ${{ secrets.METRICS_DATABASE_PASSWORD }}
PGDATABASE: ${{ secrets.METRICS_DATABASE_NAME }}
PGPORT: 5432
run: |
python3 .github/scripts/collect_github_metrics.py
python3 .github/scripts/collect_github_metrics.py \
--run-id ${{ github.event.workflow_run.id }} \
--repository-name ${GITHUB_REPOSITORY}
Loading