Skip to content

Commit

Permalink
Add submission code for interactive analyses to write research code
Browse files Browse the repository at this point in the history
  • Loading branch information
ghickman committed Feb 10, 2023
1 parent 7ec0743 commit ffd975f
Show file tree
Hide file tree
Showing 16 changed files with 835 additions and 70 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,8 @@ jobs:
- name: Run unit tests on docker dev image
env:
GITHUB_TOKEN_TESTING: ${{ secrets.OPENSAFELY_GITHUB_TESTING_ORG_PAT }}
GITHUB_WRITEABLE_TOKEN:
INTERACTIVE_TEMPLATE_REPO:
run: |
# build docker and run test
just docker-test
Expand Down
1 change: 1 addition & 0 deletions docker/dependencies.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# list ubuntu packages needed in production, one per line
git
postgresql-client
python3.11
python3.11-distutils
Expand Down
4 changes: 3 additions & 1 deletion docker/docker-compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ services:
depends_on: [db]
environment:
- GITHUB_TOKEN=
- GITHUB_WRITEABLE_TOKEN=
- INTERACTIVE_TEMPLATE_REPO=
- SECRET_KEY=12345
- SOCIAL_AUTH_GITHUB_KEY=test
- SOCIAL_AUTH_GITHUB_SECRET=test
Expand Down Expand Up @@ -85,7 +87,7 @@ services:
environment:
# override db hostname, so we can reach it within the container
- GITHUB_TOKEN_TESTING
command: bash -c "coverage run --branch --source=applications,jobserver,services,staff,tests --module pytest && coverage report || coverage html"
command: bash -c "coverage run --branch --source=applications,interactive,jobserver,services,staff,tests --module pytest && coverage report || coverage html"

volumes:
postgres_data:
Expand Down
2 changes: 1 addition & 1 deletion docker/justfile
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ build env="dev":

# run tests in dev container
test *args="": build
docker-compose run --rm test bash -c "coverage run --branch --source=applications,jobserver,services,staff,tests --module pytest && coverage report || coverage html"
docker-compose run --rm test bash -c "coverage run --branch --source=applications,interactive,jobserver,services,staff,tests --module pytest && coverage report || coverage html"


# run server in dev|prod container
Expand Down
6 changes: 6 additions & 0 deletions dotenv-sample
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,12 @@ SECRET_KEY=12345
# A GitHub API token, we use a PAT for this
GITHUB_TOKEN=dummy

# A GitHub API token, we use a PAT from our writeable user for this
GITHUB_WRITEABLE_TOKEN=dummy

# The interactive analytical code repo template repo
INTERACTIVE_TEMPLATE_REPO=dummy

# Get these from a GitHub Org admin, you want the Dev application credentials
SOCIAL_AUTH_GITHUB_KEY=dummy
SOCIAL_AUTH_GITHUB_SECRET=dummy
Expand Down
35 changes: 35 additions & 0 deletions interactive/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
from attrs import define


@define
class Codelist:
label: str
slug: str
type: str # noqa: A003

# TODO: what are these again?
path: str | None = None
description: str | None = None


@define
class Analysis:
"""
Encapsulate all the data for an analysis to pass between layers
We expect to need a different object of this sort for each different analysis,
to capture all the details for a given analysis.
"""

codelist_1: Codelist
codelist_2: Codelist | None
created_by: str
demographics: str
filter_population: str
frequency: str
repo: str
time_event: str
time_scale: str
time_value: str
title: str
identifier: str | None = None
7 changes: 7 additions & 0 deletions interactive/opencodelists.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,13 @@ def _iter_codelists(self, codelists):
"organisation": codelist["organisation"],
}

def get_codelist(self, slug):
url = furl("https://www.opencodelists.org") / "codelist" / slug / "download.csv"
r = requests.get(url)
r.raise_for_status()

return r.text

def get_codelists(self, coding_system):
path_segments = [
"codelist",
Expand Down
240 changes: 240 additions & 0 deletions interactive/submit.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,240 @@
import os
import shutil
import subprocess
import sys
import tempfile
from pathlib import Path

from attrs import asdict
from django.conf import settings
from django.db import transaction

from .models import AnalysisRequest
from .opencodelists import _get_opencodelists_api


# from jobserver.github import create_issue
# from .emails import send_analysis_request_confirmation_email
# from .slacks import notify_analysis_request_submitted


@transaction.atomic()
def submit_analysis(
*,
analysis,
backend,
creator,
project,
get_opencodelists_api=_get_opencodelists_api,
force=False,
):
"""
Create all the parts needed for an analysis
This will stay in job-server while create_commit() is intended to move to
an external service in the future.
"""
# TODO: wrap form kwargs up in a dataclass?

# create an AnalysisRequest instance so we have a PK to use in various
# places, but we don't save it until we've written the commit and pushed
# it, so we can create the JobRequest this object needs
analysis_request = AnalysisRequest(
project=project,
created_by=creator,
title=analysis.title,
template_data=asdict(analysis),
)

# update the Analysis structure so we can pass around a single object
# if/when we pull the create_commit function out into another service
# this structure would be the JSON we send over
analysis.identifier = analysis_request.pk

sha, project_yaml = create_commit(
analysis,
force=force,
get_opencodelists_api=get_opencodelists_api,
template_repo=settings.INTERACTIVE_TEMPLATE_REPO,
)

job_request = project.interactive_workspace.job_requests.create(
backend=backend,
created_by=creator,
sha=sha,
project_definition=project_yaml,
force_run_dependencies=True,
requested_actions=["run_all"],
)

analysis_request.job_request = job_request
analysis_request.save()

# TODO: notify someone about output checking?
# waiting to find out what this should be
# issue_url = create_issue(analysis_request.pk, job_server_url=url)
# notify_analysis_request_submitted(analysis_request, issue_url)

# TODO: notify the user?
# send_analysis_request_confirmation_email(
# analysis_request.user.email, analysis_request
# )
return analysis_request


# MOVE TO EXTERNAL SERVICE


def clean_working_tree(path):
"""Remove all files (except .git)"""
for f in path.glob("**/*"):
relative = f.relative_to(path)

if str(relative) == ".git" or str(relative).startswith(".git/"):
continue

print(f, relative)

f.unlink() if f.is_file() else shutil.rmtree(f)


def commit_and_push(working_dir, analysis, force=False):
force_args = ["--force"] if force else []

git("add", ".", cwd=working_dir)

second_codelist = ""
if analysis.codelist_2:
second_codelist = f" and codelist {analysis.codelist_2.slug}"
msg = (
f"Codelist {analysis.codelist_1.slug}{second_codelist} ({analysis.identifier})"
)

git(
# -c arguments are instead of having to having to maintain stateful git config
"-c",
"[email protected]",
"-c",
"user.name=OpenSAFELY Interactive",
"commit",
"--author",
f"{analysis.created_by} <{analysis.created_by}>",
"-m",
msg,
cwd=working_dir,
)
ps = git("rev-parse", "HEAD", capture_output=True, cwd=working_dir)
commit_sha = ps.stdout.strip()

# this is an super important step, makes it much easier to track commits
git("tag", analysis.identifier, *force_args, cwd=working_dir)

# push to main. Note: we technically wouldn't need this from a pure git
# pov, as a tag would be enough, but job-runner explicitly checks that
# a commit is on the branch history, for security reasons
git("push", "origin", "main", "--force-with-lease", cwd=working_dir)

# push the tag once we know the main push has succeeded
git("push", "origin", analysis.identifier, *force_args, cwd=working_dir)
return commit_sha


def create_commit(
analysis,
template_repo,
force=False,
get_opencodelists_api=_get_opencodelists_api,
):
if not force:
# check this commit does not already exist
raise_if_commit_exists(analysis.repo, analysis.identifier)

# 1. create tempdir with AR.pk suffix
suffix = f"template-{analysis.identifier}"
with tempfile.TemporaryDirectory(suffix=suffix) as working_dir:
working_dir = Path(working_dir)

# 2. clone the given analysis code template repo to tempdir
git("clone", "--depth", "1", template_repo, working_dir)

# 3. remove the git directory
shutil.rmtree(working_dir / ".git")

# 4. download codelistA
download_codelist(
analysis.codelist_1.slug,
working_dir / "codelist_1.csv",
get_opencodelists_api=get_opencodelists_api,
)

# 5. optionally download codelistB
if analysis.codelist_2:
download_codelist(
analysis.codelist_2.slug,
working_dir / "codelist_2.csv",
get_opencodelists_api=get_opencodelists_api,
)

# 6. interpolate form data into the template files on disk
# render_interactive_report_code(working_dir, analysis)

suffix = f"repo-{analysis.identifier}"
with tempfile.TemporaryDirectory(suffix=suffix) as repo_dir:
repo_dir = Path(repo_dir)

# 7. clone the given interactive repo
git("clone", "--depth", "1", analysis.repo, repo_dir)

# 8. clear working directory because each analysis is fresh set of files
clean_working_tree(repo_dir)

# 9. Move templated files into the repo dir
for path in working_dir.iterdir():
shutil.move(path, repo_dir)

# 10. write a commit to the given interactive repo
sha = commit_and_push(repo_dir, analysis)

# 11. return contents of project.yaml (from disk) and sha
project_yaml = (repo_dir / "project.yaml").read_text()

return sha, project_yaml


def download_codelist(slug, path, get_opencodelists_api=_get_opencodelists_api):
"""Download the contents of a codelist."""
content = get_opencodelists_api().get_codelist(slug)

path.write_text(content)


def git(*args, check=True, text=True, **kwargs):
"""
Wrapper around subprocess.run for git commands.
Changes the defaults: check=True and text=True, and prints the command run
for logging.
"""
cmd = ["git"] + [str(arg) for arg in args]

cwd = kwargs.get("cwd", os.getcwd())
cleaned = [arg.replace(settings.GITHUB_WRITEABLE_TOKEN, "*****") for arg in cmd]
sys.stderr.write(f"{' '.join(cleaned)} (in {cwd})\n")

# disable reading the user's gitconfig, to give us a more expected environment
# when developing and testing locally.
env = {"GIT_CONFIG_GLOBAL": "1"}

return subprocess.run(cmd, check=check, text=text, env=env, **kwargs)


def raise_if_commit_exists(repo, tag):
ps = git(
"ls-remote",
"--tags",
repo,
f"refs/tags/{tag}",
capture_output=True,
)
if ps.stdout != "":
raise Exception(f"Commit for {tag} already exists in {repo}")
Loading

0 comments on commit ffd975f

Please sign in to comment.