-
Notifications
You must be signed in to change notification settings - Fork 4
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Make nextstrain build scripts consistent #868
Changes from all commits
7e89d6a
ec570a5
b0b4699
c385152
a4fc312
708d8b5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -39,6 +39,8 @@ RUN mkdir /ncov && \ | |
git remote add origin https://github.com/nextstrain/ncov.git && \ | ||
git fetch origin master && \ | ||
git reset --hard FETCH_HEAD | ||
RUN mkdir -p /ncov/auspice | ||
RUN mkdir -p /ncov/logs | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Make sure these directories always exist when we do a phylo run, so we don't get failures when trying to copy debug logs to s3 |
||
|
||
ADD aspen/workflows/nextstrain_run/patches/crowding_penalty.patch /tmp/ | ||
RUN patch /ncov/workflow/snakemake_rules/main_workflow.smk < /tmp/crowding_penalty.patch | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -14,13 +14,20 @@ df 1>&2 | |
cat /proc/meminfo 1>&2 | ||
|
||
start_time=$(date +%s) | ||
build_date=$(date +%Y%m%d) | ||
|
||
aws configure set region $AWS_REGION | ||
|
||
if [ ! -z "${BOTO_ENDPOINT_URL}" ]; then | ||
export aws="aws --endpoint-url ${BOTO_ENDPOINT_URL}" | ||
else | ||
export aws="aws" | ||
fi | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This makes it so we can run this script in local-dev (which needs the --endpoint-url flag) more easily. |
||
|
||
# fetch aspen config | ||
genepi_config="$(aws secretsmanager get-secret-value --secret-id $GENEPI_CONFIG_SECRET_NAME --query SecretString --output text)" | ||
genepi_config="$($aws secretsmanager get-secret-value --secret-id $GENEPI_CONFIG_SECRET_NAME --query SecretString --output text)" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We're using the $aws variable we set above with optional --endpoint-url flags to run aws cli commands. |
||
aspen_s3_db_bucket="$(jq -r .S3_db_bucket <<< "$genepi_config")" | ||
key_prefix="phylo_run/${S3_FILESTEM}/${WORKFLOW_ID}" | ||
s3_prefix="s3://${aspen_s3_db_bucket}/${key_prefix}" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Set our prefixes once so we don't have to accidentally get them wrong later on in this script |
||
|
||
# set up ncov | ||
mkdir -p /ncov/my_profiles/aspen /ncov/results | ||
|
@@ -39,8 +46,9 @@ aligned_gisaid_location=$( | |
--builds-file /ncov/my_profiles/aspen/builds.yaml \ | ||
) | ||
|
||
|
||
# Persist the build config we generated. | ||
aws s3 cp /ncov/my_profiles/aspen/builds.yaml "s3://${aspen_s3_db_bucket}/phylo_run/${build_date}/${S3_FILESTEM}/${WORKFLOW_ID}/builds.yaml" | ||
$aws s3 cp /ncov/my_profiles/aspen/builds.yaml "${s3_prefix}/builds.yaml" | ||
|
||
# If we don't have any county samples, copy the reference genomes to to our county file | ||
if [ ! -e /ncov/data/sequences_aspen.fasta ]; then | ||
|
@@ -52,16 +60,17 @@ aligned_gisaid_s3_bucket=$(echo "${aligned_gisaid_location}" | jq -r .bucket) | |
aligned_gisaid_sequences_s3_key=$(echo "${aligned_gisaid_location}" | jq -r .sequences_key) | ||
aligned_gisaid_metadata_s3_key=$(echo "${aligned_gisaid_location}" | jq -r .metadata_key) | ||
|
||
|
||
# fetch the gisaid dataset | ||
aws s3 cp --no-progress "s3://${aligned_gisaid_s3_bucket}/${aligned_gisaid_sequences_s3_key}" /ncov/results/ | ||
aws s3 cp --no-progress "s3://${aligned_gisaid_s3_bucket}/${aligned_gisaid_metadata_s3_key}" /ncov/results/ | ||
$aws s3 cp --no-progress "s3://${aligned_gisaid_s3_bucket}/${aligned_gisaid_sequences_s3_key}" /ncov/results/ | ||
$aws s3 cp --no-progress "s3://${aligned_gisaid_s3_bucket}/${aligned_gisaid_metadata_s3_key}" /ncov/results/ | ||
|
||
# run snakemake, if run fails export the logs from snakemake and ncov to s3 | ||
(cd /ncov && snakemake --printshellcmds auspice/ncov_aspen.json --profile my_profiles/aspen/ --resources=mem_mb=312320) || { aws s3 cp /ncov/.snakemake/log/ "s3://${aspen_s3_db_bucket}/phylo_run/${build_date}/${S3_FILESTEM}/${WORKFLOW_ID}/logs/snakemake/" --recursive ; aws s3 cp /ncov/logs/ "s3://${aspen_s3_db_bucket}/phylo_run/${build_date}/${S3_FILESTEM}/${WORKFLOW_ID}/logs/ncov/" --recursive ; } | ||
(cd /ncov && snakemake --printshellcmds auspice/ncov_aspen.json --profile my_profiles/aspen/ --resources=mem_mb=312320) || { $aws s3 cp /ncov/.snakemake/log/ "${s3_prefix}/logs/snakemake/" --recursive ; $aws s3 cp /ncov/logs/ "${s3_prefix}/logs/ncov/" --recursive ; } | ||
|
||
# upload the tree to S3 | ||
key="phylo_run/${build_date}/${S3_FILESTEM}/${WORKFLOW_ID}/ncov.json" | ||
aws s3 cp /ncov/auspice/ncov_aspen.json "s3://${aspen_s3_db_bucket}/${key}" | ||
key="${key_prefix}/ncov_aspen.json" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this key is used in the save.py code below, not sure what it is for. just checking. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it tells save.py which path to write to the DB so we can find it and make it available to auspice |
||
$aws s3 cp /ncov/auspice/ncov_aspen.json "s3://${aspen_s3_db_bucket}/${key}" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. sometimes we get > 1 JSON in 1 folder, so in the older code JSONs are renamed in this step (the builds.yaml needs too). Is the current set up, each tree run will have its own folder and this won't happen anymore? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The new path will create a folder prefix for every phylo tree: So the files will be like There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i'll add this to my tree wiki! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We can also change the way we format these paths -- if there's anything that you think is easier to understand, let me know! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it's great as you put here! this is going to make the tree debug a lot easier. i spend a lot of time clicking through the 1234 folders every time TnT |
||
|
||
# update aspen | ||
aspen_workflow_rev=WHATEVER | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,7 +6,6 @@ | |
# REMOTE_DEV_PREFIX (if set) | ||
# S3_FILESTEM | ||
# GROUP_NAME | ||
# TEMPLATE_FILENAME | ||
# TEMPLATE_ARGS_FILE | ||
# TREE_TYPE | ||
|
||
|
@@ -17,23 +16,32 @@ df 1>&2 | |
cat /proc/meminfo 1>&2 | ||
|
||
start_time=$(date +%s) | ||
build_id=$(date +%Y%m%d-%H%M) | ||
|
||
aws configure set region $AWS_REGION | ||
|
||
if [ ! -z "${BOTO_ENDPOINT_URL}" ]; then | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The changes to this file are ~identical to the changes to the ondemand script, they just make the two scripts more similar. |
||
export aws="aws --endpoint-url ${BOTO_ENDPOINT_URL}" | ||
else | ||
export aws="aws" | ||
fi | ||
|
||
# fetch aspen config | ||
genepi_config="$(aws secretsmanager get-secret-value --secret-id $GENEPI_CONFIG_SECRET_NAME --query SecretString --output text)" | ||
genepi_config="$($aws secretsmanager get-secret-value --secret-id $GENEPI_CONFIG_SECRET_NAME --query SecretString --output text)" | ||
aspen_s3_db_bucket="$(jq -r .S3_db_bucket <<< "$genepi_config")" | ||
|
||
# Recover template args | ||
TEMPLATE_ARGS=$(jq -c . < "${TEMPLATE_ARGS_FILE}") | ||
|
||
workflow_id=$(aspen-cli db create-phylo-run \ | ||
# Create a workflow run | ||
WORKFLOW_ID=$(aspen-cli db create-phylo-run \ | ||
--group-name "${GROUP_NAME}" \ | ||
--builds-template-args "${TEMPLATE_ARGS}" \ | ||
--tree-type "${TREE_TYPE}" | ||
) | ||
echo "${workflow_id}" >| "/tmp/workflow_id" | ||
echo "${WORKFLOW_ID}" >| "/tmp/workflow_id" | ||
|
||
key_prefix="phylo_run/${S3_FILESTEM}/${WORKFLOW_ID}" | ||
s3_prefix="s3://${aspen_s3_db_bucket}/${key_prefix}" | ||
|
||
# set up ncov | ||
mkdir -p /ncov/my_profiles/aspen /ncov/results | ||
|
@@ -42,35 +50,41 @@ echo "${ncov_git_rev}" >| "/tmp/ncov_git_rev" | |
|
||
cp /usr/src/app/aspen/workflows/nextstrain_run/nextstrain_profile/* /ncov/my_profiles/aspen/ | ||
|
||
|
||
# dump the sequences, metadata, and builds.yaml for a run out to disk. | ||
aligned_gisaid_location=$( | ||
python3 /usr/src/app/aspen/workflows/nextstrain_run/export.py \ | ||
--phylo-run-id "${workflow_id}" \ | ||
--sequences /ncov/data/sequences_aspen.fasta \ | ||
--metadata /ncov/data/metadata_aspen.tsv \ | ||
--phylo-run-id "${WORKFLOW_ID}" \ | ||
--sequences /ncov/data/sequences_aspen.fasta \ | ||
--metadata /ncov/data/metadata_aspen.tsv \ | ||
--selected /ncov/data/include.txt \ | ||
--builds-file /ncov/my_profiles/aspen/builds.yaml \ | ||
) | ||
|
||
|
||
# Persist the build config we generated. | ||
aws s3 cp /ncov/my_profiles/aspen/builds.yaml "s3://${aspen_s3_db_bucket}/phylo_run/${build_date}/${S3_FILESTEM}/${WORKFLOW_ID}/builds.yaml" | ||
$aws s3 cp /ncov/my_profiles/aspen/builds.yaml "${s3_prefix}/builds.yaml" | ||
|
||
# If we don't have any county samples, copy the reference genomes to to our county file | ||
if [ ! -e /ncov/data/sequences_aspen.fasta ]; then | ||
cp /ncov/data/references_sequences.fasta /ncov/data/sequences_aspen.fasta; | ||
cp /ncov/data/references_metadata.tsv /ncov/data/metadata_aspen.tsv; | ||
fi; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 👍 |
||
|
||
aligned_gisaid_s3_bucket=$(echo "${aligned_gisaid_location}" | jq -r .bucket) | ||
aligned_gisaid_sequences_s3_key=$(echo "${aligned_gisaid_location}" | jq -r .sequences_key) | ||
aligned_gisaid_metadata_s3_key=$(echo "${aligned_gisaid_location}" | jq -r .metadata_key) | ||
|
||
|
||
# fetch the gisaid dataset | ||
aws s3 cp --no-progress "s3://${aligned_gisaid_s3_bucket}/${aligned_gisaid_sequences_s3_key}" /ncov/results/ | ||
aws s3 cp --no-progress "s3://${aligned_gisaid_s3_bucket}/${aligned_gisaid_metadata_s3_key}" /ncov/results/ | ||
$aws s3 cp --no-progress "s3://${aligned_gisaid_s3_bucket}/${aligned_gisaid_sequences_s3_key}" /ncov/results/ | ||
$aws s3 cp --no-progress "s3://${aligned_gisaid_s3_bucket}/${aligned_gisaid_metadata_s3_key}" /ncov/results/ | ||
|
||
# run snakemake, if run fails export the logs from snakemake and ncov to s3 | ||
(cd /ncov && snakemake --printshellcmds auspice/ncov_aspen.json --profile my_profiles/aspen/ --resources=mem_mb=312320) || { aws s3 cp /ncov/.snakemake/log/ "s3://${aspen_s3_db_bucket}/phylo_run/${build_id}/logs/snakemake/" --recursive ; aws s3 cp /ncov/logs/ "s3://${aspen_s3_db_bucket}/phylo_run/${build_id}/logs/ncov/" --recursive ; } | ||
(cd /ncov && snakemake --printshellcmds auspice/ncov_aspen.json --profile my_profiles/aspen/ --resources=mem_mb=312320) || { $aws s3 cp /ncov/.snakemake/log/ "${s3_prefix}/logs/snakemake/" --recursive ; $aws s3 cp /ncov/logs/ "${s3_prefix}/logs/ncov/" --recursive ; } | ||
|
||
# upload the tree to S3 | ||
key="phylo_run/${build_id}/${S3_FILESTEM}.json" | ||
aws s3 cp /ncov/auspice/ncov_aspen.json "s3://${aspen_s3_db_bucket}/${key}" | ||
key="${key_prefix}/ncov_aspen.json" | ||
$aws s3 cp /ncov/auspice/ncov_aspen.json "s3://${aspen_s3_db_bucket}/${key}" | ||
|
||
# update aspen | ||
aspen_workflow_rev=WHATEVER | ||
|
@@ -85,7 +99,7 @@ python3 /usr/src/app/aspen/workflows/nextstrain_run/save.py \ | |
--ncov-rev "${ncov_git_rev}" \ | ||
--aspen-docker-image-version "" \ | ||
--end-time "${end_time}" \ | ||
--phylo-run-id "${workflow_id}" \ | ||
--phylo-run-id "${WORKFLOW_ID}" \ | ||
--bucket "${aspen_s3_db_bucket}" \ | ||
--key "${key}" \ | ||
--tree-path /ncov/auspice/ncov_aspen.json \ |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
#!/bin/bash | ||
export GENEPI_CONFIG_SECRET_NAME="genepi-config" | ||
export TEMPLATE_ARGS_FILE="args.json" | ||
export GROUP_NAME="CZI" | ||
export S3_FILESTEM="CZI_SCHEDULED_RUN" | ||
export TREE_TYPE="targeted" | ||
echo "{}" > $TEMPLATE_ARGS_FILE | ||
|
||
./run_nextstrain_scheduled.sh | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -34,7 +34,8 @@ ${local_aws} secretsmanager update-secret --secret-id genepi-config --secret-str | |
"DB_rw_username": "user_rw", | ||
"DB_rw_password": "password_rw", | ||
"DB_address": "database.genepinet.localdev", | ||
"S3_external_auspice_bucket": "genepi-external-auspice-data" | ||
"S3_external_auspice_bucket": "genepi-external-auspice-data", | ||
"S3_db_bucket": "genepi-db-data" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add some config info in local dev that we can use in the nextstrain run scripts |
||
}' || true | ||
|
||
echo "Creating IAM role" | ||
|
@@ -94,6 +95,8 @@ ${local_aws} ssm put-parameter --name /genepi/local/localstack/pangolin-ondemand | |
|
||
echo "Creating s3 buckets" | ||
${local_aws} s3api head-bucket --bucket genepi-external-auspice-data || ${local_aws} s3 mb s3://genepi-external-auspice-data | ||
${local_aws} s3api head-bucket --bucket genepi-db-data || ${local_aws} s3 mb s3://genepi-db-data | ||
${local_aws} s3api head-bucket --bucket genepi-gisaid-data || ${local_aws} s3 mb s3://genepi-gisaid-data | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Create the buckets we'll need so we can run builds in local dev. |
||
${local_aws} s3api head-bucket --bucket genepi-batch || ${local_aws} s3 mb s3://genepi-batch | ||
echo | ||
echo "Dev env is up and running!" | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,8 @@ | ||
import os | ||
from datetime import datetime | ||
|
||
import boto3 | ||
|
||
from aspen.config.docker_compose import DockerComposeConfig | ||
from aspen.database.connection import get_db_uri, init_db | ||
from aspen.database.models import ( | ||
|
@@ -110,20 +113,35 @@ def create_gisaid(session): | |
print("Aligned Gisaid Dump already exists") | ||
return | ||
# Add raw gisaid dump | ||
gisaid_s3_bucket = "gisaid_bucket" | ||
gisaid_s3_bucket = "genepi-gisaid-data" | ||
s3_resource = boto3.resource( | ||
"s3", | ||
endpoint_url=os.getenv("BOTO_ENDPOINT_URL") or None, | ||
config=boto3.session.Config(signature_version="s3v4"), | ||
) | ||
suffix = datetime.now().isoformat() | ||
raw_s3_key = f"raw_gisaid_dump-{suffix}" | ||
processed_sequences_s3_key = f"processed_sequences-{suffix}" | ||
processed_metadata_s3_key = f"processed_metadata-{suffix}" | ||
aligned_sequences_s3_key = f"aligned_sequences-{suffix}" | ||
aligned_metadata_s3_key = f"aligned_metadata-{suffix}" | ||
raw_gisaid_dump = RawGisaidDump( | ||
download_date=datetime.now(), | ||
s3_bucket=gisaid_s3_bucket, | ||
s3_key=f"raw_gisaid_dump-{suffix}", | ||
s3_key=raw_s3_key, | ||
) | ||
session.add(raw_gisaid_dump) | ||
s3_resource.Bucket(gisaid_s3_bucket).Object(raw_s3_key).put(Body="") | ||
s3_resource.Bucket(gisaid_s3_bucket).Object(processed_sequences_s3_key).put(Body="") | ||
s3_resource.Bucket(gisaid_s3_bucket).Object(processed_metadata_s3_key).put(Body="") | ||
s3_resource.Bucket(gisaid_s3_bucket).Object(aligned_sequences_s3_key).put(Body="") | ||
s3_resource.Bucket(gisaid_s3_bucket).Object(aligned_metadata_s3_key).put(Body="") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually write some gisaid files to s3 so our nextstrain scripts can download them in local dev. |
||
|
||
# add transformed gisaid dump | ||
processed_gisaid_dump = ProcessedGisaidDump( | ||
s3_bucket=gisaid_s3_bucket, | ||
sequences_s3_key=f"processed_sequences-{suffix}", | ||
metadata_s3_key=f"processed_metadata-{suffix}", | ||
sequences_s3_key=processed_sequences_s3_key, | ||
metadata_s3_key=processed_metadata_s3_key, | ||
) | ||
processed_workflow = GisaidDumpWorkflow( | ||
start_datetime=datetime.now(), | ||
|
@@ -139,8 +157,8 @@ def create_gisaid(session): | |
# Add an aligned dump | ||
aligned_gisaid_dump = AlignedGisaidDump( | ||
s3_bucket=gisaid_s3_bucket, | ||
sequences_s3_key=f"aligned_sequences-{suffix}", | ||
metadata_s3_key=f"aligned_metadata-{suffix}", | ||
sequences_s3_key=aligned_sequences_s3_key, | ||
metadata_s3_key=aligned_metadata_s3_key, | ||
) | ||
|
||
# attach a workflow | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This means we can run a nextstrain container locally with our local code changes in it.