From a820acd1a1fed29263130b9d179897035ca3bea4 Mon Sep 17 00:00:00 2001 From: Jessica Gadling Date: Wed, 15 Dec 2021 11:35:11 -0800 Subject: [PATCH] Make nextstrain build scripts consistent (#868) Standardize s3 paths for nextstrain builds --- docker-compose.yml | 2 + src/backend/Dockerfile.nextstrain | 2 + .../nextstrain_run/run_nextstrain_ondemand.sh | 25 ++++++---- .../run_nextstrain_scheduled.sh | 46 ++++++++++++------- .../workflows/nextstrain_run/run_test.sh | 10 ++++ src/backend/scripts/setup_dev_data.sh | 5 +- src/backend/scripts/setup_localdata.py | 30 +++++++++--- 7 files changed, 89 insertions(+), 31 deletions(-) create mode 100755 src/backend/aspen/workflows/nextstrain_run/run_test.sh diff --git a/docker-compose.yml b/docker-compose.yml index 5e4ae9d8cd..ad2fc9d614 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -217,6 +217,8 @@ services: genepinet: aliases: - nextstrain.genepinet.localdev + volumes: + - ./src/backend:/usr/src/app networks: genepinet: volumes: diff --git a/src/backend/Dockerfile.nextstrain b/src/backend/Dockerfile.nextstrain index 89c93815d2..40db58e17d 100644 --- a/src/backend/Dockerfile.nextstrain +++ b/src/backend/Dockerfile.nextstrain @@ -39,6 +39,8 @@ RUN mkdir /ncov && \ git remote add origin https://github.com/nextstrain/ncov.git && \ git fetch origin master && \ git reset --hard FETCH_HEAD +RUN mkdir -p /ncov/auspice +RUN mkdir -p /ncov/logs ADD aspen/workflows/nextstrain_run/patches/crowding_penalty.patch /tmp/ RUN patch /ncov/workflow/snakemake_rules/main_workflow.smk < /tmp/crowding_penalty.patch diff --git a/src/backend/aspen/workflows/nextstrain_run/run_nextstrain_ondemand.sh b/src/backend/aspen/workflows/nextstrain_run/run_nextstrain_ondemand.sh index 1e777b51ff..483490f125 100755 --- a/src/backend/aspen/workflows/nextstrain_run/run_nextstrain_ondemand.sh +++ b/src/backend/aspen/workflows/nextstrain_run/run_nextstrain_ondemand.sh @@ -14,13 +14,20 @@ df 1>&2 cat /proc/meminfo 1>&2 start_time=$(date +%s) -build_date=$(date +%Y%m%d) aws configure set region $AWS_REGION +if [ ! -z "${BOTO_ENDPOINT_URL}" ]; then + export aws="aws --endpoint-url ${BOTO_ENDPOINT_URL}" +else + export aws="aws" +fi + # fetch aspen config -genepi_config="$(aws secretsmanager get-secret-value --secret-id $GENEPI_CONFIG_SECRET_NAME --query SecretString --output text)" +genepi_config="$($aws secretsmanager get-secret-value --secret-id $GENEPI_CONFIG_SECRET_NAME --query SecretString --output text)" aspen_s3_db_bucket="$(jq -r .S3_db_bucket <<< "$genepi_config")" +key_prefix="phylo_run/${S3_FILESTEM}/${WORKFLOW_ID}" +s3_prefix="s3://${aspen_s3_db_bucket}/${key_prefix}" # set up ncov mkdir -p /ncov/my_profiles/aspen /ncov/results @@ -39,8 +46,9 @@ aligned_gisaid_location=$( --builds-file /ncov/my_profiles/aspen/builds.yaml \ ) + # Persist the build config we generated. -aws s3 cp /ncov/my_profiles/aspen/builds.yaml "s3://${aspen_s3_db_bucket}/phylo_run/${build_date}/${S3_FILESTEM}/${WORKFLOW_ID}/builds.yaml" +$aws s3 cp /ncov/my_profiles/aspen/builds.yaml "${s3_prefix}/builds.yaml" # If we don't have any county samples, copy the reference genomes to to our county file if [ ! -e /ncov/data/sequences_aspen.fasta ]; then @@ -52,16 +60,17 @@ aligned_gisaid_s3_bucket=$(echo "${aligned_gisaid_location}" | jq -r .bucket) aligned_gisaid_sequences_s3_key=$(echo "${aligned_gisaid_location}" | jq -r .sequences_key) aligned_gisaid_metadata_s3_key=$(echo "${aligned_gisaid_location}" | jq -r .metadata_key) + # fetch the gisaid dataset -aws s3 cp --no-progress "s3://${aligned_gisaid_s3_bucket}/${aligned_gisaid_sequences_s3_key}" /ncov/results/ -aws s3 cp --no-progress "s3://${aligned_gisaid_s3_bucket}/${aligned_gisaid_metadata_s3_key}" /ncov/results/ +$aws s3 cp --no-progress "s3://${aligned_gisaid_s3_bucket}/${aligned_gisaid_sequences_s3_key}" /ncov/results/ +$aws s3 cp --no-progress "s3://${aligned_gisaid_s3_bucket}/${aligned_gisaid_metadata_s3_key}" /ncov/results/ # run snakemake, if run fails export the logs from snakemake and ncov to s3 -(cd /ncov && snakemake --printshellcmds auspice/ncov_aspen.json --profile my_profiles/aspen/ --resources=mem_mb=312320) || { aws s3 cp /ncov/.snakemake/log/ "s3://${aspen_s3_db_bucket}/phylo_run/${build_date}/${S3_FILESTEM}/${WORKFLOW_ID}/logs/snakemake/" --recursive ; aws s3 cp /ncov/logs/ "s3://${aspen_s3_db_bucket}/phylo_run/${build_date}/${S3_FILESTEM}/${WORKFLOW_ID}/logs/ncov/" --recursive ; } +(cd /ncov && snakemake --printshellcmds auspice/ncov_aspen.json --profile my_profiles/aspen/ --resources=mem_mb=312320) || { $aws s3 cp /ncov/.snakemake/log/ "${s3_prefix}/logs/snakemake/" --recursive ; $aws s3 cp /ncov/logs/ "${s3_prefix}/logs/ncov/" --recursive ; } # upload the tree to S3 -key="phylo_run/${build_date}/${S3_FILESTEM}/${WORKFLOW_ID}/ncov.json" -aws s3 cp /ncov/auspice/ncov_aspen.json "s3://${aspen_s3_db_bucket}/${key}" +key="${key_prefix}/ncov_aspen.json" +$aws s3 cp /ncov/auspice/ncov_aspen.json "s3://${aspen_s3_db_bucket}/${key}" # update aspen aspen_workflow_rev=WHATEVER diff --git a/src/backend/aspen/workflows/nextstrain_run/run_nextstrain_scheduled.sh b/src/backend/aspen/workflows/nextstrain_run/run_nextstrain_scheduled.sh index c228cfeb2b..005b6d5104 100755 --- a/src/backend/aspen/workflows/nextstrain_run/run_nextstrain_scheduled.sh +++ b/src/backend/aspen/workflows/nextstrain_run/run_nextstrain_scheduled.sh @@ -6,7 +6,6 @@ # REMOTE_DEV_PREFIX (if set) # S3_FILESTEM # GROUP_NAME -# TEMPLATE_FILENAME # TEMPLATE_ARGS_FILE # TREE_TYPE @@ -17,23 +16,32 @@ df 1>&2 cat /proc/meminfo 1>&2 start_time=$(date +%s) -build_id=$(date +%Y%m%d-%H%M) aws configure set region $AWS_REGION +if [ ! -z "${BOTO_ENDPOINT_URL}" ]; then + export aws="aws --endpoint-url ${BOTO_ENDPOINT_URL}" +else + export aws="aws" +fi + # fetch aspen config -genepi_config="$(aws secretsmanager get-secret-value --secret-id $GENEPI_CONFIG_SECRET_NAME --query SecretString --output text)" +genepi_config="$($aws secretsmanager get-secret-value --secret-id $GENEPI_CONFIG_SECRET_NAME --query SecretString --output text)" aspen_s3_db_bucket="$(jq -r .S3_db_bucket <<< "$genepi_config")" # Recover template args TEMPLATE_ARGS=$(jq -c . < "${TEMPLATE_ARGS_FILE}") -workflow_id=$(aspen-cli db create-phylo-run \ +# Create a workflow run +WORKFLOW_ID=$(aspen-cli db create-phylo-run \ --group-name "${GROUP_NAME}" \ --builds-template-args "${TEMPLATE_ARGS}" \ --tree-type "${TREE_TYPE}" ) -echo "${workflow_id}" >| "/tmp/workflow_id" +echo "${WORKFLOW_ID}" >| "/tmp/workflow_id" + +key_prefix="phylo_run/${S3_FILESTEM}/${WORKFLOW_ID}" +s3_prefix="s3://${aspen_s3_db_bucket}/${key_prefix}" # set up ncov mkdir -p /ncov/my_profiles/aspen /ncov/results @@ -42,19 +50,25 @@ echo "${ncov_git_rev}" >| "/tmp/ncov_git_rev" cp /usr/src/app/aspen/workflows/nextstrain_run/nextstrain_profile/* /ncov/my_profiles/aspen/ - # dump the sequences, metadata, and builds.yaml for a run out to disk. aligned_gisaid_location=$( python3 /usr/src/app/aspen/workflows/nextstrain_run/export.py \ - --phylo-run-id "${workflow_id}" \ - --sequences /ncov/data/sequences_aspen.fasta \ - --metadata /ncov/data/metadata_aspen.tsv \ + --phylo-run-id "${WORKFLOW_ID}" \ + --sequences /ncov/data/sequences_aspen.fasta \ + --metadata /ncov/data/metadata_aspen.tsv \ --selected /ncov/data/include.txt \ --builds-file /ncov/my_profiles/aspen/builds.yaml \ ) + # Persist the build config we generated. -aws s3 cp /ncov/my_profiles/aspen/builds.yaml "s3://${aspen_s3_db_bucket}/phylo_run/${build_date}/${S3_FILESTEM}/${WORKFLOW_ID}/builds.yaml" +$aws s3 cp /ncov/my_profiles/aspen/builds.yaml "${s3_prefix}/builds.yaml" + +# If we don't have any county samples, copy the reference genomes to to our county file +if [ ! -e /ncov/data/sequences_aspen.fasta ]; then + cp /ncov/data/references_sequences.fasta /ncov/data/sequences_aspen.fasta; + cp /ncov/data/references_metadata.tsv /ncov/data/metadata_aspen.tsv; +fi; aligned_gisaid_s3_bucket=$(echo "${aligned_gisaid_location}" | jq -r .bucket) aligned_gisaid_sequences_s3_key=$(echo "${aligned_gisaid_location}" | jq -r .sequences_key) @@ -62,15 +76,15 @@ aligned_gisaid_metadata_s3_key=$(echo "${aligned_gisaid_location}" | jq -r .meta # fetch the gisaid dataset -aws s3 cp --no-progress "s3://${aligned_gisaid_s3_bucket}/${aligned_gisaid_sequences_s3_key}" /ncov/results/ -aws s3 cp --no-progress "s3://${aligned_gisaid_s3_bucket}/${aligned_gisaid_metadata_s3_key}" /ncov/results/ +$aws s3 cp --no-progress "s3://${aligned_gisaid_s3_bucket}/${aligned_gisaid_sequences_s3_key}" /ncov/results/ +$aws s3 cp --no-progress "s3://${aligned_gisaid_s3_bucket}/${aligned_gisaid_metadata_s3_key}" /ncov/results/ # run snakemake, if run fails export the logs from snakemake and ncov to s3 -(cd /ncov && snakemake --printshellcmds auspice/ncov_aspen.json --profile my_profiles/aspen/ --resources=mem_mb=312320) || { aws s3 cp /ncov/.snakemake/log/ "s3://${aspen_s3_db_bucket}/phylo_run/${build_id}/logs/snakemake/" --recursive ; aws s3 cp /ncov/logs/ "s3://${aspen_s3_db_bucket}/phylo_run/${build_id}/logs/ncov/" --recursive ; } +(cd /ncov && snakemake --printshellcmds auspice/ncov_aspen.json --profile my_profiles/aspen/ --resources=mem_mb=312320) || { $aws s3 cp /ncov/.snakemake/log/ "${s3_prefix}/logs/snakemake/" --recursive ; $aws s3 cp /ncov/logs/ "${s3_prefix}/logs/ncov/" --recursive ; } # upload the tree to S3 -key="phylo_run/${build_id}/${S3_FILESTEM}.json" -aws s3 cp /ncov/auspice/ncov_aspen.json "s3://${aspen_s3_db_bucket}/${key}" +key="${key_prefix}/ncov_aspen.json" +$aws s3 cp /ncov/auspice/ncov_aspen.json "s3://${aspen_s3_db_bucket}/${key}" # update aspen aspen_workflow_rev=WHATEVER @@ -85,7 +99,7 @@ python3 /usr/src/app/aspen/workflows/nextstrain_run/save.py \ --ncov-rev "${ncov_git_rev}" \ --aspen-docker-image-version "" \ --end-time "${end_time}" \ - --phylo-run-id "${workflow_id}" \ + --phylo-run-id "${WORKFLOW_ID}" \ --bucket "${aspen_s3_db_bucket}" \ --key "${key}" \ --tree-path /ncov/auspice/ncov_aspen.json \ diff --git a/src/backend/aspen/workflows/nextstrain_run/run_test.sh b/src/backend/aspen/workflows/nextstrain_run/run_test.sh new file mode 100755 index 0000000000..9c54bae6af --- /dev/null +++ b/src/backend/aspen/workflows/nextstrain_run/run_test.sh @@ -0,0 +1,10 @@ +#!/bin/bash +export GENEPI_CONFIG_SECRET_NAME="genepi-config" +export TEMPLATE_ARGS_FILE="args.json" +export GROUP_NAME="CZI" +export S3_FILESTEM="CZI_SCHEDULED_RUN" +export TREE_TYPE="targeted" +echo "{}" > $TEMPLATE_ARGS_FILE + +./run_nextstrain_scheduled.sh + diff --git a/src/backend/scripts/setup_dev_data.sh b/src/backend/scripts/setup_dev_data.sh index c9e28d9d89..459fdf554c 100755 --- a/src/backend/scripts/setup_dev_data.sh +++ b/src/backend/scripts/setup_dev_data.sh @@ -34,7 +34,8 @@ ${local_aws} secretsmanager update-secret --secret-id genepi-config --secret-str "DB_rw_username": "user_rw", "DB_rw_password": "password_rw", "DB_address": "database.genepinet.localdev", - "S3_external_auspice_bucket": "genepi-external-auspice-data" + "S3_external_auspice_bucket": "genepi-external-auspice-data", + "S3_db_bucket": "genepi-db-data" }' || true echo "Creating IAM role" @@ -94,6 +95,8 @@ ${local_aws} ssm put-parameter --name /genepi/local/localstack/pangolin-ondemand echo "Creating s3 buckets" ${local_aws} s3api head-bucket --bucket genepi-external-auspice-data || ${local_aws} s3 mb s3://genepi-external-auspice-data +${local_aws} s3api head-bucket --bucket genepi-db-data || ${local_aws} s3 mb s3://genepi-db-data +${local_aws} s3api head-bucket --bucket genepi-gisaid-data || ${local_aws} s3 mb s3://genepi-gisaid-data ${local_aws} s3api head-bucket --bucket genepi-batch || ${local_aws} s3 mb s3://genepi-batch echo echo "Dev env is up and running!" diff --git a/src/backend/scripts/setup_localdata.py b/src/backend/scripts/setup_localdata.py index 141a0e1af8..6102816119 100644 --- a/src/backend/scripts/setup_localdata.py +++ b/src/backend/scripts/setup_localdata.py @@ -1,5 +1,8 @@ +import os from datetime import datetime +import boto3 + from aspen.config.docker_compose import DockerComposeConfig from aspen.database.connection import get_db_uri, init_db from aspen.database.models import ( @@ -110,20 +113,35 @@ def create_gisaid(session): print("Aligned Gisaid Dump already exists") return # Add raw gisaid dump - gisaid_s3_bucket = "gisaid_bucket" + gisaid_s3_bucket = "genepi-gisaid-data" + s3_resource = boto3.resource( + "s3", + endpoint_url=os.getenv("BOTO_ENDPOINT_URL") or None, + config=boto3.session.Config(signature_version="s3v4"), + ) suffix = datetime.now().isoformat() + raw_s3_key = f"raw_gisaid_dump-{suffix}" + processed_sequences_s3_key = f"processed_sequences-{suffix}" + processed_metadata_s3_key = f"processed_metadata-{suffix}" + aligned_sequences_s3_key = f"aligned_sequences-{suffix}" + aligned_metadata_s3_key = f"aligned_metadata-{suffix}" raw_gisaid_dump = RawGisaidDump( download_date=datetime.now(), s3_bucket=gisaid_s3_bucket, - s3_key=f"raw_gisaid_dump-{suffix}", + s3_key=raw_s3_key, ) session.add(raw_gisaid_dump) + s3_resource.Bucket(gisaid_s3_bucket).Object(raw_s3_key).put(Body="") + s3_resource.Bucket(gisaid_s3_bucket).Object(processed_sequences_s3_key).put(Body="") + s3_resource.Bucket(gisaid_s3_bucket).Object(processed_metadata_s3_key).put(Body="") + s3_resource.Bucket(gisaid_s3_bucket).Object(aligned_sequences_s3_key).put(Body="") + s3_resource.Bucket(gisaid_s3_bucket).Object(aligned_metadata_s3_key).put(Body="") # add transformed gisaid dump processed_gisaid_dump = ProcessedGisaidDump( s3_bucket=gisaid_s3_bucket, - sequences_s3_key=f"processed_sequences-{suffix}", - metadata_s3_key=f"processed_metadata-{suffix}", + sequences_s3_key=processed_sequences_s3_key, + metadata_s3_key=processed_metadata_s3_key, ) processed_workflow = GisaidDumpWorkflow( start_datetime=datetime.now(), @@ -139,8 +157,8 @@ def create_gisaid(session): # Add an aligned dump aligned_gisaid_dump = AlignedGisaidDump( s3_bucket=gisaid_s3_bucket, - sequences_s3_key=f"aligned_sequences-{suffix}", - metadata_s3_key=f"aligned_metadata-{suffix}", + sequences_s3_key=aligned_sequences_s3_key, + metadata_s3_key=aligned_metadata_s3_key, ) # attach a workflow