Skip to content

Commit

Permalink
Make nextstrain build scripts consistent (#868)
Browse files Browse the repository at this point in the history
Standardize s3 paths for nextstrain builds
  • Loading branch information
jgadling authored Dec 15, 2021
1 parent dfdae7c commit a820acd
Show file tree
Hide file tree
Showing 7 changed files with 89 additions and 31 deletions.
2 changes: 2 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,8 @@ services:
genepinet:
aliases:
- nextstrain.genepinet.localdev
volumes:
- ./src/backend:/usr/src/app
networks:
genepinet:
volumes:
Expand Down
2 changes: 2 additions & 0 deletions src/backend/Dockerfile.nextstrain
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ RUN mkdir /ncov && \
git remote add origin https://github.com/nextstrain/ncov.git && \
git fetch origin master && \
git reset --hard FETCH_HEAD
RUN mkdir -p /ncov/auspice
RUN mkdir -p /ncov/logs

ADD aspen/workflows/nextstrain_run/patches/crowding_penalty.patch /tmp/
RUN patch /ncov/workflow/snakemake_rules/main_workflow.smk < /tmp/crowding_penalty.patch
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,20 @@ df 1>&2
cat /proc/meminfo 1>&2

start_time=$(date +%s)
build_date=$(date +%Y%m%d)

aws configure set region $AWS_REGION

if [ ! -z "${BOTO_ENDPOINT_URL}" ]; then
export aws="aws --endpoint-url ${BOTO_ENDPOINT_URL}"
else
export aws="aws"
fi

# fetch aspen config
genepi_config="$(aws secretsmanager get-secret-value --secret-id $GENEPI_CONFIG_SECRET_NAME --query SecretString --output text)"
genepi_config="$($aws secretsmanager get-secret-value --secret-id $GENEPI_CONFIG_SECRET_NAME --query SecretString --output text)"
aspen_s3_db_bucket="$(jq -r .S3_db_bucket <<< "$genepi_config")"
key_prefix="phylo_run/${S3_FILESTEM}/${WORKFLOW_ID}"
s3_prefix="s3://${aspen_s3_db_bucket}/${key_prefix}"

# set up ncov
mkdir -p /ncov/my_profiles/aspen /ncov/results
Expand All @@ -39,8 +46,9 @@ aligned_gisaid_location=$(
--builds-file /ncov/my_profiles/aspen/builds.yaml \
)


# Persist the build config we generated.
aws s3 cp /ncov/my_profiles/aspen/builds.yaml "s3://${aspen_s3_db_bucket}/phylo_run/${build_date}/${S3_FILESTEM}/${WORKFLOW_ID}/builds.yaml"
$aws s3 cp /ncov/my_profiles/aspen/builds.yaml "${s3_prefix}/builds.yaml"

# If we don't have any county samples, copy the reference genomes to to our county file
if [ ! -e /ncov/data/sequences_aspen.fasta ]; then
Expand All @@ -52,16 +60,17 @@ aligned_gisaid_s3_bucket=$(echo "${aligned_gisaid_location}" | jq -r .bucket)
aligned_gisaid_sequences_s3_key=$(echo "${aligned_gisaid_location}" | jq -r .sequences_key)
aligned_gisaid_metadata_s3_key=$(echo "${aligned_gisaid_location}" | jq -r .metadata_key)


# fetch the gisaid dataset
aws s3 cp --no-progress "s3://${aligned_gisaid_s3_bucket}/${aligned_gisaid_sequences_s3_key}" /ncov/results/
aws s3 cp --no-progress "s3://${aligned_gisaid_s3_bucket}/${aligned_gisaid_metadata_s3_key}" /ncov/results/
$aws s3 cp --no-progress "s3://${aligned_gisaid_s3_bucket}/${aligned_gisaid_sequences_s3_key}" /ncov/results/
$aws s3 cp --no-progress "s3://${aligned_gisaid_s3_bucket}/${aligned_gisaid_metadata_s3_key}" /ncov/results/

# run snakemake, if run fails export the logs from snakemake and ncov to s3
(cd /ncov && snakemake --printshellcmds auspice/ncov_aspen.json --profile my_profiles/aspen/ --resources=mem_mb=312320) || { aws s3 cp /ncov/.snakemake/log/ "s3://${aspen_s3_db_bucket}/phylo_run/${build_date}/${S3_FILESTEM}/${WORKFLOW_ID}/logs/snakemake/" --recursive ; aws s3 cp /ncov/logs/ "s3://${aspen_s3_db_bucket}/phylo_run/${build_date}/${S3_FILESTEM}/${WORKFLOW_ID}/logs/ncov/" --recursive ; }
(cd /ncov && snakemake --printshellcmds auspice/ncov_aspen.json --profile my_profiles/aspen/ --resources=mem_mb=312320) || { $aws s3 cp /ncov/.snakemake/log/ "${s3_prefix}/logs/snakemake/" --recursive ; $aws s3 cp /ncov/logs/ "${s3_prefix}/logs/ncov/" --recursive ; }

# upload the tree to S3
key="phylo_run/${build_date}/${S3_FILESTEM}/${WORKFLOW_ID}/ncov.json"
aws s3 cp /ncov/auspice/ncov_aspen.json "s3://${aspen_s3_db_bucket}/${key}"
key="${key_prefix}/ncov_aspen.json"
$aws s3 cp /ncov/auspice/ncov_aspen.json "s3://${aspen_s3_db_bucket}/${key}"

# update aspen
aspen_workflow_rev=WHATEVER
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
# REMOTE_DEV_PREFIX (if set)
# S3_FILESTEM
# GROUP_NAME
# TEMPLATE_FILENAME
# TEMPLATE_ARGS_FILE
# TREE_TYPE

Expand All @@ -17,23 +16,32 @@ df 1>&2
cat /proc/meminfo 1>&2

start_time=$(date +%s)
build_id=$(date +%Y%m%d-%H%M)

aws configure set region $AWS_REGION

if [ ! -z "${BOTO_ENDPOINT_URL}" ]; then
export aws="aws --endpoint-url ${BOTO_ENDPOINT_URL}"
else
export aws="aws"
fi

# fetch aspen config
genepi_config="$(aws secretsmanager get-secret-value --secret-id $GENEPI_CONFIG_SECRET_NAME --query SecretString --output text)"
genepi_config="$($aws secretsmanager get-secret-value --secret-id $GENEPI_CONFIG_SECRET_NAME --query SecretString --output text)"
aspen_s3_db_bucket="$(jq -r .S3_db_bucket <<< "$genepi_config")"

# Recover template args
TEMPLATE_ARGS=$(jq -c . < "${TEMPLATE_ARGS_FILE}")

workflow_id=$(aspen-cli db create-phylo-run \
# Create a workflow run
WORKFLOW_ID=$(aspen-cli db create-phylo-run \
--group-name "${GROUP_NAME}" \
--builds-template-args "${TEMPLATE_ARGS}" \
--tree-type "${TREE_TYPE}"
)
echo "${workflow_id}" >| "/tmp/workflow_id"
echo "${WORKFLOW_ID}" >| "/tmp/workflow_id"

key_prefix="phylo_run/${S3_FILESTEM}/${WORKFLOW_ID}"
s3_prefix="s3://${aspen_s3_db_bucket}/${key_prefix}"

# set up ncov
mkdir -p /ncov/my_profiles/aspen /ncov/results
Expand All @@ -42,35 +50,41 @@ echo "${ncov_git_rev}" >| "/tmp/ncov_git_rev"

cp /usr/src/app/aspen/workflows/nextstrain_run/nextstrain_profile/* /ncov/my_profiles/aspen/


# dump the sequences, metadata, and builds.yaml for a run out to disk.
aligned_gisaid_location=$(
python3 /usr/src/app/aspen/workflows/nextstrain_run/export.py \
--phylo-run-id "${workflow_id}" \
--sequences /ncov/data/sequences_aspen.fasta \
--metadata /ncov/data/metadata_aspen.tsv \
--phylo-run-id "${WORKFLOW_ID}" \
--sequences /ncov/data/sequences_aspen.fasta \
--metadata /ncov/data/metadata_aspen.tsv \
--selected /ncov/data/include.txt \
--builds-file /ncov/my_profiles/aspen/builds.yaml \
)


# Persist the build config we generated.
aws s3 cp /ncov/my_profiles/aspen/builds.yaml "s3://${aspen_s3_db_bucket}/phylo_run/${build_date}/${S3_FILESTEM}/${WORKFLOW_ID}/builds.yaml"
$aws s3 cp /ncov/my_profiles/aspen/builds.yaml "${s3_prefix}/builds.yaml"

# If we don't have any county samples, copy the reference genomes to to our county file
if [ ! -e /ncov/data/sequences_aspen.fasta ]; then
cp /ncov/data/references_sequences.fasta /ncov/data/sequences_aspen.fasta;
cp /ncov/data/references_metadata.tsv /ncov/data/metadata_aspen.tsv;
fi;

aligned_gisaid_s3_bucket=$(echo "${aligned_gisaid_location}" | jq -r .bucket)
aligned_gisaid_sequences_s3_key=$(echo "${aligned_gisaid_location}" | jq -r .sequences_key)
aligned_gisaid_metadata_s3_key=$(echo "${aligned_gisaid_location}" | jq -r .metadata_key)


# fetch the gisaid dataset
aws s3 cp --no-progress "s3://${aligned_gisaid_s3_bucket}/${aligned_gisaid_sequences_s3_key}" /ncov/results/
aws s3 cp --no-progress "s3://${aligned_gisaid_s3_bucket}/${aligned_gisaid_metadata_s3_key}" /ncov/results/
$aws s3 cp --no-progress "s3://${aligned_gisaid_s3_bucket}/${aligned_gisaid_sequences_s3_key}" /ncov/results/
$aws s3 cp --no-progress "s3://${aligned_gisaid_s3_bucket}/${aligned_gisaid_metadata_s3_key}" /ncov/results/

# run snakemake, if run fails export the logs from snakemake and ncov to s3
(cd /ncov && snakemake --printshellcmds auspice/ncov_aspen.json --profile my_profiles/aspen/ --resources=mem_mb=312320) || { aws s3 cp /ncov/.snakemake/log/ "s3://${aspen_s3_db_bucket}/phylo_run/${build_id}/logs/snakemake/" --recursive ; aws s3 cp /ncov/logs/ "s3://${aspen_s3_db_bucket}/phylo_run/${build_id}/logs/ncov/" --recursive ; }
(cd /ncov && snakemake --printshellcmds auspice/ncov_aspen.json --profile my_profiles/aspen/ --resources=mem_mb=312320) || { $aws s3 cp /ncov/.snakemake/log/ "${s3_prefix}/logs/snakemake/" --recursive ; $aws s3 cp /ncov/logs/ "${s3_prefix}/logs/ncov/" --recursive ; }

# upload the tree to S3
key="phylo_run/${build_id}/${S3_FILESTEM}.json"
aws s3 cp /ncov/auspice/ncov_aspen.json "s3://${aspen_s3_db_bucket}/${key}"
key="${key_prefix}/ncov_aspen.json"
$aws s3 cp /ncov/auspice/ncov_aspen.json "s3://${aspen_s3_db_bucket}/${key}"

# update aspen
aspen_workflow_rev=WHATEVER
Expand All @@ -85,7 +99,7 @@ python3 /usr/src/app/aspen/workflows/nextstrain_run/save.py \
--ncov-rev "${ncov_git_rev}" \
--aspen-docker-image-version "" \
--end-time "${end_time}" \
--phylo-run-id "${workflow_id}" \
--phylo-run-id "${WORKFLOW_ID}" \
--bucket "${aspen_s3_db_bucket}" \
--key "${key}" \
--tree-path /ncov/auspice/ncov_aspen.json \
10 changes: 10 additions & 0 deletions src/backend/aspen/workflows/nextstrain_run/run_test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#!/bin/bash
export GENEPI_CONFIG_SECRET_NAME="genepi-config"
export TEMPLATE_ARGS_FILE="args.json"
export GROUP_NAME="CZI"
export S3_FILESTEM="CZI_SCHEDULED_RUN"
export TREE_TYPE="targeted"
echo "{}" > $TEMPLATE_ARGS_FILE

./run_nextstrain_scheduled.sh

5 changes: 4 additions & 1 deletion src/backend/scripts/setup_dev_data.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@ ${local_aws} secretsmanager update-secret --secret-id genepi-config --secret-str
"DB_rw_username": "user_rw",
"DB_rw_password": "password_rw",
"DB_address": "database.genepinet.localdev",
"S3_external_auspice_bucket": "genepi-external-auspice-data"
"S3_external_auspice_bucket": "genepi-external-auspice-data",
"S3_db_bucket": "genepi-db-data"
}' || true

echo "Creating IAM role"
Expand Down Expand Up @@ -94,6 +95,8 @@ ${local_aws} ssm put-parameter --name /genepi/local/localstack/pangolin-ondemand

echo "Creating s3 buckets"
${local_aws} s3api head-bucket --bucket genepi-external-auspice-data || ${local_aws} s3 mb s3://genepi-external-auspice-data
${local_aws} s3api head-bucket --bucket genepi-db-data || ${local_aws} s3 mb s3://genepi-db-data
${local_aws} s3api head-bucket --bucket genepi-gisaid-data || ${local_aws} s3 mb s3://genepi-gisaid-data
${local_aws} s3api head-bucket --bucket genepi-batch || ${local_aws} s3 mb s3://genepi-batch
echo
echo "Dev env is up and running!"
Expand Down
30 changes: 24 additions & 6 deletions src/backend/scripts/setup_localdata.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
import os
from datetime import datetime

import boto3

from aspen.config.docker_compose import DockerComposeConfig
from aspen.database.connection import get_db_uri, init_db
from aspen.database.models import (
Expand Down Expand Up @@ -110,20 +113,35 @@ def create_gisaid(session):
print("Aligned Gisaid Dump already exists")
return
# Add raw gisaid dump
gisaid_s3_bucket = "gisaid_bucket"
gisaid_s3_bucket = "genepi-gisaid-data"
s3_resource = boto3.resource(
"s3",
endpoint_url=os.getenv("BOTO_ENDPOINT_URL") or None,
config=boto3.session.Config(signature_version="s3v4"),
)
suffix = datetime.now().isoformat()
raw_s3_key = f"raw_gisaid_dump-{suffix}"
processed_sequences_s3_key = f"processed_sequences-{suffix}"
processed_metadata_s3_key = f"processed_metadata-{suffix}"
aligned_sequences_s3_key = f"aligned_sequences-{suffix}"
aligned_metadata_s3_key = f"aligned_metadata-{suffix}"
raw_gisaid_dump = RawGisaidDump(
download_date=datetime.now(),
s3_bucket=gisaid_s3_bucket,
s3_key=f"raw_gisaid_dump-{suffix}",
s3_key=raw_s3_key,
)
session.add(raw_gisaid_dump)
s3_resource.Bucket(gisaid_s3_bucket).Object(raw_s3_key).put(Body="")
s3_resource.Bucket(gisaid_s3_bucket).Object(processed_sequences_s3_key).put(Body="")
s3_resource.Bucket(gisaid_s3_bucket).Object(processed_metadata_s3_key).put(Body="")
s3_resource.Bucket(gisaid_s3_bucket).Object(aligned_sequences_s3_key).put(Body="")
s3_resource.Bucket(gisaid_s3_bucket).Object(aligned_metadata_s3_key).put(Body="")

# add transformed gisaid dump
processed_gisaid_dump = ProcessedGisaidDump(
s3_bucket=gisaid_s3_bucket,
sequences_s3_key=f"processed_sequences-{suffix}",
metadata_s3_key=f"processed_metadata-{suffix}",
sequences_s3_key=processed_sequences_s3_key,
metadata_s3_key=processed_metadata_s3_key,
)
processed_workflow = GisaidDumpWorkflow(
start_datetime=datetime.now(),
Expand All @@ -139,8 +157,8 @@ def create_gisaid(session):
# Add an aligned dump
aligned_gisaid_dump = AlignedGisaidDump(
s3_bucket=gisaid_s3_bucket,
sequences_s3_key=f"aligned_sequences-{suffix}",
metadata_s3_key=f"aligned_metadata-{suffix}",
sequences_s3_key=aligned_sequences_s3_key,
metadata_s3_key=aligned_metadata_s3_key,
)

# attach a workflow
Expand Down

0 comments on commit a820acd

Please sign in to comment.