Skip to content

Commit

Permalink
edit: Use a permalink for each script
Browse files Browse the repository at this point in the history
Use a permalink for each script to allow us to version the software we
use in this workflow without being affected by upstream changes until
we want to bump the version. This design adds more maintenance to this
workflow, but it also protects users against unexpected issues that are
outside of their control.
Discussed in nextstrain/ebola#6 (comment)

Pick curl instead of wget as discussed in:
nextstrain/ebola#6 (comment)
  • Loading branch information
j23414 authored and j23414 committed Apr 17, 2023
1 parent 8f44c12 commit 7f96553
Show file tree
Hide file tree
Showing 5 changed files with 126 additions and 71 deletions.
25 changes: 16 additions & 9 deletions ingest/workflow/snakemake_rules/fetch_sequences.smk
Original file line number Diff line number Diff line change
Expand Up @@ -28,18 +28,25 @@ rule fetch_from_genbank:
genbank_ndjson="data/genbank_{serotype}.ndjson",
params:
serotype_tax_id=download_serotype,
csv_to_ndjson_url="https://raw.githubusercontent.com/nextstrain/monkeypox/master/ingest/bin/csv-to-ndjson",
csv_to_ndjson_url="https://raw.githubusercontent.com/nextstrain/monkeypox/644d07ebe3fa5ded64d27d0964064fb722797c5d/ingest/bin/csv-to-ndjson",
shell:
"""
if [[ ! -d bin ]]; then
mkdir bin
fi
if [[ ! -f bin/csv-to-ndjson ]]; then
cd bin
wget {params.csv_to_ndjson_url}
chmod 755 *
cd ..
# (1) Pick curl or wget based on availability
if which curl > /dev/null; then
download_cmd="curl -fsSL --output"
elif which wget > /dev/null; then
download_cmd="wget -O"
else
echo "ERROR: Neither curl nor wget found. Please install one of them."
exit 1
fi
# (2) Download the required scripts if not already present
[[ -d bin ]] || mkdir bin
[[ -f bin/csv-to-ndjson ]] || $download_cmd bin/csv-to-ndjson {params.csv_to_ndjson_url}
chmod +x bin/*
# (3) Fetch sequences from GenBank
./bin/fetch-from-genbank {params.serotype_tax_id} > {output.genbank_ndjson}
"""

Expand Down
49 changes: 31 additions & 18 deletions ingest/workflow/snakemake_rules/slack_notifications.smk
Original file line number Diff line number Diff line change
Expand Up @@ -28,19 +28,25 @@ rule notify_on_genbank_record_change:
touch("data/notify/genbank-record-change.done"),
params:
s3_src=S3_SRC,
notify_on_record_change_url="https://raw.githubusercontent.com/nextstrain/monkeypox/master/ingest/bin/notify-on-record-change",
notify_on_record_change_url="https://raw.githubusercontent.com/nextstrain/monkeypox/644d07ebe3fa5ded64d27d0964064fb722797c5d/ingest/bin/notify-on-record-change",
shell:
"""
if [[ ! -d bin ]]; then
mkdir bin
fi
if [[ ! -f bin/notify-on-record-change ]]; then
cd bin
wget {params.notify_on_record_change_url}
chmod 755
cd ..
# (1) Pick curl or wget based on availability
if which curl > /dev/null; then
download_cmd="curl -fsSL --output"
elif which wget > /dev/null; then
download_cmd="wget -O"
else
echo "ERROR: Neither curl nor wget found. Please install one of them."
exit 1
fi
# (2) Download the required scripts if not already present
[[ -d bin ]] || mkdir bin
[[ -f bin/notify-on-record-change ]] || $download_cmd bin/notify-on-record-change {params.notify_on_record_change_url}
chmod +x bin/*
# (3) Run the script
./bin/notify-on-record-change {input.genbank_ndjson} {params.s3_src:q}/genbank.ndjson.xz Genbank
"""

Expand All @@ -52,18 +58,25 @@ rule notify_on_metadata_diff:
touch("data/notify/metadata-diff.done"),
params:
s3_src=S3_SRC,
notify_on_diff_url = "https://raw.githubusercontent.com/nextstrain/monkeypox/master/ingest/bin/notify-on-diff",
notify_on_diff_url = "https://raw.githubusercontent.com/nextstrain/monkeypox/644d07ebe3fa5ded64d27d0964064fb722797c5d/ingest/bin/notify-on-diff",
shell:
"""
if [[ ! -d bin ]]; then
mkdir bin
fi
if [[ ! -f bin/notify-on-diff ]]; then
cd bin
wget {params.notify_on_diff_url}
chmod 755
cd ..
# (1) Pick curl or wget based on availability
if which curl > /dev/null; then
download_cmd="curl -fsSL --output"
elif which wget > /dev/null; then
download_cmd="wget -O"
else
echo "ERROR: Neither curl nor wget found. Please install one of them."
exit 1
fi
# (2) Download the required scripts if not already present
[[ -d bin ]] || mkdir bin
[[ -f bin/notify-on-diff ]] || $download_cmd bin/notify-on-diff {params.notify_on_diff_url}
chmod +x bin/*
# (3) Run the script
./bin/notify-on-diff {input.metadata} {params.s3_src:q}/metadata.tsv.gz
"""

Expand Down
67 changes: 43 additions & 24 deletions ingest/workflow/snakemake_rules/transform.smk
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,18 @@ rule fetch_general_geolocation_rules:
geolocation_rules_url=config["transform"]["geolocation_rules_url"],
shell:
"""
curl {params.geolocation_rules_url} > {output.general_geolocation_rules}
# (1) Pick curl or wget based on availability
if which curl > /dev/null; then
download_cmd="curl -fsSL --output"
elif which wget > /dev/null; then
download_cmd="wget -O"
else
echo "ERROR: Neither curl nor wget found. Please install one of them."
exit 1
fi
# (2) Fetch general geolocation rules
$download_cmd {output.general_geolocation_rules} {params.geolocation_rules_url}
"""


Expand Down Expand Up @@ -62,33 +73,41 @@ rule transform:
metadata_columns=config["transform"]["metadata_columns"],
id_field=config["transform"]["id_field"],
sequence_field=config["transform"]["sequence_field"],
transform_field_names_url="https://raw.githubusercontent.com/nextstrain/monkeypox/master/ingest/bin/transform-field-names",
transform_string_fields_url="https://raw.githubusercontent.com/nextstrain/monkeypox/master/ingest/bin/transform-string-fields",
transform_strain_names_url="https://raw.githubusercontent.com/nextstrain/monkeypox/master/ingest/bin/transform-strain-names",
transform_date_fields_url="https://raw.githubusercontent.com/nextstrain/monkeypox/master/ingest/bin/transform-date-fields",
transform_genbank_location_url="https://raw.githubusercontent.com/nextstrain/monkeypox/master/ingest/bin/transform-genbank-location",
transform_authors_url="https://raw.githubusercontent.com/nextstrain/monkeypox/master/ingest/bin/transform-authors",
apply_geolocation_rules_url="https://raw.githubusercontent.com/nextstrain/monkeypox/master/ingest/bin/apply-geolocation-rules",
merge_user_metadata_url="https://raw.githubusercontent.com/nextstrain/monkeypox/master/ingest/bin/merge-user-metadata",
ndjson_to_tsv_and_fasta_url="https://raw.githubusercontent.com/nextstrain/monkeypox/master/ingest/bin/ndjson-to-tsv-and-fasta",
transform_field_names_url="https://raw.githubusercontent.com/nextstrain/monkeypox/644d07ebe3fa5ded64d27d0964064fb722797c5d/ingest/bin/transform-field-names",
transform_string_fields_url="https://raw.githubusercontent.com/nextstrain/monkeypox/644d07ebe3fa5ded64d27d0964064fb722797c5d/ingest/bin/transform-string-fields",
transform_strain_names_url="https://raw.githubusercontent.com/nextstrain/monkeypox/b54768ec17872eb0d898e29527785642f6b98c0d/ingest/bin/transform-strain-names",
transform_date_fields_url="https://raw.githubusercontent.com/nextstrain/monkeypox/644d07ebe3fa5ded64d27d0964064fb722797c5d/ingest/bin/transform-date-fields",
transform_genbank_location_url="https://raw.githubusercontent.com/nextstrain/monkeypox/644d07ebe3fa5ded64d27d0964064fb722797c5d/ingest/bin/transform-genbank-location",
transform_authors_url="https://raw.githubusercontent.com/nextstrain/monkeypox/644d07ebe3fa5ded64d27d0964064fb722797c5d/ingest/bin/transform-authors",
apply_geolocation_rules_url="https://raw.githubusercontent.com/nextstrain/monkeypox/644d07ebe3fa5ded64d27d0964064fb722797c5d/ingest/bin/apply-geolocation-rules",
merge_user_metadata_url="https://raw.githubusercontent.com/nextstrain/monkeypox/644d07ebe3fa5ded64d27d0964064fb722797c5d/ingest/bin/merge-user-metadata",
ndjson_to_tsv_and_fasta_url="https://raw.githubusercontent.com/nextstrain/monkeypox/644d07ebe3fa5ded64d27d0964064fb722797c5d/ingest/bin/ndjson-to-tsv-and-fasta",
shell:
"""
if [[ ! -d bin ]]; then
mkdir bin
# (1) Pick curl or wget based on availability
if which curl > /dev/null; then
download_cmd="curl -fsSL --output"
elif which wget > /dev/null; then
download_cmd="wget -O"
else
echo "ERROR: Neither curl nor wget found. Please install one of them."
exit 1
fi
cd bin
[[ -f transform-field-names ]] || wget {params.transform_field_names_url}
[[ -f transform-string-fields ]] || wget {params.transform_string_fields_url}
[[ -f transform-strain-names ]] || wget {params.transform_strain_names_url}
[[ -f transform-date-fields ]] || wget {params.transform_date_fields_url}
[[ -f transform-genbank-location ]] || wget {params.transform_genbank_location_url}
[[ -f transform-authors ]] || wget {params.transform_authors_url}
[[ -f apply-geolocation-rules ]] || wget {params.apply_geolocation_rules_url}
[[ -f merge-user-metadata ]] || wget {params.merge_user_metadata_url}
[[ -f ndjson-to-tsv-and-fasta ]] || wget {params.ndjson_to_tsv_and_fasta_url}
chmod 755 *
cd ..
# (2) Download the required scripts if not already present
[[ -d bin ]] || mkdir bin
[[ -f bin/transform-field-names ]] || $download_cmd bin/transform-field-names {params.transform_field_names_url}
[[ -f bin/transform-string-fields ]] || $download_cmd bin/transform-string-fields {params.transform_string_fields_url}
[[ -f bin/transform-strain-names ]] || $download_cmd bin/transform-strain-names {params.transform_strain_names_url}
[[ -f bin/transform-date-fields ]] || $download_cmd bin/transform-date-fields {params.transform_date_fields_url}
[[ -f bin/transform-genbank-location ]] || $download_cmd bin/transform-genbank-location {params.transform_genbank_location_url}
[[ -f bin/transform-authors ]] || $download_cmd bin/transform-authors {params.transform_authors_url}
[[ -f bin/apply-geolocation-rules ]] || $download_cmd bin/apply-geolocation-rules {params.apply_geolocation_rules_url}
[[ -f bin/merge-user-metadata ]] || $download_cmd bin/merge-user-metadata {params.merge_user_metadata_url}
[[ -f bin/ndjson-to-tsv-and-fasta ]] || $download_cmd bin/ndjson-to-tsv-and-fasta {params.ndjson_to_tsv_and_fasta_url}
chmod +x bin/*
# (3) Transform the sequences
(cat {input.sequences_ndjson} \
| ./bin/transform-field-names \
--field-map {params.field_map} \
Expand Down
25 changes: 16 additions & 9 deletions ingest/workflow/snakemake_rules/trigger_rebuild.smk
Original file line number Diff line number Diff line change
Expand Up @@ -14,17 +14,24 @@ rule trigger_build:
output:
touch("data/trigger/rebuild.done")
params:
trigger_on_new_data_url="https://raw.githubusercontent.com/nextstrain/monkeypox/master/ingest/bin/trigger-on-new-data"
trigger_on_new_data_url="https://raw.githubusercontent.com/nextstrain/monkeypox/644d07ebe3fa5ded64d27d0964064fb722797c5d/ingest/bin/trigger-on-new-data"
shell:
"""
if [[ ! -d bin ]]; then
mkdir bin
fi
if [[ ! -f bin/trigger-on-new-data ]]; then
cd bin
wget {params.trigger_on_new_data_url}
chmod 755 *
cd ..
# (1) Pick curl or wget based on availability
if which curl > /dev/null; then
download_cmd="curl -fsSL --output"
elif which wget > /dev/null; then
download_cmd="wget -O"
else
echo "ERROR: Neither curl nor wget found. Please install one of them."
exit 1
fi
# (2) Download the required scripts if not already present
[[ -d bin ]] || mkdir bin
[[ -f bin/trigger-on-new-data ]] || $download_cmd bin/trigger-on-new-data {params.trigger_on_new_data_url}
chmod +x bin/*
# (3) Trigger the build
./bin/trigger-on-new-data {input.metadata_upload} {input.fasta_upload}
"""
31 changes: 20 additions & 11 deletions ingest/workflow/snakemake_rules/upload.smk
Original file line number Diff line number Diff line change
Expand Up @@ -54,20 +54,29 @@ rule upload_to_s3:
quiet="" if send_notifications else "--quiet",
s3_dst=config["upload"].get("s3", {}).get("dst", ""),
cloudfront_domain=config["upload"].get("s3", {}).get("cloudfront_domain", ""),
upload_to_s3_url="https://raw.githubusercontent.com/nextstrain/monkeypox/master/ingest/bin/upload-to-s3",
sha256sum_url="https://raw.githubusercontent.com/nextstrain/monkeypox/master/ingest/bin/sha256sum",
cloudfront_invalidate_url="https://raw.githubusercontent.com/nextstrain/monkeypox/master/ingest/bin/cloudfront-invalidate"
upload_to_s3_url="https://raw.githubusercontent.com/nextstrain/monkeypox/644d07ebe3fa5ded64d27d0964064fb722797c5d/ingest/bin/upload-to-s3",
sha256sum_url="https://raw.githubusercontent.com/nextstrain/monkeypox/644d07ebe3fa5ded64d27d0964064fb722797c5d/ingest/bin/sha256sum",
cloudfront_invalidate_url="https://raw.githubusercontent.com/nextstrain/monkeypox/644d07ebe3fa5ded64d27d0964064fb722797c5d/ingest/bin/cloudfront-invalidate"
shell:
"""
if [[ ! -d bin ]]; then
mkdir bin
# (1) Pick curl or wget based on availability
if which curl > /dev/null; then
download_cmd="curl -fsSL --output"
elif which wget > /dev/null; then
download_cmd="wget -O"
else
echo "ERROR: Neither curl nor wget found. Please install one of them."
exit 1
fi
cd bin
[[ -f upload-to-s3 ]] || wget {params.upload_to_s3_url}
[[ -f sha256sum ]] || wget {params.sha256sum_url}
[[ -f cloudfront-invalidate ]] || wget {params.cloudfront_invalidate_url}
chmod 755 *
cd ..
# (2) Download the required scripts if not already present
[[ -d bin ]] || mkdir bin
[[ -f bin/upload-to-s3 ]] || $download_cmd bin/upload-to-s3 {params.upload_to_s3_url}
[[ -f bin/sha256sum ]] || $download_cmd bin/sha256sum {params.sha256sum_url}
[[ -f bin/cloudfront-invalidate ]] || $download_cmd bin/cloudfront-invalidate {params.cloudfront_invalidate_url}
chmod +x bin/*
# (3) Run the upload script
./bin/upload-to-s3 \
{params.quiet} \
{input.file_to_upload:q} \
Expand Down

0 comments on commit 7f96553

Please sign in to comment.