Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merge CLIMB and grinch changes #42

Open
wants to merge 4 commits into
base: grinch
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions config/base.config
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ params {
uk_aligned_fasta = "test/matched3.fa" // null param so exists
uk_mutations = "test/matched2.variants" // null param so exists
uk_constellations = "resources/empty_constellations.csv" // null so exists
uk_pag = "test/uk_pag.tsv" //null param

// if carrying forward from previous
previous_metadata = ""
Expand Down
5 changes: 3 additions & 2 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,16 @@ dependencies:
- nextflow
- s3cmd
- smart_open
- datafunk
- fastafunk
- pip:
- ftfy
- geopandas
- git+https://github.com/cov-lineages/pangolin.git
- git+https://github.com/cov-lineages/pangoLEARN.git
- git+https://github.com/cov-ert/datafunk.git
- git+https://github.com/cov-ert/fastafunk.git
- git+https://github.com/cov-lineages/constellations.git
- git+https://github.com/cov-lineages/scorpio.git
- git+https://github.com/cov-lineages/pango-designation.git
- git+https://github.com/cov-lineages/pangolin-assigment.git


2 changes: 1 addition & 1 deletion modules/align_and_variant_call.nf
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ process minimap2_to_reference {

script:
"""
minimap2 -t ${task.cpus} -a --secondary=no -x asm20 --score-N=0 ${reference_fasta} ${fasta} > alignment.sam
minimap2 -t ${task.cpus} -a --secondary=no --score-N=0 -x asm20 ${reference_fasta} ${fasta} > alignment.sam
"""
}

Expand Down
23 changes: 22 additions & 1 deletion modules/clean_geography.nf
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,26 @@ process make_delta_by_utla_summary {
"""
}


process drop_anon_id {
/**
* Drops anonymous ID from master metadata csv
* @input metadta
* @output metadata
*/

input:
path metadata

output:
path "${metadata.baseName}_anon.csv"

script:
"""
fastafunk drop_columns --in-metadata ${metadata} --columns anonymous_sample_id --out-metadata ${metadata.baseName}_anon.csv
"""
}

process publish_master_metadata {
/**
* Publishes master metadata csv for this category
Expand Down Expand Up @@ -238,7 +258,8 @@ workflow clean_geography_cog_uk {
uk_geography(uk_fasta, uk_metadata)
add_uk_geography_to_metadata(uk_metadata,uk_geography.out.geography)
make_delta_by_utla_summary(add_uk_geography_to_metadata.out.metadata)
publish_master_metadata(add_uk_geography_to_metadata.out.metadata, "cog")
drop_anon_id(add_uk_geography_to_metadata.out.metadata)
publish_master_metadata(drop_anon_id.out, "cog")
emit:
metadata = add_uk_geography_to_metadata.out.metadata
}
Expand Down
18 changes: 12 additions & 6 deletions modules/pangolin.nf
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ process extract_sequences_for_pangolin {
* @output pangolin_fasta, metadata_with_previous
* @params previous_metadata, update_all_lineage_assignments
*/
memory {task.attempt * 6.GB}

input:
path fasta
Expand Down Expand Up @@ -91,13 +92,15 @@ process run_pangolin {
* @input fasta
* @output pangolin_fasta
*/
cpus 4
memory { task.attempt * 8.GB }

input:
path fasta

output:
path "pangolin/lineage_report.csv", emit: report
path "pangolin/sequences.aln.fasta", emit: alignment
//path "pangolin/sequences.aln.fasta", emit: alignment

script:
if (params.skip_designation_hash)
Expand All @@ -106,14 +109,18 @@ process run_pangolin {
--outdir pangolin \
--tempdir pangolin_tmp \
--alignment \
--skip-designation-hash
--analysis-mode fast \
--skip-designation-hash \
-t ${task.cpus}
"""
else
"""
pangolin "${fasta}" \
--outdir pangolin \
--tempdir pangolin_tmp \
--alignment
--alignment \
--analysis-mode fast \
-t ${task.cpus}
"""
}

Expand All @@ -124,7 +131,7 @@ process run_pangolin_usher {
* @output pangolin_fasta
*/

cpus 4
cpus 16

input:
path fasta
Expand All @@ -149,8 +156,7 @@ process run_pangolin_usher {
--outdir pangolin \
--tempdir pangolin_tmp \
--outfile usher_lineage_report.csv \
--usher \
-t ${task.cpus}
--usher -t ${task.cpus}
"""
}

Expand Down
72 changes: 70 additions & 2 deletions modules/preprocess_cog_uk.nf
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,70 @@ process uk_strip_header_digits_and_unalign {
"""
}

process uk_add_published_date_to_metadata {
/**
* Takes the MAJORA TSV of metadata and adds the published_data parameter from
* majora.pag_lookup.tsv
* @input uk_metadata, uk_pag_metadata
* @output uk_metadata_updated_date
*/

input:
path uk_updated_metadata
path uk_metadata_pag

output:
path "${uk_updated_metadata.baseName}.pag.csv"

script:
"""
fastafunk add_columns \
--in-metadata ${uk_updated_metadata} \
--in-data ${uk_metadata_pag} \
--index-column central_sample_id \
--join-on central_sample_id \
--force-overwrite \
--new-columns published_date \
--out-metadata "${uk_updated_metadata.baseName}.pag.csv"
"""
}

process uk_anonymise_ids {
/**
If on or after 30th June 2023, replace central ID
for anonymous ID, if they are present.
@input uk_metadata
@output uk_metadata_anon
*/

input:
path uk_metadata

output:
path "${uk_metadata.baseName}.anon.tsv"

script:
"""
#!/usr/bin/env python3
import datetime
import csv

anon_samp_id_date = datetime.datetime(2023, 6, 30).date()

with open("${uk_metadata}", 'r', newline = '') as csv_in, open("${uk_metadata.baseName}.anon.tsv", 'w', newline = '') as csv_out:
reader = csv.DictReader(csv_in, delimiter=",", quotechar='\"', dialect = "unix")
writer = csv.DictWriter(csv_out, fieldnames = reader.fieldnames, quotechar='\"', quoting=csv.QUOTE_MINIMAL, dialect = "unix", delimiter="\t")
writer.writeheader()

for row in reader:
if datetime.datetime.strptime(row["published_date"], "%Y-%m-%d").date() >= anon_samp_id_date:
if row["anonymous_sample_id"]:
row["central_sample_id"] = row["anonymous_sample_id"]
writer.writerow(row)
"""
}


process uk_add_columns_to_metadata {
/**
* Takes the MAJORA TSV of metadata and adds/updates columns for sample_date, pillar_2,
Expand Down Expand Up @@ -66,6 +130,7 @@ process uk_add_columns_to_metadata {
"""
}


process uk_filter_omitted_sequences {
/**
* Takes a FASTA and METADATA and excludes samples specified in an exclusion file
Expand Down Expand Up @@ -204,7 +269,7 @@ process add_previous_uk_lineage_to_metadata {
* @output metadata
*/

memory { 1.GB * task.attempt + metadata.size() * 2.B }
memory { 2.GB * task.attempt + metadata.size() * 2.B }

input:
path metadata
Expand Down Expand Up @@ -280,9 +345,12 @@ workflow preprocess_cog_uk {
uk_fasta
uk_metadata
uk_accessions
uk_pag
main:
uk_strip_header_digits_and_unalign(uk_fasta)
uk_add_columns_to_metadata(uk_metadata, uk_accessions, uk_updated_dates)
uk_add_published_date_to_metadata(uk_metadata, uk_pag)
uk_anonymise_ids(uk_add_published_date_to_metadata.out)
uk_add_columns_to_metadata(uk_anonymise_ids.out, uk_accessions, uk_updated_dates)
uk_filter_omitted_sequences(uk_strip_header_digits_and_unalign.out, uk_add_columns_to_metadata.out, uk_omissions)
uk_filter_on_sample_date(uk_filter_omitted_sequences.out.fasta, uk_filter_omitted_sequences.out.metadata)
add_previous_uk_lineage_to_metadata(uk_filter_omitted_sequences.out.metadata)
Expand Down
4 changes: 2 additions & 2 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,6 @@ process {
withLabel: retry_increasing_mem {
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
memory = {4.GB * task.attempt}
maxRetries = 2
maxRetries = 5
}
}
}
Loading