Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

separate reference parameters to allow non-genbank file formats #6

Merged
merged 9 commits into from
Dec 27, 2023
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions bin/fix_aa_muts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#!/usr/bin/env python3
import os, json
import argparse


def init_parser():
parser = argparse.ArgumentParser()
parser.add_argument('-a', '--aa_json', required=True, help='aa_muts.json produced by the Augur translate step')
parser.add_argument('-n', '--nt_json', required=True, help='nt_muts.json produced by the Augur ancestral step. Needed to calculate the length of the full nucleotide sequence.')
parser.add_argument('-o', '--outpath', required=True, help='Fixed amino acid mutation node data in JSON format.')

return parser


def main(args):

with open(args.aa_json, 'r') as infile:
aa_data = json.load(infile)

with open(args.nt_json, 'r') as infile:
reference_length = len(json.load(infile)['reference']['nuc'])

if 'nuc' not in aa_data['annotations']:
key = list(aa_data['annotations'].keys())[0]
name = aa_data['annotations'][key]['seqid']

aa_data['annotations']['nuc'] = {
"end": reference_length,
"seqid": name,
"start": 1,
"strand": "+",
"type": "source"
}

with open(args.outpath, 'w') as outfile:
json.dump(aa_data, outfile, indent=4)

if __name__ == "__main__":
args = init_parser().parse_args()
main(args)
96 changes: 67 additions & 29 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ process refine {
publishDir "${params.work_dir}/results/", mode: 'copy'

input:
tuple file(tree), file(msa)
tuple file(tree), file(msa), file(metadata)

output:
tuple path("tree.nwk"), path("branch_lengths.json")
Expand All @@ -135,7 +135,7 @@ process refine {
augur refine \
--tree ${tree} \
--alignment ${msa} \
--metadata ${params.meta} \
--metadata ${metadata} \
--timetree \
--keep-root \
--divergence-units ${params.divergence_units} \
Expand All @@ -154,14 +154,14 @@ process ancestral {
output:
path("nt_muts.json")

"""
augur ancestral \
--tree ${refine_tree} \
--alignment ${msa} \
--output-node-data nt_muts.json \
--keep-overhangs \
--keep-ambiguous
"""
"""
augur ancestral \
--tree ${refine_tree} \
--alignment ${msa} \
--output-node-data nt_muts.json \
--keep-overhangs \
--keep-ambiguous
"""
}

process translate {
Expand All @@ -183,30 +183,46 @@ process translate {
"""
}

process fix_aa_json {
tag "Fixing aa_muts.json when using a GFF3 file for augur translate."
publishDir "${params.work_dir}/results/", mode: 'copy'

input:
tuple file(aa_muts_json), file(nt_muts_json)

output:
file("aa_muts_fix.json")

"""
fix_aa_muts.py --aa_json ${aa_muts_json} --nt_json ${nt_muts_json} --outpath aa_muts_fix.json
"""
}

process export {
tag "Exporting data files for auspice"
publishDir "${params.work_dir}/auspice", mode: 'copy'

input:
tuple file(refine_tree), file(branch_len), file(nt_muts),\
file(aa_muts)
tuple file(refine_tree), file(branch_len), file(nt_muts), file(aa_muts)
file(metadata)
tuple file(auspice_config), file(colors), file(lat_long)

output:
file("flu_na.json")
file("flu.json")

"""
export AUGUR_RECURSION_LIMIT=${params.recursion_limit}

augur export v2 \
--tree ${refine_tree} \
--metadata ${params.meta} \
--node-data ${branch_len} ${nt_muts} ${aa_muts} \
--colors ${params.colors} \
--lat-longs ${params.lat_long} \
--minify-json \
--auspice-config ${params.auspice} \
--output flu_na.json
"""
"""
export AUGUR_RECURSION_LIMIT=${params.recursion_limit}

augur export v2 \
--tree ${refine_tree} \
--metadata ${metadata} \
--node-data ${branch_len} ${nt_muts} ${aa_muts} \
--colors ${colors} \
--lat-longs ${lat_long} \
--minify-json \
--auspice-config ${auspice_config} \
--output flu.json
"""
}

/**
Expand All @@ -218,13 +234,35 @@ workflow
workflow {
seq_ch = Channel.fromPath(params.seqs, checkIfExists:true)
ref_ch = Channel.fromPath(params.ref, checkIfExists:true)
meta_ch = Channel.fromPath(params.meta, checkIfExists:true)
config_ch = Channel.fromPath([params.auspice, params.colors, params.lat_long], checkIfExists:true).collect()


// Catch invalid reference input combinations
if (!(params.ref =~ /.+\.[Gg]b$/) && params.ref_anno == 'NO_FILE' ){ // Cannot have an empty --ref_anno parameter if reference is in non-GenBank format
error "ERROR: Extra parameter --ref_anno (.gff3 or .gb format) must be specified for augur translate if non-GenBank formatted reference is provided."
}else if (params.ref_anno != 'NO_FILE' && !(params.ref_anno =~ /.+\.gff.?|.+\.[Gg]b/) ){ // Can only have .gff or .gb formats in the --ref_anno parameter
error "ERROR: Extra parameter --ref_anno must be in either .gff or .gb (GenBank) format."
}

// Load the ref_anno_ch channel appropriately
if (params.ref_anno == 'NO_FILE'){ // Copy the ref_ch channel if in GenBank format (ref_ch can be reused as ref_anno_ch)
ref_anno_ch = ref_ch
}else{ // Load new channel from scratch if different reference annotation format specified
ref_anno_ch = Channel.fromPath(params.ref_anno, checkIfExists:true)
}

align(seq_ch.combine(ref_ch)) | tree
msa_ch = align.out
refine(tree.out.combine(align.out))
refine(tree.out.combine(msa_ch).combine(meta_ch))
ancestral(refine.out.combine(msa_ch))
translate(ancestral.out.combine(refine.out.combine(ref_ch)))
export(refine.out.combine(ancestral.out.combine(translate.out)))
translate(ancestral.out.combine(refine.out).combine(ref_anno_ch))

ch_aa_muts = translate.out
if (params.ref_anno != 'NO_FILE' && params.ref_anno =~ /.+\.gff.?/ ) { // If gff annotation format used, augur translate outputs need to be fixed (causes downstream schema error)
ch_aa_muts = fix_aa_json(ch_aa_muts.combine(ancestral.out))
}
export(refine.out.combine(ancestral.out).combine(ch_aa_muts), meta_ch, config_ch)
}

/**
Expand Down
71 changes: 37 additions & 34 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -11,57 +11,60 @@ manifest {

params {

//help message
help = null
//help message
help = null

//version number
version = null
//version number
version = null

//conda env local cache
conda_cache = null
//conda env local cache
conda_cache = null

//directory containing config/ & data/ folders
work_dir = "/path/to/data/"
//directory containing config/ & data/ folders
work_dir = "/path/to/data/"

//reference for alignment
ref = "${params.work_dir}config/Ref.gb"
//reference for alignment
ref = "${params.work_dir}/config/Ref.gb"

//input sequences
seqs = "${params.work_dir}data/sequences.fasta"
//reference annotation (needed for concatenated workflow only)
ref_anno = "NO_FILE"

//metadata of input sequences
meta = "${params.work_dir}data/metadata.csv"
//input sequences
seqs = "${params.work_dir}/data/sequences.fasta"

//strains that are excluded
drop_strains = "${params.work_dir}config/dropped_strains.txt"
//metadata of input sequences
meta = "${params.work_dir}/data/metadata.csv"

//colors used in final auspice visualization
colors = "${params.work_dir}config/colors.csv"
//strains that are excluded
drop_strains = "${params.work_dir}/config/dropped_strains.txt"

//latitude and longitudes
lat_long = "${params.work_dir}config/lat_longs.csv"
//colors used in final auspice visualization
colors = "${params.work_dir}/config/colors.csv"

//details for auspice visualization
auspice = "${params.work_dir}config/auspice_config.json"
//latitude and longitudes
lat_long = "${params.work_dir}/config/lat_longs.csv"

//refining phylogeny
divergence_units = "mutations"
//details for auspice visualization
auspice = "${params.work_dir}/config/auspice_config.json"

//env variable AUGUR_RECURSION_LIMIT
recursion_limit = 10000
//refining phylogeny
divergence_units = "mutations"

//env variable AUGUR_RECURSION_LIMIT
recursion_limit = 10000

}

//seamlessly run pipeline on different execution systems by modifying
//the process section of the config file. ex. AWS, SLURM, sun grid engine:

process {
withName: align {
cpus = 28
withName: align {
cpus = 28
}
withName: tree {
cpus = 28
}
withName: tree {
cpus = 28
}
// penv='smp'
// executor='sge'
// memory='30 GB'
Expand All @@ -83,17 +86,17 @@ profiles {
//html displaying breakdown of time taken to execute workflow
timeline {
enabled = true
file = "${params.work_dir}reports/fluflo_timeline.html"
file = "${params.work_dir}/reports/fluflo_timeline.html"
}

//html of cpu/mem usage
report {
enabled = true
file = "${params.work_dir}reports/fluflo_usage.html"
file = "${params.work_dir}/reports/fluflo_usage.html"
}

//dag of beast-flow workflow
dag {
enabled = true
file = "${params.work_dir}reports/fluflo_dag.html"
file = "${params.work_dir}/reports/fluflo_dag.html"
}