Skip to content

Commit

Permalink
feat(ancestral): make --genes optional when reconstructing translatio…
Browse files Browse the repository at this point in the history
…ns, if ommitted, translate all genes in annotation
  • Loading branch information
corneliusroemer committed Nov 10, 2024
1 parent ab4565f commit 8311506
Show file tree
Hide file tree
Showing 5 changed files with 820 additions and 10 deletions.
16 changes: 9 additions & 7 deletions augur/ancestral.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,7 +314,7 @@ def register_parser(parent_subparsers):
)
amino_acid_options_group.add_argument('--annotation',
help='GenBank or GFF file containing the annotation')
amino_acid_options_group.add_argument('--genes', nargs='+', action='extend', help="genes to translate (list or file containing list)")
amino_acid_options_group.add_argument('--genes', nargs='+', action='extend', help="genes to translate (list or file containing list), if not provided, all genes will be translated")
amino_acid_options_group.add_argument('--translations', type=str, help="translated alignments for each CDS/Gene. "
"Currently only supported for FASTA-input. Specify the file name via a "
"template like 'aa_sequences_%%GENE.fasta' where %%GENE will be replaced "
Expand Down Expand Up @@ -349,10 +349,10 @@ def validate_arguments(args, is_vcf):
This checking shouldn't be used by downstream code to assume arguments exist, however by checking for
invalid combinations up-front we can exit quickly.
"""
mandatory_aa_arguments = (args.annotation, args.genes, args.translations)
all_aa_arguments = (*mandatory_aa_arguments, args.use_nextclade_gff_parsing)
mandatory_aa_arguments = (args.annotation, args.translations)
all_aa_arguments = (*mandatory_aa_arguments, args.use_nextclade_gff_parsing, args.genes)
if any(all_aa_arguments) and not all(mandatory_aa_arguments):
raise AugurError("For amino acid sequence reconstruction, you must provide an annotation file, a list of genes, and a template path to amino acid sequences.")
raise AugurError("For amino acid sequence reconstruction, you must provide an annotation file and a template path to amino acid sequences.")

if args.output_sequences and args.output_vcf:
raise AugurError("Both sequence (fasta) and VCF output have been requested, but these are incompatible.")
Expand Down Expand Up @@ -431,8 +431,8 @@ def run(args):
anc_seqs['annotations'] = {'nuc': {'start': 1, 'end': len(anc_seqs['reference']['nuc']),
'strand': '+', 'type': 'source'}}

if not is_vcf and args.genes:
genes = parse_genes_argument(args.genes)
if not is_vcf and args.annotation:
genes = parse_genes_argument(args.genes) if args.genes else None

from .utils import load_features
## load features; only requested features if genes given
Expand All @@ -444,7 +444,9 @@ def run(args):
f" don't match the provided sequence data coordinates ({anc_seqs['annotations']['nuc']['start']}..{anc_seqs['annotations']['nuc']['end']}).")

print("Read in {} features from reference sequence file".format(len(features)))
for gene in genes:
for gene in features:
if gene == 'nuc':
continue
print(f"Processing gene: {gene}")
fname = args.translations.replace("%GENE", gene)
feat = features[gene]
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
Setup

$ source "$TESTDIR"/_setup.sh

Infer ancestral nucleotide and amino acid sequences using Nextclade GFF annotations.

$ ${AUGUR} ancestral \
> --tree $TESTDIR/../data/ebola/tree.nwk \
> --alignment $TESTDIR/../data/ebola/masked.fasta \
> --annotation $TESTDIR/../data/ebola/genome_annotation.gff3 \
> --use-nextclade-gff-parsing \
> --translations $TESTDIR/../data/ebola/translations/%GENE.fasta \
> --infer-ambiguous \
> --inference joint \
> --output-node-data "$CRAMTMP/$TESTFILE/ancestral_mutations.json" \
> > /dev/null

Check that output is as expected

$ python3 "$TESTDIR/../../../../scripts/diff_jsons.py" \
> --exclude-regex-paths "\['seqid'\]" -- \
> "$TESTDIR/../data/ebola/nt_muts.json" \
> "$CRAMTMP/$TESTFILE/ancestral_mutations.json"
{}
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,13 @@ Setup
$ source "$TESTDIR"/_setup.sh

Infer ancestral nucleotide and amino acid sequences using Nextclade GFF annotations.
Only include the specified genes, not all that are in the GFF.

$ ${AUGUR} ancestral \
> --tree $TESTDIR/../data/ebola/tree.nwk \
> --alignment $TESTDIR/../data/ebola/masked.fasta \
> --annotation $TESTDIR/../data/ebola/genome_annotation.gff3 \
> --genes GP L NP sGP ssGP VP24 VP30 VP35 VP40 \
> --genes GP sGP ssGP \
> --use-nextclade-gff-parsing \
> --translations $TESTDIR/../data/ebola/translations/%GENE.fasta \
> --infer-ambiguous \
Expand All @@ -20,6 +21,6 @@ Check that output is as expected

$ python3 "$TESTDIR/../../../../scripts/diff_jsons.py" \
> --exclude-regex-paths "\['seqid'\]" -- \
> "$TESTDIR/../data/ebola/nt_muts.json" \
> "$TESTDIR/../data/ebola/nt_muts_gene_subset.json" \
> "$CRAMTMP/$TESTFILE/ancestral_mutations.json"
{}
2 changes: 1 addition & 1 deletion tests/functional/ancestral/cram/invalid-args.t
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ This should fail.
> --annotation $TESTDIR/../data/zika_outgroup.gb \
> --genes ENV PRO \
> --output-node-data "ancestral_mutations.json" > /dev/null
ERROR: For amino acid sequence reconstruction, you must provide an annotation file, a list of genes, and a template path to amino acid sequences.
ERROR: For amino acid sequence reconstruction, you must provide an annotation file and a template path to amino acid sequences.
[2]

Missing tree file
Expand Down
Loading

0 comments on commit 8311506

Please sign in to comment.