feat(ancestral): make --genes optional when reconstructing translatio…

…ns, if ommitted, translate all genes in annotation
nextstrain · Nov 10, 2024 · 8311506 · 8311506
1 parent ab4565f
commit 8311506
Show file tree

Hide file tree

Showing 5 changed files with 820 additions and 10 deletions.
diff --git a/augur/ancestral.py b/augur/ancestral.py
@@ -314,7 +314,7 @@ def register_parser(parent_subparsers):
     )
     amino_acid_options_group.add_argument('--annotation',
                         help='GenBank or GFF file containing the annotation')
-    amino_acid_options_group.add_argument('--genes', nargs='+', action='extend', help="genes to translate (list or file containing list)")
+    amino_acid_options_group.add_argument('--genes', nargs='+', action='extend', help="genes to translate (list or file containing list), if not provided, all genes will be translated")
     amino_acid_options_group.add_argument('--translations', type=str, help="translated alignments for each CDS/Gene. "
                            "Currently only supported for FASTA-input. Specify the file name via a "
                            "template like 'aa_sequences_%%GENE.fasta' where %%GENE will be replaced "
@@ -349,10 +349,10 @@ def validate_arguments(args, is_vcf):
     This checking shouldn't be used by downstream code to assume arguments exist, however by checking for
     invalid combinations up-front we can exit quickly.
     """
-    mandatory_aa_arguments = (args.annotation, args.genes, args.translations)
-    all_aa_arguments = (*mandatory_aa_arguments, args.use_nextclade_gff_parsing)
+    mandatory_aa_arguments = (args.annotation, args.translations)
+    all_aa_arguments = (*mandatory_aa_arguments, args.use_nextclade_gff_parsing, args.genes)
     if any(all_aa_arguments) and not all(mandatory_aa_arguments):
-        raise AugurError("For amino acid sequence reconstruction, you must provide an annotation file, a list of genes, and a template path to amino acid sequences.")
+        raise AugurError("For amino acid sequence reconstruction, you must provide an annotation file and a template path to amino acid sequences.")
 
     if args.output_sequences and args.output_vcf:
         raise AugurError("Both sequence (fasta) and VCF output have been requested, but these are incompatible.")
@@ -431,8 +431,8 @@ def run(args):
     anc_seqs['annotations'] = {'nuc': {'start': 1, 'end': len(anc_seqs['reference']['nuc']),
                                        'strand': '+', 'type': 'source'}}
 
-    if not is_vcf and args.genes:
-        genes = parse_genes_argument(args.genes)
+    if not is_vcf and args.annotation:
+        genes = parse_genes_argument(args.genes) if args.genes else None
 
         from .utils import load_features
         ## load features; only requested features if genes given
@@ -444,7 +444,9 @@ def run(args):
                 f" don't match the provided sequence data coordinates ({anc_seqs['annotations']['nuc']['start']}..{anc_seqs['annotations']['nuc']['end']}).")
 
         print("Read in {} features from reference sequence file".format(len(features)))
-        for gene in genes:
+        for gene in features:
+            if gene == 'nuc':
+                continue
             print(f"Processing gene: {gene}")
             fname = args.translations.replace("%GENE", gene)
             feat = features[gene]

diff --git a/tests/functional/ancestral/cram/infer-amino-acid-sequences-nextclade-gff-all-genes.t b/tests/functional/ancestral/cram/infer-amino-acid-sequences-nextclade-gff-all-genes.t
@@ -0,0 +1,24 @@
+Setup
+
+  $ source "$TESTDIR"/_setup.sh
+
+Infer ancestral nucleotide and amino acid sequences using Nextclade GFF annotations.
+
+  $ ${AUGUR} ancestral \
+  >  --tree $TESTDIR/../data/ebola/tree.nwk \
+  >  --alignment $TESTDIR/../data/ebola/masked.fasta \
+  >  --annotation $TESTDIR/../data/ebola/genome_annotation.gff3 \
+  >  --use-nextclade-gff-parsing \
+  >  --translations $TESTDIR/../data/ebola/translations/%GENE.fasta \
+  >  --infer-ambiguous \
+  >  --inference joint \
+  >  --output-node-data "$CRAMTMP/$TESTFILE/ancestral_mutations.json" \
+  >  > /dev/null
+
+Check that output is as expected
+
+  $ python3 "$TESTDIR/../../../../scripts/diff_jsons.py" \
+  >   --exclude-regex-paths "\['seqid'\]" -- \
+  >   "$TESTDIR/../data/ebola/nt_muts.json" \
+  >   "$CRAMTMP/$TESTFILE/ancestral_mutations.json"
+  {}
diff --git a/tests/functional/ancestral/cram/infer-amino-acid-sequences-nextclade-gff.t b/tests/functional/ancestral/cram/infer-amino-acid-sequences-nextclade-gff.t
@@ -3,12 +3,13 @@ Setup
   $ source "$TESTDIR"/_setup.sh
 
 Infer ancestral nucleotide and amino acid sequences using Nextclade GFF annotations.
+Only include the specified genes, not all that are in the GFF.
 
   $ ${AUGUR} ancestral \
   >  --tree $TESTDIR/../data/ebola/tree.nwk \
   >  --alignment $TESTDIR/../data/ebola/masked.fasta \
   >  --annotation $TESTDIR/../data/ebola/genome_annotation.gff3 \
-  >  --genes GP L NP sGP ssGP VP24 VP30 VP35 VP40 \
+  >  --genes GP sGP ssGP \
   >  --use-nextclade-gff-parsing \
   >  --translations $TESTDIR/../data/ebola/translations/%GENE.fasta \
   >  --infer-ambiguous \
@@ -20,6 +21,6 @@ Check that output is as expected
 
   $ python3 "$TESTDIR/../../../../scripts/diff_jsons.py" \
   >   --exclude-regex-paths "\['seqid'\]" -- \
-  >   "$TESTDIR/../data/ebola/nt_muts.json" \
+  >   "$TESTDIR/../data/ebola/nt_muts_gene_subset.json" \
   >   "$CRAMTMP/$TESTFILE/ancestral_mutations.json"
   {}
diff --git a/tests/functional/ancestral/cram/invalid-args.t b/tests/functional/ancestral/cram/invalid-args.t
@@ -40,7 +40,7 @@ This should fail.
   >  --annotation $TESTDIR/../data/zika_outgroup.gb \
   >  --genes ENV PRO \
   >  --output-node-data "ancestral_mutations.json" > /dev/null
-  ERROR: For amino acid sequence reconstruction, you must provide an annotation file, a list of genes, and a template path to amino acid sequences.
+  ERROR: For amino acid sequence reconstruction, you must provide an annotation file and a template path to amino acid sequences.
   [2]
 
 Missing tree file