[translate] Fix reference sequence translation

For JSON inputs we were previously incorrectly exporting the root-sequence translations as the "reference". Instead, we now translate the provided reference (nuc) sequence. (There is some subtlety here because the provided nuc reference sequence may in fact be the root-sequence rather than an actual reference, but this is a problem with `augur ancestral`. See <#1362> for more details.) This allows us to compare the reference translation to the root-sequence translation and thus detail any AA mutations at the root node. A side-effect of this is that we now always export an array of mutations for each gene/CDS at the root node, although this may often be empty. This brings the behaviour of JSON inputs in-line with that of VCF inputs.
nextstrain · Jan 22, 2024 · e744c86 · e744c86
1 parent 2a9f585
commit e744c86
Show file tree

Hide file tree

Showing 5 changed files with 69 additions and 20 deletions.
diff --git a/augur/translate.py b/augur/translate.py
@@ -270,12 +270,15 @@ def assign_aa_vcf(tree, translations):
 
     return aa_muts
 
-def assign_aa_fasta(tree, translations):
+def assign_aa_fasta(tree, translations, reference_translations):
     aa_muts = {}
 
-    #fasta input shouldn't have mutations on root, so give empty entry
+    # Depending on how `augur ancestral` was run, the input JSON (nt_muts.json)
+    # may or may not have mutations defined on the root node. Note that the 'muts'
+    # array will always be present, but it can only contain mutations if a
+    # --root-sequence was provided to augur ancestral.
+
     root = tree.get_nonterminals()[0]
-    aa_muts[root.name]={"aa_muts":{}}
 
     for n in tree.get_nonterminals():
         if n.name is None:
@@ -296,10 +299,17 @@ def assign_aa_fasta(tree, translations):
                     print("no sequence pair for nodes %s-%s"%(c.name, n.name))
 
         if n==tree.root:
-            aa_muts[n.name]={"aa_muts":{}, "aa_sequences":{}}
+            # The aa_sequences at the root are simply the translation from the root-node input data
+            aa_muts[n.name]={"aa_sequences":{}}
             for fname, aln in translations.items():
                 if n.name in aln:
                     aa_muts[n.name]["aa_sequences"][fname] = "".join(aln[n.name])
+            # The aa_muts are found by comparing the aa_sequence with the reference sequence
+            aa_muts[n.name]['aa_muts'] = {}
+            for fname, aln in translations.items():
+                ref = reference_translations[fname]
+                muts = [construct_mut(a, int(pos+1), d) for pos, (a,d) in enumerate(zip(ref, aln[n.name])) if a!=d]
+                aa_muts[n.name]["aa_muts"][fname] = muts
 
     return aa_muts
 
@@ -337,7 +347,8 @@ def sequences_json(node_data_json, tree):
             These must be present under 'nodes' → <node_name> → 'sequence'.
             This error may originate from using 'augur ancestral' with VCF input; in this case try using VCF output from that command here.
             """))
-    return sequences
+    reference = node_data['reference']['nuc']
+    return (reference, sequences)
 
 def register_parser(parent_subparsers):
     parser = parent_subparsers.add_parser("translate", help=__doc__)
@@ -401,7 +412,11 @@ def run(args):
 
     ## Read in sequences & for each sequence translate each feature _except for_ the 'nuc' feature name
     ## Note that except for the 'nuc' annotation, `load_features` _only_ looks for 'gene' (GFF files) or 'CDS' (GenBank files)
+    ## The reference translations are straight translations of the "reference.nuc" sequence in the JSON
+    ## (see <https://github.com/nextstrain/augur/issues/1362> for more details here), _or_ for VCF input a translation
+    ## from the provided FASTA reference
     translations = {}
+    reference_translations = {}
     if is_vcf:
         (sequences, ref) = sequences_vcf(args.vcf_reference, args.ancestral_sequences)
         features_without_variation = []
@@ -410,13 +425,18 @@ def run(args):
                 continue
             try:
                 translations[fname] = translate_vcf_feature(sequences, ref, feat, fname)
+                reference_translations[fname] = translations[fname]['reference']
             except NoVariationError:
                 features_without_variation.append(fname)
         if len(features_without_variation):
             print("{} genes had no mutations and so have been be excluded.".format(len(features_without_variation)))  
     else:
-        sequences = sequences_json(args.ancestral_sequences, tree)
+        (reference, sequences) = sequences_json(args.ancestral_sequences, tree)
         translations = {fname: translate_feature(sequences, feat) for fname, feat in features.items() if fname!='nuc'}
+        for fname, feat in features.items():
+            if fname=='nuc':
+                continue
+            reference_translations[fname] = safe_translate(str(feat.extract(reference)))
 
     ## glob the annotations for later auspice export
     #
@@ -442,7 +462,7 @@ def run(args):
         if is_vcf:
             aa_muts = assign_aa_vcf(tree, translations)
         else:
-            aa_muts = assign_aa_fasta(tree, translations)
+            aa_muts = assign_aa_fasta(tree, translations, reference_translations)
     except MissingNodeError as err:
         print("\n*** ERROR: Some/all nodes have no node names!")
         print("*** Please check you are providing the tree output by 'augur refine'.")
@@ -456,14 +476,7 @@ def run(args):
         print("*** Or, re-run 'ancestral' using %s, then use the new %s as input here."%(args.tree, args.ancestral_sequences))
         return 1
 
-    output_data = {'annotations':annotations, 'nodes':aa_muts}
-    if is_vcf:
-        output_data['reference'] = {}
-        for fname in translations:
-            output_data['reference'][fname] = translations[fname]['reference']
-    else:
-        output_data['reference'] = aa_muts[tree.root.name]['aa_sequences']
-
+    output_data = {'annotations':annotations, 'nodes':aa_muts, 'reference': reference_translations}
     out_name = get_json_name(args, '.'.join(args.tree.split('.')[:-1]) + '_aa-mutations.json')
     write_json(output_data, out_name)
     print("amino acid mutations written to", out_name, file=sys.stdout)

diff --git a/tests/functional/translate/cram/genes.t b/tests/functional/translate/cram/genes.t
@@ -23,9 +23,8 @@ as a feature ('nuc' in this case)
   $ python3 "$SCRIPTS/diff_jsons.py" \
   >   "$DATA/aa_muts.json" \
   >   "aa_muts.genes-args.json" \
-  >   --exclude-regex-paths "root\['annotations'\]\['.+'\]\['seqid'\]"
-  {'dictionary_item_removed': [root['annotations']['gene1'], root['nodes']['node_AB']['aa_muts']['gene1'], root['nodes']['node_root']['aa_sequences']['gene1'], root['nodes']['sample_A']['aa_muts']['gene1'], root['nodes']['sample_B']['aa_muts']['gene1'], root['nodes']['sample_C']['aa_muts']['gene1'], root['reference']['gene1']]}
-
+  >   --exclude-regex-paths "seqid" "gene1"
+  {}
 Using a text file rather than command line arguments
 
   $ echo -e "#comment\ngene2\ngene3"> "genes.txt"

diff --git a/tests/functional/translate/cram/root-mutations.t b/tests/functional/translate/cram/root-mutations.t
@@ -0,0 +1,31 @@
+Setup
+
+  $ export AUGUR="${AUGUR:-$TESTDIR/../../../../bin/augur}"
+  $ export SCRIPTS="$TESTDIR/../../../../scripts"
+  $ export ANC_DATA="$TESTDIR/../../ancestral/data/simple-genome"
+  $ export DATA="$TESTDIR/../data/simple-genome"
+
+This is the same as the "general.t" test, but we are modifying the input data
+such that the reference sequence contains "G" at pos 20 (1-based), and
+include a compensating mutation G20A on the root node.
+This results in the reference translation of gene1 to be MPCE* not MPCG*.
+(Note that the compensating nuc mutation doesn't actually need to be present
+in the JSON, `augur translate` just looks at the sequence attached to each node.)
+
+  $ sed '29s/^/        "G20A",\n/' "$ANC_DATA/nt_muts.ref-seq.json" | 
+  > sed 's/"nuc": "AAAAAAAAAATGCCCTGCGGG/"nuc": "AAAAAAAAAATGCCCTGCGAG/' > nt_muts.json
+
+  $ "${AUGUR}" translate \
+  >  --tree "$ANC_DATA/tree.nwk" \
+  >  --ancestral-sequences nt_muts.json \
+  >  --reference-sequence "$DATA/reference.gff" \
+  >  --output-node-data "aa_muts.json" > /dev/null
+
+The output should be a gene1 reference of MPCE* (not MPCG*). The root-sequence
+is unchanged (MPCG*). There is also a mutation E4G at the root node to compensate.
+
+  $ python3 "$SCRIPTS/diff_jsons.py" \
+  >   "$DATA/aa_muts.json" \
+  >   "aa_muts.json" \
+  >   --exclude-regex-paths "root\['annotations'\]\['.+'\]\['seqid'\]"
+  {'values_changed': {"root['reference']['gene1']": {'new_value': 'MPCE*', 'old_value': 'MPCG*'}}, 'iterable_item_added': {"root['nodes']['node_root']['aa_muts']['gene1'][0]": 'E4G'}}
diff --git a/tests/functional/translate/data/simple-genome/aa_muts.json b/tests/functional/translate/data/simple-genome/aa_muts.json
@@ -38,7 +38,10 @@
       }
     },
     "node_root": {
-      "aa_muts": {},
+      "aa_muts": {
+        "gene1": [],
+        "gene2": []
+      },
       "aa_sequences": {
         "gene1": "MPCG*",
         "gene2": "MVK*"

diff --git a/tests/functional/translate/data/zika/aa_muts_genbank.json b/tests/functional/translate/data/zika/aa_muts_genbank.json
@@ -98,7 +98,10 @@
       }
     },
     "NODE_0000006": {
-      "aa_muts": {},
+      "aa_muts": {
+        "CA": [],
+        "PRO": []
+      },
       "aa_sequences": {
         "CA": "MKNPKKKSGGFRIVNMLKRGVARVSPFGGLKRLPAGLLLGHGPIRMVLAILAFLRFTAIKPSLGLINRWGSVGKKEAMEIIKKFKKDLAAMLRIINARKEKKRRGADTSVGIVGLLLTTAMA",
         "PRO": "AEVTRRGSAYYMYLDRNDAGEAISFPTTLGMNKCYIQIMDLGHMCDATMSYECPMLDEGVEPDDVDCWCNTTSTWVVYGTCHHKKGEARRSRR"