From e744c8627d433ccddc5d36934f3753fbd38ae2cd Mon Sep 17 00:00:00 2001
From: James Hadfield <hadfield.james@gmail.com>
Date: Thu, 21 Dec 2023 17:24:13 +1300
Subject: [PATCH] [translate] Fix reference sequence translation

For JSON inputs we were previously incorrectly exporting the
root-sequence translations as the "reference". Instead, we now translate
the provided reference (nuc) sequence. (There is some subtlety here
because the provided nuc reference sequence may in fact be the
root-sequence rather than an actual reference, but this is a problem
with `augur ancestral`. See
<https://github.com/nextstrain/augur/issues/1362> for more details.)

This allows us to compare the reference translation to the root-sequence
translation and thus detail any AA mutations at the root node. A
side-effect of this is that we now always export an array of mutations
for each gene/CDS at the root node, although this may often be empty.

This brings the behaviour of JSON inputs in-line with that of VCF
inputs.
---
 augur/translate.py                            | 43 ++++++++++++-------
 tests/functional/translate/cram/genes.t       |  5 +--
 .../translate/cram/root-mutations.t           | 31 +++++++++++++
 .../translate/data/simple-genome/aa_muts.json |  5 ++-
 .../translate/data/zika/aa_muts_genbank.json  |  5 ++-
 5 files changed, 69 insertions(+), 20 deletions(-)
 create mode 100644 tests/functional/translate/cram/root-mutations.t

diff --git a/augur/translate.py b/augur/translate.py
index 6329c6778..bfc072fb6 100644
--- a/augur/translate.py
+++ b/augur/translate.py
@@ -270,12 +270,15 @@ def assign_aa_vcf(tree, translations):
 
     return aa_muts
 
-def assign_aa_fasta(tree, translations):
+def assign_aa_fasta(tree, translations, reference_translations):
     aa_muts = {}
 
-    #fasta input shouldn't have mutations on root, so give empty entry
+    # Depending on how `augur ancestral` was run, the input JSON (nt_muts.json)
+    # may or may not have mutations defined on the root node. Note that the 'muts'
+    # array will always be present, but it can only contain mutations if a
+    # --root-sequence was provided to augur ancestral.
+
     root = tree.get_nonterminals()[0]
-    aa_muts[root.name]={"aa_muts":{}}
 
     for n in tree.get_nonterminals():
         if n.name is None:
@@ -296,10 +299,17 @@ def assign_aa_fasta(tree, translations):
                     print("no sequence pair for nodes %s-%s"%(c.name, n.name))
 
         if n==tree.root:
-            aa_muts[n.name]={"aa_muts":{}, "aa_sequences":{}}
+            # The aa_sequences at the root are simply the translation from the root-node input data
+            aa_muts[n.name]={"aa_sequences":{}}
             for fname, aln in translations.items():
                 if n.name in aln:
                     aa_muts[n.name]["aa_sequences"][fname] = "".join(aln[n.name])
+            # The aa_muts are found by comparing the aa_sequence with the reference sequence
+            aa_muts[n.name]['aa_muts'] = {}
+            for fname, aln in translations.items():
+                ref = reference_translations[fname]
+                muts = [construct_mut(a, int(pos+1), d) for pos, (a,d) in enumerate(zip(ref, aln[n.name])) if a!=d]
+                aa_muts[n.name]["aa_muts"][fname] = muts
 
     return aa_muts
 
@@ -337,7 +347,8 @@ def sequences_json(node_data_json, tree):
             These must be present under 'nodes' → <node_name> → 'sequence'.
             This error may originate from using 'augur ancestral' with VCF input; in this case try using VCF output from that command here.
             """))
-    return sequences
+    reference = node_data['reference']['nuc']
+    return (reference, sequences)
 
 def register_parser(parent_subparsers):
     parser = parent_subparsers.add_parser("translate", help=__doc__)
@@ -401,7 +412,11 @@ def run(args):
 
     ## Read in sequences & for each sequence translate each feature _except for_ the 'nuc' feature name
     ## Note that except for the 'nuc' annotation, `load_features` _only_ looks for 'gene' (GFF files) or 'CDS' (GenBank files)
+    ## The reference translations are straight translations of the "reference.nuc" sequence in the JSON
+    ## (see <https://github.com/nextstrain/augur/issues/1362> for more details here), _or_ for VCF input a translation
+    ## from the provided FASTA reference
     translations = {}
+    reference_translations = {}
     if is_vcf:
         (sequences, ref) = sequences_vcf(args.vcf_reference, args.ancestral_sequences)
         features_without_variation = []
@@ -410,13 +425,18 @@ def run(args):
                 continue
             try:
                 translations[fname] = translate_vcf_feature(sequences, ref, feat, fname)
+                reference_translations[fname] = translations[fname]['reference']
             except NoVariationError:
                 features_without_variation.append(fname)
         if len(features_without_variation):
             print("{} genes had no mutations and so have been be excluded.".format(len(features_without_variation)))  
     else:
-        sequences = sequences_json(args.ancestral_sequences, tree)
+        (reference, sequences) = sequences_json(args.ancestral_sequences, tree)
         translations = {fname: translate_feature(sequences, feat) for fname, feat in features.items() if fname!='nuc'}
+        for fname, feat in features.items():
+            if fname=='nuc':
+                continue
+            reference_translations[fname] = safe_translate(str(feat.extract(reference)))
 
     ## glob the annotations for later auspice export
     #
@@ -442,7 +462,7 @@ def run(args):
         if is_vcf:
             aa_muts = assign_aa_vcf(tree, translations)
         else:
-            aa_muts = assign_aa_fasta(tree, translations)
+            aa_muts = assign_aa_fasta(tree, translations, reference_translations)
     except MissingNodeError as err:
         print("\n*** ERROR: Some/all nodes have no node names!")
         print("*** Please check you are providing the tree output by 'augur refine'.")
@@ -456,14 +476,7 @@ def run(args):
         print("*** Or, re-run 'ancestral' using %s, then use the new %s as input here."%(args.tree, args.ancestral_sequences))
         return 1
 
-    output_data = {'annotations':annotations, 'nodes':aa_muts}
-    if is_vcf:
-        output_data['reference'] = {}
-        for fname in translations:
-            output_data['reference'][fname] = translations[fname]['reference']
-    else:
-        output_data['reference'] = aa_muts[tree.root.name]['aa_sequences']
-
+    output_data = {'annotations':annotations, 'nodes':aa_muts, 'reference': reference_translations}
     out_name = get_json_name(args, '.'.join(args.tree.split('.')[:-1]) + '_aa-mutations.json')
     write_json(output_data, out_name)
     print("amino acid mutations written to", out_name, file=sys.stdout)
diff --git a/tests/functional/translate/cram/genes.t b/tests/functional/translate/cram/genes.t
index 126b6f904..f09fb91aa 100644
--- a/tests/functional/translate/cram/genes.t
+++ b/tests/functional/translate/cram/genes.t
@@ -23,9 +23,8 @@ as a feature ('nuc' in this case)
   $ python3 "$SCRIPTS/diff_jsons.py" \
   >   "$DATA/aa_muts.json" \
   >   "aa_muts.genes-args.json" \
-  >   --exclude-regex-paths "root\['annotations'\]\['.+'\]\['seqid'\]"
-  {'dictionary_item_removed': [root['annotations']['gene1'], root['nodes']['node_AB']['aa_muts']['gene1'], root['nodes']['node_root']['aa_sequences']['gene1'], root['nodes']['sample_A']['aa_muts']['gene1'], root['nodes']['sample_B']['aa_muts']['gene1'], root['nodes']['sample_C']['aa_muts']['gene1'], root['reference']['gene1']]}
-
+  >   --exclude-regex-paths "seqid" "gene1"
+  {}
 Using a text file rather than command line arguments
 
   $ echo -e "#comment\ngene2\ngene3"> "genes.txt"
diff --git a/tests/functional/translate/cram/root-mutations.t b/tests/functional/translate/cram/root-mutations.t
new file mode 100644
index 000000000..dddd8c7a8
--- /dev/null
+++ b/tests/functional/translate/cram/root-mutations.t
@@ -0,0 +1,31 @@
+Setup
+
+  $ export AUGUR="${AUGUR:-$TESTDIR/../../../../bin/augur}"
+  $ export SCRIPTS="$TESTDIR/../../../../scripts"
+  $ export ANC_DATA="$TESTDIR/../../ancestral/data/simple-genome"
+  $ export DATA="$TESTDIR/../data/simple-genome"
+
+This is the same as the "general.t" test, but we are modifying the input data
+such that the reference sequence contains "G" at pos 20 (1-based), and
+include a compensating mutation G20A on the root node.
+This results in the reference translation of gene1 to be MPCE* not MPCG*.
+(Note that the compensating nuc mutation doesn't actually need to be present
+in the JSON, `augur translate` just looks at the sequence attached to each node.)
+
+  $ sed '29s/^/        "G20A",\n/' "$ANC_DATA/nt_muts.ref-seq.json" | 
+  > sed 's/"nuc": "AAAAAAAAAATGCCCTGCGGG/"nuc": "AAAAAAAAAATGCCCTGCGAG/' > nt_muts.json
+
+  $ "${AUGUR}" translate \
+  >  --tree "$ANC_DATA/tree.nwk" \
+  >  --ancestral-sequences nt_muts.json \
+  >  --reference-sequence "$DATA/reference.gff" \
+  >  --output-node-data "aa_muts.json" > /dev/null
+
+The output should be a gene1 reference of MPCE* (not MPCG*). The root-sequence
+is unchanged (MPCG*). There is also a mutation E4G at the root node to compensate.
+
+  $ python3 "$SCRIPTS/diff_jsons.py" \
+  >   "$DATA/aa_muts.json" \
+  >   "aa_muts.json" \
+  >   --exclude-regex-paths "root\['annotations'\]\['.+'\]\['seqid'\]"
+  {'values_changed': {"root['reference']['gene1']": {'new_value': 'MPCE*', 'old_value': 'MPCG*'}}, 'iterable_item_added': {"root['nodes']['node_root']['aa_muts']['gene1'][0]": 'E4G'}}
\ No newline at end of file
diff --git a/tests/functional/translate/data/simple-genome/aa_muts.json b/tests/functional/translate/data/simple-genome/aa_muts.json
index 8fe9db91a..375ee7cb3 100644
--- a/tests/functional/translate/data/simple-genome/aa_muts.json
+++ b/tests/functional/translate/data/simple-genome/aa_muts.json
@@ -38,7 +38,10 @@
       }
     },
     "node_root": {
-      "aa_muts": {},
+      "aa_muts": {
+        "gene1": [],
+        "gene2": []
+      },
       "aa_sequences": {
         "gene1": "MPCG*",
         "gene2": "MVK*"
diff --git a/tests/functional/translate/data/zika/aa_muts_genbank.json b/tests/functional/translate/data/zika/aa_muts_genbank.json
index 075306e9e..720073d8c 100644
--- a/tests/functional/translate/data/zika/aa_muts_genbank.json
+++ b/tests/functional/translate/data/zika/aa_muts_genbank.json
@@ -98,7 +98,10 @@
       }
     },
     "NODE_0000006": {
-      "aa_muts": {},
+      "aa_muts": {
+        "CA": [],
+        "PRO": []
+      },
       "aa_sequences": {
         "CA": "MKNPKKKSGGFRIVNMLKRGVARVSPFGGLKRLPAGLLLGHGPIRMVLAILAFLRFTAIKPSLGLINRWGSVGKKEAMEIIKKFKKDLAAMLRIINARKEKKRRGADTSVGIVGLLLTTAMA",
         "PRO": "AEVTRRGSAYYMYLDRNDAGEAISFPTTLGMNKCYIQIMDLGHMCDATMSYECPMLDEGVEPDDVDCWCNTTSTWVVYGTCHHKKGEARRSRR"