Skip to content

Commit

Permalink
[translate] Fix reference sequence translation
Browse files Browse the repository at this point in the history
For JSON inputs we were previously incorrectly exporting the
root-sequence translations as the "reference". Instead, we now translate
the provided reference (nuc) sequence. (There is some subtlety here
because the provided nuc reference sequence may in fact be the
root-sequence rather than an actual reference, but this is a problem
with `augur ancestral`. See
<#1362> for more details.)

This allows us to compare the reference translation to the root-sequence
translation and thus detail any AA mutations at the root node. A
side-effect of this is that we now always export an array of mutations
for each gene/CDS at the root node, although this may often be empty.

This brings the behaviour of JSON inputs in-line with that of VCF
inputs.
  • Loading branch information
jameshadfield committed Jan 22, 2024
1 parent 2a9f585 commit e744c86
Show file tree
Hide file tree
Showing 5 changed files with 69 additions and 20 deletions.
43 changes: 28 additions & 15 deletions augur/translate.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,12 +270,15 @@ def assign_aa_vcf(tree, translations):

return aa_muts

def assign_aa_fasta(tree, translations):
def assign_aa_fasta(tree, translations, reference_translations):
aa_muts = {}

#fasta input shouldn't have mutations on root, so give empty entry
# Depending on how `augur ancestral` was run, the input JSON (nt_muts.json)
# may or may not have mutations defined on the root node. Note that the 'muts'
# array will always be present, but it can only contain mutations if a
# --root-sequence was provided to augur ancestral.

root = tree.get_nonterminals()[0]
aa_muts[root.name]={"aa_muts":{}}

for n in tree.get_nonterminals():
if n.name is None:
Expand All @@ -296,10 +299,17 @@ def assign_aa_fasta(tree, translations):
print("no sequence pair for nodes %s-%s"%(c.name, n.name))

if n==tree.root:
aa_muts[n.name]={"aa_muts":{}, "aa_sequences":{}}
# The aa_sequences at the root are simply the translation from the root-node input data
aa_muts[n.name]={"aa_sequences":{}}
for fname, aln in translations.items():
if n.name in aln:
aa_muts[n.name]["aa_sequences"][fname] = "".join(aln[n.name])
# The aa_muts are found by comparing the aa_sequence with the reference sequence
aa_muts[n.name]['aa_muts'] = {}
for fname, aln in translations.items():
ref = reference_translations[fname]
muts = [construct_mut(a, int(pos+1), d) for pos, (a,d) in enumerate(zip(ref, aln[n.name])) if a!=d]
aa_muts[n.name]["aa_muts"][fname] = muts

return aa_muts

Expand Down Expand Up @@ -337,7 +347,8 @@ def sequences_json(node_data_json, tree):
These must be present under 'nodes' → <node_name> → 'sequence'.
This error may originate from using 'augur ancestral' with VCF input; in this case try using VCF output from that command here.
"""))
return sequences
reference = node_data['reference']['nuc']
return (reference, sequences)

def register_parser(parent_subparsers):
parser = parent_subparsers.add_parser("translate", help=__doc__)
Expand Down Expand Up @@ -401,7 +412,11 @@ def run(args):

## Read in sequences & for each sequence translate each feature _except for_ the 'nuc' feature name
## Note that except for the 'nuc' annotation, `load_features` _only_ looks for 'gene' (GFF files) or 'CDS' (GenBank files)
## The reference translations are straight translations of the "reference.nuc" sequence in the JSON
## (see <https://github.com/nextstrain/augur/issues/1362> for more details here), _or_ for VCF input a translation
## from the provided FASTA reference
translations = {}
reference_translations = {}
if is_vcf:
(sequences, ref) = sequences_vcf(args.vcf_reference, args.ancestral_sequences)
features_without_variation = []
Expand All @@ -410,13 +425,18 @@ def run(args):
continue
try:
translations[fname] = translate_vcf_feature(sequences, ref, feat, fname)
reference_translations[fname] = translations[fname]['reference']
except NoVariationError:
features_without_variation.append(fname)
if len(features_without_variation):
print("{} genes had no mutations and so have been be excluded.".format(len(features_without_variation)))
else:
sequences = sequences_json(args.ancestral_sequences, tree)
(reference, sequences) = sequences_json(args.ancestral_sequences, tree)
translations = {fname: translate_feature(sequences, feat) for fname, feat in features.items() if fname!='nuc'}
for fname, feat in features.items():
if fname=='nuc':
continue
reference_translations[fname] = safe_translate(str(feat.extract(reference)))

## glob the annotations for later auspice export
#
Expand All @@ -442,7 +462,7 @@ def run(args):
if is_vcf:
aa_muts = assign_aa_vcf(tree, translations)
else:
aa_muts = assign_aa_fasta(tree, translations)
aa_muts = assign_aa_fasta(tree, translations, reference_translations)
except MissingNodeError as err:
print("\n*** ERROR: Some/all nodes have no node names!")
print("*** Please check you are providing the tree output by 'augur refine'.")
Expand All @@ -456,14 +476,7 @@ def run(args):
print("*** Or, re-run 'ancestral' using %s, then use the new %s as input here."%(args.tree, args.ancestral_sequences))
return 1

output_data = {'annotations':annotations, 'nodes':aa_muts}
if is_vcf:
output_data['reference'] = {}
for fname in translations:
output_data['reference'][fname] = translations[fname]['reference']
else:
output_data['reference'] = aa_muts[tree.root.name]['aa_sequences']

output_data = {'annotations':annotations, 'nodes':aa_muts, 'reference': reference_translations}
out_name = get_json_name(args, '.'.join(args.tree.split('.')[:-1]) + '_aa-mutations.json')
write_json(output_data, out_name)
print("amino acid mutations written to", out_name, file=sys.stdout)
Expand Down
5 changes: 2 additions & 3 deletions tests/functional/translate/cram/genes.t
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,8 @@ as a feature ('nuc' in this case)
$ python3 "$SCRIPTS/diff_jsons.py" \
> "$DATA/aa_muts.json" \
> "aa_muts.genes-args.json" \
> --exclude-regex-paths "root\['annotations'\]\['.+'\]\['seqid'\]"
{'dictionary_item_removed': [root['annotations']['gene1'], root['nodes']['node_AB']['aa_muts']['gene1'], root['nodes']['node_root']['aa_sequences']['gene1'], root['nodes']['sample_A']['aa_muts']['gene1'], root['nodes']['sample_B']['aa_muts']['gene1'], root['nodes']['sample_C']['aa_muts']['gene1'], root['reference']['gene1']]}

> --exclude-regex-paths "seqid" "gene1"
{}
Using a text file rather than command line arguments

$ echo -e "#comment\ngene2\ngene3"> "genes.txt"
Expand Down
31 changes: 31 additions & 0 deletions tests/functional/translate/cram/root-mutations.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
Setup

$ export AUGUR="${AUGUR:-$TESTDIR/../../../../bin/augur}"
$ export SCRIPTS="$TESTDIR/../../../../scripts"
$ export ANC_DATA="$TESTDIR/../../ancestral/data/simple-genome"
$ export DATA="$TESTDIR/../data/simple-genome"

This is the same as the "general.t" test, but we are modifying the input data
such that the reference sequence contains "G" at pos 20 (1-based), and
include a compensating mutation G20A on the root node.
This results in the reference translation of gene1 to be MPCE* not MPCG*.
(Note that the compensating nuc mutation doesn't actually need to be present
in the JSON, `augur translate` just looks at the sequence attached to each node.)

$ sed '29s/^/ "G20A",\n/' "$ANC_DATA/nt_muts.ref-seq.json" |
> sed 's/"nuc": "AAAAAAAAAATGCCCTGCGGG/"nuc": "AAAAAAAAAATGCCCTGCGAG/' > nt_muts.json

$ "${AUGUR}" translate \
> --tree "$ANC_DATA/tree.nwk" \
> --ancestral-sequences nt_muts.json \
> --reference-sequence "$DATA/reference.gff" \
> --output-node-data "aa_muts.json" > /dev/null

The output should be a gene1 reference of MPCE* (not MPCG*). The root-sequence
is unchanged (MPCG*). There is also a mutation E4G at the root node to compensate.

$ python3 "$SCRIPTS/diff_jsons.py" \
> "$DATA/aa_muts.json" \
> "aa_muts.json" \
> --exclude-regex-paths "root\['annotations'\]\['.+'\]\['seqid'\]"
{'values_changed': {"root['reference']['gene1']": {'new_value': 'MPCE*', 'old_value': 'MPCG*'}}, 'iterable_item_added': {"root['nodes']['node_root']['aa_muts']['gene1'][0]": 'E4G'}}
5 changes: 4 additions & 1 deletion tests/functional/translate/data/simple-genome/aa_muts.json
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,10 @@
}
},
"node_root": {
"aa_muts": {},
"aa_muts": {
"gene1": [],
"gene2": []
},
"aa_sequences": {
"gene1": "MPCG*",
"gene2": "MVK*"
Expand Down
5 changes: 4 additions & 1 deletion tests/functional/translate/data/zika/aa_muts_genbank.json
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,10 @@
}
},
"NODE_0000006": {
"aa_muts": {},
"aa_muts": {
"CA": [],
"PRO": []
},
"aa_sequences": {
"CA": "MKNPKKKSGGFRIVNMLKRGVARVSPFGGLKRLPAGLLLGHGPIRMVLAILAFLRFTAIKPSLGLINRWGSVGKKEAMEIIKKFKKDLAAMLRIINARKEKKRRGADTSVGIVGLLLTTAMA",
"PRO": "AEVTRRGSAYYMYLDRNDAGEAISFPTTLGMNKCYIQIMDLGHMCDATMSYECPMLDEGVEPDDVDCWCNTTSTWVVYGTCHHKKGEARRSRR"
Expand Down

0 comments on commit e744c86

Please sign in to comment.