diff --git a/augur/translate.py b/augur/translate.py index 6329c6778..bfc072fb6 100644 --- a/augur/translate.py +++ b/augur/translate.py @@ -270,12 +270,15 @@ def assign_aa_vcf(tree, translations): return aa_muts -def assign_aa_fasta(tree, translations): +def assign_aa_fasta(tree, translations, reference_translations): aa_muts = {} - #fasta input shouldn't have mutations on root, so give empty entry + # Depending on how `augur ancestral` was run, the input JSON (nt_muts.json) + # may or may not have mutations defined on the root node. Note that the 'muts' + # array will always be present, but it can only contain mutations if a + # --root-sequence was provided to augur ancestral. + root = tree.get_nonterminals()[0] - aa_muts[root.name]={"aa_muts":{}} for n in tree.get_nonterminals(): if n.name is None: @@ -296,10 +299,17 @@ def assign_aa_fasta(tree, translations): print("no sequence pair for nodes %s-%s"%(c.name, n.name)) if n==tree.root: - aa_muts[n.name]={"aa_muts":{}, "aa_sequences":{}} + # The aa_sequences at the root are simply the translation from the root-node input data + aa_muts[n.name]={"aa_sequences":{}} for fname, aln in translations.items(): if n.name in aln: aa_muts[n.name]["aa_sequences"][fname] = "".join(aln[n.name]) + # The aa_muts are found by comparing the aa_sequence with the reference sequence + aa_muts[n.name]['aa_muts'] = {} + for fname, aln in translations.items(): + ref = reference_translations[fname] + muts = [construct_mut(a, int(pos+1), d) for pos, (a,d) in enumerate(zip(ref, aln[n.name])) if a!=d] + aa_muts[n.name]["aa_muts"][fname] = muts return aa_muts @@ -337,7 +347,8 @@ def sequences_json(node_data_json, tree): These must be present under 'nodes' → → 'sequence'. This error may originate from using 'augur ancestral' with VCF input; in this case try using VCF output from that command here. """)) - return sequences + reference = node_data['reference']['nuc'] + return (reference, sequences) def register_parser(parent_subparsers): parser = parent_subparsers.add_parser("translate", help=__doc__) @@ -401,7 +412,11 @@ def run(args): ## Read in sequences & for each sequence translate each feature _except for_ the 'nuc' feature name ## Note that except for the 'nuc' annotation, `load_features` _only_ looks for 'gene' (GFF files) or 'CDS' (GenBank files) + ## The reference translations are straight translations of the "reference.nuc" sequence in the JSON + ## (see for more details here), _or_ for VCF input a translation + ## from the provided FASTA reference translations = {} + reference_translations = {} if is_vcf: (sequences, ref) = sequences_vcf(args.vcf_reference, args.ancestral_sequences) features_without_variation = [] @@ -410,13 +425,18 @@ def run(args): continue try: translations[fname] = translate_vcf_feature(sequences, ref, feat, fname) + reference_translations[fname] = translations[fname]['reference'] except NoVariationError: features_without_variation.append(fname) if len(features_without_variation): print("{} genes had no mutations and so have been be excluded.".format(len(features_without_variation))) else: - sequences = sequences_json(args.ancestral_sequences, tree) + (reference, sequences) = sequences_json(args.ancestral_sequences, tree) translations = {fname: translate_feature(sequences, feat) for fname, feat in features.items() if fname!='nuc'} + for fname, feat in features.items(): + if fname=='nuc': + continue + reference_translations[fname] = safe_translate(str(feat.extract(reference))) ## glob the annotations for later auspice export # @@ -442,7 +462,7 @@ def run(args): if is_vcf: aa_muts = assign_aa_vcf(tree, translations) else: - aa_muts = assign_aa_fasta(tree, translations) + aa_muts = assign_aa_fasta(tree, translations, reference_translations) except MissingNodeError as err: print("\n*** ERROR: Some/all nodes have no node names!") print("*** Please check you are providing the tree output by 'augur refine'.") @@ -456,14 +476,7 @@ def run(args): print("*** Or, re-run 'ancestral' using %s, then use the new %s as input here."%(args.tree, args.ancestral_sequences)) return 1 - output_data = {'annotations':annotations, 'nodes':aa_muts} - if is_vcf: - output_data['reference'] = {} - for fname in translations: - output_data['reference'][fname] = translations[fname]['reference'] - else: - output_data['reference'] = aa_muts[tree.root.name]['aa_sequences'] - + output_data = {'annotations':annotations, 'nodes':aa_muts, 'reference': reference_translations} out_name = get_json_name(args, '.'.join(args.tree.split('.')[:-1]) + '_aa-mutations.json') write_json(output_data, out_name) print("amino acid mutations written to", out_name, file=sys.stdout) diff --git a/tests/functional/translate/cram/genes.t b/tests/functional/translate/cram/genes.t index 126b6f904..f09fb91aa 100644 --- a/tests/functional/translate/cram/genes.t +++ b/tests/functional/translate/cram/genes.t @@ -23,9 +23,8 @@ as a feature ('nuc' in this case) $ python3 "$SCRIPTS/diff_jsons.py" \ > "$DATA/aa_muts.json" \ > "aa_muts.genes-args.json" \ - > --exclude-regex-paths "root\['annotations'\]\['.+'\]\['seqid'\]" - {'dictionary_item_removed': [root['annotations']['gene1'], root['nodes']['node_AB']['aa_muts']['gene1'], root['nodes']['node_root']['aa_sequences']['gene1'], root['nodes']['sample_A']['aa_muts']['gene1'], root['nodes']['sample_B']['aa_muts']['gene1'], root['nodes']['sample_C']['aa_muts']['gene1'], root['reference']['gene1']]} - + > --exclude-regex-paths "seqid" "gene1" + {} Using a text file rather than command line arguments $ echo -e "#comment\ngene2\ngene3"> "genes.txt" diff --git a/tests/functional/translate/cram/root-mutations.t b/tests/functional/translate/cram/root-mutations.t new file mode 100644 index 000000000..dddd8c7a8 --- /dev/null +++ b/tests/functional/translate/cram/root-mutations.t @@ -0,0 +1,31 @@ +Setup + + $ export AUGUR="${AUGUR:-$TESTDIR/../../../../bin/augur}" + $ export SCRIPTS="$TESTDIR/../../../../scripts" + $ export ANC_DATA="$TESTDIR/../../ancestral/data/simple-genome" + $ export DATA="$TESTDIR/../data/simple-genome" + +This is the same as the "general.t" test, but we are modifying the input data +such that the reference sequence contains "G" at pos 20 (1-based), and +include a compensating mutation G20A on the root node. +This results in the reference translation of gene1 to be MPCE* not MPCG*. +(Note that the compensating nuc mutation doesn't actually need to be present +in the JSON, `augur translate` just looks at the sequence attached to each node.) + + $ sed '29s/^/ "G20A",\n/' "$ANC_DATA/nt_muts.ref-seq.json" | + > sed 's/"nuc": "AAAAAAAAAATGCCCTGCGGG/"nuc": "AAAAAAAAAATGCCCTGCGAG/' > nt_muts.json + + $ "${AUGUR}" translate \ + > --tree "$ANC_DATA/tree.nwk" \ + > --ancestral-sequences nt_muts.json \ + > --reference-sequence "$DATA/reference.gff" \ + > --output-node-data "aa_muts.json" > /dev/null + +The output should be a gene1 reference of MPCE* (not MPCG*). The root-sequence +is unchanged (MPCG*). There is also a mutation E4G at the root node to compensate. + + $ python3 "$SCRIPTS/diff_jsons.py" \ + > "$DATA/aa_muts.json" \ + > "aa_muts.json" \ + > --exclude-regex-paths "root\['annotations'\]\['.+'\]\['seqid'\]" + {'values_changed': {"root['reference']['gene1']": {'new_value': 'MPCE*', 'old_value': 'MPCG*'}}, 'iterable_item_added': {"root['nodes']['node_root']['aa_muts']['gene1'][0]": 'E4G'}} \ No newline at end of file diff --git a/tests/functional/translate/data/simple-genome/aa_muts.json b/tests/functional/translate/data/simple-genome/aa_muts.json index 8fe9db91a..375ee7cb3 100644 --- a/tests/functional/translate/data/simple-genome/aa_muts.json +++ b/tests/functional/translate/data/simple-genome/aa_muts.json @@ -38,7 +38,10 @@ } }, "node_root": { - "aa_muts": {}, + "aa_muts": { + "gene1": [], + "gene2": [] + }, "aa_sequences": { "gene1": "MPCG*", "gene2": "MVK*" diff --git a/tests/functional/translate/data/zika/aa_muts_genbank.json b/tests/functional/translate/data/zika/aa_muts_genbank.json index 075306e9e..720073d8c 100644 --- a/tests/functional/translate/data/zika/aa_muts_genbank.json +++ b/tests/functional/translate/data/zika/aa_muts_genbank.json @@ -98,7 +98,10 @@ } }, "NODE_0000006": { - "aa_muts": {}, + "aa_muts": { + "CA": [], + "PRO": [] + }, "aa_sequences": { "CA": "MKNPKKKSGGFRIVNMLKRGVARVSPFGGLKRLPAGLLLGHGPIRMVLAILAFLRFTAIKPSLGLINRWGSVGKKEAMEIIKKFKKDLAAMLRIINARKEKKRRGADTSVGIVGLLLTTAMA", "PRO": "AEVTRRGSAYYMYLDRNDAGEAISFPTTLGMNKCYIQIMDLGHMCDATMSYECPMLDEGVEPDDVDCWCNTTSTWVVYGTCHHKKGEARRSRR"