From f9a06aadedebd045bb8081f0355ef521863b3f42 Mon Sep 17 00:00:00 2001 From: John Huddleston Date: Tue, 29 Aug 2023 15:51:56 -0700 Subject: [PATCH] Use new augur ancestral interface Updates augur ancestral rule in the core workflow to generate ancestral translations in addition to ancestral nucleotide sequences. Removes the separate rule for translation and the corresponding custom script. Updates all downstream rules to reference a new single `muts.json` file. Rules that depended on the translations with internal node sequences do not need to change, since the output files from the updated ancestral rule maintain the same name. --- scripts/translations_aamuts.py | 94 ----------------------------- workflow/snakemake_rules/core.smk | 61 +++++++------------ workflow/snakemake_rules/export.smk | 1 - 3 files changed, 21 insertions(+), 135 deletions(-) delete mode 100644 scripts/translations_aamuts.py diff --git a/scripts/translations_aamuts.py b/scripts/translations_aamuts.py deleted file mode 100644 index 01991789..00000000 --- a/scripts/translations_aamuts.py +++ /dev/null @@ -1,94 +0,0 @@ -import argparse -import json -from Bio import Phylo, SeqIO, Seq -from Bio.Align import MultipleSeqAlignment -from treetime import TreeAnc - - -def read_gff(fname): - try: - from BCBio import GFF #Package name is confusing - tell user exactly what they need! - except ImportError: - print("ERROR: Package BCBio.GFF not found! Please install using \'pip install bcbio-gff\' before re-running.") - return None - - features = {} - with open(fname, encoding='utf-8') as in_handle: - for rec in GFF.parse(in_handle): - for feat in rec.features: - if "gene_name" in feat.qualifiers: - fname = feat.qualifiers["gene_name"][0] - if fname: - features[fname] = feat - - return features - -def annotation_json(features, reference): - annotations = {} - for fname, feat in features.items(): - annotations[fname] = {'seqid':reference.id, - 'type':feat.type, - 'start':int(feat.location.start)+1, - 'end':int(feat.location.end), - 'strand': '+' if feat.location.strand else '-'} - annotations['nuc'] = {'seqid':reference.id, - 'type':'source', - 'start': 1, - 'end': len(reference), - 'strand': '+'} - return annotations - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description="Add translations", - formatter_class=argparse.ArgumentDefaultsHelpFormatter - ) - - parser.add_argument('--tree', type=str, required=True, help="input tree") - parser.add_argument('--annotation', type=str, required=True, help="gff annotation file") - parser.add_argument('--reference', type=str, required=True, help="reference fasta sequence") - parser.add_argument('--translations', type=str, nargs='+', required=True, help="amino acid alignment") - parser.add_argument('--genes', type=str, nargs='+', required=True, help="amino acid alignment") - parser.add_argument('--output', type=str, metavar="JSON", required=True, help="output Auspice JSON") - args = parser.parse_args() - - genes = args.genes if type(args.genes)==list else [args.genes] - translations = args.translations if type(args.translations)==list else [args.translations] - features = read_gff(args.annotation) - ref = SeqIO.read(args.reference, format='fasta') - - if not set(features.keys())==set(args.genes): - print("ERROR: supplied genes don't match the annotation") - exit(1) - - T = Phylo.read(args.tree, 'newick') - leafs = {n.name for n in T.get_terminals()} - - node_data = {} - for gene, translation in zip(genes, translations): - seqs = [] - for s in SeqIO.parse(translation, 'fasta'): - if s.id in leafs: - if str(s.seq).strip('-'): - seqs.append(s) - else: - print("WARNING: sequence {} is all gaps".format(s.id)) - s.seq = Seq.Seq(len(s) * 'N') - seqs.append(s) - - - tt = TreeAnc(tree=T, aln=MultipleSeqAlignment(seqs), alphabet='aa') - - tt.infer_ancestral_sequences(reconstruct_tip_states=True) - - with open(translation.replace('.fasta', '_withInternalNodes.fasta'), 'w') as fh: - for n in tt.tree.find_clades(): - if n.name not in node_data: - node_data[n.name] = {"aa_muts":{}} - if len(n.mutations): - node_data[n.name]["aa_muts"][gene] = [f"{a}{p+1}{d}" for a,p,d in n.mutations] - fh.write(f">{n.name}\n{tt.sequence(n, as_string=True, reconstructed=True)}\n") - - annotations = annotation_json(features, ref) - with open(args.output, 'w') as fh: - json.dump({"nodes":node_data, "annotations":annotations}, fh) \ No newline at end of file diff --git a/workflow/snakemake_rules/core.smk b/workflow/snakemake_rules/core.smk index e4d58dd9..a6ab3f33 100644 --- a/workflow/snakemake_rules/core.smk +++ b/workflow/snakemake_rules/core.smk @@ -266,12 +266,19 @@ rule refine: rule ancestral: message: "Reconstructing ancestral sequences and mutations" input: - tree = rules.refine.output.tree, - alignment = rules.align.output.alignment, + tree = build_dir + "/{build_name}/{segment}/tree.nwk", + alignment = build_dir + "/{build_name}/{segment}/aligned.fasta", + translations = aggregate_translations, + reference = lambda w: f"{config['builds'][w.build_name]['reference']}", + annotation = lambda w: f"{config['builds'][w.build_name]['annotation']}", output: - node_data = build_dir + "/{build_name}/{segment}/nt-muts.json" + node_data = build_dir + "/{build_name}/{segment}/muts.json", + translations_done = build_dir + "/{build_name}/{segment}/translations.done", params: - inference = "joint" + inference = "joint", + genes = lambda w: GENES[w.segment], + input_translations = lambda w: build_dir + f"/{w.build_name}/{w.segment}/nextalign/masked.gene.%GENE.fasta", + output_translations = lambda w: build_dir + f"/{w.build_name}/{w.segment}/nextalign/masked.gene.%GENE_withInternalNodes.fasta", conda: "../envs/nextstrain.yaml" benchmark: "benchmarks/ancestral_{build_name}_{segment}.txt" @@ -284,36 +291,13 @@ rule ancestral: augur ancestral \ --tree {input.tree} \ --alignment {input.alignment} \ - --output-node-data {output.node_data} \ - --inference {params.inference} 2>&1 | tee {log} - """ - -rule translate: - message: "Translating amino acid sequences" - input: - translations = aggregate_translations, - tree = rules.refine.output.tree, - reference = lambda w: f"{config['builds'][w.build_name]['reference']}", - annotation = lambda w: f"{config['builds'][w.build_name]['annotation']}", - output: - node_data = build_dir + "/{build_name}/{segment}/aa_muts.json", - translations_done = build_dir + "/{build_name}/{segment}/translations.done" - params: - genes = lambda w: GENES[w.segment] - conda: "../envs/nextstrain.yaml" - benchmark: - "benchmarks/translate_{build_name}_{segment}.txt" - log: - "logs/translate_{build_name}_{segment}.txt" - shell: - """ - python3 scripts/translations_aamuts.py \ - --tree {input.tree} \ + --root-sequence {input.reference} \ --annotation {input.annotation} \ - --reference {input.reference} \ - --translations {input.translations:q} \ --genes {params.genes} \ - --output {output.node_data} 2>&1 | tee {log} && touch {output.translations_done} + --translations "{params.input_translations}" \ + --output-node-data {output.node_data} \ + --output-translations "{params.output_translations}" \ + --inference {params.inference} 2>&1 | tee {log} && touch {output.translations_done} """ rule traits: @@ -347,8 +331,7 @@ rule traits: rule clades: input: tree = build_dir + "/{build_name}/ha/tree.nwk", - nt_muts = build_dir + "/{build_name}/ha/nt-muts.json", - aa_muts = build_dir + "/{build_name}/ha/aa_muts.json", + muts = build_dir + "/{build_name}/ha/muts.json", clades = lambda wildcards: config["builds"][wildcards.build_name]["clades"], output: node_data = build_dir + "/{build_name}/ha/clades.json", @@ -361,7 +344,7 @@ rule clades: """ augur clades \ --tree {input.tree} \ - --mutations {input.nt_muts} {input.aa_muts} \ + --mutations {input.muts} \ --clades {input.clades} \ --output {output.node_data} 2>&1 | tee {log} """ @@ -370,8 +353,7 @@ rule clades: rule subclades: input: tree = build_dir + "/{build_name}/{segment}/tree.nwk", - nt_muts = build_dir + "/{build_name}/{segment}/nt-muts.json", - aa_muts = build_dir + "/{build_name}/{segment}/aa_muts.json", + muts = build_dir + "/{build_name}/{segment}/muts.json", clades = lambda wildcards: config["builds"][wildcards.build_name]["subclades"], output: node_data = build_dir + "/{build_name}/{segment}/subclades.json", @@ -387,7 +369,7 @@ rule subclades: """ augur clades \ --tree {input.tree} \ - --mutations {input.nt_muts} {input.aa_muts} \ + --mutations {input.muts} \ --clades {input.clades} \ --membership-name {params.membership_name} \ --label-name {params.label_name} \ @@ -398,8 +380,7 @@ rule subclades: rule import_clades: input: tree = build_dir + "/{build_name}/ha/tree.nwk", - nt_muts = build_dir + "/{build_name}/{segment}/nt-muts.json", - aa_muts = build_dir + "/{build_name}/{segment}/aa_muts.json", + muts = build_dir + "/{build_name}/{segment}/muts.json", clades = build_dir + "/{build_name}/ha/clades.json", output: node_data = build_dir + "/{build_name}/{segment}/clades.json", diff --git a/workflow/snakemake_rules/export.smk b/workflow/snakemake_rules/export.smk index 2115287a..3898f067 100644 --- a/workflow/snakemake_rules/export.smk +++ b/workflow/snakemake_rules/export.smk @@ -8,7 +8,6 @@ def _get_node_data_by_wildcards(wildcards): inputs = [ rules.refine.output.node_data, rules.ancestral.output.node_data, - rules.translate.output.node_data, rules.clades.output.node_data, rules.traits.output.node_data, rules.annotate_epiweeks.output.node_data,