From 422084de571b832f86c2f70ec434a31f18c2d78f Mon Sep 17 00:00:00 2001 From: James Hadfield Date: Fri, 28 May 2021 11:33:37 +1200 Subject: [PATCH] augur clades allows attribute name to be specified Previously the `augur clades` command produced a node-data JSON which stored clade membership as the node-attr "clade_membership" and defined the basal nodes of each clade with the node-attr "clade_annotation". `augur export v2` interpreted the latter as a special-case and turned it into a branch label of the same name. The previous commit allowed `augur export` to be supplied node-data JSONs with a `branches` dictionary. Here we update `augur clades` to export data in this structure, which allows the user to specify the keys to use via the `--attribute-name` arg. This commit breaks backwards compatibility for pipelines as the default attribute name is "clade". This will result in dataset (auspice) JSONs with the same branch labelling as before, but with a different node-attr (was "clade_membership", now "clade"). As `augur export v2` will make colorings for all node-attrs in in node-data JSONs, this will be exported as a "clade" coloring with no changes needed, however auspice config JSONs may now refer to a non-existent "clade_membership" key. `augur export v2` has been updated to no longer special-case `clade_membership` or `clade_annotation` node attrs. We print a warning if an auspice config JSON refers to `clade_membership` to help users update their configs. Functional tests for `augur clades` have been added. Closes https://github.com/nextstrain/augur/issues/720 --- augur/clades.py | 37 +++++++++------ augur/export_v2.py | 46 ++++++------------- tests/functional/clades.t | 34 ++++++++++++++ tests/functional/clades/clades.tsv | 12 +++++ .../clades/expected-output-custom-attr.json | 46 +++++++++++++++++++ .../clades/expected-output-default.json | 46 +++++++++++++++++++ tests/functional/clades/nt_muts.json | 19 ++++++++ tests/functional/clades/tree.nwk | 1 + 8 files changed, 195 insertions(+), 46 deletions(-) create mode 100644 tests/functional/clades.t create mode 100644 tests/functional/clades/clades.tsv create mode 100644 tests/functional/clades/expected-output-custom-attr.json create mode 100644 tests/functional/clades/expected-output-default.json create mode 100644 tests/functional/clades/nt_muts.json create mode 100644 tests/functional/clades/tree.nwk diff --git a/augur/clades.py b/augur/clades.py index 1a1e71d8c..620474439 100644 --- a/augur/clades.py +++ b/augur/clades.py @@ -101,13 +101,12 @@ def assign_clades(clade_designations, all_muts, tree, ref=None): mapping of node to clades ''' - clade_membership = {} + # We use the following dictionaries to store which clade nodes belong to. + # All nodes in a clade should appear in `clade_membership` while only one node should + # appear in `basal_clade_nodes` + (clade_membership, basal_clade_nodes) = ({}, {}) parents = get_parent_name_by_child_name_for_tree(tree) - # first pass to set all nodes to unassigned as precaution to ensure attribute is set - for node in tree.find_clades(order = 'preorder'): - clade_membership[node.name] = {'clade_membership': 'unassigned'} - # count leaves for node in tree.find_clades(order = 'postorder'): node.leaf_count = 1 if node.is_terminal() else np.sum([c.leaf_count for c in node]) @@ -136,7 +135,7 @@ def assign_clades(clade_designations, all_muts, tree, ref=None): node.sequences[gene][pos] = d - # second pass to assign 'clade_annotation' to basal nodes within each clade + # store names of basal nodes of each clade in `basal_clade_nodes` and `clade_membership` dicts. # if multiple nodes match, assign annotation to largest # otherwise occasional unwanted cousin nodes get assigned the annotation for clade_name, clade_alleles in clade_designations.items(): @@ -147,16 +146,17 @@ def assign_clades(clade_designations, all_muts, tree, ref=None): sorted_nodes = sorted(node_counts, key=lambda x: x.leaf_count, reverse=True) if len(sorted_nodes) > 0: target_node = sorted_nodes[0] - clade_membership[target_node.name] = {'clade_annotation': clade_name, 'clade_membership': clade_name} + basal_clade_nodes[target_node.name] = clade_name + clade_membership[target_node.name] = clade_name # basal nodes are members of the clade - # third pass to propagate 'clade_membership' + # propagate 'clade_membership' to children nodes # don't propagate if encountering 'clade_annotation' for node in tree.find_clades(order = 'preorder'): for child in node: - if 'clade_annotation' not in clade_membership[child.name]: - clade_membership[child.name]['clade_membership'] = clade_membership[node.name]['clade_membership'] - - return clade_membership + # if the child doesn't define the start of its own clade, but the parent belongs to a clade, then inherit that membership + if child.name not in basal_clade_nodes and node.name in clade_membership: + clade_membership[child.name] = clade_membership[node.name] + return (basal_clade_nodes, clade_membership) def get_reference_sequence_from_root_node(all_muts, root_name): @@ -181,6 +181,7 @@ def register_arguments(parser): parser.add_argument('--mutations', nargs='+', help='JSON(s) containing ancestral and tip nucleotide and/or amino-acid mutations ') parser.add_argument('--reference', nargs='+', help='fasta files containing reference and tip nucleotide and/or amino-acid sequences ') parser.add_argument('--clades', type=str, help='TSV file containing clade definitions by amino-acid') + parser.add_argument('--attribute-name', type=str, default="clade", help="name to use for clade membership & branch labels", required=False) parser.add_argument('--output-node-data', type=str, help='name of JSON file to save clade assignments to') @@ -205,8 +206,14 @@ def run(args): clade_designations = read_in_clade_definitions(args.clades) - clade_membership = assign_clades(clade_designations, all_muts, tree, ref) + (basal_clade_nodes, clade_membership) = assign_clades(clade_designations, all_muts, tree, ref) + + # create node_data for export as a JSON + node_data = { + 'nodes': {node: {args.attribute_name: clade} for node,clade in clade_membership.items()}, + 'branches': {node: {'labels': {args.attribute_name: clade}} for node,clade in basal_clade_nodes.items()} + } out_name = get_json_name(args) - write_json({'nodes': clade_membership}, out_name) - print("clades written to", out_name, file=sys.stdout) + write_json(node_data, out_name) + print(f"clades written to {out_name} using attribute name {args.attribute_name}", file=sys.stdout) diff --git a/augur/export_v2.py b/augur/export_v2.py index c9ae9c82c..1c0f2d21b 100644 --- a/augur/export_v2.py +++ b/augur/export_v2.py @@ -92,17 +92,9 @@ def are_mutations_defined(node_attrs): return True return False - -def are_clades_defined(node_attrs): +def is_node_attr_defined(node_attrs, attr_name): for node, data in node_attrs.items(): - if data.get("clade_membership") or data.get("clade_annotation"): - return True - return False - - -def are_dates_defined(node_attrs): - for node, data in node_attrs.items(): - if data.get("num_date"): + if data.get(attr_name): return True return False @@ -163,7 +155,7 @@ def set_colorings(data_json, config, command_line_colorings, metadata_names, nod def _get_type(key, trait_values): # for some keys we know what the type must be known_types = { - "clade_membership": "categorical", + "clade": "categorical", "gt": "categorical", "author": "categorical", "num_date": "continuous" @@ -200,7 +192,7 @@ def _get_title(key): return config_title # hardcoded fallbacks: - if key == "clade_membership": + if key == "clade": return "Clade" if key == "gt": return "Genotype" @@ -310,6 +302,12 @@ def _is_valid(coloring): if key == "gt" and not are_mutations_defined(node_attrs): warn("[colorings] You asked for mutations (\"gt\"), but none are defined on the tree. They cannot be used as a coloring.") return False + if key == "clade_membership" and not trait_values: + # augur 12 & below defined clades via the key "clade_membership", not "clade". + # If an auspice_config file specifies this, and it is not present in any node-data, we print a warning. + # (Note that if "clade" is present in node-data, we automatically include it as a colouring.) + warn("You asked for a color-by for 'clade_membership' but this is now called 'clade'. You should update your auspice config file.") + return False if key != "gt" and not trait_values: warn("You asked for a color-by for trait '{}', but it has no values on the tree. It has been ignored.".format(key)) return False @@ -348,11 +346,10 @@ def _get_colorings(): # add in genotype as a special case if (a) not already set and (b) the data supports it if "gt" not in explicitly_defined_colorings and are_mutations_defined(node_attrs): colorings.insert(0,{'key':'gt'}) - if "num_date" not in explicitly_defined_colorings and are_dates_defined(node_attrs): + if "num_date" not in explicitly_defined_colorings and is_node_attr_defined(node_attrs, "num_date"): colorings.insert(0,{'key':'num_date'}) - if "clade_membership" not in explicitly_defined_colorings and are_clades_defined(node_attrs): - colorings.insert(0,{'key':'clade_membership'}) - + if "clade" not in explicitly_defined_colorings and is_node_attr_defined(node_attrs, "clade"): + colorings.insert(0,{'key':'clade'}) return colorings @@ -714,8 +711,6 @@ def node_data_prop_is_normal_trait(name): # those traits / keys / attrs which are not "special" and can be exported # as normal attributes on nodes excluded = [ - "clade_annotation", # Clade annotation is label, not colorby! - "clade_membership", # will be auto-detected if it is available "authors", # authors are set as a node property, not a trait property "author", # see above "vaccine", # vaccine info is stored as a "special" node prop @@ -914,16 +909,6 @@ def transfer_mutations_to_branches(node_attrs, branch_attrs): else: branch_attrs[node_name]["labels"] = { "aa": aa_lab } -def transfer_clade_annotation_to_branches(node_attrs, branch_attrs): - for node_name, raw_data in node_attrs.items(): - if "clade_annotation" in raw_data and is_valid(raw_data["clade_annotation"]): - if node_name not in branch_attrs: - branch_attrs[node_name] = {} - if 'labels' in branch_attrs[node_name]: - branch_attrs[node_name]["labels"]['clade'] = raw_data["clade_annotation"] - else: - branch_attrs[node_name]["labels"] = { "clade": raw_data["clade_annotation"] } - def transfer_branch_data_to_branch_attrs(branches_node_data, branch_attrs): """ Transfers information stored in node-data JSONs under "branches" to the `branch_attrs`. @@ -968,12 +953,11 @@ def parse_node_data_and_metadata(T, node_data_files, metadata_file): node_attrs[name][corrected_key] = value node_data_names.add(corrected_key) - # third pass: create `branch_attrs` which includes certain traits supplied in `node_attrs` - # (e.g. mutations are coverted to branch attrs, and `clade_annotation` is interpreted as a label) + # third pass: create `branch_attrs` which includes a few special-case traits from in `node_attrs` + # (e.g. mutations are coverted from node attrs to branch attrs) # as well as any branch labels supplied in node-data files. branch_attrs = {} transfer_mutations_to_branches(node_attrs, branch_attrs) - transfer_clade_annotation_to_branches(node_attrs, branch_attrs) transfer_branch_data_to_branch_attrs(node_data.get('branches', {}), branch_attrs) return (node_data, node_attrs, node_data_names, metadata_names, branch_attrs) diff --git a/tests/functional/clades.t b/tests/functional/clades.t new file mode 100644 index 000000000..f66fe588f --- /dev/null +++ b/tests/functional/clades.t @@ -0,0 +1,34 @@ +Integration tests for augur clades. + + $ pushd "$TESTDIR" > /dev/null + $ export AUGUR="../../bin/augur" + +Run augur clades without --attribute-name. We expect the name to be "clade" + + $ ${AUGUR} clades \ + > --tree clades/tree.nwk \ + > --clades clades/clades.tsv \ + > --mutations clades/nt_muts.json \ + > --output-node-data "$TMP/default.json" > /dev/null + + $ python3 "$TESTDIR/../../scripts/diff_jsons.py" "clades/expected-output-default.json" "$TMP/default.json" + {} + +Run augur clades with a custom --attribute-name + + $ ${AUGUR} clades \ + > --tree clades/tree.nwk \ + > --clades clades/clades.tsv \ + > --mutations clades/nt_muts.json \ + > --attribute-name custom \ + > --output-node-data "$TMP/custom-attr.json" > /dev/null + + $ python3 "$TESTDIR/../../scripts/diff_jsons.py" "clades/expected-output-custom-attr.json" "$TMP/custom-attr.json" + {} + +Ensure the only change between runs of `augur clades` is the attr name used + $ cat "$TMP/default.json" | sed "s/clade/custom/" > "$TMP/default-now-custom.json" + $ diff -u "$TMP/default-now-custom.json" "$TMP/custom-attr.json" + +Cleanup + $ rm -f "$TMP/default.json" "$TMP/custom-attr.json" "$TMP/default-now-custom.json" diff --git a/tests/functional/clades/clades.tsv b/tests/functional/clades/clades.tsv new file mode 100644 index 000000000..559d73dfd --- /dev/null +++ b/tests/functional/clades/clades.tsv @@ -0,0 +1,12 @@ +clade gene site alt + +# the 1b mutation only once, on the branch leading to tips B and C +# thus we expect the clade label to be on node `internalAB` +cladeCB nuc 1 B +# the 2c mutation appears twice -- on branch `internalAB` and `internalDEF` +# as the latter has 3 descendants, it is chosen over the former +cladeDEF nuc 2 C +# mutation 3e appears only on a terminal node (tipE) +# but we still expect both a branch label and a node_attr +# this means that tipE should be annotated "cladeE" and _not_ "cladeDEF" +cladeE nuc 3 E \ No newline at end of file diff --git a/tests/functional/clades/expected-output-custom-attr.json b/tests/functional/clades/expected-output-custom-attr.json new file mode 100644 index 000000000..161e28966 --- /dev/null +++ b/tests/functional/clades/expected-output-custom-attr.json @@ -0,0 +1,46 @@ +{ + "branches": { + "internalBC": { + "labels": { + "custom": "cladeCB" + } + }, + "internalDEF": { + "labels": { + "custom": "cladeDEF" + } + }, + "tipE": { + "labels": { + "custom": "cladeE" + } + } + }, + "generated_by": { + "program": "augur", + "version": "12.0.0" + }, + "nodes": { + "internalBC": { + "custom": "cladeCB" + }, + "internalDEF": { + "custom": "cladeDEF" + }, + "tipB": { + "custom": "cladeCB" + }, + "tipC": { + "custom": "cladeCB" + }, + "tipD": { + "custom": "cladeDEF" + }, + "tipE": { + "custom": "cladeE" + }, + "tipF": { + "custom": "cladeDEF" + } + } +} \ No newline at end of file diff --git a/tests/functional/clades/expected-output-default.json b/tests/functional/clades/expected-output-default.json new file mode 100644 index 000000000..a0b472a96 --- /dev/null +++ b/tests/functional/clades/expected-output-default.json @@ -0,0 +1,46 @@ +{ + "branches": { + "internalBC": { + "labels": { + "clade": "cladeCB" + } + }, + "internalDEF": { + "labels": { + "clade": "cladeDEF" + } + }, + "tipE": { + "labels": { + "clade": "cladeE" + } + } + }, + "generated_by": { + "program": "augur", + "version": "12.0.0" + }, + "nodes": { + "internalBC": { + "clade": "cladeCB" + }, + "internalDEF": { + "clade": "cladeDEF" + }, + "tipB": { + "clade": "cladeCB" + }, + "tipC": { + "clade": "cladeCB" + }, + "tipD": { + "clade": "cladeDEF" + }, + "tipE": { + "clade": "cladeE" + }, + "tipF": { + "clade": "cladeDEF" + } + } +} \ No newline at end of file diff --git a/tests/functional/clades/nt_muts.json b/tests/functional/clades/nt_muts.json new file mode 100644 index 000000000..16cb24848 --- /dev/null +++ b/tests/functional/clades/nt_muts.json @@ -0,0 +1,19 @@ +{ + "nodes": { + "tipA": {"muts": [], "aa_muts": {}}, + "tipB": {"muts": [], "aa_muts": {}}, + "tipC": {"muts": [], "aa_muts": {}}, + "tipD": {"muts": [], "aa_muts": {}}, + "tipE": {"muts": ["A3E"], "aa_muts": {}}, + "tipF": {"muts": [], "aa_muts": {}}, + "internalBC": { + "muts": ["A1B", "A2C"], + "aa_muts": {} + }, + "internalDEF": { + "muts": ["A2C"], + "aa_muts": {} + }, + "ROOT":{"muts": [], "aa_muts": {}} + } +} \ No newline at end of file diff --git a/tests/functional/clades/tree.nwk b/tests/functional/clades/tree.nwk new file mode 100644 index 000000000..e4e5a8e3a --- /dev/null +++ b/tests/functional/clades/tree.nwk @@ -0,0 +1 @@ +(tipA:1,(tipB:1,tipC:1)internalBC:2,(tipD:3,tipE:4,tipF:1)internalDEF:5)ROOT:1; \ No newline at end of file