From 8072c78b59fdfff72cdeffc0a73d22f208ae8c0f Mon Sep 17 00:00:00 2001 From: ivan-aksamentov Date: Tue, 4 Jun 2024 08:27:01 +0200 Subject: [PATCH 1/2] feat: index add dataset capabilities --- scripts/rebuild | 58 ++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 53 insertions(+), 5 deletions(-) diff --git a/scripts/rebuild b/scripts/rebuild index 30714491..e0eb3fd1 100755 --- a/scripts/rebuild +++ b/scripts/rebuild @@ -13,7 +13,7 @@ from os.path import dirname, realpath, join, relpath, isfile from lib.changelog import changelog_prepare, changelog_get_unreleased_section from lib.container import dict_get, dict_get_required, find_index_by, first, format_list, \ - dict_remove_many, find_duplicates, dict_cleanup, find + dict_remove_many, find_duplicates, dict_cleanup, find, unique from lib.date import now_iso, iso_to_iso_safe from lib.fasta import fasta_read_exactly_one_seq from lib.fs import json_read, find_files, json_write, copy, make_zip, file_write, rmrf @@ -33,16 +33,19 @@ def get_dataset_capabilities(pathogen_json: dict, dataset_dir: str): filepath = join(dataset_dir, filename) if not isfile(filepath): raise FileNotFoundError( - f"'Filename '{filename}' is declared in `.files.{name}` field of pathogen.json, but the actual file is not found: '{filepath}'") + f"'Filename '{filename}' is declared in `.files.{name}` field of pathogen.json, but the actual file is not " + f"found: '{filepath}'") other = [] - tree_filename = dict_get(pathogen_json, ["files", "tree"]) + tree_filename = dict_get(pathogen_json, ["files", "treeJson"]) tree_json_path = join(dataset_dir, tree_filename) if tree_filename else None + clades = [] + custom_clades = {} if tree_json_path is not None and isfile(tree_json_path): tree_json = json_read(tree_json_path) - if dict_get(tree_json, ["extensions", "nextclade", "clade_node_attrs"]) is not None: - other.append("customClades") + clades = tree_find_clades(tree_json) + custom_clades = tree_find_clade_like_attrs(tree_json) if dict_get(pathogen_json, ["mutLabels"]) is not None: other.append("mutLabels") @@ -58,13 +61,58 @@ def get_dataset_capabilities(pathogen_json: dict, dataset_dir: str): if dict_get(q, ["enabled"]): qc.append(k) + custom_clades = dict_cleanup({attr: len(values) for attr, values in custom_clades.items() if len(values) > 0}) + return dict_cleanup({ + "clades": len(clades) if len(clades) > 0 else None, + "customClades": custom_clades, "qc": qc, "primers": True if len(dict_get(pathogen_json, ["primers"]) or []) > 0 else None, "other": other }) +def tree_find_clades(auspice_json): + def tree_find_clades_recursive(node, clades=None): + if clades is None: + clades = [] + + clade_membership = node.get('node_attrs', {}).get('clade_membership', {}).get('value') + if clade_membership: + clades.append(clade_membership) + + children = node.get('children', []) + for child in children: + tree_find_clades_recursive(child, clades) + + return clades + + clades = tree_find_clades_recursive(auspice_json["tree"]) + return list(sorted(unique(clades))) + + +def tree_find_clade_like_attrs(auspice_json): + def tree_find_clade_like_attrs_recursive(node, attr_names, attributes=None): + if attributes is None: + attributes = {attr: [] for attr in attr_names} + + for attr in attr_names: + attr_value = node.get('node_attrs', {}).get(attr, {}).get('value') + if attr_value is not None: + attributes[attr].append(attr_value) + + children = node.get('children', []) + for child in children: + tree_find_clade_like_attrs_recursive(child, attr_names, attributes) + + return attributes + + clade_node_attrs = dict_get(auspice_json, ["meta", "extensions", "nextclade", "clade_node_attrs"]) or [] + attr_names = [attr["name"] for attr in clade_node_attrs] + attributes = tree_find_clade_like_attrs_recursive(auspice_json["tree"], attr_names) + return {attr: list(sorted(unique(values))) for attr, values in attributes.items() if len(values) > 0} + + def dataset_get_versions(dataset): versions = dict_get(dataset, ["versions"]) or [] versions = list(filter(lambda version: version["tag"] != "unreleased", versions)) From 573507d1bfa223d9fa5dcf1c288627b68a9a4593 Mon Sep 17 00:00:00 2001 From: nextstrain-bot Date: Tue, 4 Jun 2024 06:28:48 +0000 Subject: [PATCH 2/2] chore: rebuild [skip ci] --- data_output/index.json | 96 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 96 insertions(+) diff --git a/data_output/index.json b/data_output/index.json index ceb57105..e1bff838 100644 --- a/data_output/index.json +++ b/data_output/index.json @@ -71,6 +71,14 @@ "treeJson": "tree.json" }, "capabilities": { + "clades": 43, + "customClades": { + "Nextclade_pango": 2969, + "partiallyAliased": 2969, + "clade_nextstrain": 43, + "clade_who": 13, + "clade_display": 43 + }, "qc": [ "frameShifts", "missingData", @@ -168,6 +176,14 @@ "treeJson": "tree.json" }, "capabilities": { + "clades": 43, + "customClades": { + "Nextclade_pango": 2969, + "partiallyAliased": 2969, + "clade_nextstrain": 43, + "clade_who": 13, + "clade_display": 43 + }, "qc": [ "frameShifts", "missingData", @@ -267,6 +283,14 @@ "treeJson": "tree.json" }, "capabilities": { + "clades": 20, + "customClades": { + "Nextclade_pango": 2232, + "partiallyAliased": 2232, + "clade_nextstrain": 20, + "clade_who": 4, + "clade_display": 20 + }, "qc": [ "frameShifts", "missingData", @@ -364,6 +388,14 @@ "treeJson": "tree.json" }, "capabilities": { + "clades": 20, + "customClades": { + "Nextclade_pango": 2232, + "partiallyAliased": 2232, + "clade_nextstrain": 20, + "clade_who": 4, + "clade_display": 20 + }, "qc": [ "frameShifts", "missingData", @@ -461,6 +493,14 @@ "treeJson": "tree.json" }, "capabilities": { + "clades": 20, + "customClades": { + "Nextclade_pango": 2232, + "partiallyAliased": 2232, + "clade_nextstrain": 20, + "clade_who": 4, + "clade_display": 20 + }, "qc": [ "frameShifts", "missingData", @@ -561,6 +601,11 @@ "treeJson": "tree.json" }, "capabilities": { + "clades": 21, + "customClades": { + "short-clade": 15, + "subclade": 21 + }, "qc": [ "privateMutations", "mixedSites", @@ -644,6 +689,11 @@ "treeJson": "tree.json" }, "capabilities": { + "clades": 16, + "customClades": { + "short-clade": 13, + "subclade": 21 + }, "qc": [ "privateMutations", "mixedSites", @@ -726,6 +776,7 @@ "treeJson": "tree.json" }, "capabilities": { + "clades": 23, "qc": [ "privateMutations", "mixedSites", @@ -807,6 +858,11 @@ "treeJson": "tree.json" }, "capabilities": { + "clades": 37, + "customClades": { + "subclade": 42, + "short-clade": 37 + }, "qc": [ "privateMutations", "mixedSites", @@ -898,6 +954,11 @@ "treeJson": "tree.json" }, "capabilities": { + "clades": 30, + "customClades": { + "subclade": 36, + "short-clade": 30 + }, "qc": [ "privateMutations", "mixedSites", @@ -989,6 +1050,7 @@ "treeJson": "tree.json" }, "capabilities": { + "clades": 17, "qc": [ "privateMutations", "mixedSites", @@ -1072,6 +1134,10 @@ "treeJson": "tree.json" }, "capabilities": { + "clades": 8, + "customClades": { + "subclade": 22 + }, "qc": [ "privateMutations", "mixedSites", @@ -1154,6 +1220,7 @@ "treeJson": "tree.json" }, "capabilities": { + "clades": 19, "qc": [ "privateMutations", "mixedSites", @@ -1237,6 +1304,7 @@ "treeJson": "tree.json" }, "capabilities": { + "clades": 3, "qc": [ "privateMutations", "mixedSites", @@ -1318,6 +1386,10 @@ "treeJson": "tree.json" }, "capabilities": { + "clades": 24, + "customClades": { + "G_clade": 15 + }, "qc": [ "privateMutations", "mixedSites", @@ -1396,6 +1468,10 @@ "treeJson": "tree.json" }, "capabilities": { + "clades": 17, + "customClades": { + "G_clade": 9 + }, "qc": [ "privateMutations", "mixedSites", @@ -1452,6 +1528,11 @@ "treeJson": "tree.json" }, "capabilities": { + "clades": 5, + "customClades": { + "outbreak": 1, + "lineage": 33 + }, "qc": [ "frameShifts", "missingData", @@ -1508,6 +1589,11 @@ "treeJson": "tree.json" }, "capabilities": { + "clades": 2, + "customClades": { + "outbreak": 1, + "lineage": 33 + }, "qc": [ "frameShifts", "missingData", @@ -1565,6 +1651,11 @@ "treeJson": "tree.json" }, "capabilities": { + "clades": 2, + "customClades": { + "outbreak": 1, + "lineage": 25 + }, "qc": [ "frameShifts", "missingData", @@ -2469,6 +2560,7 @@ "treeJson": "tree.json" }, "capabilities": { + "clades": 33, "qc": [ "frameShifts", "mixedSites", @@ -2525,6 +2617,7 @@ "treeJson": "tree.json" }, "capabilities": { + "clades": 100, "qc": [ "frameShifts", "missingData", @@ -2572,6 +2665,7 @@ "examples": "example_sequences.fasta" }, "capabilities": { + "clades": 11, "qc": [ "frameShifts", "mixedSites", @@ -2621,6 +2715,7 @@ "examples": "example_sequences.fasta" }, "capabilities": { + "clades": 57, "qc": [ "frameShifts", "mixedSites", @@ -2670,6 +2765,7 @@ "examples": "example_sequences.fasta" }, "capabilities": { + "clades": 10, "qc": [ "frameShifts", "mixedSites",