diff --git a/code/kg2c/README.md b/code/kg2c/README.md index 7910e512c..3e6429c41 100644 --- a/code/kg2c/README.md +++ b/code/kg2c/README.md @@ -41,7 +41,7 @@ KG2canonicalized (KG2c) is a lightweight version of KG2 in which synonymous node ``` The node `id` is the 'preferred' curie for the group of synonymous nodes this KG2c node represents (according to the ARAX `NodeSynonymizer`). Similarly, the node `category` and `name` are the 'preferred' category/name, according to the `NodeSynonymizer`. -In the Neo4j instantiation of KG2c (see [below section](#host-kg2canonicalized-in-neo4j) for how to host KG2c in Neo4j), nodes are labeled with their `expanded_categories`. +In the Neo4j instantiation of KG2c (see [below section](#host-kg2canonicalized-in-neo4j) for how to host KG2c in Neo4j), nodes are labeled with their `all_categories` and ancestors of those categories. ###### Example KG2c edge: ``` diff --git a/code/kg2c/create_kg2c_files.py b/code/kg2c/create_kg2c_files.py index 5c76e6590..72eec1087 100644 --- a/code/kg2c/create_kg2c_files.py +++ b/code/kg2c/create_kg2c_files.py @@ -28,6 +28,8 @@ from RTXConfiguration import RTXConfiguration sys.path.append(os.path.dirname(os.path.abspath(__file__))+"/../ARAX/NodeSynonymizer/") from node_synonymizer import NodeSynonymizer +sys.path.append(os.path.dirname(os.path.abspath(__file__))+"/../ARAX/BiolinkHelper/") +from biolink_helper import BiolinkHelper ARRAY_NODE_PROPERTIES = ["all_categories", "publications", "equivalent_curies", "all_names", "expanded_categories"] ARRAY_EDGE_PROPERTIES = ["provided_by", "publications", "kg2_ids"] @@ -194,7 +196,7 @@ def create_kg2c_lite_json_file(canonicalized_nodes_dict: Dict[str, Dict[str, any meta_info_dict: Dict[str, str], is_test: bool): logging.info(f" Creating KG2c lite JSON file..") # Filter out all except these properties so we create a lightweight KG - node_lite_properties = ["id", "name", "category", "expanded_categories"] + node_lite_properties = ["id", "name", "category", "all_categories"] edge_lite_properties = ["id", "predicate", "subject", "object", "provided_by", "publications"] lite_kg = {"nodes": [], "edges": []} for node in canonicalized_nodes_dict.values(): @@ -235,12 +237,14 @@ def create_kg2c_sqlite_db(canonicalized_nodes_dict: Dict[str, Dict[str, any]], i def create_kg2c_tsv_files(canonicalized_nodes_dict: Dict[str, Dict[str, any]], - canonicalized_edges_dict: Dict[str, Dict[str, any]], is_test: bool): + canonicalized_edges_dict: Dict[str, Dict[str, any]], + biolink_version: str, is_test: bool): + bh = BiolinkHelper(biolink_version) # Convert array fields into the format neo4j wants and do some final processing for canonicalized_node in canonicalized_nodes_dict.values(): for list_node_property in ARRAY_NODE_PROPERTIES: canonicalized_node[list_node_property] = _convert_list_to_string_encoded_format(canonicalized_node[list_node_property]) - canonicalized_node['node_labels'] = canonicalized_node['expanded_categories'] + canonicalized_node['node_labels'] = bh.get_ancestors(canonicalized_node['all_categories']) for canonicalized_edge in canonicalized_edges_dict.values(): if not is_test: # Make sure we don't have any orphan edges assert canonicalized_edge['subject'] in canonicalized_nodes_dict @@ -439,7 +443,7 @@ def create_kg2c_files(is_test=False): meta_info_dict = {"kg2_version": kg2_version, "biolink_version": biolink_version} create_kg2c_lite_json_file(canonicalized_nodes_dict, canonicalized_edges_dict, meta_info_dict, is_test) create_kg2c_sqlite_db(canonicalized_nodes_dict, is_test) - create_kg2c_tsv_files(canonicalized_nodes_dict, canonicalized_edges_dict, is_test) + create_kg2c_tsv_files(canonicalized_nodes_dict, canonicalized_edges_dict, biolink_version, is_test) def main(): diff --git a/code/kg2c/record_kg2c_meta_info.py b/code/kg2c/record_kg2c_meta_info.py index 158346c69..0836f718b 100644 --- a/code/kg2c/record_kg2c_meta_info.py +++ b/code/kg2c/record_kg2c_meta_info.py @@ -43,8 +43,8 @@ def build_meta_kg(nodes_by_id: Dict[str, Dict[str, any]], edges_by_id: Dict[str, if not is_test or (subject_node_id in nodes_by_id and object_node_id in nodes_by_id): subject_node = nodes_by_id[subject_node_id] object_node = nodes_by_id[object_node_id] - subject_categories = biolink_helper.add_conflations(subject_node["category"]) - object_categories = biolink_helper.add_conflations(object_node["category"]) + subject_categories = biolink_helper.add_conflations(subject_node["all_categories"]) + object_categories = biolink_helper.add_conflations(object_node["all_categories"]) predicate = edge["predicate"] for subject_category in subject_categories: for object_category in object_categories: @@ -67,7 +67,7 @@ def build_meta_kg(nodes_by_id: Dict[str, Dict[str, any]], edges_by_id: Dict[str, logging.info("Saving meta KG to JSON file..") meta_kg = {"nodes": meta_nodes, "edges": meta_edges} with open(f"{KG2C_DIR}/{meta_kg_file_name}", "w+") as meta_kg_file: - json.dump(meta_kg, meta_kg_file, default=serialize_with_sets) + json.dump(meta_kg, meta_kg_file, default=serialize_with_sets, indent=2) def add_neighbor_counts_to_sqlite(nodes_by_id: Dict[str, Dict[str, any]], edges_by_id: Dict[str, Dict[str, any]], @@ -151,7 +151,7 @@ def record_meta_kg_info(is_test: bool): meta_kg_file_name = f"kg2c_meta_kg{'_test' if is_test else ''}.json" sqlite_file_name = f"kg2c{'_test' if is_test else ''}.sqlite" fda_approved_file_name = f"fda_approved_drugs{'_test' if is_test else ''}.pickle" - label_property_name = "expanded_categories" + label_property_name = "all_categories" start = time.time() with open(f"{KG2C_DIR}/{input_kg_file_name}", "r") as input_kg_file: