Skip to content

Commit

Permalink
Use all_categories (with ancestors) as labels
Browse files Browse the repository at this point in the history
  • Loading branch information
amykglen committed Aug 15, 2021
1 parent 43e1e5e commit f39e0af
Show file tree
Hide file tree
Showing 3 changed files with 13 additions and 9 deletions.
2 changes: 1 addition & 1 deletion code/kg2c/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ KG2canonicalized (KG2c) is a lightweight version of KG2 in which synonymous node
```
The node `id` is the 'preferred' curie for the group of synonymous nodes this KG2c node represents (according to the ARAX `NodeSynonymizer`). Similarly, the node `category` and `name` are the 'preferred' category/name, according to the `NodeSynonymizer`.

In the Neo4j instantiation of KG2c (see [below section](#host-kg2canonicalized-in-neo4j) for how to host KG2c in Neo4j), nodes are labeled with their `expanded_categories`.
In the Neo4j instantiation of KG2c (see [below section](#host-kg2canonicalized-in-neo4j) for how to host KG2c in Neo4j), nodes are labeled with their `all_categories` and ancestors of those categories.

###### Example KG2c edge:
```
Expand Down
12 changes: 8 additions & 4 deletions code/kg2c/create_kg2c_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@
from RTXConfiguration import RTXConfiguration
sys.path.append(os.path.dirname(os.path.abspath(__file__))+"/../ARAX/NodeSynonymizer/")
from node_synonymizer import NodeSynonymizer
sys.path.append(os.path.dirname(os.path.abspath(__file__))+"/../ARAX/BiolinkHelper/")
from biolink_helper import BiolinkHelper

ARRAY_NODE_PROPERTIES = ["all_categories", "publications", "equivalent_curies", "all_names", "expanded_categories"]
ARRAY_EDGE_PROPERTIES = ["provided_by", "publications", "kg2_ids"]
Expand Down Expand Up @@ -194,7 +196,7 @@ def create_kg2c_lite_json_file(canonicalized_nodes_dict: Dict[str, Dict[str, any
meta_info_dict: Dict[str, str], is_test: bool):
logging.info(f" Creating KG2c lite JSON file..")
# Filter out all except these properties so we create a lightweight KG
node_lite_properties = ["id", "name", "category", "expanded_categories"]
node_lite_properties = ["id", "name", "category", "all_categories"]
edge_lite_properties = ["id", "predicate", "subject", "object", "provided_by", "publications"]
lite_kg = {"nodes": [], "edges": []}
for node in canonicalized_nodes_dict.values():
Expand Down Expand Up @@ -235,12 +237,14 @@ def create_kg2c_sqlite_db(canonicalized_nodes_dict: Dict[str, Dict[str, any]], i


def create_kg2c_tsv_files(canonicalized_nodes_dict: Dict[str, Dict[str, any]],
canonicalized_edges_dict: Dict[str, Dict[str, any]], is_test: bool):
canonicalized_edges_dict: Dict[str, Dict[str, any]],
biolink_version: str, is_test: bool):
bh = BiolinkHelper(biolink_version)
# Convert array fields into the format neo4j wants and do some final processing
for canonicalized_node in canonicalized_nodes_dict.values():
for list_node_property in ARRAY_NODE_PROPERTIES:
canonicalized_node[list_node_property] = _convert_list_to_string_encoded_format(canonicalized_node[list_node_property])
canonicalized_node['node_labels'] = canonicalized_node['expanded_categories']
canonicalized_node['node_labels'] = bh.get_ancestors(canonicalized_node['all_categories'])
for canonicalized_edge in canonicalized_edges_dict.values():
if not is_test: # Make sure we don't have any orphan edges
assert canonicalized_edge['subject'] in canonicalized_nodes_dict
Expand Down Expand Up @@ -439,7 +443,7 @@ def create_kg2c_files(is_test=False):
meta_info_dict = {"kg2_version": kg2_version, "biolink_version": biolink_version}
create_kg2c_lite_json_file(canonicalized_nodes_dict, canonicalized_edges_dict, meta_info_dict, is_test)
create_kg2c_sqlite_db(canonicalized_nodes_dict, is_test)
create_kg2c_tsv_files(canonicalized_nodes_dict, canonicalized_edges_dict, is_test)
create_kg2c_tsv_files(canonicalized_nodes_dict, canonicalized_edges_dict, biolink_version, is_test)


def main():
Expand Down
8 changes: 4 additions & 4 deletions code/kg2c/record_kg2c_meta_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,8 @@ def build_meta_kg(nodes_by_id: Dict[str, Dict[str, any]], edges_by_id: Dict[str,
if not is_test or (subject_node_id in nodes_by_id and object_node_id in nodes_by_id):
subject_node = nodes_by_id[subject_node_id]
object_node = nodes_by_id[object_node_id]
subject_categories = biolink_helper.add_conflations(subject_node["category"])
object_categories = biolink_helper.add_conflations(object_node["category"])
subject_categories = biolink_helper.add_conflations(subject_node["all_categories"])
object_categories = biolink_helper.add_conflations(object_node["all_categories"])
predicate = edge["predicate"]
for subject_category in subject_categories:
for object_category in object_categories:
Expand All @@ -67,7 +67,7 @@ def build_meta_kg(nodes_by_id: Dict[str, Dict[str, any]], edges_by_id: Dict[str,
logging.info("Saving meta KG to JSON file..")
meta_kg = {"nodes": meta_nodes, "edges": meta_edges}
with open(f"{KG2C_DIR}/{meta_kg_file_name}", "w+") as meta_kg_file:
json.dump(meta_kg, meta_kg_file, default=serialize_with_sets)
json.dump(meta_kg, meta_kg_file, default=serialize_with_sets, indent=2)


def add_neighbor_counts_to_sqlite(nodes_by_id: Dict[str, Dict[str, any]], edges_by_id: Dict[str, Dict[str, any]],
Expand Down Expand Up @@ -151,7 +151,7 @@ def record_meta_kg_info(is_test: bool):
meta_kg_file_name = f"kg2c_meta_kg{'_test' if is_test else ''}.json"
sqlite_file_name = f"kg2c{'_test' if is_test else ''}.sqlite"
fda_approved_file_name = f"fda_approved_drugs{'_test' if is_test else ''}.pickle"
label_property_name = "expanded_categories"
label_property_name = "all_categories"

start = time.time()
with open(f"{KG2C_DIR}/{input_kg_file_name}", "r") as input_kg_file:
Expand Down

0 comments on commit f39e0af

Please sign in to comment.