-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextract_all_bacteria_for_bcmm_compounds.py
104 lines (89 loc) · 4.26 KB
/
extract_all_bacteria_for_bcmm_compounds.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import sys
import boto3
import pandas as pd
import numpy as np
from scipy import stats
import multiprocessing as mp
import time
import json
import pickle
import networkx as nx
from s3_utility import read_pickle_file_from_s3
GRAPH_PATH = sys.argv[1]
MAPPING_FILE = sys.argv[2]
BACTERIA_FILE = sys.argv[3]
BUCKET_NAME = sys.argv[4]
PPR_FILE_LOCATION = sys.argv[5]
PPR_FEATURE_MAP_FILELOCATION = sys.argv[6]
SAVE_LOCATION = sys.argv[7]
SAVE_NAME = sys.argv[8]
NCORES = int(sys.argv[9])
NODE_TYPE_SEPARATOR = sys.argv[10]
mapping_file_df = pd.read_csv(MAPPING_FILE)
compound_names = mapping_file_df["compound_name"].unique()
bacteria_df = pd.read_csv(BACTERIA_FILE, sep="\t")
bacteria_df["type_id"] = "Organism" + NODE_TYPE_SEPARATOR + bacteria_df["spoke_identifier"].astype(str)
def main():
global bacteria_feature_df_with_names, bacteria_feature_indices, G
start_time = time.time()
with open(GRAPH_PATH, "rb") as f:
G = pickle.load(f)
feature_df = get_feature_map()
feature_df["type_id"] = feature_df["node_type"] + NODE_TYPE_SEPARATOR + feature_df["node_id"]
bacteria_feature_df = feature_df[feature_df["type_id"].isin(bacteria_df.type_id.unique())]
bacteria_feature_df_with_names = pd.merge(bacteria_feature_df, bacteria_df, on="type_id")
bacteria_feature_df_with_names = bacteria_feature_df_with_names[["spoke_identifier", "spoke_name"]]
bacteria_feature_indices = bacteria_feature_df.index.values
p = mp.Pool(NCORES)
out_list_of_df = p.map(get_all_bacteria_for_the_compound, compound_names)
p.close()
p.join()
s3_client = boto3.client('s3')
file_name = SAVE_LOCATION + "/" + SAVE_NAME
binary_data = pickle.dumps(out_list_of_df)
s3_client.put_object(Bucket=BUCKET_NAME, Key=file_name, Body=binary_data)
s3_client.close()
completion_time = round((time.time()-start_time)/(60),2)
print("Completed in {} min!".format(completion_time))
def get_all_bacteria_for_the_compound(item):
bacteria_feature_df_with_names_ = bacteria_feature_df_with_names.copy()
bacteria_feature_df_with_names_["type_id"] = "Organism" + NODE_TYPE_SEPARATOR + bacteria_feature_df_with_names_["spoke_identifier"].astype(str)
bacteria_list = list(bacteria_feature_df_with_names_["type_id"])
bacteria_feature_df_with_names_.drop("type_id", axis=1, inplace=True)
spoke_compound_nodes_ids = list(mapping_file_df[mapping_file_df["compound_name"]==item].spoke_identifier.unique())
# spoke_compound_nodes_ids = list(map(lambda x:"Compound:"+x, spoke_compound_nodes_ids))
spoke_vector = 0
shortest_pathlength_list = []
for compound_id in spoke_compound_nodes_ids:
object_key = PPR_FILE_LOCATION + "/" + compound_id + "_dict.pickle"
spoke_embedding_data = read_pickle_file_from_s3(BUCKET_NAME, object_key)
if spoke_embedding_data["embedding"].shape[0] != 0:
spoke_vector += spoke_embedding_data["embedding"]
shortest_pathlength_list_ = []
for bacteria in bacteria_list:
shortest_pathlength_list_.append(get_shortest_pathlength(bacteria, compound_id))
shortest_pathlength_list.append(shortest_pathlength_list_)
try:
shortest_pathlength_list = [list(pair) for pair in zip(*shortest_pathlength_list)]
spoke_bacteria_vector = spoke_vector[bacteria_feature_indices]
bacteria_feature_df_with_names_["ppr_values"] = spoke_bacteria_vector
bacteria_feature_df_with_names_["ppr_percentile"] = bacteria_feature_df_with_names_.ppr_values.apply(lambda x:stats.percentileofscore(bacteria_feature_df_with_names_.ppr_values, x))/100
bacteria_feature_df_with_names_.drop("ppr_values", axis=1, inplace=True)
bacteria_feature_df_with_names_.rename(columns={"spoke_identifier":"ncbi_id", "spoke_name": "name", "ppr_percentile":item}, inplace=True)
bacteria_feature_df_with_names_["shortest_pathlength_to_{}".format(item)] = shortest_pathlength_list
return bacteria_feature_df_with_names_
except:
return pd.DataFrame(columns=["ncbi_id", "name", item, "shortest_pathlength_to_{}".format(item)])
def get_shortest_pathlength(source, target):
try:
shortest_pathlength = nx.shortest_path_length(G, source=source, target=target)
except:
shortest_pathlength = None
return shortest_pathlength
def get_feature_map():
s3_client = boto3.client('s3')
s3_object = s3_client.get_object(Bucket=BUCKET_NAME, Key=PPR_FEATURE_MAP_FILELOCATION)
feature_df = pd.read_csv(s3_object["Body"])
return feature_df
if __name__ == "__main__":
main()