forked from intermine/similarity
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMinHash.py
224 lines (153 loc) · 7.01 KB
/
MinHash.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
"""
InterMine @ Open Genome Informatics : Similarity Project
-> Implementation of the MinHash Algorithm to finding similarity amongst Genes treated as sets of information
-> Implementation of LSH on the signatures generated to find highly similar genes
-> This method of Finding has a huge benefit when dealing with millions of sets of Genes to find Similarity as it scales very well """
#Libraries
from __future__ import division
import json
import pandas as pd
from py2neo import Node, Relationship, Graph
import re
import math
import string
import networkx as nx
import matplotlib.pyplot as plt
from matplotlib import pylab
import sys
import binascii
import numpy as np
import pandas as pd
from features import create_features, get_genes
from doc_cluster import create_gene_documents
import random
""" Algorithm Description : MinHash -- This is a method of compressing the information and an approximation scheme towards Jaccard coefficient computation
-> Generate Random Hash functions of the form : (a * x + b ) mod c , where a,b < max(x) and c is the prime number just greater than x
-> x can take maximum of 2**32 - 1 as in the earlier step for Shingle ID's we have used crc32
-> Choose each hash function and generate hash values for each Shingle ID -- then choose the minimum value as signature
-> In this way, if there are 'n' Hash Functions there will be n components of a signature """
#Function to Map the Shingles to an ID - using CRC32 Hash
def generate_shingle_id(sets):
#List containing Shingle ID's
shingle_ids = []
#Finding the shingle ID's for each set of property in the Gene Set
for gene in sets:
if gene: #Checking if information pertaining to the Gene is present
gene_modified = [x for x in gene if x is not None]
ids = map((lambda g: binascii.crc32(g) & 0xffffffff),gene_modified)
shingle_ids.append(ids)
return shingle_ids
#Function to Generate Random values for a & b in the Hash function with the requirement that there is no repetition
def generate_random_hash(max,components):
#A sampling method cannot be used to generate a list of random number with no repetitions as they don't work in low memory conditions
#print list(set(np.random.randint(11,size=10)))
initial_list = np.random.randint(max,size=components)
while len(set(initial_list)) != len(initial_list):
#Perform the operation again to get unique random values
initial_list = np.random.randint(max,size=components)
return initial_list
#Function to Generate the MinHash Signatures using Hash Functions
def generate_signatures(shingles,components):
#Max Shingle_id
max_shingle_id = 2**32 - 1
#Prime Number just greater than max_shingle_id -- Can be extracted from http://compoasso.free.fr/primelistweb/page/prime/liste_online_en.php
c = 4294967311
#Generate Random Hash Functions
hash_a = generate_random_hash(max_shingle_id,components)
hash_b = generate_random_hash(max_shingle_id,components)
#Storing the Min Hash Signatures in a list
minhash_signatures = []
for gene in shingles:
temp = []
for i in range(0,components):
#Selection for coefficient A
a = hash_a[i]
#Selection for coefficient B
b = hash_b[i]
#Complete List of Hash Values
hash_values = map((lambda g: (a*g + b)%c),gene)
#Choose Minimum Hash Value
temp.append(min(hash_values))
minhash_signatures.append(temp)
return minhash_signatures
""" Application of LSH on the MinHash Signatures to identify similar patterns in the data set
Parameters => b -> Number of Bands, r -> Number of Rows in a band, signature_matrix -> Contains signature for each gene
components -> Number of elements in the Gene Signature """
#Perform LSH
def LSH(b,r,signature_matrix,components):
# b * r = number of components in a signature
#Initializing LSH matrix - to be filled after hashing bands
lsh_matrix = []
#Calculating Hash values for each band -> Total Number of hash values for a gene = Number of bands
for gene in signature_matrix:
#For storing hash value of each band
temp = []
for band in range(0,b):
#Extracting required number of elements for a band
temp = gene[band*r:band*r + r]
#Obtaining a Hash Value
hash_value = hash(tuple(temp))
#Append to temporary list
temp.append(hash_value)
lsh_matrix.append(temp)
return lsh_matrix
#Function to churn out the candidate genes which needs to be compared
def candidate_genes(lsh_matrix):
#Transpose the Matrix
transposed_lsh_matrix = np.array(lsh_matrix).transpose()
#List storing the Candidate Genes from each Band
candidates = []
#Extracting the candidate genes from each band
for band in transposed_lsh_matrix:
#Extract the unique hash values from the band
hash_list = list(set(band))
#Candidates for the particular band
band_candidate = [np.where(band == value) for value in hash_list]
#Add to main candidate list
candidates.append(band_candidate)
return candidates
""" The following method taken : N * (N-1) / 2 time, which is not good for scaling -- Replaced with LSH """
#Function to construct a Similarity Matrix - By comparing all pairs - This will be changed by Locality Sensitive Hashing for faster processing
def get_similarity_matrix(signatures,n,components):
#Generate Empty Matrix
similarity_matrix = np.zeros((n,n))
#Loop through and get common signature values
for i in range(0,n):
for j in range(i+1,n):
#Similarity Metric
similarity_score = len(set(signatures[i]).intersection(set(signatures[j]))) / len(set(signatures[i]).union(set(signatures[j])))
similarity_matrix[i][j] = similarity_score
return similarity_matrix
#Function to get the Similar Genes using information about candidate genes
def get_similar_genes(candidate_gene,genes):
#Extract Candidate Genes from the Band Hash Values where repetition is observed
#for band in candidate_gene:
#print len(band[0][0])
#print band[0][0]
return 1
#Main Function for calls
def main(components):
#Connection to Neo4j
graph = Graph("http://localhost:7474/db/data/cypher",password="rimo")
#Obtain the features for corresponding InterMine Model
feature_array = create_features()
#Obtaining the list of Genes
genes, length_genes = get_genes(graph)
#Treating each Gene as a set
sets = create_gene_documents(feature_array)
#Singles mapped to ID's
shingles = generate_shingle_id(sets)
#Get signature based on the Shingle ID's
signatures = generate_signatures(shingles,components)
#similarity_matrix = get_similarity_matrix(signatures,len(genes),components)
#return similarity_matrix
b = 10
r = 4
#Obtain the matrix formed due to Locality Sensitive Hashing
lsh_matrix = LSH(b,r,signatures,components)
#Candidate Genes for Close Inspection
candidate_gene = candidate_genes(lsh_matrix)
#Use the information regarding candidate genes to obtain similarity scores
final_similarity = get_similar_genes(candidate_gene, genes)
#Compute Similarity Matrix
matrix = main(40)