forked from insightbook/data-science-from-scratch
-
Notifications
You must be signed in to change notification settings - Fork 1
/
ch21_network_analysis.py
229 lines (172 loc) · 7.01 KB
/
ch21_network_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
from __future__ import division
import math, random, re
from collections import defaultdict, Counter, deque
from linear_algebra import dot, get_row, get_column, make_matrix, magnitude, scalar_multiply, shape, distance
from functools import partial
users = [
{ "id": 0, "name": "Hero" },
{ "id": 1, "name": "Dunn" },
{ "id": 2, "name": "Sue" },
{ "id": 3, "name": "Chi" },
{ "id": 4, "name": "Thor" },
{ "id": 5, "name": "Clive" },
{ "id": 6, "name": "Hicks" },
{ "id": 7, "name": "Devin" },
{ "id": 8, "name": "Kate" },
{ "id": 9, "name": "Klein" }
]
friendships = [(0, 1), (0, 2), (1, 2), (1, 3), (2, 3), (3, 4),
(4, 5), (5, 6), (5, 7), (6, 8), (7, 8), (8, 9)]
# give each user a friends list
for user in users:
user["friends"] = []
# and populate it
for i, j in friendships:
# this works because users[i] is the user whose id is i
users[i]["friends"].append(users[j]) # add i as a friend of j
users[j]["friends"].append(users[i]) # add j as a friend of i
#
# Betweenness Centrality
#
def shortest_paths_from(from_user):
# a dictionary from "user_id" to *all* shortest paths to that user
shortest_paths_to = { from_user["id"] : [[]] }
# a queue of (previous user, next user) that we need to check.
# starts out with all pairs (from_user, friend_of_from_user)
frontier = deque((from_user, friend)
for friend in from_user["friends"])
# keep going until we empty the queue
while frontier:
prev_user, user = frontier.popleft() # take from the beginning
user_id = user["id"]
# the fact that we're pulling from our queue means that
# necessarily we already know a shortest path to prev_user
paths_to_prev = shortest_paths_to[prev_user["id"]]
paths_via_prev = [path + [user_id] for path in paths_to_prev]
# it's possible we already know a shortest path to here as well
old_paths_to_here = shortest_paths_to.get(user_id, [])
# what's the shortest path to here that we've seen so far?
if old_paths_to_here:
min_path_length = len(old_paths_to_here[0])
else:
min_path_length = float('inf')
# any new paths to here that aren't too long
new_paths_to_here = [path_via_prev
for path_via_prev in paths_via_prev
if len(path_via_prev) <= min_path_length
and path_via_prev not in old_paths_to_here]
shortest_paths_to[user_id] = old_paths_to_here + new_paths_to_here
# add new neighbors to the frontier
frontier.extend((user, friend)
for friend in user["friends"]
if friend["id"] not in shortest_paths_to)
return shortest_paths_to
for user in users:
user["shortest_paths"] = shortest_paths_from(user)
for user in users:
user["betweenness_centrality"] = 0.0
for source in users:
source_id = source["id"]
for target_id, paths in source["shortest_paths"].iteritems():
if source_id < target_id: # don't double count
num_paths = len(paths) # how many shortest paths?
contrib = 1 / num_paths # contribution to centrality
for path in paths:
for id in path:
if id not in [source_id, target_id]:
users[id]["betweenness_centrality"] += contrib
#
# closeness centrality
#
def farness(user):
"""the sum of the lengths of the shortest paths to each other user"""
return sum(len(paths[0])
for paths in user["shortest_paths"].values())
for user in users:
user["closeness_centrality"] = 1 / farness(user)
#
# matrix multiplication
#
def matrix_product_entry(A, B, i, j):
return dot(get_row(A, i), get_column(B, j))
def matrix_multiply(A, B):
n1, k1 = shape(A)
n2, k2 = shape(B)
if k1 != n2:
raise ArithmeticError("incompatible shapes!")
return make_matrix(n1, k2, partial(matrix_product_entry, A, B))
def vector_as_matrix(v):
"""returns the vector v (represented as a list) as a n x 1 matrix"""
return [[v_i] for v_i in v]
def vector_from_matrix(v_as_matrix):
"""returns the n x 1 matrix as a list of values"""
return [row[0] for row in v_as_matrix]
def matrix_operate(A, v):
v_as_matrix = vector_as_matrix(v)
product = matrix_multiply(A, v_as_matrix)
return vector_from_matrix(product)
def find_eigenvector(A, tolerance=0.00001):
guess = [1 for __ in A]
while True:
result = matrix_operate(A, guess)
length = magnitude(result)
next_guess = scalar_multiply(1/length, result)
if distance(guess, next_guess) < tolerance:
return next_guess, length # eigenvector, eigenvalue
guess = next_guess
#
# eigenvector centrality
#
def entry_fn(i, j):
return 1 if (i, j) in friendships or (j, i) in friendships else 0
n = len(users)
adjacency_matrix = make_matrix(n, n, entry_fn)
eigenvector_centralities, _ = find_eigenvector(adjacency_matrix)
#
# directed graphs
#
endorsements = [(0, 1), (1, 0), (0, 2), (2, 0), (1, 2), (2, 1), (1, 3),
(2, 3), (3, 4), (5, 4), (5, 6), (7, 5), (6, 8), (8, 7), (8, 9)]
for user in users:
user["endorses"] = [] # add one list to track outgoing endorsements
user["endorsed_by"] = [] # and another to track endorsements
for source_id, target_id in endorsements:
users[source_id]["endorses"].append(users[target_id])
users[target_id]["endorsed_by"].append(users[source_id])
endorsements_by_id = [(user["id"], len(user["endorsed_by"]))
for user in users]
sorted(endorsements_by_id,
key=lambda (user_id, num_endorsements): num_endorsements,
reverse=True)
def page_rank(users, damping = 0.85, num_iters = 100):
# initially distribute PageRank evenly
num_users = len(users)
pr = { user["id"] : 1 / num_users for user in users }
# this is the small fraction of PageRank
# that each node gets each iteration
base_pr = (1 - damping) / num_users
for __ in range(num_iters):
next_pr = { user["id"] : base_pr for user in users }
for user in users:
# distribute PageRank to outgoing links
links_pr = pr[user["id"]] * damping
for endorsee in user["endorses"]:
next_pr[endorsee["id"]] += links_pr / len(user["endorses"])
pr = next_pr
return pr
if __name__ == "__main__":
print "Betweenness Centrality"
for user in users:
print user["id"], user["betweenness_centrality"]
print
print "Closeness Centrality"
for user in users:
print user["id"], user["closeness_centrality"]
print
print "Eigenvector Centrality"
for user_id, centrality in enumerate(eigenvector_centralities):
print user_id, centrality
print
print "PageRank"
for user_id, pr in page_rank(users).iteritems():
print user_id, pr