forked from insightbook/data-science-from-scratch
-
Notifications
You must be signed in to change notification settings - Fork 1
/
ch19_clustering.py
202 lines (156 loc) · 6.29 KB
/
ch19_clustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
from __future__ import division
from linear_algebra import squared_distance, vector_mean, distance
import math, random
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
class KMeans:
"""performs k-means clustering"""
def __init__(self, k):
self.k = k # number of clusters
self.means = None # means of clusters
def classify(self, input):
"""return the index of the cluster closest to the input"""
return min(range(self.k),
key=lambda i: squared_distance(input, self.means[i]))
def train(self, inputs):
self.means = random.sample(inputs, self.k)
assignments = None
while True:
# Find new assignments
new_assignments = map(self.classify, inputs)
# If no assignments have changed, we're done.
if assignments == new_assignments:
return
# Otherwise keep the new assignments,
assignments = new_assignments
for i in range(self.k):
i_points = [p for p, a in zip(inputs, assignments) if a == i]
# avoid divide-by-zero if i_points is empty
if i_points:
self.means[i] = vector_mean(i_points)
def squared_clustering_errors(inputs, k):
"""finds the total squared error from k-means clustering the inputs"""
clusterer = KMeans(k)
clusterer.train(inputs)
means = clusterer.means
assignments = map(clusterer.classify, inputs)
return sum(squared_distance(input,means[cluster])
for input, cluster in zip(inputs, assignments))
def plot_squared_clustering_errors(plt):
ks = range(1, len(inputs) + 1)
errors = [squared_clustering_errors(inputs, k) for k in ks]
plt.plot(ks, errors)
plt.xticks(ks)
plt.xlabel("k")
plt.ylabel("total squared error")
plt.show()
#
# using clustering to recolor an image
#
def recolor_image(input_file, k=5):
img = mpimg.imread(path_to_png_file)
pixels = [pixel for row in img for pixel in row]
clusterer = KMeans(k)
clusterer.train(pixels) # this might take a while
def recolor(pixel):
cluster = clusterer.classify(pixel) # index of the closest cluster
return clusterer.means[cluster] # mean of the closest cluster
new_img = [[recolor(pixel) for pixel in row]
for row in img]
plt.imshow(new_img)
plt.axis('off')
plt.show()
#
# hierarchical clustering
#
def is_leaf(cluster):
"""a cluster is a leaf if it has length 1"""
return len(cluster) == 1
def get_children(cluster):
"""returns the two children of this cluster if it's a merged cluster;
raises an exception if this is a leaf cluster"""
if is_leaf(cluster):
raise TypeError("a leaf cluster has no children")
else:
return cluster[1]
def get_values(cluster):
"""returns the value in this cluster (if it's a leaf cluster)
or all the values in the leaf clusters below it (if it's not)"""
if is_leaf(cluster):
return cluster # is already a 1-tuple containing value
else:
return [value
for child in get_children(cluster)
for value in get_values(child)]
def cluster_distance(cluster1, cluster2, distance_agg=min):
"""finds the aggregate distance between elements of cluster1
and elements of cluster2"""
return distance_agg([distance(input1, input2)
for input1 in get_values(cluster1)
for input2 in get_values(cluster2)])
def get_merge_order(cluster):
if is_leaf(cluster):
return float('inf')
else:
return cluster[0] # merge_order is first element of 2-tuple
def bottom_up_cluster(inputs, distance_agg=min):
# start with every input a leaf cluster / 1-tuple
clusters = [(input,) for input in inputs]
# as long as we have more than one cluster left...
while len(clusters) > 1:
# find the two closest clusters
c1, c2 = min([(cluster1, cluster2)
for i, cluster1 in enumerate(clusters)
for cluster2 in clusters[:i]],
key=lambda (x, y): cluster_distance(x, y, distance_agg))
# remove them from the list of clusters
clusters = [c for c in clusters if c != c1 and c != c2]
# merge them, using merge_order = # of clusters left
merged_cluster = (len(clusters), [c1, c2])
# and add their merge
clusters.append(merged_cluster)
# when there's only one cluster left, return it
return clusters[0]
def generate_clusters(base_cluster, num_clusters):
# start with a list with just the base cluster
clusters = [base_cluster]
# as long as we don't have enough clusters yet...
while len(clusters) < num_clusters:
# choose the last-merged of our clusters
next_cluster = min(clusters, key=get_merge_order)
# remove it from the list
clusters = [c for c in clusters if c != next_cluster]
# and add its children to the list (i.e., unmerge it)
clusters.extend(get_children(next_cluster))
# once we have enough clusters...
return clusters
if __name__ == "__main__":
inputs = [[-14,-5],[13,13],[20,23],[-19,-11],[-9,-16],[21,27],[-49,15],[26,13],[-46,5],[-34,-1],[11,15],[-49,0],[-22,-16],[19,28],[-12,-8],[-13,-19],[-41,8],[-11,-6],[-25,-9],[-18,-3]]
random.seed(0) # so you get the same results as me
clusterer = KMeans(3)
clusterer.train(inputs)
print "3-means:"
print clusterer.means
print
random.seed(0)
clusterer = KMeans(2)
clusterer.train(inputs)
print "2-means:"
print clusterer.means
print
print "errors as a function of k"
for k in range(1, len(inputs) + 1):
print k, squared_clustering_errors(inputs, k)
print
print "bottom up hierarchical clustering"
base_cluster = bottom_up_cluster(inputs)
print base_cluster
print
print "three clusters, min:"
for cluster in generate_clusters(base_cluster, 3):
print get_values(cluster)
print
print "three clusters, max:"
base_cluster = bottom_up_cluster(inputs, max)
for cluster in generate_clusters(base_cluster, 3):
print get_values(cluster)