-
Notifications
You must be signed in to change notification settings - Fork 149
/
Copy pathcluster.mr.R
75 lines (68 loc) · 2.02 KB
/
cluster.mr.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# Copyright 2013 Revolution Analytics
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
library(cluster)
napply = function(ll, a.name) lapply(ll, function(l) l[[a.name]])
cluster.mr =
function(data, subcluster, merge)
mapreduce(
data,
map =
function(., data.chunk) {
rmr.str(data.chunk)
keyval(1, list(subcluster(data.chunk)))},
combine = T,
reduce =
function(., clusters)
keyval(1, list(merge(clusters))))
subclara =
function(n.clusters)
function(data) {
clust =
clara(
rmr.str(data),
n.clusters,
keep.data=F)
list(
size = nrow(data),
sample = data[clust$sample,],
medoids = clust$medoids)}
merge.clara =
function(n.clusters)
function(clusters){
sizes = unlist(napply(clusters, 'size'))
total.size = sum(sizes)
size.range = range(sizes)
size.ratio = max(size.range)/min(size.range)
clust =
subclara(n.clusters)(
do.call(
rbind,
lapply(
clusters,
function(x)
x$sample[
sample(
1:nrow(x$sample),
round(nrow(x$sample) * size.ratio),
replace = TRUE),
])))
clust$size = total.size
clust}
clara.mr = function(data, n.clusters)
values(
from.dfs(
cluster.mr(
data,
subclara(n.clusters),
merge.clara(n.clusters))))[[1]]