Skip to content

Commit

Permalink
fixes for reduceByKeyLocally
Browse files Browse the repository at this point in the history
  • Loading branch information
lythesia committed Feb 7, 2015
1 parent b082a35 commit ba6f044
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 27 deletions.
33 changes: 22 additions & 11 deletions pkg/R/RDD.R
Original file line number Diff line number Diff line change
Expand Up @@ -1388,9 +1388,22 @@ setMethod("groupByKey",
function(item) {
item$hash <- as.character(hashCode(item[[1]]))
updateOrCreatePair(item, keys, vals, pred,
function(vs, v) c(vs, list(v)),
function(x) list(x))
function(acc, x) {
addItemToAccumulator(acc, x)
acc
},
function(x) {
acc <- initAccumulator()
addItemToAccumulator(acc, x)
acc
})
})
# extract out data field
vals <- eapply(vals,
function(x) {
length(x$data) <- x$counter
x$data
})
# Every key in the environment contains a list
# Convert that to list(K, Seq[V])
convertEnvsToList(keys, vals)
Expand Down Expand Up @@ -1438,7 +1451,7 @@ setMethod("reduceByKey",
lapply(part,
function(item) {
item$hash <- as.character(hashCode(item[[1]]))
updateOrCreatePair(item, keys, vals, pred, combineFunc, function(x) x)
updateOrCreatePair(item, keys, vals, pred, combineFunc, identity)
})
convertEnvsToList(keys, vals)
}
Expand All @@ -1451,13 +1464,12 @@ setMethod("reduceByKey",
#'
#' This function operates on RDDs where every element is of the form list(K, V) or c(K, V).
#' and merges the values for each key using an associative reduce function, but return the
#' results immediately to master as R list.
#' results immediately to the driver as an R list.
#'
#' @param rdd The RDD to reduce by key. Should be an RDD where each element is
#' list(K, V) or c(K, V).
#' @param combineFunc The associative reduce function to use.
#' @return An list where each element is list(K, V') where V' is the merged
#' value
#' @return A list of elements of type list(K, V') where V' is the merged value for each key
#' @rdname reduceByKeyLocally
#' @seealso reduceByKey
#' @export
Expand All @@ -1467,7 +1479,7 @@ setMethod("reduceByKey",
#' pairs <- list(list(1, 2), list(1.1, 3), list(1, 4))
#' rdd <- parallelize(sc, pairs)
#' reduced <- reduceByKeyLocally(rdd, "+")
#' reduced[[1]] # Should be a list(1, 6)
#' reduced # list(list(1, 6), list(1.1, 3))
#'}
setGeneric("reduceByKeyLocally",
function(rdd, combineFunc) {
Expand All @@ -1486,7 +1498,7 @@ setMethod("reduceByKeyLocally",
lapply(part,
function(item) {
item$hash <- as.character(hashCode(item[[1]]))
updateOrCreatePair(item, keys, vals, pred, combineFunc, function(x) x)
updateOrCreatePair(item, keys, vals, pred, combineFunc, identity)
})
list(list(keys, vals)) # return hash to avoid re-compute in merge
}
Expand All @@ -1498,7 +1510,7 @@ setMethod("reduceByKeyLocally",
function(name) {
item <- list(x[[1]][[name]], x[[2]][[name]])
item$hash <- name
updateOrCreatePair(item, accum[[1]], accum[[2]], pred, combineFunc, function(x) x)
updateOrCreatePair(item, accum[[1]], accum[[2]], pred, combineFunc, identity)
})
accum
}
Expand Down Expand Up @@ -1573,8 +1585,7 @@ setMethod("combineByKey",
lapply(part,
function(item) {
item$hash <- as.character(item[[1]])
updateOrCreatePair(item, keys, combiners, pred, mergeCombiners,
function(x) x)
updateOrCreatePair(item, keys, combiners, pred, mergeCombiners, identity)
})
convertEnvsToList(keys, combiners)
}
Expand Down
24 changes: 12 additions & 12 deletions pkg/R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -263,20 +263,20 @@ joinTaggedList <- function(tagged_list, cnull) {
# Utility function to reduce a key-value list with predicate
# Used in *ByKey functions
# param
# item key-val pair
# pair key-value pair
# keys/vals env of key/value with hashes
# pred predicate function
# update_fn update or merge function for existing pair, similar with `mergeVal` @combineByKey
# create_fn create function for new pair, similar with `createCombiner` @combinebykey
updateOrCreatePair <- function(item, keys, vals, pred, update_fn, create_fn) {
# assum hashval bind to `$hash`, key/val with index 1/2
hashVal <- item$hash
key <- item[[1]]
val <- item[[2]]
if (pred(item)) {
assign(hashVal, do.call(update_fn, list(get(hashVal, envir=vals), val)), envir=vals)
# updateOrCreatePred predicate function
# updateFn update or merge function for existing pair, similar with `mergeVal` @combineByKey
# createFn create function for new pair, similar with `createCombiner` @combinebykey
updateOrCreatePair <- function(pair, keys, vals, updateOrCreatePred, updateFn, createFn) {
# assume hashVal bind to `$hash`, key/val with index 1/2
hashVal <- pair$hash
key <- pair[[1]]
val <- pair[[2]]
if (updateOrCreatePred(pair)) {
assign(hashVal, do.call(updateFn, list(get(hashVal, envir = vals), val)), envir = vals)
} else {
assign(hashVal, do.call(create_fn, list(val)), envir=vals)
assign(hashVal, do.call(createFn, list(val)), envir = vals)
assign(hashVal, key, envir=keys)
}
}
Expand Down
7 changes: 3 additions & 4 deletions pkg/man/reduceByKeyLocally.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -18,21 +18,20 @@ list(K, V) or c(K, V).}
\item{combineFunc}{The associative reduce function to use.}
}
\value{
An list where each element is list(K, V') where V' is the merged
value
A list of elements of type list(K, V') where V' is the merged value for each key
}
\description{
This function operates on RDDs where every element is of the form list(K, V) or c(K, V).
and merges the values for each key using an associative reduce function, but return the
results immediately to master as R list.
results immediately to the driver as an R list.
}
\examples{
\dontrun{
sc <- sparkR.init()
pairs <- list(list(1, 2), list(1.1, 3), list(1, 4))
rdd <- parallelize(sc, pairs)
reduced <- reduceByKeyLocally(rdd, "+")
reduced[[1]] # Should be a list(1, 6)
reduced # list(list(1, 6), list(1.1, 3))
}
}
\seealso{
Expand Down

0 comments on commit ba6f044

Please sign in to comment.