Skip to content

Commit

Permalink
Merge pull request #245 from hqzizania/upstream
Browse files Browse the repository at this point in the history
Add Rd files for sampleByKey() of [SPARKR-163] and sumRDD() of [SPARKR-92]
  • Loading branch information
shivaram authored and Davies Liu committed Apr 14, 2015
1 parent 9387402 commit 141efd8
Show file tree
Hide file tree
Showing 3 changed files with 83 additions and 1 deletion.
5 changes: 4 additions & 1 deletion R/pkg/R/pairRDD.R
Original file line number Diff line number Diff line change
Expand Up @@ -821,8 +821,11 @@ setMethod("subtractByKey",
function (v) { v[[1]] })
})

#' Return a subset of this RDD sampled by key.
#'
#' @description
#' \code{sampleByKey} return a subset RDD of the given RDD sampled by key
#' \code{sampleByKey} Create a sample of this RDD using variable sampling rates
#' for different keys as specified by fractions, a key to sampling rate map.
#'
#' @param x The RDD to sample elements by key, where each element is
#' list(K, V) or c(K, V).
Expand Down
52 changes: 52 additions & 0 deletions pkg/man/sampleByKey.Rd
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
% Generated by roxygen2 (4.1.0): do not edit by hand
% Please edit documentation in R/generics.R, R/pairRDD.R
\docType{methods}
\name{sampleByKey}
\alias{sampleByKey}
\alias{sampleByKey,RDD,logical,vector,integer-method}
\alias{sampleByKey,RDD-method}
\title{Return a subset of this RDD sampled by key.}
\usage{
sampleByKey(x, withReplacement, fractions, seed)

\S4method{sampleByKey}{RDD,logical,vector,integer}(x, withReplacement,
fractions, seed)
}
\arguments{
\item{x}{The RDD to sample elements by key, where each element is
list(K, V) or c(K, V).}

\item{withReplacement}{Sampling with replacement or not}

\item{seed}{Randomness seed value}

\item{fraction}{The (rough) sample target fraction}
}
\description{
\code{sampleByKey} Create a sample of this RDD using variable sampling rates
for different keys as specified by fractions, a key to sampling rate map.
}
\examples{
\dontrun{
sc <- sparkR.init()
rdd <- parallelize(sc, 1:3000)
pairs <- lapply(rdd, function(x) { if (x \%\% 3 == 0) list("a", x)
else { if (x \%\% 3 == 1) list("b", x) else list("c", x) }})
fractions <- list(a = 0.2, b = 0.1, c = 0.3)
sample <- sampleByKey(pairs, FALSE, fractions, 1618L)
100 < length(lookup(sample, "a")) && 300 > length(lookup(sample, "a")) # TRUE
50 < length(lookup(sample, "b")) && 150 > length(lookup(sample, "b")) # TRUE
200 < length(lookup(sample, "c")) && 400 > length(lookup(sample, "c")) # TRUE
lookup(sample, "a")[which.min(lookup(sample, "a"))] >= 0 # TRUE
lookup(sample, "a")[which.max(lookup(sample, "a"))] <= 2000 # TRUE
lookup(sample, "b")[which.min(lookup(sample, "b"))] >= 0 # TRUE
lookup(sample, "b")[which.max(lookup(sample, "b"))] <= 2000 # TRUE
lookup(sample, "c")[which.min(lookup(sample, "c"))] >= 0 # TRUE
lookup(sample, "c")[which.max(lookup(sample, "c"))] <= 2000 # TRUE
fractions <- list(a = 0.2, b = 0.1, c = 0.3, d = 0.4)
sample <- sampleByKey(pairs, FALSE, fractions, 1618L) # Key "d" will be ignored
fractions <- list(a = 0.2, b = 0.1)
sample <- sampleByKey(pairs, FALSE, fractions, 1618L) # KeyError: "c"
}
}

27 changes: 27 additions & 0 deletions pkg/man/sumRDD.Rd
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
% Generated by roxygen2 (4.1.0): do not edit by hand
% Please edit documentation in R/RDD.R, R/generics.R
\docType{methods}
\name{sumRDD,RDD-method}
\alias{sumRDD}
\alias{sumRDD,RDD}
\alias{sumRDD,RDD-method}
\title{Add up the elements in an RDD.}
\usage{
\S4method{sumRDD}{RDD}(x)

sumRDD(x)
}
\arguments{
\item{x}{The RDD to add up the elements in}
}
\description{
Add up the elements in an RDD.
}
\examples{
\dontrun{
sc <- sparkR.init()
rdd <- parallelize(sc, 1:10)
sumRDD(rdd) # 55
}
}

0 comments on commit 141efd8

Please sign in to comment.