Skip to content

Commit

Permalink
Merge pull request apache#253 from palantir/resync-apache
Browse files Browse the repository at this point in the history
  • Loading branch information
robert3005 authored Sep 15, 2017
2 parents 72d0f1e + 388cb48 commit 105e1b6
Show file tree
Hide file tree
Showing 538 changed files with 15,328 additions and 5,937 deletions.
2 changes: 1 addition & 1 deletion R/pkg/DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ Package: SparkR
Type: Package
Version: 2.3.0
Title: R Frontend for Apache Spark
Description: The SparkR package provides an R Frontend for Apache Spark.
Description: Provides an R Frontend for Apache Spark.
Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"),
email = "[email protected]"),
person("Xiangrui", "Meng", role = "aut",
Expand Down
1 change: 1 addition & 0 deletions R/pkg/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,7 @@ exportMethods("arrange",
"transform",
"union",
"unionAll",
"unionByName",
"unique",
"unpersist",
"where",
Expand Down
82 changes: 76 additions & 6 deletions R/pkg/R/DataFrame.R
Original file line number Diff line number Diff line change
Expand Up @@ -2683,7 +2683,7 @@ generateAliasesForIntersectedCols <- function (x, intersectedColNames, suffix) {
#' @rdname union
#' @name union
#' @aliases union,SparkDataFrame,SparkDataFrame-method
#' @seealso \link{rbind}
#' @seealso \link{rbind} \link{unionByName}
#' @export
#' @examples
#'\dontrun{
Expand Down Expand Up @@ -2714,6 +2714,40 @@ setMethod("unionAll",
union(x, y)
})

#' Return a new SparkDataFrame containing the union of rows, matched by column names
#'
#' Return a new SparkDataFrame containing the union of rows in this SparkDataFrame
#' and another SparkDataFrame. This is different from \code{union} function, and both
#' \code{UNION ALL} and \code{UNION DISTINCT} in SQL as column positions are not taken
#' into account. Input SparkDataFrames can have different data types in the schema.
#'
#' Note: This does not remove duplicate rows across the two SparkDataFrames.
#' This function resolves columns by name (not by position).
#'
#' @param x A SparkDataFrame
#' @param y A SparkDataFrame
#' @return A SparkDataFrame containing the result of the union.
#' @family SparkDataFrame functions
#' @rdname unionByName
#' @name unionByName
#' @aliases unionByName,SparkDataFrame,SparkDataFrame-method
#' @seealso \link{rbind} \link{union}
#' @export
#' @examples
#'\dontrun{
#' sparkR.session()
#' df1 <- select(createDataFrame(mtcars), "carb", "am", "gear")
#' df2 <- select(createDataFrame(mtcars), "am", "gear", "carb")
#' head(unionByName(df1, df2))
#' }
#' @note unionByName since 2.3.0
setMethod("unionByName",
signature(x = "SparkDataFrame", y = "SparkDataFrame"),
function(x, y) {
unioned <- callJMethod(x@sdf, "unionByName", y@sdf)
dataFrame(unioned)
})

#' Union two or more SparkDataFrames
#'
#' Union two or more SparkDataFrames by row. As in R's \code{rbind}, this method
Expand All @@ -2730,7 +2764,7 @@ setMethod("unionAll",
#' @aliases rbind,SparkDataFrame-method
#' @rdname rbind
#' @name rbind
#' @seealso \link{union}
#' @seealso \link{union} \link{unionByName}
#' @export
#' @examples
#'\dontrun{
Expand Down Expand Up @@ -2930,7 +2964,7 @@ setMethod("saveAsTable",
invisible(callJMethod(write, "saveAsTable", tableName))
})

#' summary
#' describe
#'
#' Computes statistics for numeric and string columns.
#' If no columns are given, this function computes statistics for all numerical or string columns.
Expand All @@ -2941,7 +2975,7 @@ setMethod("saveAsTable",
#' @return A SparkDataFrame.
#' @family SparkDataFrame functions
#' @aliases describe,SparkDataFrame,character-method describe,SparkDataFrame,ANY-method
#' @rdname summary
#' @rdname describe
#' @name describe
#' @export
#' @examples
Expand All @@ -2953,6 +2987,7 @@ setMethod("saveAsTable",
#' describe(df, "col1")
#' describe(df, "col1", "col2")
#' }
#' @seealso See \link{summary} for expanded statistics and control over which statistics to compute.
#' @note describe(SparkDataFrame, character) since 1.4.0
setMethod("describe",
signature(x = "SparkDataFrame", col = "character"),
Expand All @@ -2962,7 +2997,7 @@ setMethod("describe",
dataFrame(sdf)
})

#' @rdname summary
#' @rdname describe
#' @name describe
#' @aliases describe,SparkDataFrame-method
#' @note describe(SparkDataFrame) since 1.4.0
Expand All @@ -2973,15 +3008,50 @@ setMethod("describe",
dataFrame(sdf)
})

#' summary
#'
#' Computes specified statistics for numeric and string columns. Available statistics are:
#' \itemize{
#' \item count
#' \item mean
#' \item stddev
#' \item min
#' \item max
#' \item arbitrary approximate percentiles specified as a percentage (eg, "75%")
#' }
#' If no statistics are given, this function computes count, mean, stddev, min,
#' approximate quartiles (percentiles at 25%, 50%, and 75%), and max.
#' This function is meant for exploratory data analysis, as we make no guarantee about the
#' backward compatibility of the schema of the resulting Dataset. If you want to
#' programmatically compute summary statistics, use the \code{agg} function instead.
#'
#'
#' @param object a SparkDataFrame to be summarized.
#' @param ... (optional) statistics to be computed for all columns.
#' @return A SparkDataFrame.
#' @family SparkDataFrame functions
#' @rdname summary
#' @name summary
#' @aliases summary,SparkDataFrame-method
#' @export
#' @examples
#'\dontrun{
#' sparkR.session()
#' path <- "path/to/file.json"
#' df <- read.json(path)
#' summary(df)
#' summary(df, "min", "25%", "75%", "max")
#' summary(select(df, "age", "height"))
#' }
#' @note summary(SparkDataFrame) since 1.5.0
#' @note The statistics provided by \code{summary} were change in 2.3.0 use \link{describe} for previous defaults.
#' @seealso \link{describe}
setMethod("summary",
signature(object = "SparkDataFrame"),
function(object, ...) {
describe(object)
statisticsList <- list(...)
sdf <- callJMethod(object@sdf, "summary", statisticsList)
dataFrame(sdf)
})


Expand Down
16 changes: 13 additions & 3 deletions R/pkg/R/functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,8 @@ NULL
#'
#' @param x Column to compute on. Note the difference in the following methods:
#' \itemize{
#' \item \code{to_json}: it is the column containing the struct or array of the structs.
#' \item \code{to_json}: it is the column containing the struct, array of the structs,
#' the map or array of maps.
#' \item \code{from_json}: it is the column containing the JSON string.
#' }
#' @param ... additional argument(s). In \code{to_json} and \code{from_json}, this contains
Expand Down Expand Up @@ -1700,8 +1701,9 @@ setMethod("to_date",
})

#' @details
#' \code{to_json}: Converts a column containing a \code{structType} or array of \code{structType}
#' into a Column of JSON string. Resolving the Column can fail if an unsupported type is encountered.
#' \code{to_json}: Converts a column containing a \code{structType}, array of \code{structType},
#' a \code{mapType} or array of \code{mapType} into a Column of JSON string.
#' Resolving the Column can fail if an unsupported type is encountered.
#'
#' @rdname column_collection_functions
#' @aliases to_json to_json,Column-method
Expand All @@ -1715,6 +1717,14 @@ setMethod("to_date",
#'
#' # Converts an array of structs into a JSON array
#' df2 <- sql("SELECT array(named_struct('name', 'Bob'), named_struct('name', 'Alice')) as people")
#' df2 <- mutate(df2, people_json = to_json(df2$people))
#'
#' # Converts a map into a JSON object
#' df2 <- sql("SELECT map('name', 'Bob')) as people")
#' df2 <- mutate(df2, people_json = to_json(df2$people))
#'
#' # Converts an array of maps into a JSON array
#' df2 <- sql("SELECT array(map('name', 'Bob'), map('name', 'Alice')) as people")
#' df2 <- mutate(df2, people_json = to_json(df2$people))}
#' @note to_json since 2.2.0
setMethod("to_json", signature(x = "Column"),
Expand Down
6 changes: 5 additions & 1 deletion R/pkg/R/generics.R
Original file line number Diff line number Diff line change
Expand Up @@ -521,7 +521,7 @@ setGeneric("gapplyCollect", function(x, ...) { standardGeneric("gapplyCollect")
# @export
setGeneric("getNumPartitions", function(x) { standardGeneric("getNumPartitions") })

#' @rdname summary
#' @rdname describe
#' @export
setGeneric("describe", function(x, col, ...) { standardGeneric("describe") })

Expand Down Expand Up @@ -769,6 +769,10 @@ setGeneric("union", function(x, y) { standardGeneric("union") })
#' @export
setGeneric("unionAll", function(x, y) { standardGeneric("unionAll") })

#' @rdname unionByName
#' @export
setGeneric("unionByName", function(x, y) { standardGeneric("unionByName") })

#' @rdname unpersist
#' @export
setGeneric("unpersist", function(x, ...) { standardGeneric("unpersist") })
Expand Down
6 changes: 5 additions & 1 deletion R/pkg/R/install.R
Original file line number Diff line number Diff line change
Expand Up @@ -270,7 +270,11 @@ sparkCachePath <- function() {
if (is_windows()) {
winAppPath <- Sys.getenv("LOCALAPPDATA", unset = NA)
if (is.na(winAppPath)) {
stop(paste("%LOCALAPPDATA% not found.",
message("%LOCALAPPDATA% not found. Falling back to %USERPROFILE%.")
winAppPath <- Sys.getenv("USERPROFILE", unset = NA)
}
if (is.na(winAppPath)) {
stop(paste("%LOCALAPPDATA% and %USERPROFILE% not found.",
"Please define the environment variable",
"or restart and enter an installation path in localDir."))
} else {
Expand Down
36 changes: 19 additions & 17 deletions R/pkg/tests/fulltests/test_mllib_tree.R
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ test_that("spark.gbt", {
# label must be binary - GBTClassifier currently only supports binary classification.
iris2 <- iris[iris$Species != "virginica", ]
data <- suppressWarnings(createDataFrame(iris2))
model <- spark.gbt(data, Species ~ Petal_Length + Petal_Width, "classification")
model <- spark.gbt(data, Species ~ Petal_Length + Petal_Width, "classification", seed = 12)
stats <- summary(model)
expect_equal(stats$numFeatures, 2)
expect_equal(stats$numTrees, 20)
Expand Down Expand Up @@ -94,7 +94,7 @@ test_that("spark.gbt", {

iris2$NumericSpecies <- ifelse(iris2$Species == "setosa", 0, 1)
df <- suppressWarnings(createDataFrame(iris2))
m <- spark.gbt(df, NumericSpecies ~ ., type = "classification")
m <- spark.gbt(df, NumericSpecies ~ ., type = "classification", seed = 12)
s <- summary(m)
# test numeric prediction values
expect_equal(iris2$NumericSpecies, as.double(collect(predict(m, df))$prediction))
Expand All @@ -106,7 +106,7 @@ test_that("spark.gbt", {
if (windows_with_hadoop()) {
data <- read.df(absoluteSparkPath("data/mllib/sample_binary_classification_data.txt"),
source = "libsvm")
model <- spark.gbt(data, label ~ features, "classification")
model <- spark.gbt(data, label ~ features, "classification", seed = 12)
expect_equal(summary(model)$numFeatures, 692)
}

Expand All @@ -117,10 +117,11 @@ test_that("spark.gbt", {
trainidxs <- base::sample(nrow(data), nrow(data) * 0.7)
traindf <- as.DataFrame(data[trainidxs, ])
testdf <- as.DataFrame(rbind(data[-trainidxs, ], c(0, "the other")))
model <- spark.gbt(traindf, clicked ~ ., type = "classification")
model <- spark.gbt(traindf, clicked ~ ., type = "classification", seed = 23)
predictions <- predict(model, testdf)
expect_error(collect(predictions))
model <- spark.gbt(traindf, clicked ~ ., type = "classification", handleInvalid = "keep")
model <- spark.gbt(traindf, clicked ~ ., type = "classification", handleInvalid = "keep",
seed = 23)
predictions <- predict(model, testdf)
expect_equal(class(collect(predictions)$clicked[1]), "character")
})
Expand All @@ -129,7 +130,7 @@ test_that("spark.randomForest", {
# regression
data <- suppressWarnings(createDataFrame(longley))
model <- spark.randomForest(data, Employed ~ ., "regression", maxDepth = 5, maxBins = 16,
numTrees = 1)
numTrees = 1, seed = 1)

predictions <- collect(predict(model, data))
expect_equal(predictions$prediction, c(60.323, 61.122, 60.171, 61.187,
Expand Down Expand Up @@ -177,7 +178,7 @@ test_that("spark.randomForest", {
# classification
data <- suppressWarnings(createDataFrame(iris))
model <- spark.randomForest(data, Species ~ Petal_Length + Petal_Width, "classification",
maxDepth = 5, maxBins = 16)
maxDepth = 5, maxBins = 16, seed = 123)

stats <- summary(model)
expect_equal(stats$numFeatures, 2)
Expand Down Expand Up @@ -215,7 +216,7 @@ test_that("spark.randomForest", {
iris$NumericSpecies <- lapply(iris$Species, labelToIndex)
data <- suppressWarnings(createDataFrame(iris[-5]))
model <- spark.randomForest(data, NumericSpecies ~ Petal_Length + Petal_Width, "classification",
maxDepth = 5, maxBins = 16)
maxDepth = 5, maxBins = 16, seed = 123)
stats <- summary(model)
expect_equal(stats$numFeatures, 2)
expect_equal(stats$numTrees, 20)
Expand All @@ -234,28 +235,29 @@ test_that("spark.randomForest", {
traindf <- as.DataFrame(data[trainidxs, ])
testdf <- as.DataFrame(rbind(data[-trainidxs, ], c(0, "the other")))
model <- spark.randomForest(traindf, clicked ~ ., type = "classification",
maxDepth = 10, maxBins = 10, numTrees = 10)
maxDepth = 10, maxBins = 10, numTrees = 10, seed = 123)
predictions <- predict(model, testdf)
expect_error(collect(predictions))
model <- spark.randomForest(traindf, clicked ~ ., type = "classification",
maxDepth = 10, maxBins = 10, numTrees = 10,
handleInvalid = "keep")
handleInvalid = "keep", seed = 123)
predictions <- predict(model, testdf)
expect_equal(class(collect(predictions)$clicked[1]), "character")

# spark.randomForest classification can work on libsvm data
if (windows_with_hadoop()) {
data <- read.df(absoluteSparkPath("data/mllib/sample_multiclass_classification_data.txt"),
source = "libsvm")
model <- spark.randomForest(data, label ~ features, "classification")
model <- spark.randomForest(data, label ~ features, "classification", seed = 123)
expect_equal(summary(model)$numFeatures, 4)
}
})

test_that("spark.decisionTree", {
# regression
data <- suppressWarnings(createDataFrame(longley))
model <- spark.decisionTree(data, Employed ~ ., "regression", maxDepth = 5, maxBins = 16)
model <- spark.decisionTree(data, Employed ~ ., "regression", maxDepth = 5, maxBins = 16,
seed = 42)

predictions <- collect(predict(model, data))
expect_equal(predictions$prediction, c(60.323, 61.122, 60.171, 61.187,
Expand Down Expand Up @@ -288,7 +290,7 @@ test_that("spark.decisionTree", {
# classification
data <- suppressWarnings(createDataFrame(iris))
model <- spark.decisionTree(data, Species ~ Petal_Length + Petal_Width, "classification",
maxDepth = 5, maxBins = 16)
maxDepth = 5, maxBins = 16, seed = 43)

stats <- summary(model)
expect_equal(stats$numFeatures, 2)
Expand Down Expand Up @@ -325,7 +327,7 @@ test_that("spark.decisionTree", {
iris$NumericSpecies <- lapply(iris$Species, labelToIndex)
data <- suppressWarnings(createDataFrame(iris[-5]))
model <- spark.decisionTree(data, NumericSpecies ~ Petal_Length + Petal_Width, "classification",
maxDepth = 5, maxBins = 16)
maxDepth = 5, maxBins = 16, seed = 44)
stats <- summary(model)
expect_equal(stats$numFeatures, 2)
expect_equal(stats$maxDepth, 5)
Expand All @@ -339,7 +341,7 @@ test_that("spark.decisionTree", {
if (windows_with_hadoop()) {
data <- read.df(absoluteSparkPath("data/mllib/sample_multiclass_classification_data.txt"),
source = "libsvm")
model <- spark.decisionTree(data, label ~ features, "classification")
model <- spark.decisionTree(data, label ~ features, "classification", seed = 45)
expect_equal(summary(model)$numFeatures, 4)
}

Expand All @@ -351,11 +353,11 @@ test_that("spark.decisionTree", {
traindf <- as.DataFrame(data[trainidxs, ])
testdf <- as.DataFrame(rbind(data[-trainidxs, ], c(0, "the other")))
model <- spark.decisionTree(traindf, clicked ~ ., type = "classification",
maxDepth = 5, maxBins = 16)
maxDepth = 5, maxBins = 16, seed = 46)
predictions <- predict(model, testdf)
expect_error(collect(predictions))
model <- spark.decisionTree(traindf, clicked ~ ., type = "classification",
maxDepth = 5, maxBins = 16, handleInvalid = "keep")
maxDepth = 5, maxBins = 16, handleInvalid = "keep", seed = 46)
predictions <- predict(model, testdf)
expect_equal(class(collect(predictions)$clicked[1]), "character")
})
Expand Down
Loading

0 comments on commit 105e1b6

Please sign in to comment.