Skip to content

Commit

Permalink
Merge pull request apache#230 from palantir/rk/upstream-no-squash
Browse files Browse the repository at this point in the history
Upstream merge
  • Loading branch information
robert3005 authored Jul 14, 2017
2 parents 6eae452 + d56829a commit 68ef3f5
Show file tree
Hide file tree
Showing 466 changed files with 11,621 additions and 9,075 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ R-unit-tests.log
R/unit-tests.out
R/cran-check.out
R/pkg/vignettes/sparkr-vignettes.html
R/pkg/tests/fulltests/Rplots.pdf
build/*.jar
build/apache-maven*
build/scala*
Expand Down
2 changes: 1 addition & 1 deletion LICENSE
Original file line number Diff line number Diff line change
Expand Up @@ -263,7 +263,7 @@ The text of each license is also included at licenses/LICENSE-[project].txt.
(New BSD license) Protocol Buffer Java API (org.spark-project.protobuf:protobuf-java:2.4.1-shaded - http://code.google.com/p/protobuf)
(The BSD License) Fortran to Java ARPACK (net.sourceforge.f2j:arpack_combined_all:0.1 - http://f2j.sourceforge.net)
(The BSD License) xmlenc Library (xmlenc:xmlenc:0.52 - http://xmlenc.sourceforge.net)
(The New BSD License) Py4J (net.sf.py4j:py4j:0.10.4 - http://py4j.sourceforge.net/)
(The New BSD License) Py4J (net.sf.py4j:py4j:0.10.6 - http://py4j.sourceforge.net/)
(Two-clause BSD-style license) JUnit-Interface (com.novocode:junit-interface:0.10 - http://github.com/szeiger/junit-interface/)
(BSD licence) sbt and sbt-launch-lib.bash
(BSD 3 Clause) d3.min.js (https://github.com/mbostock/d3/blob/master/LICENSE)
Expand Down
2 changes: 2 additions & 0 deletions R/pkg/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -429,6 +429,7 @@ export("structField",
"structField.character",
"print.structField",
"structType",
"structType.character",
"structType.jobj",
"structType.structField",
"print.structType")
Expand Down Expand Up @@ -465,5 +466,6 @@ S3method(print, summary.GBTRegressionModel)
S3method(print, summary.GBTClassificationModel)
S3method(structField, character)
S3method(structField, jobj)
S3method(structType, character)
S3method(structType, jobj)
S3method(structType, structField)
36 changes: 32 additions & 4 deletions R/pkg/R/DataFrame.R
Original file line number Diff line number Diff line change
Expand Up @@ -1391,6 +1391,10 @@ setMethod("summarize",
})

dapplyInternal <- function(x, func, schema) {
if (is.character(schema)) {
schema <- structType(schema)
}

packageNamesArr <- serialize(.sparkREnv[[".packages"]],
connection = NULL)

Expand All @@ -1408,6 +1412,8 @@ dapplyInternal <- function(x, func, schema) {
dataFrame(sdf)
}

setClassUnion("characterOrstructType", c("character", "structType"))

#' dapply
#'
#' Apply a function to each partition of a SparkDataFrame.
Expand All @@ -1418,10 +1424,11 @@ dapplyInternal <- function(x, func, schema) {
#' to each partition will be passed.
#' The output of func should be a R data.frame.
#' @param schema The schema of the resulting SparkDataFrame after the function is applied.
#' It must match the output of func.
#' It must match the output of func. Since Spark 2.3, the DDL-formatted string
#' is also supported for the schema.
#' @family SparkDataFrame functions
#' @rdname dapply
#' @aliases dapply,SparkDataFrame,function,structType-method
#' @aliases dapply,SparkDataFrame,function,characterOrstructType-method
#' @name dapply
#' @seealso \link{dapplyCollect}
#' @export
Expand All @@ -1444,6 +1451,17 @@ dapplyInternal <- function(x, func, schema) {
#' y <- cbind(y, y[1] + 1L)
#' },
#' schema)
#'
#' # The schema also can be specified in a DDL-formatted string.
#' schema <- "a INT, d DOUBLE, c STRING, d INT"
#' df1 <- dapply(
#' df,
#' function(x) {
#' y <- x[x[1] > 1, ]
#' y <- cbind(y, y[1] + 1L)
#' },
#' schema)
#'
#' collect(df1)
#' # the result
#' # a b c d
Expand All @@ -1452,7 +1470,7 @@ dapplyInternal <- function(x, func, schema) {
#' }
#' @note dapply since 2.0.0
setMethod("dapply",
signature(x = "SparkDataFrame", func = "function", schema = "structType"),
signature(x = "SparkDataFrame", func = "function", schema = "characterOrstructType"),
function(x, func, schema) {
dapplyInternal(x, func, schema)
})
Expand Down Expand Up @@ -1522,6 +1540,7 @@ setMethod("dapplyCollect",
#' @param schema the schema of the resulting SparkDataFrame after the function is applied.
#' The schema must match to output of \code{func}. It has to be defined for each
#' output column with preferred output column name and corresponding data type.
#' Since Spark 2.3, the DDL-formatted string is also supported for the schema.
#' @return A SparkDataFrame.
#' @family SparkDataFrame functions
#' @aliases gapply,SparkDataFrame-method
Expand All @@ -1541,7 +1560,7 @@ setMethod("dapplyCollect",
#'
#' Here our output contains three columns, the key which is a combination of two
#' columns with data types integer and string and the mean which is a double.
#' schema <- structType(structField("a", "integer"), structField("c", "string"),
#' schema <- structType(structField("a", "integer"), structField("c", "string"),
#' structField("avg", "double"))
#' result <- gapply(
#' df,
Expand All @@ -1550,6 +1569,15 @@ setMethod("dapplyCollect",
#' y <- data.frame(key, mean(x$b), stringsAsFactors = FALSE)
#' }, schema)
#'
#' The schema also can be specified in a DDL-formatted string.
#' schema <- "a INT, c STRING, avg DOUBLE"
#' result <- gapply(
#' df,
#' c("a", "c"),
#' function(key, x) {
#' y <- data.frame(key, mean(x$b), stringsAsFactors = FALSE)
#' }, schema)
#'
#' We can also group the data and afterwards call gapply on GroupedData.
#' For Example:
#' gdf <- group_by(df, "a", "c")
Expand Down
Loading

0 comments on commit 68ef3f5

Please sign in to comment.