Merge remote-tracking branch 'upstream/master' into pr-18338

apache · Jun 30, 2017 · 1f32ed7 · 1f32ed7
2 parents f723cb6 + 528c928
commit 1f32ed7
Show file tree

Hide file tree

Showing 394 changed files with 9,148 additions and 5,264 deletions.
diff --git a/R/README.md b/R/README.md
@@ -66,11 +66,7 @@ To run one of them, use `./bin/spark-submit <filename> <args>`. For example:
 ```bash
 ./bin/spark-submit examples/src/main/r/dataframe.R
 ```
-You can also run the unit tests for SparkR by running. You need to install the [testthat](http://cran.r-project.org/web/packages/testthat/index.html) package first:
-```bash
-R -e 'install.packages("testthat", repos="http://cran.us.r-project.org")'
-./R/run-tests.sh
-```
+You can run R unit tests by following the instructions under [Running R Tests](http://spark.apache.org/docs/latest/building-spark.html#running-r-tests).
 
 ### Running on YARN
 

diff --git a/R/WINDOWS.md b/R/WINDOWS.md
@@ -34,10 +34,9 @@ To run the SparkR unit tests on Windows, the following steps are required —ass
 
 4. Set the environment variable `HADOOP_HOME` to the full path to the newly created `hadoop` directory.
 
-5. Run unit tests for SparkR by running the command below. You need to install the [testthat](http://cran.r-project.org/web/packages/testthat/index.html) package first:
+5. Run unit tests for SparkR by running the command below. You need to install the needed packages following the instructions under [Running R Tests](http://spark.apache.org/docs/latest/building-spark.html#running-r-tests) first:
 
     ```
-    R -e "install.packages('testthat', repos='http://cran.us.r-project.org')"
     .\bin\spark-submit2.cmd --conf spark.hadoop.fs.defaultFS="file:///" R\pkg\tests\run-all.R
     ```
 
diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: SparkR
 Type: Package
-Version: 2.2.0
+Version: 2.3.0
 Title: R Frontend for Apache Spark
 Description: The SparkR package provides an R Frontend for Apache Spark.
 Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"),

diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
@@ -75,7 +75,8 @@ exportMethods("glm",
 # Job group lifecycle management methods
 export("setJobGroup",
        "clearJobGroup",
-       "cancelJobGroup")
+       "cancelJobGroup",
+       "setJobDescription")
 
 # Export Utility methods
 export("setLogLevel")
@@ -357,6 +358,7 @@ exportMethods("%<=>%",
               "to_utc_timestamp",
               "translate",
               "trim",
+              "trunc",
               "unbase64",
               "unhex",
               "unix_timestamp",

diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R
@@ -334,7 +334,7 @@ setMethod("toDF", signature(x = "RDD"),
 #'
 #' Loads a JSON file, returning the result as a SparkDataFrame
 #' By default, (\href{http://jsonlines.org/}{JSON Lines text format or newline-delimited JSON}
-#' ) is supported. For JSON (one record per file), set a named property \code{wholeFile} to
+#' ) is supported. For JSON (one record per file), set a named property \code{multiLine} to
 #' \code{TRUE}.
 #' It goes through the entire dataset once to determine the schema.
 #'
@@ -348,7 +348,7 @@ setMethod("toDF", signature(x = "RDD"),
 #' sparkR.session()
 #' path <- "path/to/file.json"
 #' df <- read.json(path)
-#' df <- read.json(path, wholeFile = TRUE)
+#' df <- read.json(path, multiLine = TRUE)
 #' df <- jsonFile(path)
 #' }
 #' @name read.json
@@ -584,7 +584,7 @@ tableToDF <- function(tableName) {
 #'
 #' @param path The path of files to load
 #' @param source The name of external data source
-#' @param schema The data schema defined in structType
+#' @param schema The data schema defined in structType or a DDL-formatted string.
 #' @param na.strings Default string value for NA when source is "csv"
 #' @param ... additional external data source specific named properties.
 #' @return SparkDataFrame
@@ -598,8 +598,10 @@ tableToDF <- function(tableName) {
 #' df1 <- read.df("path/to/file.json", source = "json")
 #' schema <- structType(structField("name", "string"),
 #'                      structField("info", "map<string,double>"))
-#' df2 <- read.df(mapTypeJsonPath, "json", schema, wholeFile = TRUE)
+#' df2 <- read.df(mapTypeJsonPath, "json", schema, multiLine = TRUE)
 #' df3 <- loadDF("data/test_table", "parquet", mergeSchema = "true")
+#' stringSchema <- "name STRING, info MAP<STRING, DOUBLE>"
+#' df4 <- read.df(mapTypeJsonPath, "json", stringSchema, multiLine = TRUE)
 #' }
 #' @name read.df
 #' @method read.df default
@@ -623,14 +625,19 @@ read.df.default <- function(path = NULL, source = NULL, schema = NULL, na.string
   if (source == "csv" && is.null(options[["nullValue"]])) {
     options[["nullValue"]] <- na.strings
   }
+  read <- callJMethod(sparkSession, "read")
+  read <- callJMethod(read, "format", source)
   if (!is.null(schema)) {
-    stopifnot(class(schema) == "structType")
-    sdf <- handledCallJStatic("org.apache.spark.sql.api.r.SQLUtils", "loadDF", sparkSession,
-                              source, schema$jobj, options)
-  } else {
-    sdf <- handledCallJStatic("org.apache.spark.sql.api.r.SQLUtils", "loadDF", sparkSession,
-                              source, options)
+    if (class(schema) == "structType") {
+      read <- callJMethod(read, "schema", schema$jobj)
+    } else if (is.character(schema)) {
+      read <- callJMethod(read, "schema", schema)
+    } else {
+      stop("schema should be structType or character.")
+    }
   }
+  read <- callJMethod(read, "options", options)
+  sdf <- handledCallJMethod(read, "load")
   dataFrame(sdf)
 }
 
@@ -717,8 +724,8 @@ read.jdbc <- function(url, tableName,
 #' "spark.sql.sources.default" will be used.
 #'
 #' @param source The name of external data source
-#' @param schema The data schema defined in structType, this is required for file-based streaming
-#'               data source
+#' @param schema The data schema defined in structType or a DDL-formatted string, this is
+#'               required for file-based streaming data source
 #' @param ... additional external data source specific named options, for instance \code{path} for
 #'        file-based streaming data source
 #' @return SparkDataFrame
@@ -733,6 +740,8 @@ read.jdbc <- function(url, tableName,
 #' q <- write.stream(df, "text", path = "/home/user/out", checkpointLocation = "/home/user/cp")
 #'
 #' df <- read.stream("json", path = jsonDir, schema = schema, maxFilesPerTrigger = 1)
+#' stringSchema <- "name STRING, info MAP<STRING, DOUBLE>"
+#' df1 <- read.stream("json", path = jsonDir, schema = stringSchema, maxFilesPerTrigger = 1)
 #' }
 #' @name read.stream
 #' @note read.stream since 2.2.0
@@ -750,10 +759,15 @@ read.stream <- function(source = NULL, schema = NULL, ...) {
   read <- callJMethod(sparkSession, "readStream")
   read <- callJMethod(read, "format", source)
   if (!is.null(schema)) {
-    stopifnot(class(schema) == "structType")
-    read <- callJMethod(read, "schema", schema$jobj)
+    if (class(schema) == "structType") {
+      read <- callJMethod(read, "schema", schema$jobj)
+    } else if (is.character(schema)) {
+      read <- callJMethod(read, "schema", schema)
+    } else {
+      stop("schema should be structType or character.")
+    }
   }
   read <- callJMethod(read, "options", options)
   sdf <- handledCallJMethod(read, "load")
-  dataFrame(callJMethod(sdf, "toDF"))
+  dataFrame(sdf)
 }