Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/master' into pr-18338
Browse files Browse the repository at this point in the history
  • Loading branch information
gatorsmile committed Jun 30, 2017
2 parents f723cb6 + 528c928 commit 1f32ed7
Show file tree
Hide file tree
Showing 394 changed files with 9,148 additions and 5,264 deletions.
6 changes: 1 addition & 5 deletions R/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -66,11 +66,7 @@ To run one of them, use `./bin/spark-submit <filename> <args>`. For example:
```bash
./bin/spark-submit examples/src/main/r/dataframe.R
```
You can also run the unit tests for SparkR by running. You need to install the [testthat](http://cran.r-project.org/web/packages/testthat/index.html) package first:
```bash
R -e 'install.packages("testthat", repos="http://cran.us.r-project.org")'
./R/run-tests.sh
```
You can run R unit tests by following the instructions under [Running R Tests](http://spark.apache.org/docs/latest/building-spark.html#running-r-tests).

### Running on YARN

Expand Down
3 changes: 1 addition & 2 deletions R/WINDOWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,9 @@ To run the SparkR unit tests on Windows, the following steps are required —ass

4. Set the environment variable `HADOOP_HOME` to the full path to the newly created `hadoop` directory.

5. Run unit tests for SparkR by running the command below. You need to install the [testthat](http://cran.r-project.org/web/packages/testthat/index.html) package first:
5. Run unit tests for SparkR by running the command below. You need to install the needed packages following the instructions under [Running R Tests](http://spark.apache.org/docs/latest/building-spark.html#running-r-tests) first:

```
R -e "install.packages('testthat', repos='http://cran.us.r-project.org')"
.\bin\spark-submit2.cmd --conf spark.hadoop.fs.defaultFS="file:///" R\pkg\tests\run-all.R
```

2 changes: 1 addition & 1 deletion R/pkg/DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: SparkR
Type: Package
Version: 2.2.0
Version: 2.3.0
Title: R Frontend for Apache Spark
Description: The SparkR package provides an R Frontend for Apache Spark.
Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"),
Expand Down
4 changes: 3 additions & 1 deletion R/pkg/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,8 @@ exportMethods("glm",
# Job group lifecycle management methods
export("setJobGroup",
"clearJobGroup",
"cancelJobGroup")
"cancelJobGroup",
"setJobDescription")

# Export Utility methods
export("setLogLevel")
Expand Down Expand Up @@ -357,6 +358,7 @@ exportMethods("%<=>%",
"to_utc_timestamp",
"translate",
"trim",
"trunc",
"unbase64",
"unhex",
"unix_timestamp",
Expand Down
44 changes: 29 additions & 15 deletions R/pkg/R/SQLContext.R
Original file line number Diff line number Diff line change
Expand Up @@ -334,7 +334,7 @@ setMethod("toDF", signature(x = "RDD"),
#'
#' Loads a JSON file, returning the result as a SparkDataFrame
#' By default, (\href{http://jsonlines.org/}{JSON Lines text format or newline-delimited JSON}
#' ) is supported. For JSON (one record per file), set a named property \code{wholeFile} to
#' ) is supported. For JSON (one record per file), set a named property \code{multiLine} to
#' \code{TRUE}.
#' It goes through the entire dataset once to determine the schema.
#'
Expand All @@ -348,7 +348,7 @@ setMethod("toDF", signature(x = "RDD"),
#' sparkR.session()
#' path <- "path/to/file.json"
#' df <- read.json(path)
#' df <- read.json(path, wholeFile = TRUE)
#' df <- read.json(path, multiLine = TRUE)
#' df <- jsonFile(path)
#' }
#' @name read.json
Expand Down Expand Up @@ -584,7 +584,7 @@ tableToDF <- function(tableName) {
#'
#' @param path The path of files to load
#' @param source The name of external data source
#' @param schema The data schema defined in structType
#' @param schema The data schema defined in structType or a DDL-formatted string.
#' @param na.strings Default string value for NA when source is "csv"
#' @param ... additional external data source specific named properties.
#' @return SparkDataFrame
Expand All @@ -598,8 +598,10 @@ tableToDF <- function(tableName) {
#' df1 <- read.df("path/to/file.json", source = "json")
#' schema <- structType(structField("name", "string"),
#' structField("info", "map<string,double>"))
#' df2 <- read.df(mapTypeJsonPath, "json", schema, wholeFile = TRUE)
#' df2 <- read.df(mapTypeJsonPath, "json", schema, multiLine = TRUE)
#' df3 <- loadDF("data/test_table", "parquet", mergeSchema = "true")
#' stringSchema <- "name STRING, info MAP<STRING, DOUBLE>"
#' df4 <- read.df(mapTypeJsonPath, "json", stringSchema, multiLine = TRUE)
#' }
#' @name read.df
#' @method read.df default
Expand All @@ -623,14 +625,19 @@ read.df.default <- function(path = NULL, source = NULL, schema = NULL, na.string
if (source == "csv" && is.null(options[["nullValue"]])) {
options[["nullValue"]] <- na.strings
}
read <- callJMethod(sparkSession, "read")
read <- callJMethod(read, "format", source)
if (!is.null(schema)) {
stopifnot(class(schema) == "structType")
sdf <- handledCallJStatic("org.apache.spark.sql.api.r.SQLUtils", "loadDF", sparkSession,
source, schema$jobj, options)
} else {
sdf <- handledCallJStatic("org.apache.spark.sql.api.r.SQLUtils", "loadDF", sparkSession,
source, options)
if (class(schema) == "structType") {
read <- callJMethod(read, "schema", schema$jobj)
} else if (is.character(schema)) {
read <- callJMethod(read, "schema", schema)
} else {
stop("schema should be structType or character.")
}
}
read <- callJMethod(read, "options", options)
sdf <- handledCallJMethod(read, "load")
dataFrame(sdf)
}

Expand Down Expand Up @@ -717,8 +724,8 @@ read.jdbc <- function(url, tableName,
#' "spark.sql.sources.default" will be used.
#'
#' @param source The name of external data source
#' @param schema The data schema defined in structType, this is required for file-based streaming
#' data source
#' @param schema The data schema defined in structType or a DDL-formatted string, this is
#' required for file-based streaming data source
#' @param ... additional external data source specific named options, for instance \code{path} for
#' file-based streaming data source
#' @return SparkDataFrame
Expand All @@ -733,6 +740,8 @@ read.jdbc <- function(url, tableName,
#' q <- write.stream(df, "text", path = "/home/user/out", checkpointLocation = "/home/user/cp")
#'
#' df <- read.stream("json", path = jsonDir, schema = schema, maxFilesPerTrigger = 1)
#' stringSchema <- "name STRING, info MAP<STRING, DOUBLE>"
#' df1 <- read.stream("json", path = jsonDir, schema = stringSchema, maxFilesPerTrigger = 1)
#' }
#' @name read.stream
#' @note read.stream since 2.2.0
Expand All @@ -750,10 +759,15 @@ read.stream <- function(source = NULL, schema = NULL, ...) {
read <- callJMethod(sparkSession, "readStream")
read <- callJMethod(read, "format", source)
if (!is.null(schema)) {
stopifnot(class(schema) == "structType")
read <- callJMethod(read, "schema", schema$jobj)
if (class(schema) == "structType") {
read <- callJMethod(read, "schema", schema$jobj)
} else if (is.character(schema)) {
read <- callJMethod(read, "schema", schema)
} else {
stop("schema should be structType or character.")
}
}
read <- callJMethod(read, "options", options)
sdf <- handledCallJMethod(read, "load")
dataFrame(callJMethod(sdf, "toDF"))
dataFrame(sdf)
}
Loading

0 comments on commit 1f32ed7

Please sign in to comment.