From 5fd20b66ffe18c05cf257af7f30d32464d2fe8e7 Mon Sep 17 00:00:00 2001 From: Kai Jiang Date: Thu, 16 Jun 2016 19:39:33 -0700 Subject: [PATCH] [SPARK-15490][R][DOC] SparkR 2.0 QA: New R APIs and API docs for non-MLib changes ## What changes were proposed in this pull request? R Docs changes include typos, format, layout. ## How was this patch tested? Test locally. Author: Kai Jiang Closes #13394 from vectorijk/spark-15490. --- R/pkg/R/DataFrame.R | 91 +++++++++++++++++++++++++------------------- R/pkg/R/RDD.R | 14 ++++--- R/pkg/R/WindowSpec.R | 7 ++-- R/pkg/R/broadcast.R | 8 ++-- R/pkg/R/column.R | 6 ++- R/pkg/R/context.R | 41 ++++++++++---------- R/pkg/R/functions.R | 2 +- R/pkg/R/group.R | 6 ++- R/pkg/R/mllib.R | 34 +++++++++++------ R/pkg/R/utils.R | 2 + 10 files changed, 123 insertions(+), 88 deletions(-) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 9a9b3f7ecae16..d72cbbd79e817 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -23,9 +23,11 @@ NULL setOldClass("jobj") setOldClass("structType") -#' @title S4 class that represents a SparkDataFrame -#' @description DataFrames can be created using functions like \link{createDataFrame}, -#' \link{read.json}, \link{table} etc. +#' S4 class that represents a SparkDataFrame +#' +#' DataFrames can be created using functions like \link{createDataFrame}, +#' \link{read.json}, \link{table} etc. +#' #' @family SparkDataFrame functions #' @rdname SparkDataFrame #' @docType class @@ -629,8 +631,6 @@ setMethod("repartition", #' #' @param x A SparkDataFrame #' @return A StringRRDD of JSON objects -#' @family SparkDataFrame functions -#' @rdname tojson #' @noRd #' @examples #'\dontrun{ @@ -648,7 +648,7 @@ setMethod("toJSON", RDD(jrdd, serializedMode = "string") }) -#' write.json +#' Save the contents of SparkDataFrame as a JSON file #' #' Save the contents of a SparkDataFrame as a JSON file (one object per line). Files written out #' with this method can be read back in as a SparkDataFrame using read.json(). @@ -675,7 +675,7 @@ setMethod("write.json", invisible(callJMethod(write, "json", path)) }) -#' write.parquet +#' Save the contents of SparkDataFrame as a Parquet file, preserving the schema. #' #' Save the contents of a SparkDataFrame as a Parquet file, preserving the schema. Files written out #' with this method can be read back in as a SparkDataFrame using read.parquet(). @@ -713,9 +713,9 @@ setMethod("saveAsParquetFile", write.parquet(x, path) }) -#' write.text +#' Save the content of SparkDataFrame in a text file at the specified path. #' -#' Saves the content of the SparkDataFrame in a text file at the specified path. +#' Save the content of the SparkDataFrame in a text file at the specified path. #' The SparkDataFrame must have only one column of string type with the name "value". #' Each row becomes a new line in the output file. #' @@ -820,8 +820,6 @@ setMethod("sample_frac", sample(x, withReplacement, fraction, seed) }) -#' nrow -#' #' Returns the number of rows in a SparkDataFrame #' #' @param x A SparkDataFrame @@ -874,6 +872,8 @@ setMethod("ncol", length(columns(x)) }) +#' Returns the dimensions of SparkDataFrame +#' #' Returns the dimensions (number of rows and columns) of a SparkDataFrame #' @param x a SparkDataFrame #' @@ -2012,8 +2012,9 @@ setMethod("join", dataFrame(sdf) }) +#' Merges two data frames +#' #' @name merge -#' @title Merges two data frames #' @param x the first data frame to be joined #' @param y the second data frame to be joined #' @param by a character vector specifying the join columns. If by is not @@ -2127,7 +2128,6 @@ setMethod("merge", joinRes }) -#' #' Creates a list of columns by replacing the intersected ones with aliases. #' The name of the alias column is formed by concatanating the original column name and a suffix. #' @@ -2182,8 +2182,9 @@ setMethod("unionAll", dataFrame(unioned) }) -#' @title Union two or more SparkDataFrames -#' @description Returns a new SparkDataFrame containing rows of all parameters. +#' Union two or more SparkDataFrames +#' +#' Returns a new SparkDataFrame containing rows of all parameters. #' #' @rdname rbind #' @name rbind @@ -2254,20 +2255,22 @@ setMethod("except", dataFrame(excepted) }) -#' Save the contents of the SparkDataFrame to a data source +#' Save the contents of SparkDataFrame to a data source. #' #' The data source is specified by the `source` and a set of options (...). #' If `source` is not specified, the default data source configured by #' spark.sql.sources.default will be used. #' -#' Additionally, mode is used to specify the behavior of the save operation when -#' data already exists in the data source. There are four modes: \cr -#' append: Contents of this SparkDataFrame are expected to be appended to existing data. \cr -#' overwrite: Existing data is expected to be overwritten by the contents of this -#' SparkDataFrame. \cr -#' error: An exception is expected to be thrown. \cr -#' ignore: The save operation is expected to not save the contents of the SparkDataFrame -#' and to not change the existing data. \cr +#' Additionally, mode is used to specify the behavior of the save operation when data already +#' exists in the data source. There are four modes: +#' \itemize{ +#' \item append: Contents of this SparkDataFrame are expected to be appended to existing data. +#' \item overwrite: Existing data is expected to be overwritten by the contents of this +#' SparkDataFrame. +#' \item error: An exception is expected to be thrown. +#' \item ignore: The save operation is expected to not save the contents of the SparkDataFrame +#' and to not change the existing data. +#' } #' #' @param df A SparkDataFrame #' @param path A name for the table @@ -2315,8 +2318,6 @@ setMethod("saveDF", write.df(df, path, source, mode, ...) }) -#' saveAsTable -#' #' Save the contents of the SparkDataFrame to a data source as a table #' #' The data source is specified by the `source` and a set of options (...). @@ -2543,11 +2544,12 @@ setMethod("fillna", dataFrame(sdf) }) +#' Download data from a SparkDataFrame into a data.frame +#' #' This function downloads the contents of a SparkDataFrame into an R's data.frame. #' Since data.frames are held in memory, ensure that you have enough memory #' in your system to accommodate the contents. #' -#' @title Download data from a SparkDataFrame into a data.frame #' @param x a SparkDataFrame #' @return a data.frame #' @family SparkDataFrame functions @@ -2563,13 +2565,14 @@ setMethod("as.data.frame", as.data.frame(collect(x), row.names, optional, ...) }) +#' Attach SparkDataFrame to R search path +#' #' The specified SparkDataFrame is attached to the R search path. This means that #' the SparkDataFrame is searched by R when evaluating a variable, so columns in #' the SparkDataFrame can be accessed by simply giving their names. #' #' @family SparkDataFrame functions #' @rdname attach -#' @title Attach SparkDataFrame to R search path #' @param what (SparkDataFrame) The SparkDataFrame to attach #' @param pos (integer) Specify position in search() where to attach. #' @param name (character) Name to use for the attached SparkDataFrame. Names @@ -2589,6 +2592,8 @@ setMethod("attach", attach(newEnv, pos = pos, name = name, warn.conflicts = warn.conflicts) }) +#' Evaluate a R expression in an environment constructed from a SparkDataFrame +#' #' Evaluate a R expression in an environment constructed from a SparkDataFrame #' with() allows access to columns of a SparkDataFrame by simply referring to #' their name. It appends every column of a SparkDataFrame into a new @@ -2596,7 +2601,7 @@ setMethod("attach", #' environment. #' #' @rdname with -#' @title Evaluate a R expression in an environment constructed from a SparkDataFrame +#' @family SparkDataFrame functions #' @param data (SparkDataFrame) SparkDataFrame to use for constructing an environment. #' @param expr (expression) Expression to evaluate. #' @param ... arguments to be passed to future methods. @@ -2612,10 +2617,12 @@ setMethod("with", eval(substitute(expr), envir = newEnv, enclos = newEnv) }) +#' Compactly display the structure of a dataset +#' #' Display the structure of a SparkDataFrame, including column names, column types, as well as a #' a small sample of rows. +#' #' @name str -#' @title Compactly display the structure of a dataset #' @rdname str #' @family SparkDataFrame functions #' @param object a SparkDataFrame @@ -2728,10 +2735,11 @@ setMethod("drop", base::drop(x) }) +#' Compute histogram statistics for given column +#' #' This function computes a histogram for a given SparkR Column. #' #' @name histogram -#' @title Histogram #' @param nbins the number of bins (optional). Default value is 10. #' @param df the SparkDataFrame containing the Column to build the histogram from. #' @param colname the name of the column to build the histogram from. @@ -2847,18 +2855,21 @@ setMethod("histogram", return(histStats) }) -#' Saves the content of the SparkDataFrame to an external database table via JDBC +#' Save the content of SparkDataFrame to an external database table via JDBC. #' -#' Additional JDBC database connection properties can be set (...) +#' Save the content of the SparkDataFrame to an external database table via JDBC. Additional JDBC +#' database connection properties can be set (...) #' #' Also, mode is used to specify the behavior of the save operation when -#' data already exists in the data source. There are four modes: \cr -#' append: Contents of this SparkDataFrame are expected to be appended to existing data. \cr -#' overwrite: Existing data is expected to be overwritten by the contents of this -#' SparkDataFrame. \cr -#' error: An exception is expected to be thrown. \cr -#' ignore: The save operation is expected to not save the contents of the SparkDataFrame -#' and to not change the existing data. \cr +#' data already exists in the data source. There are four modes: +#' \itemize{ +#' \item append: Contents of this SparkDataFrame are expected to be appended to existing data. +#' \item overwrite: Existing data is expected to be overwritten by the contents of this +#' SparkDataFrame. +#' \item error: An exception is expected to be thrown. +#' \item ignore: The save operation is expected to not save the contents of the SparkDataFrame +#' and to not change the existing data. +#' } #' #' @param x A SparkDataFrame #' @param url JDBC database url of the form `jdbc:subprotocol:subname` diff --git a/R/pkg/R/RDD.R b/R/pkg/R/RDD.R index f1badf4364da0..72a805256523e 100644 --- a/R/pkg/R/RDD.R +++ b/R/pkg/R/RDD.R @@ -19,9 +19,11 @@ setOldClass("jobj") -#' @title S4 class that represents an RDD -#' @description RDD can be created using functions like +#' S4 class that represents an RDD +#' +#' RDD can be created using functions like #' \code{parallelize}, \code{textFile} etc. +#' #' @rdname RDD #' @seealso parallelize, textFile #' @slot env An R environment that stores bookkeeping states of the RDD @@ -497,9 +499,9 @@ setMethod("map", lapply(X, FUN) }) -#' Flatten results after apply a function to all elements +#' Flatten results after applying a function to all elements #' -#' This function return a new RDD by first applying a function to all +#' This function returns a new RDD by first applying a function to all #' elements of this RDD, and then flattening the results. #' #' @param X The RDD to apply the transformation. @@ -713,7 +715,7 @@ setMethod("sumRDD", reduce(x, "+") }) -#' Applies a function to all elements in an RDD, and force evaluation. +#' Applies a function to all elements in an RDD, and forces evaluation. #' #' @param x The RDD to apply the function #' @param func The function to be applied. @@ -737,7 +739,7 @@ setMethod("foreach", invisible(collect(mapPartitions(x, partition.func))) }) -#' Applies a function to each partition in an RDD, and force evaluation. +#' Applies a function to each partition in an RDD, and forces evaluation. #' #' @examples #'\dontrun{ diff --git a/R/pkg/R/WindowSpec.R b/R/pkg/R/WindowSpec.R index 581176a6c0918..d8405420d0a49 100644 --- a/R/pkg/R/WindowSpec.R +++ b/R/pkg/R/WindowSpec.R @@ -20,9 +20,10 @@ #' @include generics.R jobj.R column.R NULL -#' @title S4 class that represents a WindowSpec -#' @description WindowSpec can be created by using window.partitionBy() -#' or window.orderBy() +#' S4 class that represents a WindowSpec +#' +#' WindowSpec can be created by using window.partitionBy() or window.orderBy() +#' #' @rdname WindowSpec #' @seealso \link{window.partitionBy}, \link{window.orderBy} #' diff --git a/R/pkg/R/broadcast.R b/R/pkg/R/broadcast.R index 38f0eed95e065..398dffc4ab1b4 100644 --- a/R/pkg/R/broadcast.R +++ b/R/pkg/R/broadcast.R @@ -23,9 +23,11 @@ .broadcastValues <- new.env() .broadcastIdToName <- new.env() -# @title S4 class that represents a Broadcast variable -# @description Broadcast variables can be created using the broadcast -# function from a \code{SparkContext}. +# S4 class that represents a Broadcast variable +# +# Broadcast variables can be created using the broadcast +# function from a \code{SparkContext}. +# # @rdname broadcast-class # @seealso broadcast # diff --git a/R/pkg/R/column.R b/R/pkg/R/column.R index 873e8b1665a28..cc2876ed94b7f 100644 --- a/R/pkg/R/column.R +++ b/R/pkg/R/column.R @@ -22,8 +22,10 @@ NULL setOldClass("jobj") -#' @title S4 class that represents a SparkDataFrame column -#' @description The column class supports unary, binary operations on SparkDataFrame columns +#' S4 class that represents a SparkDataFrame column +#' +#' The column class supports unary, binary operations on SparkDataFrame columns +#' #' @rdname column #' #' @slot jc reference to JVM SparkDataFrame column diff --git a/R/pkg/R/context.R b/R/pkg/R/context.R index 44bca877fd45a..5c886030ff5c5 100644 --- a/R/pkg/R/context.R +++ b/R/pkg/R/context.R @@ -173,9 +173,8 @@ includePackage <- function(sc, pkg) { .sparkREnv$.packages <- packages } -#' @title Broadcast a variable to all workers +#' Broadcast a variable to all workers #' -#' @description #' Broadcast a read-only variable to the cluster, returning a \code{Broadcast} #' object for reading it in distributed functions. #' @@ -207,7 +206,7 @@ broadcast <- function(sc, object) { Broadcast(id, object, jBroadcast, objName) } -#' @title Set the checkpoint directory +#' Set the checkpoint directory #' #' Set the directory under which RDDs are going to be checkpointed. The #' directory must be a HDFS path if running on a cluster. @@ -226,30 +225,31 @@ setCheckpointDir <- function(sc, dirName) { invisible(callJMethod(sc, "setCheckpointDir", suppressWarnings(normalizePath(dirName)))) } -#' @title Run a function over a list of elements, distributing the computations with Spark. +#' Run a function over a list of elements, distributing the computations with Spark. #' -#' @description #' Applies a function in a manner that is similar to doParallel or lapply to elements of a list. #' The computations are distributed using Spark. It is conceptually the same as the following code: #' lapply(list, func) #' #' Known limitations: -#' - variable scoping and capture: compared to R's rich support for variable resolutions, the -# distributed nature of SparkR limits how variables are resolved at runtime. All the variables -# that are available through lexical scoping are embedded in the closure of the function and -# available as read-only variables within the function. The environment variables should be -# stored into temporary variables outside the function, and not directly accessed within the -# function. +#' \itemize{ +#' \item variable scoping and capture: compared to R's rich support for variable resolutions, +#' the distributed nature of SparkR limits how variables are resolved at runtime. All the +#' variables that are available through lexical scoping are embedded in the closure of the +#' function and available as read-only variables within the function. The environment variables +#' should be stored into temporary variables outside the function, and not directly accessed +#' within the function. #' -#' - loading external packages: In order to use a package, you need to load it inside the -#' closure. For example, if you rely on the MASS module, here is how you would use it: -#'\dontrun{ -#' train <- function(hyperparam) { -#' library(MASS) -#' lm.ridge(“y ~ x+z”, data, lambda=hyperparam) -#' model +#' \item loading external packages: In order to use a package, you need to load it inside the +#' closure. For example, if you rely on the MASS module, here is how you would use it: +#' \preformatted{ +#' train <- function(hyperparam) { +#' library(MASS) +#' lm.ridge(“y ~ x+z”, data, lambda=hyperparam) +#' model +#' } +#' } #' } -#'} #' #' @rdname spark.lapply #' @param sc Spark Context to use @@ -259,7 +259,8 @@ setCheckpointDir <- function(sc, dirName) { #' @export #' @examples #'\dontrun{ -#' doubled <- spark.lapply(1:10, function(x){2 * x}) +#' sc <- sparkR.init() +#' doubled <- spark.lapply(sc, 1:10, function(x){2 * x}) #'} spark.lapply <- function(sc, list, func) { rdd <- parallelize(sc, list, length(list)) diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index 2665d1d477802..a779127b379a0 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -2185,7 +2185,7 @@ setMethod("from_unixtime", signature(x = "Column"), #' # 09:01:15-09:02:15... #' window(df$time, "1 minute", startTime = "15 seconds") #' -#' # Thirty second windows every 10 seconds, e.g. 09:00:00-09:00:30, 09:00:10-09:00:40, ... +#' # Thirty-second windows every 10 seconds, e.g. 09:00:00-09:00:30, 09:00:10-09:00:40, ... #' window(df$time, "30 seconds", "10 seconds") #'} setMethod("window", signature(x = "Column"), diff --git a/R/pkg/R/group.R b/R/pkg/R/group.R index b7047769175a3..eba083fe4b124 100644 --- a/R/pkg/R/group.R +++ b/R/pkg/R/group.R @@ -22,8 +22,10 @@ NULL setOldClass("jobj") -#' @title S4 class that represents a GroupedData -#' @description GroupedDatas can be created using groupBy() on a SparkDataFrame +#' S4 class that represents a GroupedData +#' +#' GroupedDatas can be created using groupBy() on a SparkDataFrame +#' #' @rdname GroupedData #' @seealso groupBy #' diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R index d4152b43b6f5f..ba2eee2fca76a 100644 --- a/R/pkg/R/mllib.R +++ b/R/pkg/R/mllib.R @@ -25,22 +25,26 @@ # - a set of methods that reflect the arguments of the other languages supported by Spark. These # methods are prefixed with the `spark.` prefix: spark.glm, spark.kmeans, etc. -#' @title S4 class that represents a generalized linear model +#' S4 class that represents a generalized linear model +#' #' @param jobj a Java object reference to the backing Scala GeneralizedLinearRegressionWrapper #' @export setClass("GeneralizedLinearRegressionModel", representation(jobj = "jobj")) -#' @title S4 class that represents a NaiveBayesModel +#' S4 class that represents a NaiveBayesModel +#' #' @param jobj a Java object reference to the backing Scala NaiveBayesWrapper #' @export setClass("NaiveBayesModel", representation(jobj = "jobj")) -#' @title S4 class that represents a AFTSurvivalRegressionModel +#' S4 class that represents a AFTSurvivalRegressionModel +#' #' @param jobj a Java object reference to the backing Scala AFTSurvivalRegressionWrapper #' @export setClass("AFTSurvivalRegressionModel", representation(jobj = "jobj")) -#' @title S4 class that represents a KMeansModel +#' S4 class that represents a KMeansModel +#' #' @param jobj a Java object reference to the backing Scala KMeansModel #' @export setClass("KMeansModel", representation(jobj = "jobj")) @@ -197,7 +201,7 @@ print.summary.GeneralizedLinearRegressionModel <- function(x, ...) { invisible(x) } -#' Make predictions from a generalized linear model +#' Predicted values based on model #' #' Makes predictions from a generalized linear model produced by glm() or spark.glm(), #' similarly to R's predict(). @@ -218,9 +222,9 @@ setMethod("predict", signature(object = "GeneralizedLinearRegressionModel"), return(dataFrame(callJMethod(object@jobj, "transform", newData@sdf))) }) -#' Make predictions from a naive Bayes model +#' Predicted values based on model #' -#' Makes predictions from a model produced by spark.naiveBayes(), +#' Makes predictions from a naive Bayes model or a model produced by spark.naiveBayes(), #' similarly to R package e1071's predict. #' #' @param object A fitted naive Bayes model @@ -357,9 +361,9 @@ setMethod("summary", signature(object = "KMeansModel"), cluster = cluster, is.loaded = is.loaded)) }) -#' Make predictions from a k-means model +#' Predicted values based on model #' -#' Make predictions from a model produced by spark.kmeans(). +#' Makes predictions from a k-means model or a model produced by spark.kmeans(). #' #' @param object A fitted k-means model #' @param newData SparkDataFrame for testing @@ -402,6 +406,8 @@ setMethod("spark.naiveBayes", signature(data = "SparkDataFrame", formula = "form return(new("NaiveBayesModel", jobj = jobj)) }) +#' Save fitted MLlib model to the input path +#' #' Save the Bernoulli naive Bayes model to the input path. #' #' @param object A fitted Bernoulli naive Bayes model @@ -428,6 +434,8 @@ setMethod("write.ml", signature(object = "NaiveBayesModel", path = "character"), invisible(callJMethod(writer, "save", path)) }) +#' Save fitted MLlib model to the input path +#' #' Save the AFT survival regression model to the input path. #' #' @param object A fitted AFT survival regression model @@ -453,6 +461,8 @@ setMethod("write.ml", signature(object = "AFTSurvivalRegressionModel", path = "c invisible(callJMethod(writer, "save", path)) }) +#' Save fitted MLlib model to the input path +#' #' Save the generalized linear model to the input path. #' #' @param object A fitted generalized linear model @@ -478,6 +488,8 @@ setMethod("write.ml", signature(object = "GeneralizedLinearRegressionModel", pat invisible(callJMethod(writer, "save", path)) }) +#' Save fitted MLlib model to the input path +#' #' Save the k-means model to the input path. #' #' @param object A fitted k-means model @@ -582,9 +594,9 @@ setMethod("summary", signature(object = "AFTSurvivalRegressionModel"), return(list(coefficients = coefficients)) }) -#' Make predictions from an AFT survival regression model +#' Predicted values based on model #' -#' Make predictions from a model produced by spark.survreg(), +#' Makes predictions from an AFT survival regression model or a model produced by spark.survreg(), #' similarly to R package survival's predict. #' #' @param object A fitted AFT survival regression model diff --git a/R/pkg/R/utils.R b/R/pkg/R/utils.R index 12e4f4f1ae8bb..b1b8adaa66a25 100644 --- a/R/pkg/R/utils.R +++ b/R/pkg/R/utils.R @@ -110,9 +110,11 @@ isRDD <- function(name, env) { #' @return the hash code as an integer #' @export #' @examples +#'\dontrun{ #' hashCode(1L) # 1 #' hashCode(1.0) # 1072693248 #' hashCode("1") # 49 +#'} hashCode <- function(key) { if (class(key) == "integer") { as.integer(key[[1]])