From 789e1270f2ab809d9b1cc9f2a89dd1cc27b8067f Mon Sep 17 00:00:00 2001 From: Kai Jiang Date: Sat, 28 May 2016 19:09:22 -0700 Subject: [PATCH 1/9] QA for non-MLlib changes --- R/pkg/R/DataFrame.R | 70 +++++++++++++++++++++++++++++---------------- R/pkg/R/RDD.R | 8 +++--- R/pkg/R/column.R | 2 ++ R/pkg/R/context.R | 30 ++++++++++--------- R/pkg/R/functions.R | 2 +- R/pkg/R/mllib.R | 18 +++++------- R/pkg/R/stats.R | 37 +++++++++++------------- 7 files changed, 92 insertions(+), 75 deletions(-) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 0ff350d44d4b3..c5d120b26dfb7 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -630,8 +630,9 @@ setMethod("repartition", #' @param x A SparkDataFrame #' @return A StringRRDD of JSON objects #' @family SparkDataFrame functions -#' @rdname tojson -#' @noRd +#' @rdname toJSON +#' @name toJSON +#' @export #' @examples #'\dontrun{ #' sc <- sparkR.init() @@ -852,6 +853,8 @@ setMethod("nrow", count(x) }) +#' ncol +#' #' Returns the number of columns in a SparkDataFrame #' #' @param x a SparkDataFrame @@ -874,6 +877,8 @@ setMethod("ncol", length(columns(x)) }) +#' dim +#' #' Returns the dimensions (number of rows and columns) of a SparkDataFrame #' @param x a SparkDataFrame #' @@ -895,6 +900,8 @@ setMethod("dim", c(count(x), ncol(x)) }) +#' collect +#' #' Collects all the elements of a SparkDataFrame and coerces them into an R data.frame. #' #' @param x A SparkDataFrame @@ -992,6 +999,8 @@ setMethod("limit", dataFrame(res) }) +#' take +#' #' Take the first NUM rows of a SparkDataFrame and return a the results as a data.frame #' #' @family SparkDataFrame functions @@ -1042,6 +1051,8 @@ setMethod("head", take(x, num) }) +#' first +#' #' Return the first row of a SparkDataFrame #' #' @param x A SparkDataFrame @@ -1070,7 +1081,10 @@ setMethod("first", #' #' @param x A SparkDataFrame #' -#' @noRd +#' @family SparkDataFrame functions +#' @rdname toRDD +#' @name toRDD +#' @export #' @examples #'\dontrun{ #' sc <- sparkR.init() @@ -2047,6 +2061,7 @@ setMethod("merge", joinRes }) +#' generateAliasesForIntersectedCols #' #' Creates a list of columns by replacing the intersected ones with aliases. #' The name of the alias column is formed by concatanating the original column name and a suffix. @@ -2174,20 +2189,22 @@ setMethod("except", dataFrame(excepted) }) -#' Save the contents of the SparkDataFrame to a data source +#' write.df #' -#' The data source is specified by the `source` and a set of options (...). -#' If `source` is not specified, the default data source configured by -#' spark.sql.sources.default will be used. +#' Save the contents of the SparkDataFrame to a data source. The data source is specified by the +#' `source` and a set of options (...). If `source` is not specified, the default data source +#' configured by spark.sql.sources.default will be used. #' -#' Additionally, mode is used to specify the behavior of the save operation when -#' data already exists in the data source. There are four modes: \cr -#' append: Contents of this SparkDataFrame are expected to be appended to existing data. \cr -#' overwrite: Existing data is expected to be overwritten by the contents of this -#' SparkDataFrame. \cr -#' error: An exception is expected to be thrown. \cr -#' ignore: The save operation is expected to not save the contents of the SparkDataFrame -#' and to not change the existing data. \cr +#' Additionally, mode is used to specify the behavior of the save operation when data already +#' exists in the data source. There are four modes: +#' \itemize{ +#' \item append: Contents of this SparkDataFrame are expected to be appended to existing data. +#' \item overwrite: Existing data is expected to be overwritten by the contents of this +#' SparkDataFrame. +#' \item error: An exception is expected to be thrown. +#' \item ignore: The save operation is expected to not save the contents of the SparkDataFrame +#' and to not change the existing data. +#' } #' #' @param df A SparkDataFrame #' @param path A name for the table @@ -2515,7 +2532,9 @@ setMethod("attach", #' environment. Then, the given expression is evaluated in this new #' environment. #' +#' @title with #' @rdname with +#' @family SparkDataFrame functions #' @title Evaluate a R expression in an environment constructed from a SparkDataFrame #' @param data (SparkDataFrame) SparkDataFrame to use for constructing an environment. #' @param expr (expression) Expression to evaluate. @@ -2767,18 +2786,21 @@ setMethod("histogram", return(histStats) }) -#' Saves the content of the SparkDataFrame to an external database table via JDBC +#' write.jdbc #' -#' Additional JDBC database connection properties can be set (...) +#' Saves the content of the SparkDataFrame to an external database table via JDBC. Additional JDBC +#' database connection properties can be set (...) #' #' Also, mode is used to specify the behavior of the save operation when -#' data already exists in the data source. There are four modes: \cr -#' append: Contents of this SparkDataFrame are expected to be appended to existing data. \cr -#' overwrite: Existing data is expected to be overwritten by the contents of this -#' SparkDataFrame. \cr -#' error: An exception is expected to be thrown. \cr -#' ignore: The save operation is expected to not save the contents of the SparkDataFrame -#' and to not change the existing data. \cr +#' data already exists in the data source. There are four modes: +#' \itemize{ +#' \item append: Contents of this SparkDataFrame are expected to be appended to existing data. +#' \item overwrite: Existing data is expected to be overwritten by the contents of this +#' SparkDataFrame. +#' \item error: An exception is expected to be thrown. +#' \item ignore: The save operation is expected to not save the contents of the SparkDataFrame +#' and to not change the existing data. +#' } #' #' @param x A SparkDataFrame #' @param url JDBC database url of the form `jdbc:subprotocol:subname` diff --git a/R/pkg/R/RDD.R b/R/pkg/R/RDD.R index f1badf4364da0..fded00b40a08c 100644 --- a/R/pkg/R/RDD.R +++ b/R/pkg/R/RDD.R @@ -497,9 +497,9 @@ setMethod("map", lapply(X, FUN) }) -#' Flatten results after apply a function to all elements +#' Flatten results after applying a function to all elements #' -#' This function return a new RDD by first applying a function to all +#' This function returns a new RDD by first applying a function to all #' elements of this RDD, and then flattening the results. #' #' @param X The RDD to apply the transformation. @@ -713,7 +713,7 @@ setMethod("sumRDD", reduce(x, "+") }) -#' Applies a function to all elements in an RDD, and force evaluation. +#' Applies a function to all elements in an RDD, and forces evaluation. #' #' @param x The RDD to apply the function #' @param func The function to be applied. @@ -737,7 +737,7 @@ setMethod("foreach", invisible(collect(mapPartitions(x, partition.func))) }) -#' Applies a function to each partition in an RDD, and force evaluation. +#' Applies a function to each partition in an RDD, and forces evaluation. #' #' @examples #'\dontrun{ diff --git a/R/pkg/R/column.R b/R/pkg/R/column.R index 873e8b1665a28..1c195c10633e3 100644 --- a/R/pkg/R/column.R +++ b/R/pkg/R/column.R @@ -204,6 +204,8 @@ setMethod("between", signature(x = "Column"), } }) +#' cast +#' #' Casts the column to a different data type. #' #' @rdname cast diff --git a/R/pkg/R/context.R b/R/pkg/R/context.R index 44bca877fd45a..38a4e21acbc04 100644 --- a/R/pkg/R/context.R +++ b/R/pkg/R/context.R @@ -234,22 +234,24 @@ setCheckpointDir <- function(sc, dirName) { #' lapply(list, func) #' #' Known limitations: -#' - variable scoping and capture: compared to R's rich support for variable resolutions, the -# distributed nature of SparkR limits how variables are resolved at runtime. All the variables -# that are available through lexical scoping are embedded in the closure of the function and -# available as read-only variables within the function. The environment variables should be -# stored into temporary variables outside the function, and not directly accessed within the -# function. +#' \itemize{ +#' \item variable scoping and capture: compared to R's rich support for variable resolutions, +#' the distributed nature of SparkR limits how variables are resolved at runtime. All the +#' variables that are available through lexical scoping are embedded in the closure of the +#' function and available as read-only variables within the function. The environment variables +#' should be stored into temporary variables outside the function, and not directly accessed +#' within the function. #' -#' - loading external packages: In order to use a package, you need to load it inside the -#' closure. For example, if you rely on the MASS module, here is how you would use it: -#'\dontrun{ -#' train <- function(hyperparam) { -#' library(MASS) -#' lm.ridge(“y ~ x+z”, data, lambda=hyperparam) -#' model +#' \item loading external packages: In order to use a package, you need to load it inside the +#' closure. For example, if you rely on the MASS module, here is how you would use it: +#' \preformatted{ +#' train <- function(hyperparam) { +#' library(MASS) +#' lm.ridge(“y ~ x+z”, data, lambda=hyperparam) +#' model +#' } +#' } #' } -#'} #' #' @rdname spark.lapply #' @param sc Spark Context to use diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index 2665d1d477802..a779127b379a0 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -2185,7 +2185,7 @@ setMethod("from_unixtime", signature(x = "Column"), #' # 09:01:15-09:02:15... #' window(df$time, "1 minute", startTime = "15 seconds") #' -#' # Thirty second windows every 10 seconds, e.g. 09:00:00-09:00:30, 09:00:10-09:00:40, ... +#' # Thirty-second windows every 10 seconds, e.g. 09:00:00-09:00:30, 09:00:10-09:00:40, ... #' window(df$time, "30 seconds", "10 seconds") #'} setMethod("window", signature(x = "Column"), diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R index d4152b43b6f5f..380e942c12925 100644 --- a/R/pkg/R/mllib.R +++ b/R/pkg/R/mllib.R @@ -197,11 +197,10 @@ print.summary.GeneralizedLinearRegressionModel <- function(x, ...) { invisible(x) } -#' Make predictions from a generalized linear model -#' #' Makes predictions from a generalized linear model produced by glm() or spark.glm(), #' similarly to R's predict(). #' +#' @title predict #' @param object A fitted generalized linear model #' @param newData SparkDataFrame for testing #' @return SparkDataFrame containing predicted labels in a column named "prediction" @@ -218,11 +217,10 @@ setMethod("predict", signature(object = "GeneralizedLinearRegressionModel"), return(dataFrame(callJMethod(object@jobj, "transform", newData@sdf))) }) -#' Make predictions from a naive Bayes model -#' -#' Makes predictions from a model produced by spark.naiveBayes(), +#' Makes predictions from a naive Bayes model or a model produced by spark.naiveBayes(), #' similarly to R package e1071's predict. #' +#' @title predict #' @param object A fitted naive Bayes model #' @param newData SparkDataFrame for testing #' @return SparkDataFrame containing predicted labels in a column named "prediction" @@ -357,10 +355,9 @@ setMethod("summary", signature(object = "KMeansModel"), cluster = cluster, is.loaded = is.loaded)) }) -#' Make predictions from a k-means model -#' -#' Make predictions from a model produced by spark.kmeans(). +#' Makes predictions from a k-means model or a model produced by spark.kmeans(). #' +#' @title predict #' @param object A fitted k-means model #' @param newData SparkDataFrame for testing #' @return SparkDataFrame containing predicted labels in a column named "prediction" @@ -582,11 +579,10 @@ setMethod("summary", signature(object = "AFTSurvivalRegressionModel"), return(list(coefficients = coefficients)) }) -#' Make predictions from an AFT survival regression model -#' -#' Make predictions from a model produced by spark.survreg(), +#' Makes predictions from an AFT survival regression model or a model produced by spark.survreg(), #' similarly to R package survival's predict. #' +#' @title predict #' @param object A fitted AFT survival regression model #' @param newData SparkDataFrame for testing #' @return SparkDataFrame containing predicted labels in a column named "prediction" diff --git a/R/pkg/R/stats.R b/R/pkg/R/stats.R index 6b53517873a72..77dddad88e4e5 100644 --- a/R/pkg/R/stats.R +++ b/R/pkg/R/stats.R @@ -19,12 +19,11 @@ setOldClass("jobj") -#' crosstab -#' #' Computes a pair-wise frequency table of the given columns. Also known as a contingency #' table. The number of distinct values for each column should be less than 1e4. At most 1e6 #' non-zero pair frequencies will be returned. #' +#' @title Statistic functions for SparkDataFrames #' @param col1 name of the first column. Distinct items will make the first item of each row. #' @param col2 name of the second column. Distinct items will make the column names of the output. #' @return a local R data.frame representing the contingency table. The first column of each row @@ -48,10 +47,9 @@ setMethod("crosstab", collect(dataFrame(sct)) }) -#' cov -#' -#' Calculate the sample covariance of two numerical columns of a SparkDataFrame. +#' Calculates the sample covariance of two numerical columns of a SparkDataFrame. #' +#' @title Statistic functions for SparkDataFrames #' @param x A SparkDataFrame #' @param col1 the name of the first column #' @param col2 the name of the second column @@ -73,12 +71,11 @@ setMethod("cov", callJMethod(statFunctions, "cov", col1, col2) }) -#' corr -#' #' Calculates the correlation of two columns of a SparkDataFrame. #' Currently only supports the Pearson Correlation Coefficient. #' For Spearman Correlation, consider using RDD methods found in MLlib's Statistics. #' +#' @title Statistic functions for SparkDataFrames #' @param x A SparkDataFrame #' @param col1 the name of the first column #' @param col2 the name of the second column @@ -103,12 +100,12 @@ setMethod("corr", callJMethod(statFunctions, "corr", col1, col2, method) }) -#' freqItems -#' #' Finding frequent items for columns, possibly with false positives. #' Using the frequent element count algorithm described in -#' \url{http://dx.doi.org/10.1145/762471.762473}, proposed by Karp, Schenker, and Papadimitriou. +#' \href{http://dx.doi.org/10.1145/762471.762473}{A simple algorithm for finding frequent elements +#' in streams and bags}, proposed by Karp, Schenker, and Papadimitriou. #' +#' @title Statistic functions for SparkDataFrames #' @param x A SparkDataFrame. #' @param cols A vector column names to search frequent items in. #' @param support (Optional) The minimum frequency for an item to be considered `frequent`. @@ -130,18 +127,17 @@ setMethod("freqItems", signature(x = "SparkDataFrame", cols = "character"), collect(dataFrame(sct)) }) -#' approxQuantile -#' #' Calculates the approximate quantiles of a numerical column of a SparkDataFrame. #' +#' @title Statistic functions for SparkDataFrames #' The result of this algorithm has the following deterministic bound: -#' If the SparkDataFrame has N elements and if we request the quantile at probability `p` up to -#' error `err`, then the algorithm will return a sample `x` from the SparkDataFrame so that the -#' *exact* rank of `x` is close to (p * N). More precisely, -#' floor((p - err) * N) <= rank(x) <= ceil((p + err) * N). -#' This method implements a variation of the Greenwald-Khanna algorithm (with some speed -#' optimizations). The algorithm was first present in [[http://dx.doi.org/10.1145/375663.375670 -#' Space-efficient Online Computation of Quantile Summaries]] by Greenwald and Khanna. +#' If the SparkDataFrame has N elements and if we request the quantile at probability \strong{p} up +#' to error \strong{err}, then the algorithm will return a sample \strong{x} from the +#' SparkDataFrame so that the \strong{exact} rank of \strong{x} is close to \eqn{(p * N)}. More +#' precisely, floor((p - err) * N) <= rank(x) <= ceil((p + err) * N). This method implements a +#' variation of the Greenwald-Khanna algorithm (with some speed optimizations). The algorithm was +#' first present in \href{http://dx.doi.org/10.1145/375663.375670}{Space-efficient Online +#' Computation of Quantile Summaries} by Greenwald and Khanna. #' #' @param x A SparkDataFrame. #' @param col The name of the numerical column. @@ -169,10 +165,9 @@ setMethod("approxQuantile", as.list(probabilities), relativeError) }) -#' sampleBy -#' #' Returns a stratified sample without replacement based on the fraction given on each stratum. #' +#' @title Statistic functions for SparkDataFrames #' @param x A SparkDataFrame #' @param col column that defines strata #' @param fractions A named list giving sampling fraction for each stratum. If a stratum is From aa810839df59183092d03202876ba73f062a6819 Mon Sep 17 00:00:00 2001 From: Kai Jiang Date: Tue, 31 May 2016 05:00:15 -0700 Subject: [PATCH 2/9] address comments --- R/pkg/R/DataFrame.R | 10 ++++------ R/pkg/R/stats.R | 20 +++++++++++++------- 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index c5d120b26dfb7..07005de17f71f 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -630,9 +630,8 @@ setMethod("repartition", #' @param x A SparkDataFrame #' @return A StringRRDD of JSON objects #' @family SparkDataFrame functions -#' @rdname toJSON -#' @name toJSON -#' @export +#' @rdname tojson +#' @noRd #' @examples #'\dontrun{ #' sc <- sparkR.init() @@ -1082,9 +1081,8 @@ setMethod("first", #' @param x A SparkDataFrame #' #' @family SparkDataFrame functions -#' @rdname toRDD -#' @name toRDD -#' @export +#' @rdname tordd +#' @noRd #' @examples #'\dontrun{ #' sc <- sparkR.init() diff --git a/R/pkg/R/stats.R b/R/pkg/R/stats.R index 77dddad88e4e5..f3c6afd3e6570 100644 --- a/R/pkg/R/stats.R +++ b/R/pkg/R/stats.R @@ -19,11 +19,12 @@ setOldClass("jobj") +#' crosstab +#' #' Computes a pair-wise frequency table of the given columns. Also known as a contingency #' table. The number of distinct values for each column should be less than 1e4. At most 1e6 #' non-zero pair frequencies will be returned. #' -#' @title Statistic functions for SparkDataFrames #' @param col1 name of the first column. Distinct items will make the first item of each row. #' @param col2 name of the second column. Distinct items will make the column names of the output. #' @return a local R data.frame representing the contingency table. The first column of each row @@ -47,9 +48,10 @@ setMethod("crosstab", collect(dataFrame(sct)) }) -#' Calculates the sample covariance of two numerical columns of a SparkDataFrame. +#' cov +#' +#' Calculate the sample covariance of two numerical columns of a SparkDataFrame. #' -#' @title Statistic functions for SparkDataFrames #' @param x A SparkDataFrame #' @param col1 the name of the first column #' @param col2 the name of the second column @@ -71,11 +73,12 @@ setMethod("cov", callJMethod(statFunctions, "cov", col1, col2) }) +#' corr +#' #' Calculates the correlation of two columns of a SparkDataFrame. #' Currently only supports the Pearson Correlation Coefficient. #' For Spearman Correlation, consider using RDD methods found in MLlib's Statistics. #' -#' @title Statistic functions for SparkDataFrames #' @param x A SparkDataFrame #' @param col1 the name of the first column #' @param col2 the name of the second column @@ -100,12 +103,13 @@ setMethod("corr", callJMethod(statFunctions, "corr", col1, col2, method) }) +#' freqItems +#' #' Finding frequent items for columns, possibly with false positives. #' Using the frequent element count algorithm described in #' \href{http://dx.doi.org/10.1145/762471.762473}{A simple algorithm for finding frequent elements #' in streams and bags}, proposed by Karp, Schenker, and Papadimitriou. #' -#' @title Statistic functions for SparkDataFrames #' @param x A SparkDataFrame. #' @param cols A vector column names to search frequent items in. #' @param support (Optional) The minimum frequency for an item to be considered `frequent`. @@ -127,9 +131,10 @@ setMethod("freqItems", signature(x = "SparkDataFrame", cols = "character"), collect(dataFrame(sct)) }) +#' approxQuantile +#' #' Calculates the approximate quantiles of a numerical column of a SparkDataFrame. #' -#' @title Statistic functions for SparkDataFrames #' The result of this algorithm has the following deterministic bound: #' If the SparkDataFrame has N elements and if we request the quantile at probability \strong{p} up #' to error \strong{err}, then the algorithm will return a sample \strong{x} from the @@ -165,9 +170,10 @@ setMethod("approxQuantile", as.list(probabilities), relativeError) }) +#' sampleBy +#' #' Returns a stratified sample without replacement based on the fraction given on each stratum. #' -#' @title Statistic functions for SparkDataFrames #' @param x A SparkDataFrame #' @param col column that defines strata #' @param fractions A named list giving sampling fraction for each stratum. If a stratum is From 8ab88d7a2ac5a8565ed8311135fac0a5b7e27e17 Mon Sep 17 00:00:00 2001 From: Kai Jiang Date: Thu, 2 Jun 2016 15:58:21 -0700 Subject: [PATCH 3/9] revert changes in R/pkg/R/stats.R - this changes might happen in #13109 --- R/pkg/R/stats.R | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/R/pkg/R/stats.R b/R/pkg/R/stats.R index f3c6afd3e6570..6b53517873a72 100644 --- a/R/pkg/R/stats.R +++ b/R/pkg/R/stats.R @@ -107,8 +107,7 @@ setMethod("corr", #' #' Finding frequent items for columns, possibly with false positives. #' Using the frequent element count algorithm described in -#' \href{http://dx.doi.org/10.1145/762471.762473}{A simple algorithm for finding frequent elements -#' in streams and bags}, proposed by Karp, Schenker, and Papadimitriou. +#' \url{http://dx.doi.org/10.1145/762471.762473}, proposed by Karp, Schenker, and Papadimitriou. #' #' @param x A SparkDataFrame. #' @param cols A vector column names to search frequent items in. @@ -136,13 +135,13 @@ setMethod("freqItems", signature(x = "SparkDataFrame", cols = "character"), #' Calculates the approximate quantiles of a numerical column of a SparkDataFrame. #' #' The result of this algorithm has the following deterministic bound: -#' If the SparkDataFrame has N elements and if we request the quantile at probability \strong{p} up -#' to error \strong{err}, then the algorithm will return a sample \strong{x} from the -#' SparkDataFrame so that the \strong{exact} rank of \strong{x} is close to \eqn{(p * N)}. More -#' precisely, floor((p - err) * N) <= rank(x) <= ceil((p + err) * N). This method implements a -#' variation of the Greenwald-Khanna algorithm (with some speed optimizations). The algorithm was -#' first present in \href{http://dx.doi.org/10.1145/375663.375670}{Space-efficient Online -#' Computation of Quantile Summaries} by Greenwald and Khanna. +#' If the SparkDataFrame has N elements and if we request the quantile at probability `p` up to +#' error `err`, then the algorithm will return a sample `x` from the SparkDataFrame so that the +#' *exact* rank of `x` is close to (p * N). More precisely, +#' floor((p - err) * N) <= rank(x) <= ceil((p + err) * N). +#' This method implements a variation of the Greenwald-Khanna algorithm (with some speed +#' optimizations). The algorithm was first present in [[http://dx.doi.org/10.1145/375663.375670 +#' Space-efficient Online Computation of Quantile Summaries]] by Greenwald and Khanna. #' #' @param x A SparkDataFrame. #' @param col The name of the numerical column. From 8a89ad2aa63e6d513154bf65ae3fc4a22b059d56 Mon Sep 17 00:00:00 2001 From: Kai Jiang Date: Sun, 5 Jun 2016 22:34:12 -0700 Subject: [PATCH 4/9] address comment and more changes --- R/pkg/R/DataFrame.R | 17 +++++++---------- R/pkg/R/context.R | 3 ++- R/pkg/R/functions.R | 10 +++++++--- R/pkg/R/utils.R | 2 ++ 4 files changed, 18 insertions(+), 14 deletions(-) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 07005de17f71f..ddb0f4500149f 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -629,8 +629,6 @@ setMethod("repartition", #' #' @param x A SparkDataFrame #' @return A StringRRDD of JSON objects -#' @family SparkDataFrame functions -#' @rdname tojson #' @noRd #' @examples #'\dontrun{ @@ -648,7 +646,7 @@ setMethod("toJSON", RDD(jrdd, serializedMode = "string") }) -#' write.json +#' Save the contents of DataFrame as a JSON file #' #' Save the contents of a SparkDataFrame as a JSON file (one object per line). Files written out #' with this method can be read back in as a SparkDataFrame using read.json(). @@ -675,7 +673,7 @@ setMethod("write.json", invisible(callJMethod(write, "json", path)) }) -#' write.parquet +#' Save the contents of DataFrame as a Parquet file, preserving the schema. #' #' Save the contents of a SparkDataFrame as a Parquet file, preserving the schema. Files written out #' with this method can be read back in as a SparkDataFrame using read.parquet(). @@ -713,7 +711,7 @@ setMethod("saveAsParquetFile", write.parquet(x, path) }) -#' write.text +#' Save the content of DataFrame in a text file at the specified path. #' #' Saves the content of the SparkDataFrame in a text file at the specified path. #' The SparkDataFrame must have only one column of string type with the name "value". @@ -1080,8 +1078,6 @@ setMethod("first", #' #' @param x A SparkDataFrame #' -#' @family SparkDataFrame functions -#' @rdname tordd #' @noRd #' @examples #'\dontrun{ @@ -2187,7 +2183,7 @@ setMethod("except", dataFrame(excepted) }) -#' write.df +#' Save the contents of DataFrame to a data source. #' #' Save the contents of the SparkDataFrame to a data source. The data source is specified by the #' `source` and a set of options (...). If `source` is not specified, the default data source @@ -2524,13 +2520,14 @@ setMethod("attach", attach(newEnv, pos = pos, name = name, warn.conflicts = warn.conflicts) }) +#' Evaluate an expression in an environment constructed from DataFrame +#' #' Evaluate a R expression in an environment constructed from a SparkDataFrame #' with() allows access to columns of a SparkDataFrame by simply referring to #' their name. It appends every column of a SparkDataFrame into a new #' environment. Then, the given expression is evaluated in this new #' environment. #' -#' @title with #' @rdname with #' @family SparkDataFrame functions #' @title Evaluate a R expression in an environment constructed from a SparkDataFrame @@ -2784,7 +2781,7 @@ setMethod("histogram", return(histStats) }) -#' write.jdbc +#' Save the content of DataFrame to an external database table via JDBC. #' #' Saves the content of the SparkDataFrame to an external database table via JDBC. Additional JDBC #' database connection properties can be set (...) diff --git a/R/pkg/R/context.R b/R/pkg/R/context.R index 38a4e21acbc04..0ab1ac14292b6 100644 --- a/R/pkg/R/context.R +++ b/R/pkg/R/context.R @@ -261,7 +261,8 @@ setCheckpointDir <- function(sc, dirName) { #' @export #' @examples #'\dontrun{ -#' doubled <- spark.lapply(1:10, function(x){2 * x}) +#' sc <- sparkR.init() +#' doubled <- spark.lapply(sc, 1:10, function(x){2 * x}) #'} spark.lapply <- function(sc, list, func) { rdd <- parallelize(sc, list, length(list)) diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index a779127b379a0..a1c091227ffc2 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -238,9 +238,9 @@ setMethod("ceil", column(jc) }) -#' Though scala functions has "col" function, we don't expose it in SparkR -#' because we don't want to conflict with the "col" function in the R base -#' package and we also have "column" function exported which is an alias of "col". +#' @rdname col +#' @name column +#' @export col <- function(x) { column(callJStatic("org.apache.spark.sql.functions", "col", x)) } @@ -249,6 +249,10 @@ col <- function(x) { #' #' Returns a Column based on the given column name. #' +#' Though scala functions has "col" function, we don't expose it in SparkR +#' because we don't want to conflict with the "col" function in the R base +#' package and we also have "column" function exported which is an alias of "col". +#' #' @rdname col #' @name column #' @family normal_funcs diff --git a/R/pkg/R/utils.R b/R/pkg/R/utils.R index 12e4f4f1ae8bb..b1b8adaa66a25 100644 --- a/R/pkg/R/utils.R +++ b/R/pkg/R/utils.R @@ -110,9 +110,11 @@ isRDD <- function(name, env) { #' @return the hash code as an integer #' @export #' @examples +#'\dontrun{ #' hashCode(1L) # 1 #' hashCode(1.0) # 1072693248 #' hashCode("1") # 49 +#'} hashCode <- function(key) { if (class(key) == "integer") { as.integer(key[[1]]) From 9629184874b4bb7d9f204e98bf0e12cd68f4ebee Mon Sep 17 00:00:00 2001 From: Kai Jiang Date: Sun, 12 Jun 2016 02:03:37 -0700 Subject: [PATCH 5/9] address comments --- R/pkg/R/DataFrame.R | 12 ++++++------ R/pkg/R/functions.R | 13 +++---------- 2 files changed, 9 insertions(+), 16 deletions(-) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index ddb0f4500149f..faa15de6824b3 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -646,7 +646,7 @@ setMethod("toJSON", RDD(jrdd, serializedMode = "string") }) -#' Save the contents of DataFrame as a JSON file +#' Save the contents of SparkDataFrame as a JSON file #' #' Save the contents of a SparkDataFrame as a JSON file (one object per line). Files written out #' with this method can be read back in as a SparkDataFrame using read.json(). @@ -673,7 +673,7 @@ setMethod("write.json", invisible(callJMethod(write, "json", path)) }) -#' Save the contents of DataFrame as a Parquet file, preserving the schema. +#' Save the contents of SparkDataFrame as a Parquet file, preserving the schema. #' #' Save the contents of a SparkDataFrame as a Parquet file, preserving the schema. Files written out #' with this method can be read back in as a SparkDataFrame using read.parquet(). @@ -711,7 +711,7 @@ setMethod("saveAsParquetFile", write.parquet(x, path) }) -#' Save the content of DataFrame in a text file at the specified path. +#' Save the content of SparkDataFrame in a text file at the specified path. #' #' Saves the content of the SparkDataFrame in a text file at the specified path. #' The SparkDataFrame must have only one column of string type with the name "value". @@ -2183,7 +2183,7 @@ setMethod("except", dataFrame(excepted) }) -#' Save the contents of DataFrame to a data source. +#' Save the contents of SparkDataFrame to a data source. #' #' Save the contents of the SparkDataFrame to a data source. The data source is specified by the #' `source` and a set of options (...). If `source` is not specified, the default data source @@ -2520,7 +2520,7 @@ setMethod("attach", attach(newEnv, pos = pos, name = name, warn.conflicts = warn.conflicts) }) -#' Evaluate an expression in an environment constructed from DataFrame +#' Evaluate an expression in an environment constructed from SparkDataFrame #' #' Evaluate a R expression in an environment constructed from a SparkDataFrame #' with() allows access to columns of a SparkDataFrame by simply referring to @@ -2781,7 +2781,7 @@ setMethod("histogram", return(histStats) }) -#' Save the content of DataFrame to an external database table via JDBC. +#' Save the content of SparkDataFrame to an external database table via JDBC. #' #' Saves the content of the SparkDataFrame to an external database table via JDBC. Additional JDBC #' database connection properties can be set (...) diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index a1c091227ffc2..b6058ccce0d21 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -238,9 +238,9 @@ setMethod("ceil", column(jc) }) -#' @rdname col -#' @name column -#' @export +#' Though scala functions has "col" function, we don't expose it in SparkR +#' because we don't want to conflict with the "col" function in the R base +#' package and we also have "column" function exported which is an alias of "col". col <- function(x) { column(callJStatic("org.apache.spark.sql.functions", "col", x)) } @@ -249,14 +249,7 @@ col <- function(x) { #' #' Returns a Column based on the given column name. #' -#' Though scala functions has "col" function, we don't expose it in SparkR -#' because we don't want to conflict with the "col" function in the R base -#' package and we also have "column" function exported which is an alias of "col". -#' -#' @rdname col -#' @name column #' @family normal_funcs -#' @export #' @examples \dontrun{column(df)} setMethod("column", signature(x = "character"), From 560ff0e3f448827f0a72f4e65f24e1d1afa24440 Mon Sep 17 00:00:00 2001 From: Kai Jiang Date: Sun, 12 Jun 2016 03:00:15 -0700 Subject: [PATCH 6/9] use first line as the title convention --- R/pkg/R/DataFrame.R | 32 ++++++++++++++++++++------------ R/pkg/R/RDD.R | 6 ++++-- R/pkg/R/WindowSpec.R | 7 ++++--- R/pkg/R/broadcast.R | 8 +++++--- R/pkg/R/column.R | 6 ++++-- R/pkg/R/context.R | 8 +++----- R/pkg/R/group.R | 6 ++++-- R/pkg/R/mllib.R | 24 ++++++++++++++++-------- 8 files changed, 60 insertions(+), 37 deletions(-) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index faa15de6824b3..96628a491c669 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -23,9 +23,11 @@ NULL setOldClass("jobj") setOldClass("structType") -#' @title S4 class that represents a SparkDataFrame -#' @description DataFrames can be created using functions like \link{createDataFrame}, -#' \link{read.json}, \link{table} etc. +#' S4 class that represents a SparkDataFrame +#' +#' DataFrames can be created using functions like \link{createDataFrame}, +#' \link{read.json}, \link{table} etc. +#' #' @family SparkDataFrame functions #' @rdname SparkDataFrame #' @docType class @@ -1940,8 +1942,9 @@ setMethod("join", dataFrame(sdf) }) +#' Merges two data frames +#' #' @name merge -#' @title Merges two data frames #' @param x the first data frame to be joined #' @param y the second data frame to be joined #' @param by a character vector specifying the join columns. If by is not @@ -2111,8 +2114,9 @@ setMethod("unionAll", dataFrame(unioned) }) -#' @title Union two or more SparkDataFrames -#' @description Returns a new SparkDataFrame containing rows of all parameters. +#' Union two or more SparkDataFrames +#' +#' Returns a new SparkDataFrame containing rows of all parameters. #' #' @rdname rbind #' @name rbind @@ -2474,11 +2478,12 @@ setMethod("fillna", dataFrame(sdf) }) +#' Download data from a SparkDataFrame into a data.frame +#' #' This function downloads the contents of a SparkDataFrame into an R's data.frame. #' Since data.frames are held in memory, ensure that you have enough memory #' in your system to accommodate the contents. #' -#' @title Download data from a SparkDataFrame into a data.frame #' @param x a SparkDataFrame #' @return a data.frame #' @family SparkDataFrame functions @@ -2494,13 +2499,14 @@ setMethod("as.data.frame", as.data.frame(collect(x), row.names, optional, ...) }) +#' Attach SparkDataFrame to R search path +#' #' The specified SparkDataFrame is attached to the R search path. This means that #' the SparkDataFrame is searched by R when evaluating a variable, so columns in #' the SparkDataFrame can be accessed by simply giving their names. #' #' @family SparkDataFrame functions #' @rdname attach -#' @title Attach SparkDataFrame to R search path #' @param what (SparkDataFrame) The SparkDataFrame to attach #' @param pos (integer) Specify position in search() where to attach. #' @param name (character) Name to use for the attached SparkDataFrame. Names @@ -2520,7 +2526,7 @@ setMethod("attach", attach(newEnv, pos = pos, name = name, warn.conflicts = warn.conflicts) }) -#' Evaluate an expression in an environment constructed from SparkDataFrame +#' Evaluate a R expression in an environment constructed from a SparkDataFrame #' #' Evaluate a R expression in an environment constructed from a SparkDataFrame #' with() allows access to columns of a SparkDataFrame by simply referring to @@ -2530,7 +2536,6 @@ setMethod("attach", #' #' @rdname with #' @family SparkDataFrame functions -#' @title Evaluate a R expression in an environment constructed from a SparkDataFrame #' @param data (SparkDataFrame) SparkDataFrame to use for constructing an environment. #' @param expr (expression) Expression to evaluate. #' @param ... arguments to be passed to future methods. @@ -2546,10 +2551,12 @@ setMethod("with", eval(substitute(expr), envir = newEnv, enclos = newEnv) }) +#' Compactly display the structure of a dataset +#' #' Display the structure of a SparkDataFrame, including column names, column types, as well as a #' a small sample of rows. +#' #' @name str -#' @title Compactly display the structure of a dataset #' @rdname str #' @family SparkDataFrame functions #' @param object a SparkDataFrame @@ -2662,10 +2669,11 @@ setMethod("drop", base::drop(x) }) +#' Histogram +#' #' This function computes a histogram for a given SparkR Column. #' #' @name histogram -#' @title Histogram #' @param nbins the number of bins (optional). Default value is 10. #' @param df the SparkDataFrame containing the Column to build the histogram from. #' @param colname the name of the column to build the histogram from. diff --git a/R/pkg/R/RDD.R b/R/pkg/R/RDD.R index fded00b40a08c..72a805256523e 100644 --- a/R/pkg/R/RDD.R +++ b/R/pkg/R/RDD.R @@ -19,9 +19,11 @@ setOldClass("jobj") -#' @title S4 class that represents an RDD -#' @description RDD can be created using functions like +#' S4 class that represents an RDD +#' +#' RDD can be created using functions like #' \code{parallelize}, \code{textFile} etc. +#' #' @rdname RDD #' @seealso parallelize, textFile #' @slot env An R environment that stores bookkeeping states of the RDD diff --git a/R/pkg/R/WindowSpec.R b/R/pkg/R/WindowSpec.R index 581176a6c0918..d8405420d0a49 100644 --- a/R/pkg/R/WindowSpec.R +++ b/R/pkg/R/WindowSpec.R @@ -20,9 +20,10 @@ #' @include generics.R jobj.R column.R NULL -#' @title S4 class that represents a WindowSpec -#' @description WindowSpec can be created by using window.partitionBy() -#' or window.orderBy() +#' S4 class that represents a WindowSpec +#' +#' WindowSpec can be created by using window.partitionBy() or window.orderBy() +#' #' @rdname WindowSpec #' @seealso \link{window.partitionBy}, \link{window.orderBy} #' diff --git a/R/pkg/R/broadcast.R b/R/pkg/R/broadcast.R index 38f0eed95e065..398dffc4ab1b4 100644 --- a/R/pkg/R/broadcast.R +++ b/R/pkg/R/broadcast.R @@ -23,9 +23,11 @@ .broadcastValues <- new.env() .broadcastIdToName <- new.env() -# @title S4 class that represents a Broadcast variable -# @description Broadcast variables can be created using the broadcast -# function from a \code{SparkContext}. +# S4 class that represents a Broadcast variable +# +# Broadcast variables can be created using the broadcast +# function from a \code{SparkContext}. +# # @rdname broadcast-class # @seealso broadcast # diff --git a/R/pkg/R/column.R b/R/pkg/R/column.R index 1c195c10633e3..fa5e743ae48b5 100644 --- a/R/pkg/R/column.R +++ b/R/pkg/R/column.R @@ -22,8 +22,10 @@ NULL setOldClass("jobj") -#' @title S4 class that represents a SparkDataFrame column -#' @description The column class supports unary, binary operations on SparkDataFrame columns +#' S4 class that represents a SparkDataFrame column +#' +#' The column class supports unary, binary operations on SparkDataFrame columns +#' #' @rdname column #' #' @slot jc reference to JVM SparkDataFrame column diff --git a/R/pkg/R/context.R b/R/pkg/R/context.R index 0ab1ac14292b6..5c886030ff5c5 100644 --- a/R/pkg/R/context.R +++ b/R/pkg/R/context.R @@ -173,9 +173,8 @@ includePackage <- function(sc, pkg) { .sparkREnv$.packages <- packages } -#' @title Broadcast a variable to all workers +#' Broadcast a variable to all workers #' -#' @description #' Broadcast a read-only variable to the cluster, returning a \code{Broadcast} #' object for reading it in distributed functions. #' @@ -207,7 +206,7 @@ broadcast <- function(sc, object) { Broadcast(id, object, jBroadcast, objName) } -#' @title Set the checkpoint directory +#' Set the checkpoint directory #' #' Set the directory under which RDDs are going to be checkpointed. The #' directory must be a HDFS path if running on a cluster. @@ -226,9 +225,8 @@ setCheckpointDir <- function(sc, dirName) { invisible(callJMethod(sc, "setCheckpointDir", suppressWarnings(normalizePath(dirName)))) } -#' @title Run a function over a list of elements, distributing the computations with Spark. +#' Run a function over a list of elements, distributing the computations with Spark. #' -#' @description #' Applies a function in a manner that is similar to doParallel or lapply to elements of a list. #' The computations are distributed using Spark. It is conceptually the same as the following code: #' lapply(list, func) diff --git a/R/pkg/R/group.R b/R/pkg/R/group.R index 08f4a490c883e..493f0f1b60f26 100644 --- a/R/pkg/R/group.R +++ b/R/pkg/R/group.R @@ -22,8 +22,10 @@ NULL setOldClass("jobj") -#' @title S4 class that represents a GroupedData -#' @description GroupedDatas can be created using groupBy() on a SparkDataFrame +#' S4 class that represents a GroupedData +#' +#' GroupedDatas can be created using groupBy() on a SparkDataFrame +#' #' @rdname GroupedData #' @seealso groupBy #' diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R index 380e942c12925..440a06646421e 100644 --- a/R/pkg/R/mllib.R +++ b/R/pkg/R/mllib.R @@ -25,22 +25,26 @@ # - a set of methods that reflect the arguments of the other languages supported by Spark. These # methods are prefixed with the `spark.` prefix: spark.glm, spark.kmeans, etc. -#' @title S4 class that represents a generalized linear model +#' S4 class that represents a generalized linear model +#' #' @param jobj a Java object reference to the backing Scala GeneralizedLinearRegressionWrapper #' @export setClass("GeneralizedLinearRegressionModel", representation(jobj = "jobj")) -#' @title S4 class that represents a NaiveBayesModel +#' S4 class that represents a NaiveBayesModel +#' #' @param jobj a Java object reference to the backing Scala NaiveBayesWrapper #' @export setClass("NaiveBayesModel", representation(jobj = "jobj")) -#' @title S4 class that represents a AFTSurvivalRegressionModel +#' S4 class that represents a AFTSurvivalRegressionModel +#' #' @param jobj a Java object reference to the backing Scala AFTSurvivalRegressionWrapper #' @export setClass("AFTSurvivalRegressionModel", representation(jobj = "jobj")) -#' @title S4 class that represents a KMeansModel +#' S4 class that represents a KMeansModel +#' #' @param jobj a Java object reference to the backing Scala KMeansModel #' @export setClass("KMeansModel", representation(jobj = "jobj")) @@ -197,10 +201,11 @@ print.summary.GeneralizedLinearRegressionModel <- function(x, ...) { invisible(x) } +#' predict +#' #' Makes predictions from a generalized linear model produced by glm() or spark.glm(), #' similarly to R's predict(). #' -#' @title predict #' @param object A fitted generalized linear model #' @param newData SparkDataFrame for testing #' @return SparkDataFrame containing predicted labels in a column named "prediction" @@ -217,10 +222,11 @@ setMethod("predict", signature(object = "GeneralizedLinearRegressionModel"), return(dataFrame(callJMethod(object@jobj, "transform", newData@sdf))) }) +#' predict +#' #' Makes predictions from a naive Bayes model or a model produced by spark.naiveBayes(), #' similarly to R package e1071's predict. #' -#' @title predict #' @param object A fitted naive Bayes model #' @param newData SparkDataFrame for testing #' @return SparkDataFrame containing predicted labels in a column named "prediction" @@ -355,9 +361,10 @@ setMethod("summary", signature(object = "KMeansModel"), cluster = cluster, is.loaded = is.loaded)) }) +#' predict +#' #' Makes predictions from a k-means model or a model produced by spark.kmeans(). #' -#' @title predict #' @param object A fitted k-means model #' @param newData SparkDataFrame for testing #' @return SparkDataFrame containing predicted labels in a column named "prediction" @@ -579,10 +586,11 @@ setMethod("summary", signature(object = "AFTSurvivalRegressionModel"), return(list(coefficients = coefficients)) }) +#' predict +#' #' Makes predictions from an AFT survival regression model or a model produced by spark.survreg(), #' similarly to R package survival's predict. #' -#' @title predict #' @param object A fitted AFT survival regression model #' @param newData SparkDataFrame for testing #' @return SparkDataFrame containing predicted labels in a column named "prediction" From 23fc1d737f598efb9031cc888e7eacc3ffbcbba1 Mon Sep 17 00:00:00 2001 From: Kai Jiang Date: Sun, 12 Jun 2016 03:45:36 -0700 Subject: [PATCH 7/9] revert --- R/pkg/R/DataFrame.R | 26 ++++++-------------------- 1 file changed, 6 insertions(+), 20 deletions(-) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 96628a491c669..edc0832c5bcae 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -715,7 +715,7 @@ setMethod("saveAsParquetFile", #' Save the content of SparkDataFrame in a text file at the specified path. #' -#' Saves the content of the SparkDataFrame in a text file at the specified path. +#' Save the content of the SparkDataFrame in a text file at the specified path. #' The SparkDataFrame must have only one column of string type with the name "value". #' Each row becomes a new line in the output file. #' @@ -820,8 +820,6 @@ setMethod("sample_frac", sample(x, withReplacement, fraction, seed) }) -#' nrow -#' #' Returns the number of rows in a SparkDataFrame #' #' @param x A SparkDataFrame @@ -852,8 +850,6 @@ setMethod("nrow", count(x) }) -#' ncol -#' #' Returns the number of columns in a SparkDataFrame #' #' @param x a SparkDataFrame @@ -876,7 +872,7 @@ setMethod("ncol", length(columns(x)) }) -#' dim +#' Returns the dimensions of SparkDataFrame #' #' Returns the dimensions (number of rows and columns) of a SparkDataFrame #' @param x a SparkDataFrame @@ -899,8 +895,6 @@ setMethod("dim", c(count(x), ncol(x)) }) -#' collect -#' #' Collects all the elements of a SparkDataFrame and coerces them into an R data.frame. #' #' @param x A SparkDataFrame @@ -998,8 +992,6 @@ setMethod("limit", dataFrame(res) }) -#' take -#' #' Take the first NUM rows of a SparkDataFrame and return a the results as a data.frame #' #' @family SparkDataFrame functions @@ -1050,8 +1042,6 @@ setMethod("head", take(x, num) }) -#' first -#' #' Return the first row of a SparkDataFrame #' #' @param x A SparkDataFrame @@ -2058,8 +2048,6 @@ setMethod("merge", joinRes }) -#' generateAliasesForIntersectedCols -#' #' Creates a list of columns by replacing the intersected ones with aliases. #' The name of the alias column is formed by concatanating the original column name and a suffix. #' @@ -2189,9 +2177,9 @@ setMethod("except", #' Save the contents of SparkDataFrame to a data source. #' -#' Save the contents of the SparkDataFrame to a data source. The data source is specified by the -#' `source` and a set of options (...). If `source` is not specified, the default data source -#' configured by spark.sql.sources.default will be used. +#' The data source is specified by the `source` and a set of options (...). +#' If `source` is not specified, the default data source configured by +#' spark.sql.sources.default will be used. #' #' Additionally, mode is used to specify the behavior of the save operation when data already #' exists in the data source. There are four modes: @@ -2250,8 +2238,6 @@ setMethod("saveDF", write.df(df, path, source, mode, ...) }) -#' saveAsTable -#' #' Save the contents of the SparkDataFrame to a data source as a table #' #' The data source is specified by the `source` and a set of options (...). @@ -2791,7 +2777,7 @@ setMethod("histogram", #' Save the content of SparkDataFrame to an external database table via JDBC. #' -#' Saves the content of the SparkDataFrame to an external database table via JDBC. Additional JDBC +#' Save the content of the SparkDataFrame to an external database table via JDBC. Additional JDBC #' database connection properties can be set (...) #' #' Also, mode is used to specify the behavior of the save operation when From 2537b8ff114f9d33d0228d7aaeabafdb454609aa Mon Sep 17 00:00:00 2001 From: Kai Jiang Date: Mon, 13 Jun 2016 23:55:27 -0700 Subject: [PATCH 8/9] address comment --- R/pkg/R/column.R | 2 -- R/pkg/R/functions.R | 3 +++ 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/R/pkg/R/column.R b/R/pkg/R/column.R index fa5e743ae48b5..cc2876ed94b7f 100644 --- a/R/pkg/R/column.R +++ b/R/pkg/R/column.R @@ -206,8 +206,6 @@ setMethod("between", signature(x = "Column"), } }) -#' cast -#' #' Casts the column to a different data type. #' #' @rdname cast diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index b6058ccce0d21..a779127b379a0 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -249,7 +249,10 @@ col <- function(x) { #' #' Returns a Column based on the given column name. #' +#' @rdname col +#' @name column #' @family normal_funcs +#' @export #' @examples \dontrun{column(df)} setMethod("column", signature(x = "character"), From 84bf2aadba4d3cf92ca4803cfb608283e8237ab1 Mon Sep 17 00:00:00 2001 From: Kai Jiang Date: Wed, 15 Jun 2016 15:40:15 -0700 Subject: [PATCH 9/9] change more titles --- R/pkg/R/DataFrame.R | 2 +- R/pkg/R/mllib.R | 16 ++++++++++++---- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index edc0832c5bcae..ab4be171fb0fc 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -2655,7 +2655,7 @@ setMethod("drop", base::drop(x) }) -#' Histogram +#' Compute histogram statistics for given column #' #' This function computes a histogram for a given SparkR Column. #' diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R index 440a06646421e..ba2eee2fca76a 100644 --- a/R/pkg/R/mllib.R +++ b/R/pkg/R/mllib.R @@ -201,7 +201,7 @@ print.summary.GeneralizedLinearRegressionModel <- function(x, ...) { invisible(x) } -#' predict +#' Predicted values based on model #' #' Makes predictions from a generalized linear model produced by glm() or spark.glm(), #' similarly to R's predict(). @@ -222,7 +222,7 @@ setMethod("predict", signature(object = "GeneralizedLinearRegressionModel"), return(dataFrame(callJMethod(object@jobj, "transform", newData@sdf))) }) -#' predict +#' Predicted values based on model #' #' Makes predictions from a naive Bayes model or a model produced by spark.naiveBayes(), #' similarly to R package e1071's predict. @@ -361,7 +361,7 @@ setMethod("summary", signature(object = "KMeansModel"), cluster = cluster, is.loaded = is.loaded)) }) -#' predict +#' Predicted values based on model #' #' Makes predictions from a k-means model or a model produced by spark.kmeans(). #' @@ -406,6 +406,8 @@ setMethod("spark.naiveBayes", signature(data = "SparkDataFrame", formula = "form return(new("NaiveBayesModel", jobj = jobj)) }) +#' Save fitted MLlib model to the input path +#' #' Save the Bernoulli naive Bayes model to the input path. #' #' @param object A fitted Bernoulli naive Bayes model @@ -432,6 +434,8 @@ setMethod("write.ml", signature(object = "NaiveBayesModel", path = "character"), invisible(callJMethod(writer, "save", path)) }) +#' Save fitted MLlib model to the input path +#' #' Save the AFT survival regression model to the input path. #' #' @param object A fitted AFT survival regression model @@ -457,6 +461,8 @@ setMethod("write.ml", signature(object = "AFTSurvivalRegressionModel", path = "c invisible(callJMethod(writer, "save", path)) }) +#' Save fitted MLlib model to the input path +#' #' Save the generalized linear model to the input path. #' #' @param object A fitted generalized linear model @@ -482,6 +488,8 @@ setMethod("write.ml", signature(object = "GeneralizedLinearRegressionModel", pat invisible(callJMethod(writer, "save", path)) }) +#' Save fitted MLlib model to the input path +#' #' Save the k-means model to the input path. #' #' @param object A fitted k-means model @@ -586,7 +594,7 @@ setMethod("summary", signature(object = "AFTSurvivalRegressionModel"), return(list(coefficients = coefficients)) }) -#' predict +#' Predicted values based on model #' #' Makes predictions from an AFT survival regression model or a model produced by spark.survreg(), #' similarly to R package survival's predict.