From 789e1270f2ab809d9b1cc9f2a89dd1cc27b8067f Mon Sep 17 00:00:00 2001
From: Kai Jiang <jiangkai@gmail.com>
Date: Sat, 28 May 2016 19:09:22 -0700
Subject: [PATCH 1/9] QA for non-MLlib changes

---
 R/pkg/R/DataFrame.R | 70 +++++++++++++++++++++++++++++----------------
 R/pkg/R/RDD.R       |  8 +++---
 R/pkg/R/column.R    |  2 ++
 R/pkg/R/context.R   | 30 ++++++++++---------
 R/pkg/R/functions.R |  2 +-
 R/pkg/R/mllib.R     | 18 +++++-------
 R/pkg/R/stats.R     | 37 +++++++++++-------------
 7 files changed, 92 insertions(+), 75 deletions(-)

diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 0ff350d44d4b3..c5d120b26dfb7 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -630,8 +630,9 @@ setMethod("repartition",
 #' @param x A SparkDataFrame
 #' @return A StringRRDD of JSON objects
 #' @family SparkDataFrame functions
-#' @rdname tojson
-#' @noRd
+#' @rdname toJSON
+#' @name toJSON
+#' @export
 #' @examples
 #'\dontrun{
 #' sc <- sparkR.init()
@@ -852,6 +853,8 @@ setMethod("nrow",
             count(x)
           })
 
+#' ncol
+#'
 #' Returns the number of columns in a SparkDataFrame
 #'
 #' @param x a SparkDataFrame
@@ -874,6 +877,8 @@ setMethod("ncol",
             length(columns(x))
           })
 
+#' dim
+#'
 #' Returns the dimensions (number of rows and columns) of a SparkDataFrame
 #' @param x a SparkDataFrame
 #'
@@ -895,6 +900,8 @@ setMethod("dim",
             c(count(x), ncol(x))
           })
 
+#' collect
+#'
 #' Collects all the elements of a SparkDataFrame and coerces them into an R data.frame.
 #'
 #' @param x A SparkDataFrame
@@ -992,6 +999,8 @@ setMethod("limit",
             dataFrame(res)
           })
 
+#' take
+#'
 #' Take the first NUM rows of a SparkDataFrame and return a the results as a data.frame
 #'
 #' @family SparkDataFrame functions
@@ -1042,6 +1051,8 @@ setMethod("head",
             take(x, num)
           })
 
+#' first
+#'
 #' Return the first row of a SparkDataFrame
 #'
 #' @param x A SparkDataFrame
@@ -1070,7 +1081,10 @@ setMethod("first",
 #'
 #' @param x A SparkDataFrame
 #'
-#' @noRd
+#' @family SparkDataFrame functions
+#' @rdname toRDD
+#' @name toRDD
+#' @export
 #' @examples
 #'\dontrun{
 #' sc <- sparkR.init()
@@ -2047,6 +2061,7 @@ setMethod("merge",
             joinRes
           })
 
+#' generateAliasesForIntersectedCols
 #'
 #' Creates a list of columns by replacing the intersected ones with aliases.
 #' The name of the alias column is formed by concatanating the original column name and a suffix.
@@ -2174,20 +2189,22 @@ setMethod("except",
             dataFrame(excepted)
           })
 
-#' Save the contents of the SparkDataFrame to a data source
+#' write.df
 #'
-#' The data source is specified by the `source` and a set of options (...).
-#' If `source` is not specified, the default data source configured by
-#' spark.sql.sources.default will be used.
+#' Save the contents of the SparkDataFrame to a data source. The data source is specified by the
+#' `source` and a set of options (...). If `source` is not specified, the default data source
+#' configured by spark.sql.sources.default will be used.
 #'
-#' Additionally, mode is used to specify the behavior of the save operation when
-#' data already exists in the data source. There are four modes: \cr
-#'  append: Contents of this SparkDataFrame are expected to be appended to existing data. \cr
-#'  overwrite: Existing data is expected to be overwritten by the contents of this
-#'     SparkDataFrame. \cr
-#'  error: An exception is expected to be thrown. \cr
-#'  ignore: The save operation is expected to not save the contents of the SparkDataFrame
-#'     and to not change the existing data. \cr
+#' Additionally, mode is used to specify the behavior of the save operation when data already
+#' exists in the data source. There are four modes:
+#' \itemize{
+#'   \item append: Contents of this SparkDataFrame are expected to be appended to existing data.
+#'   \item overwrite: Existing data is expected to be overwritten by the contents of this
+#'         SparkDataFrame.
+#'   \item error: An exception is expected to be thrown.
+#'   \item ignore: The save operation is expected to not save the contents of the SparkDataFrame
+#'         and to not change the existing data.
+#' }
 #'
 #' @param df A SparkDataFrame
 #' @param path A name for the table
@@ -2515,7 +2532,9 @@ setMethod("attach",
 #' environment. Then, the given expression is evaluated in this new
 #' environment.
 #'
+#' @title with
 #' @rdname with
+#' @family SparkDataFrame functions
 #' @title Evaluate a R expression in an environment constructed from a SparkDataFrame
 #' @param data (SparkDataFrame) SparkDataFrame to use for constructing an environment.
 #' @param expr (expression) Expression to evaluate.
@@ -2767,18 +2786,21 @@ setMethod("histogram",
             return(histStats)
           })
 
-#' Saves the content of the SparkDataFrame to an external database table via JDBC
+#' write.jdbc
 #'
-#' Additional JDBC database connection properties can be set (...)
+#' Saves the content of the SparkDataFrame to an external database table via JDBC. Additional JDBC
+#' database connection properties can be set (...)
 #'
 #' Also, mode is used to specify the behavior of the save operation when
-#' data already exists in the data source. There are four modes: \cr
-#'  append: Contents of this SparkDataFrame are expected to be appended to existing data. \cr
-#'  overwrite: Existing data is expected to be overwritten by the contents of this
-#'     SparkDataFrame. \cr
-#'  error: An exception is expected to be thrown. \cr
-#'  ignore: The save operation is expected to not save the contents of the SparkDataFrame
-#'     and to not change the existing data. \cr
+#' data already exists in the data source. There are four modes:
+#' \itemize{
+#'   \item append: Contents of this SparkDataFrame are expected to be appended to existing data.
+#'   \item overwrite: Existing data is expected to be overwritten by the contents of this
+#'         SparkDataFrame.
+#'   \item error: An exception is expected to be thrown.
+#'   \item ignore: The save operation is expected to not save the contents of the SparkDataFrame
+#'         and to not change the existing data.
+#' }
 #'
 #' @param x A SparkDataFrame
 #' @param url JDBC database url of the form `jdbc:subprotocol:subname`
diff --git a/R/pkg/R/RDD.R b/R/pkg/R/RDD.R
index f1badf4364da0..fded00b40a08c 100644
--- a/R/pkg/R/RDD.R
+++ b/R/pkg/R/RDD.R
@@ -497,9 +497,9 @@ setMethod("map",
             lapply(X, FUN)
           })
 
-#' Flatten results after apply a function to all elements
+#' Flatten results after applying a function to all elements
 #'
-#' This function return a new RDD by first applying a function to all
+#' This function returns a new RDD by first applying a function to all
 #' elements of this RDD, and then flattening the results.
 #'
 #' @param X The RDD to apply the transformation.
@@ -713,7 +713,7 @@ setMethod("sumRDD",
             reduce(x, "+")
           })
 
-#' Applies a function to all elements in an RDD, and force evaluation.
+#' Applies a function to all elements in an RDD, and forces evaluation.
 #'
 #' @param x The RDD to apply the function
 #' @param func The function to be applied.
@@ -737,7 +737,7 @@ setMethod("foreach",
             invisible(collect(mapPartitions(x, partition.func)))
           })
 
-#' Applies a function to each partition in an RDD, and force evaluation.
+#' Applies a function to each partition in an RDD, and forces evaluation.
 #'
 #' @examples
 #'\dontrun{
diff --git a/R/pkg/R/column.R b/R/pkg/R/column.R
index 873e8b1665a28..1c195c10633e3 100644
--- a/R/pkg/R/column.R
+++ b/R/pkg/R/column.R
@@ -204,6 +204,8 @@ setMethod("between", signature(x = "Column"),
             }
           })
 
+#' cast
+#'
 #' Casts the column to a different data type.
 #'
 #' @rdname cast
diff --git a/R/pkg/R/context.R b/R/pkg/R/context.R
index 44bca877fd45a..38a4e21acbc04 100644
--- a/R/pkg/R/context.R
+++ b/R/pkg/R/context.R
@@ -234,22 +234,24 @@ setCheckpointDir <- function(sc, dirName) {
 #'   lapply(list, func)
 #'
 #' Known limitations:
-#'  - variable scoping and capture: compared to R's rich support for variable resolutions, the
-# distributed nature of SparkR limits how variables are resolved at runtime. All the variables
-# that are available through lexical scoping are embedded in the closure of the function and
-# available as read-only variables within the function. The environment variables should be
-# stored into temporary variables outside the function, and not directly accessed within the
-# function.
+#' \itemize{
+#'    \item variable scoping and capture: compared to R's rich support for variable resolutions,
+#'    the distributed nature of SparkR limits how variables are resolved at runtime. All the
+#'    variables that are available through lexical scoping are embedded in the closure of the
+#'    function and available as read-only variables within the function. The environment variables
+#'    should be stored into temporary variables outside the function, and not directly accessed
+#'    within the function.
 #'
-#'  - loading external packages: In order to use a package, you need to load it inside the
-#'    closure. For example, if you rely on the MASS module, here is how you would use it:
-#'\dontrun{
-#' train <- function(hyperparam) {
-#'   library(MASS)
-#'   lm.ridge(“y ~ x+z”, data, lambda=hyperparam)
-#'   model
+#'   \item loading external packages: In order to use a package, you need to load it inside the
+#'   closure. For example, if you rely on the MASS module, here is how you would use it:
+#'   \preformatted{
+#'     train <- function(hyperparam) {
+#'       library(MASS)
+#'       lm.ridge(“y ~ x+z”, data, lambda=hyperparam)
+#'       model
+#'     }
+#'   }
 #' }
-#'}
 #'
 #' @rdname spark.lapply
 #' @param sc Spark Context to use
diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index 2665d1d477802..a779127b379a0 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -2185,7 +2185,7 @@ setMethod("from_unixtime", signature(x = "Column"),
 #'    # 09:01:15-09:02:15...
 #'   window(df$time, "1 minute", startTime = "15 seconds")
 #'
-#'   # Thirty second windows every 10 seconds, e.g. 09:00:00-09:00:30, 09:00:10-09:00:40, ...
+#'   # Thirty-second windows every 10 seconds, e.g. 09:00:00-09:00:30, 09:00:10-09:00:40, ...
 #'   window(df$time, "30 seconds", "10 seconds")
 #'}
 setMethod("window", signature(x = "Column"),
diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
index d4152b43b6f5f..380e942c12925 100644
--- a/R/pkg/R/mllib.R
+++ b/R/pkg/R/mllib.R
@@ -197,11 +197,10 @@ print.summary.GeneralizedLinearRegressionModel <- function(x, ...) {
   invisible(x)
   }
 
-#' Make predictions from a generalized linear model
-#'
 #' Makes predictions from a generalized linear model produced by glm() or spark.glm(),
 #' similarly to R's predict().
 #'
+#' @title predict
 #' @param object A fitted generalized linear model
 #' @param newData SparkDataFrame for testing
 #' @return SparkDataFrame containing predicted labels in a column named "prediction"
@@ -218,11 +217,10 @@ setMethod("predict", signature(object = "GeneralizedLinearRegressionModel"),
             return(dataFrame(callJMethod(object@jobj, "transform", newData@sdf)))
           })
 
-#' Make predictions from a naive Bayes model
-#'
-#' Makes predictions from a model produced by spark.naiveBayes(),
+#' Makes predictions from a naive Bayes model or a model produced by spark.naiveBayes(),
 #' similarly to R package e1071's predict.
 #'
+#' @title predict
 #' @param object A fitted naive Bayes model
 #' @param newData SparkDataFrame for testing
 #' @return SparkDataFrame containing predicted labels in a column named "prediction"
@@ -357,10 +355,9 @@ setMethod("summary", signature(object = "KMeansModel"),
                    cluster = cluster, is.loaded = is.loaded))
           })
 
-#' Make predictions from a k-means model
-#'
-#' Make predictions from a model produced by spark.kmeans().
+#' Makes predictions from a k-means model or a model produced by spark.kmeans().
 #'
+#' @title predict
 #' @param object A fitted k-means model
 #' @param newData SparkDataFrame for testing
 #' @return SparkDataFrame containing predicted labels in a column named "prediction"
@@ -582,11 +579,10 @@ setMethod("summary", signature(object = "AFTSurvivalRegressionModel"),
             return(list(coefficients = coefficients))
           })
 
-#' Make predictions from an AFT survival regression model
-#'
-#' Make predictions from a model produced by spark.survreg(),
+#' Makes predictions from an AFT survival regression model or a model produced by spark.survreg(),
 #' similarly to R package survival's predict.
 #'
+#' @title predict
 #' @param object A fitted AFT survival regression model
 #' @param newData SparkDataFrame for testing
 #' @return SparkDataFrame containing predicted labels in a column named "prediction"
diff --git a/R/pkg/R/stats.R b/R/pkg/R/stats.R
index 6b53517873a72..77dddad88e4e5 100644
--- a/R/pkg/R/stats.R
+++ b/R/pkg/R/stats.R
@@ -19,12 +19,11 @@
 
 setOldClass("jobj")
 
-#' crosstab
-#'
 #' Computes a pair-wise frequency table of the given columns. Also known as a contingency
 #' table. The number of distinct values for each column should be less than 1e4. At most 1e6
 #' non-zero pair frequencies will be returned.
 #'
+#' @title Statistic functions for SparkDataFrames
 #' @param col1 name of the first column. Distinct items will make the first item of each row.
 #' @param col2 name of the second column. Distinct items will make the column names of the output.
 #' @return a local R data.frame representing the contingency table. The first column of each row
@@ -48,10 +47,9 @@ setMethod("crosstab",
             collect(dataFrame(sct))
           })
 
-#' cov
-#'
-#' Calculate the sample covariance of two numerical columns of a SparkDataFrame.
+#' Calculates the sample covariance of two numerical columns of a SparkDataFrame.
 #'
+#' @title Statistic functions for SparkDataFrames
 #' @param x A SparkDataFrame
 #' @param col1 the name of the first column
 #' @param col2 the name of the second column
@@ -73,12 +71,11 @@ setMethod("cov",
             callJMethod(statFunctions, "cov", col1, col2)
           })
 
-#' corr
-#'
 #' Calculates the correlation of two columns of a SparkDataFrame.
 #' Currently only supports the Pearson Correlation Coefficient.
 #' For Spearman Correlation, consider using RDD methods found in MLlib's Statistics.
 #'
+#' @title Statistic functions for SparkDataFrames
 #' @param x A SparkDataFrame
 #' @param col1 the name of the first column
 #' @param col2 the name of the second column
@@ -103,12 +100,12 @@ setMethod("corr",
             callJMethod(statFunctions, "corr", col1, col2, method)
           })
 
-#' freqItems
-#'
 #' Finding frequent items for columns, possibly with false positives.
 #' Using the frequent element count algorithm described in
-#' \url{http://dx.doi.org/10.1145/762471.762473}, proposed by Karp, Schenker, and Papadimitriou.
+#' \href{http://dx.doi.org/10.1145/762471.762473}{A simple algorithm for finding frequent elements
+#' in streams and bags}, proposed by Karp, Schenker, and Papadimitriou.
 #'
+#' @title Statistic functions for SparkDataFrames
 #' @param x A SparkDataFrame.
 #' @param cols A vector column names to search frequent items in.
 #' @param support (Optional) The minimum frequency for an item to be considered `frequent`.
@@ -130,18 +127,17 @@ setMethod("freqItems", signature(x = "SparkDataFrame", cols = "character"),
             collect(dataFrame(sct))
           })
 
-#' approxQuantile
-#'
 #' Calculates the approximate quantiles of a numerical column of a SparkDataFrame.
 #'
+#' @title Statistic functions for SparkDataFrames
 #' The result of this algorithm has the following deterministic bound:
-#' If the SparkDataFrame has N elements and if we request the quantile at probability `p` up to
-#' error `err`, then the algorithm will return a sample `x` from the SparkDataFrame so that the
-#' *exact* rank of `x` is close to (p * N). More precisely,
-#'   floor((p - err) * N) <= rank(x) <= ceil((p + err) * N).
-#' This method implements a variation of the Greenwald-Khanna algorithm (with some speed
-#' optimizations). The algorithm was first present in [[http://dx.doi.org/10.1145/375663.375670
-#' Space-efficient Online Computation of Quantile Summaries]] by Greenwald and Khanna.
+#' If the SparkDataFrame has N elements and if we request the quantile at probability \strong{p} up
+#' to error \strong{err}, then the algorithm will return a sample \strong{x} from the
+#' SparkDataFrame so that the \strong{exact} rank of \strong{x} is close to \eqn{(p * N)}. More
+#' precisely, floor((p - err) * N) <= rank(x) <= ceil((p + err) * N). This method implements a
+#' variation of the Greenwald-Khanna algorithm (with some speed optimizations). The algorithm was
+#' first present in \href{http://dx.doi.org/10.1145/375663.375670}{Space-efficient Online
+#' Computation of Quantile Summaries} by Greenwald and Khanna.
 #'
 #' @param x A SparkDataFrame.
 #' @param col The name of the numerical column.
@@ -169,10 +165,9 @@ setMethod("approxQuantile",
                         as.list(probabilities), relativeError)
           })
 
-#' sampleBy
-#'
 #' Returns a stratified sample without replacement based on the fraction given on each stratum.
 #'
+#' @title Statistic functions for SparkDataFrames
 #' @param x A SparkDataFrame
 #' @param col column that defines strata
 #' @param fractions A named list giving sampling fraction for each stratum. If a stratum is

From aa810839df59183092d03202876ba73f062a6819 Mon Sep 17 00:00:00 2001
From: Kai Jiang <jiangkai@gmail.com>
Date: Tue, 31 May 2016 05:00:15 -0700
Subject: [PATCH 2/9] address comments

---
 R/pkg/R/DataFrame.R | 10 ++++------
 R/pkg/R/stats.R     | 20 +++++++++++++-------
 2 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index c5d120b26dfb7..07005de17f71f 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -630,9 +630,8 @@ setMethod("repartition",
 #' @param x A SparkDataFrame
 #' @return A StringRRDD of JSON objects
 #' @family SparkDataFrame functions
-#' @rdname toJSON
-#' @name toJSON
-#' @export
+#' @rdname tojson
+#' @noRd
 #' @examples
 #'\dontrun{
 #' sc <- sparkR.init()
@@ -1082,9 +1081,8 @@ setMethod("first",
 #' @param x A SparkDataFrame
 #'
 #' @family SparkDataFrame functions
-#' @rdname toRDD
-#' @name toRDD
-#' @export
+#' @rdname tordd
+#' @noRd
 #' @examples
 #'\dontrun{
 #' sc <- sparkR.init()
diff --git a/R/pkg/R/stats.R b/R/pkg/R/stats.R
index 77dddad88e4e5..f3c6afd3e6570 100644
--- a/R/pkg/R/stats.R
+++ b/R/pkg/R/stats.R
@@ -19,11 +19,12 @@
 
 setOldClass("jobj")
 
+#' crosstab
+#'
 #' Computes a pair-wise frequency table of the given columns. Also known as a contingency
 #' table. The number of distinct values for each column should be less than 1e4. At most 1e6
 #' non-zero pair frequencies will be returned.
 #'
-#' @title Statistic functions for SparkDataFrames
 #' @param col1 name of the first column. Distinct items will make the first item of each row.
 #' @param col2 name of the second column. Distinct items will make the column names of the output.
 #' @return a local R data.frame representing the contingency table. The first column of each row
@@ -47,9 +48,10 @@ setMethod("crosstab",
             collect(dataFrame(sct))
           })
 
-#' Calculates the sample covariance of two numerical columns of a SparkDataFrame.
+#' cov
+#'
+#' Calculate the sample covariance of two numerical columns of a SparkDataFrame.
 #'
-#' @title Statistic functions for SparkDataFrames
 #' @param x A SparkDataFrame
 #' @param col1 the name of the first column
 #' @param col2 the name of the second column
@@ -71,11 +73,12 @@ setMethod("cov",
             callJMethod(statFunctions, "cov", col1, col2)
           })
 
+#' corr
+#'
 #' Calculates the correlation of two columns of a SparkDataFrame.
 #' Currently only supports the Pearson Correlation Coefficient.
 #' For Spearman Correlation, consider using RDD methods found in MLlib's Statistics.
 #'
-#' @title Statistic functions for SparkDataFrames
 #' @param x A SparkDataFrame
 #' @param col1 the name of the first column
 #' @param col2 the name of the second column
@@ -100,12 +103,13 @@ setMethod("corr",
             callJMethod(statFunctions, "corr", col1, col2, method)
           })
 
+#' freqItems
+#'
 #' Finding frequent items for columns, possibly with false positives.
 #' Using the frequent element count algorithm described in
 #' \href{http://dx.doi.org/10.1145/762471.762473}{A simple algorithm for finding frequent elements
 #' in streams and bags}, proposed by Karp, Schenker, and Papadimitriou.
 #'
-#' @title Statistic functions for SparkDataFrames
 #' @param x A SparkDataFrame.
 #' @param cols A vector column names to search frequent items in.
 #' @param support (Optional) The minimum frequency for an item to be considered `frequent`.
@@ -127,9 +131,10 @@ setMethod("freqItems", signature(x = "SparkDataFrame", cols = "character"),
             collect(dataFrame(sct))
           })
 
+#' approxQuantile
+#'
 #' Calculates the approximate quantiles of a numerical column of a SparkDataFrame.
 #'
-#' @title Statistic functions for SparkDataFrames
 #' The result of this algorithm has the following deterministic bound:
 #' If the SparkDataFrame has N elements and if we request the quantile at probability \strong{p} up
 #' to error \strong{err}, then the algorithm will return a sample \strong{x} from the
@@ -165,9 +170,10 @@ setMethod("approxQuantile",
                         as.list(probabilities), relativeError)
           })
 
+#' sampleBy
+#'
 #' Returns a stratified sample without replacement based on the fraction given on each stratum.
 #'
-#' @title Statistic functions for SparkDataFrames
 #' @param x A SparkDataFrame
 #' @param col column that defines strata
 #' @param fractions A named list giving sampling fraction for each stratum. If a stratum is

From 8ab88d7a2ac5a8565ed8311135fac0a5b7e27e17 Mon Sep 17 00:00:00 2001
From: Kai Jiang <jiangkai@gmail.com>
Date: Thu, 2 Jun 2016 15:58:21 -0700
Subject: [PATCH 3/9] revert changes in R/pkg/R/stats.R

 - this changes might happen in #13109
---
 R/pkg/R/stats.R | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/R/pkg/R/stats.R b/R/pkg/R/stats.R
index f3c6afd3e6570..6b53517873a72 100644
--- a/R/pkg/R/stats.R
+++ b/R/pkg/R/stats.R
@@ -107,8 +107,7 @@ setMethod("corr",
 #'
 #' Finding frequent items for columns, possibly with false positives.
 #' Using the frequent element count algorithm described in
-#' \href{http://dx.doi.org/10.1145/762471.762473}{A simple algorithm for finding frequent elements
-#' in streams and bags}, proposed by Karp, Schenker, and Papadimitriou.
+#' \url{http://dx.doi.org/10.1145/762471.762473}, proposed by Karp, Schenker, and Papadimitriou.
 #'
 #' @param x A SparkDataFrame.
 #' @param cols A vector column names to search frequent items in.
@@ -136,13 +135,13 @@ setMethod("freqItems", signature(x = "SparkDataFrame", cols = "character"),
 #' Calculates the approximate quantiles of a numerical column of a SparkDataFrame.
 #'
 #' The result of this algorithm has the following deterministic bound:
-#' If the SparkDataFrame has N elements and if we request the quantile at probability \strong{p} up
-#' to error \strong{err}, then the algorithm will return a sample \strong{x} from the
-#' SparkDataFrame so that the \strong{exact} rank of \strong{x} is close to \eqn{(p * N)}. More
-#' precisely, floor((p - err) * N) <= rank(x) <= ceil((p + err) * N). This method implements a
-#' variation of the Greenwald-Khanna algorithm (with some speed optimizations). The algorithm was
-#' first present in \href{http://dx.doi.org/10.1145/375663.375670}{Space-efficient Online
-#' Computation of Quantile Summaries} by Greenwald and Khanna.
+#' If the SparkDataFrame has N elements and if we request the quantile at probability `p` up to
+#' error `err`, then the algorithm will return a sample `x` from the SparkDataFrame so that the
+#' *exact* rank of `x` is close to (p * N). More precisely,
+#'   floor((p - err) * N) <= rank(x) <= ceil((p + err) * N).
+#' This method implements a variation of the Greenwald-Khanna algorithm (with some speed
+#' optimizations). The algorithm was first present in [[http://dx.doi.org/10.1145/375663.375670
+#' Space-efficient Online Computation of Quantile Summaries]] by Greenwald and Khanna.
 #'
 #' @param x A SparkDataFrame.
 #' @param col The name of the numerical column.

From 8a89ad2aa63e6d513154bf65ae3fc4a22b059d56 Mon Sep 17 00:00:00 2001
From: Kai Jiang <jiangkai@gmail.com>
Date: Sun, 5 Jun 2016 22:34:12 -0700
Subject: [PATCH 4/9] address comment and more changes

---
 R/pkg/R/DataFrame.R | 17 +++++++----------
 R/pkg/R/context.R   |  3 ++-
 R/pkg/R/functions.R | 10 +++++++---
 R/pkg/R/utils.R     |  2 ++
 4 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 07005de17f71f..ddb0f4500149f 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -629,8 +629,6 @@ setMethod("repartition",
 #'
 #' @param x A SparkDataFrame
 #' @return A StringRRDD of JSON objects
-#' @family SparkDataFrame functions
-#' @rdname tojson
 #' @noRd
 #' @examples
 #'\dontrun{
@@ -648,7 +646,7 @@ setMethod("toJSON",
             RDD(jrdd, serializedMode = "string")
           })
 
-#' write.json
+#' Save the contents of DataFrame as a JSON file
 #'
 #' Save the contents of a SparkDataFrame as a JSON file (one object per line). Files written out
 #' with this method can be read back in as a SparkDataFrame using read.json().
@@ -675,7 +673,7 @@ setMethod("write.json",
             invisible(callJMethod(write, "json", path))
           })
 
-#' write.parquet
+#' Save the contents of DataFrame as a Parquet file, preserving the schema.
 #'
 #' Save the contents of a SparkDataFrame as a Parquet file, preserving the schema. Files written out
 #' with this method can be read back in as a SparkDataFrame using read.parquet().
@@ -713,7 +711,7 @@ setMethod("saveAsParquetFile",
             write.parquet(x, path)
           })
 
-#' write.text
+#' Save the content of DataFrame in a text file at the specified path.
 #'
 #' Saves the content of the SparkDataFrame in a text file at the specified path.
 #' The SparkDataFrame must have only one column of string type with the name "value".
@@ -1080,8 +1078,6 @@ setMethod("first",
 #'
 #' @param x A SparkDataFrame
 #'
-#' @family SparkDataFrame functions
-#' @rdname tordd
 #' @noRd
 #' @examples
 #'\dontrun{
@@ -2187,7 +2183,7 @@ setMethod("except",
             dataFrame(excepted)
           })
 
-#' write.df
+#' Save the contents of DataFrame to a data source.
 #'
 #' Save the contents of the SparkDataFrame to a data source. The data source is specified by the
 #' `source` and a set of options (...). If `source` is not specified, the default data source
@@ -2524,13 +2520,14 @@ setMethod("attach",
             attach(newEnv, pos = pos, name = name, warn.conflicts = warn.conflicts)
           })
 
+#' Evaluate an expression in an environment constructed from DataFrame
+#'
 #' Evaluate a R expression in an environment constructed from a SparkDataFrame
 #' with() allows access to columns of a SparkDataFrame by simply referring to
 #' their name. It appends every column of a SparkDataFrame into a new
 #' environment. Then, the given expression is evaluated in this new
 #' environment.
 #'
-#' @title with
 #' @rdname with
 #' @family SparkDataFrame functions
 #' @title Evaluate a R expression in an environment constructed from a SparkDataFrame
@@ -2784,7 +2781,7 @@ setMethod("histogram",
             return(histStats)
           })
 
-#' write.jdbc
+#' Save the content of DataFrame to an external database table via JDBC.
 #'
 #' Saves the content of the SparkDataFrame to an external database table via JDBC. Additional JDBC
 #' database connection properties can be set (...)
diff --git a/R/pkg/R/context.R b/R/pkg/R/context.R
index 38a4e21acbc04..0ab1ac14292b6 100644
--- a/R/pkg/R/context.R
+++ b/R/pkg/R/context.R
@@ -261,7 +261,8 @@ setCheckpointDir <- function(sc, dirName) {
 #' @export
 #' @examples
 #'\dontrun{
-#' doubled <- spark.lapply(1:10, function(x){2 * x})
+#' sc <- sparkR.init()
+#' doubled <- spark.lapply(sc, 1:10, function(x){2 * x})
 #'}
 spark.lapply <- function(sc, list, func) {
   rdd <- parallelize(sc, list, length(list))
diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index a779127b379a0..a1c091227ffc2 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -238,9 +238,9 @@ setMethod("ceil",
             column(jc)
           })
 
-#' Though scala functions has "col" function, we don't expose it in SparkR
-#' because we don't want to conflict with the "col" function in the R base
-#' package and we also have "column" function exported which is an alias of "col".
+#' @rdname col
+#' @name column
+#' @export
 col <- function(x) {
   column(callJStatic("org.apache.spark.sql.functions", "col", x))
 }
@@ -249,6 +249,10 @@ col <- function(x) {
 #'
 #' Returns a Column based on the given column name.
 #'
+#' Though scala functions has "col" function, we don't expose it in SparkR
+#' because we don't want to conflict with the "col" function in the R base
+#' package and we also have "column" function exported which is an alias of "col".
+#'
 #' @rdname col
 #' @name column
 #' @family normal_funcs
diff --git a/R/pkg/R/utils.R b/R/pkg/R/utils.R
index 12e4f4f1ae8bb..b1b8adaa66a25 100644
--- a/R/pkg/R/utils.R
+++ b/R/pkg/R/utils.R
@@ -110,9 +110,11 @@ isRDD <- function(name, env) {
 #' @return the hash code as an integer
 #' @export
 #' @examples
+#'\dontrun{
 #' hashCode(1L) # 1
 #' hashCode(1.0) # 1072693248
 #' hashCode("1") # 49
+#'}
 hashCode <- function(key) {
   if (class(key) == "integer") {
     as.integer(key[[1]])

From 9629184874b4bb7d9f204e98bf0e12cd68f4ebee Mon Sep 17 00:00:00 2001
From: Kai Jiang <jiangkai@gmail.com>
Date: Sun, 12 Jun 2016 02:03:37 -0700
Subject: [PATCH 5/9] address comments

---
 R/pkg/R/DataFrame.R | 12 ++++++------
 R/pkg/R/functions.R | 13 +++----------
 2 files changed, 9 insertions(+), 16 deletions(-)

diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index ddb0f4500149f..faa15de6824b3 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -646,7 +646,7 @@ setMethod("toJSON",
             RDD(jrdd, serializedMode = "string")
           })
 
-#' Save the contents of DataFrame as a JSON file
+#' Save the contents of SparkDataFrame as a JSON file
 #'
 #' Save the contents of a SparkDataFrame as a JSON file (one object per line). Files written out
 #' with this method can be read back in as a SparkDataFrame using read.json().
@@ -673,7 +673,7 @@ setMethod("write.json",
             invisible(callJMethod(write, "json", path))
           })
 
-#' Save the contents of DataFrame as a Parquet file, preserving the schema.
+#' Save the contents of SparkDataFrame as a Parquet file, preserving the schema.
 #'
 #' Save the contents of a SparkDataFrame as a Parquet file, preserving the schema. Files written out
 #' with this method can be read back in as a SparkDataFrame using read.parquet().
@@ -711,7 +711,7 @@ setMethod("saveAsParquetFile",
             write.parquet(x, path)
           })
 
-#' Save the content of DataFrame in a text file at the specified path.
+#' Save the content of SparkDataFrame in a text file at the specified path.
 #'
 #' Saves the content of the SparkDataFrame in a text file at the specified path.
 #' The SparkDataFrame must have only one column of string type with the name "value".
@@ -2183,7 +2183,7 @@ setMethod("except",
             dataFrame(excepted)
           })
 
-#' Save the contents of DataFrame to a data source.
+#' Save the contents of SparkDataFrame to a data source.
 #'
 #' Save the contents of the SparkDataFrame to a data source. The data source is specified by the
 #' `source` and a set of options (...). If `source` is not specified, the default data source
@@ -2520,7 +2520,7 @@ setMethod("attach",
             attach(newEnv, pos = pos, name = name, warn.conflicts = warn.conflicts)
           })
 
-#' Evaluate an expression in an environment constructed from DataFrame
+#' Evaluate an expression in an environment constructed from SparkDataFrame
 #'
 #' Evaluate a R expression in an environment constructed from a SparkDataFrame
 #' with() allows access to columns of a SparkDataFrame by simply referring to
@@ -2781,7 +2781,7 @@ setMethod("histogram",
             return(histStats)
           })
 
-#' Save the content of DataFrame to an external database table via JDBC.
+#' Save the content of SparkDataFrame to an external database table via JDBC.
 #'
 #' Saves the content of the SparkDataFrame to an external database table via JDBC. Additional JDBC
 #' database connection properties can be set (...)
diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index a1c091227ffc2..b6058ccce0d21 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -238,9 +238,9 @@ setMethod("ceil",
             column(jc)
           })
 
-#' @rdname col
-#' @name column
-#' @export
+#' Though scala functions has "col" function, we don't expose it in SparkR
+#' because we don't want to conflict with the "col" function in the R base
+#' package and we also have "column" function exported which is an alias of "col".
 col <- function(x) {
   column(callJStatic("org.apache.spark.sql.functions", "col", x))
 }
@@ -249,14 +249,7 @@ col <- function(x) {
 #'
 #' Returns a Column based on the given column name.
 #'
-#' Though scala functions has "col" function, we don't expose it in SparkR
-#' because we don't want to conflict with the "col" function in the R base
-#' package and we also have "column" function exported which is an alias of "col".
-#'
-#' @rdname col
-#' @name column
 #' @family normal_funcs
-#' @export
 #' @examples \dontrun{column(df)}
 setMethod("column",
           signature(x = "character"),

From 560ff0e3f448827f0a72f4e65f24e1d1afa24440 Mon Sep 17 00:00:00 2001
From: Kai Jiang <jiangkai@gmail.com>
Date: Sun, 12 Jun 2016 03:00:15 -0700
Subject: [PATCH 6/9] use first line as the title convention

---
 R/pkg/R/DataFrame.R  | 32 ++++++++++++++++++++------------
 R/pkg/R/RDD.R        |  6 ++++--
 R/pkg/R/WindowSpec.R |  7 ++++---
 R/pkg/R/broadcast.R  |  8 +++++---
 R/pkg/R/column.R     |  6 ++++--
 R/pkg/R/context.R    |  8 +++-----
 R/pkg/R/group.R      |  6 ++++--
 R/pkg/R/mllib.R      | 24 ++++++++++++++++--------
 8 files changed, 60 insertions(+), 37 deletions(-)

diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index faa15de6824b3..96628a491c669 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -23,9 +23,11 @@ NULL
 setOldClass("jobj")
 setOldClass("structType")
 
-#' @title S4 class that represents a SparkDataFrame
-#' @description DataFrames can be created using functions like \link{createDataFrame},
-#'              \link{read.json}, \link{table} etc.
+#' S4 class that represents a SparkDataFrame
+#'
+#' DataFrames can be created using functions like \link{createDataFrame},
+#' \link{read.json}, \link{table} etc.
+#'
 #' @family SparkDataFrame functions
 #' @rdname SparkDataFrame
 #' @docType class
@@ -1940,8 +1942,9 @@ setMethod("join",
             dataFrame(sdf)
           })
 
+#' Merges two data frames
+#'
 #' @name merge
-#' @title Merges two data frames
 #' @param x the first data frame to be joined
 #' @param y the second data frame to be joined
 #' @param by a character vector specifying the join columns. If by is not
@@ -2111,8 +2114,9 @@ setMethod("unionAll",
             dataFrame(unioned)
           })
 
-#' @title Union two or more SparkDataFrames
-#' @description Returns a new SparkDataFrame containing rows of all parameters.
+#' Union two or more SparkDataFrames
+#'
+#' Returns a new SparkDataFrame containing rows of all parameters.
 #'
 #' @rdname rbind
 #' @name rbind
@@ -2474,11 +2478,12 @@ setMethod("fillna",
             dataFrame(sdf)
           })
 
+#' Download data from a SparkDataFrame into a data.frame
+#'
 #' This function downloads the contents of a SparkDataFrame into an R's data.frame.
 #' Since data.frames are held in memory, ensure that you have enough memory
 #' in your system to accommodate the contents.
 #'
-#' @title Download data from a SparkDataFrame into a data.frame
 #' @param x a SparkDataFrame
 #' @return a data.frame
 #' @family SparkDataFrame functions
@@ -2494,13 +2499,14 @@ setMethod("as.data.frame",
             as.data.frame(collect(x), row.names, optional, ...)
           })
 
+#' Attach SparkDataFrame to R search path
+#'
 #' The specified SparkDataFrame is attached to the R search path. This means that
 #' the SparkDataFrame is searched by R when evaluating a variable, so columns in
 #' the SparkDataFrame can be accessed by simply giving their names.
 #'
 #' @family SparkDataFrame functions
 #' @rdname attach
-#' @title Attach SparkDataFrame to R search path
 #' @param what (SparkDataFrame) The SparkDataFrame to attach
 #' @param pos (integer) Specify position in search() where to attach.
 #' @param name (character) Name to use for the attached SparkDataFrame. Names
@@ -2520,7 +2526,7 @@ setMethod("attach",
             attach(newEnv, pos = pos, name = name, warn.conflicts = warn.conflicts)
           })
 
-#' Evaluate an expression in an environment constructed from SparkDataFrame
+#' Evaluate a R expression in an environment constructed from a SparkDataFrame
 #'
 #' Evaluate a R expression in an environment constructed from a SparkDataFrame
 #' with() allows access to columns of a SparkDataFrame by simply referring to
@@ -2530,7 +2536,6 @@ setMethod("attach",
 #'
 #' @rdname with
 #' @family SparkDataFrame functions
-#' @title Evaluate a R expression in an environment constructed from a SparkDataFrame
 #' @param data (SparkDataFrame) SparkDataFrame to use for constructing an environment.
 #' @param expr (expression) Expression to evaluate.
 #' @param ... arguments to be passed to future methods.
@@ -2546,10 +2551,12 @@ setMethod("with",
             eval(substitute(expr), envir = newEnv, enclos = newEnv)
           })
 
+#' Compactly display the structure of a dataset
+#'
 #' Display the structure of a SparkDataFrame, including column names, column types, as well as a
 #' a small sample of rows.
+#'
 #' @name str
-#' @title Compactly display the structure of a dataset
 #' @rdname str
 #' @family SparkDataFrame functions
 #' @param object a SparkDataFrame
@@ -2662,10 +2669,11 @@ setMethod("drop",
             base::drop(x)
           })
 
+#' Histogram
+#'
 #' This function computes a histogram for a given SparkR Column.
 #'
 #' @name histogram
-#' @title Histogram
 #' @param nbins the number of bins (optional). Default value is 10.
 #' @param df the SparkDataFrame containing the Column to build the histogram from.
 #' @param colname the name of the column to build the histogram from.
diff --git a/R/pkg/R/RDD.R b/R/pkg/R/RDD.R
index fded00b40a08c..72a805256523e 100644
--- a/R/pkg/R/RDD.R
+++ b/R/pkg/R/RDD.R
@@ -19,9 +19,11 @@
 
 setOldClass("jobj")
 
-#' @title S4 class that represents an RDD
-#' @description RDD can be created using functions like
+#' S4 class that represents an RDD
+#'
+#' RDD can be created using functions like
 #'              \code{parallelize}, \code{textFile} etc.
+#'
 #' @rdname RDD
 #' @seealso parallelize, textFile
 #' @slot env An R environment that stores bookkeeping states of the RDD
diff --git a/R/pkg/R/WindowSpec.R b/R/pkg/R/WindowSpec.R
index 581176a6c0918..d8405420d0a49 100644
--- a/R/pkg/R/WindowSpec.R
+++ b/R/pkg/R/WindowSpec.R
@@ -20,9 +20,10 @@
 #' @include generics.R jobj.R column.R
 NULL
 
-#' @title S4 class that represents a WindowSpec
-#' @description WindowSpec can be created by using window.partitionBy()
-#'              or window.orderBy()
+#' S4 class that represents a WindowSpec
+#'
+#' WindowSpec can be created by using window.partitionBy() or window.orderBy()
+#'
 #' @rdname WindowSpec
 #' @seealso \link{window.partitionBy}, \link{window.orderBy}
 #'
diff --git a/R/pkg/R/broadcast.R b/R/pkg/R/broadcast.R
index 38f0eed95e065..398dffc4ab1b4 100644
--- a/R/pkg/R/broadcast.R
+++ b/R/pkg/R/broadcast.R
@@ -23,9 +23,11 @@
 .broadcastValues <- new.env()
 .broadcastIdToName <- new.env()
 
-# @title S4 class that represents a Broadcast variable
-# @description Broadcast variables can be created using the broadcast
-#              function from a \code{SparkContext}.
+# S4 class that represents a Broadcast variable
+#
+# Broadcast variables can be created using the broadcast
+# function from a \code{SparkContext}.
+#
 # @rdname broadcast-class
 # @seealso broadcast
 #
diff --git a/R/pkg/R/column.R b/R/pkg/R/column.R
index 1c195c10633e3..fa5e743ae48b5 100644
--- a/R/pkg/R/column.R
+++ b/R/pkg/R/column.R
@@ -22,8 +22,10 @@ NULL
 
 setOldClass("jobj")
 
-#' @title S4 class that represents a SparkDataFrame column
-#' @description The column class supports unary, binary operations on SparkDataFrame columns
+#' S4 class that represents a SparkDataFrame column
+#'
+#' The column class supports unary, binary operations on SparkDataFrame columns
+#'
 #' @rdname column
 #'
 #' @slot jc reference to JVM SparkDataFrame column
diff --git a/R/pkg/R/context.R b/R/pkg/R/context.R
index 0ab1ac14292b6..5c886030ff5c5 100644
--- a/R/pkg/R/context.R
+++ b/R/pkg/R/context.R
@@ -173,9 +173,8 @@ includePackage <- function(sc, pkg) {
   .sparkREnv$.packages <- packages
 }
 
-#' @title Broadcast a variable to all workers
+#' Broadcast a variable to all workers
 #'
-#' @description
 #' Broadcast a read-only variable to the cluster, returning a \code{Broadcast}
 #' object for reading it in distributed functions.
 #'
@@ -207,7 +206,7 @@ broadcast <- function(sc, object) {
   Broadcast(id, object, jBroadcast, objName)
 }
 
-#' @title Set the checkpoint directory
+#' Set the checkpoint directory
 #'
 #' Set the directory under which RDDs are going to be checkpointed. The
 #' directory must be a HDFS path if running on a cluster.
@@ -226,9 +225,8 @@ setCheckpointDir <- function(sc, dirName) {
   invisible(callJMethod(sc, "setCheckpointDir", suppressWarnings(normalizePath(dirName))))
 }
 
-#' @title Run a function over a list of elements, distributing the computations with Spark.
+#' Run a function over a list of elements, distributing the computations with Spark.
 #'
-#' @description
 #' Applies a function in a manner that is similar to doParallel or lapply to elements of a list.
 #' The computations are distributed using Spark. It is conceptually the same as the following code:
 #'   lapply(list, func)
diff --git a/R/pkg/R/group.R b/R/pkg/R/group.R
index 08f4a490c883e..493f0f1b60f26 100644
--- a/R/pkg/R/group.R
+++ b/R/pkg/R/group.R
@@ -22,8 +22,10 @@ NULL
 
 setOldClass("jobj")
 
-#' @title S4 class that represents a GroupedData
-#' @description GroupedDatas can be created using groupBy() on a SparkDataFrame
+#' S4 class that represents a GroupedData
+#'
+#' GroupedDatas can be created using groupBy() on a SparkDataFrame
+#'
 #' @rdname GroupedData
 #' @seealso groupBy
 #'
diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
index 380e942c12925..440a06646421e 100644
--- a/R/pkg/R/mllib.R
+++ b/R/pkg/R/mllib.R
@@ -25,22 +25,26 @@
 # - a set of methods that reflect the arguments of the other languages supported by Spark. These
 #   methods are prefixed with the `spark.` prefix: spark.glm, spark.kmeans, etc.
 
-#' @title S4 class that represents a generalized linear model
+#' S4 class that represents a generalized linear model
+#'
 #' @param jobj a Java object reference to the backing Scala GeneralizedLinearRegressionWrapper
 #' @export
 setClass("GeneralizedLinearRegressionModel", representation(jobj = "jobj"))
 
-#' @title S4 class that represents a NaiveBayesModel
+#' S4 class that represents a NaiveBayesModel
+#'
 #' @param jobj a Java object reference to the backing Scala NaiveBayesWrapper
 #' @export
 setClass("NaiveBayesModel", representation(jobj = "jobj"))
 
-#' @title S4 class that represents a AFTSurvivalRegressionModel
+#' S4 class that represents a AFTSurvivalRegressionModel
+#'
 #' @param jobj a Java object reference to the backing Scala AFTSurvivalRegressionWrapper
 #' @export
 setClass("AFTSurvivalRegressionModel", representation(jobj = "jobj"))
 
-#' @title S4 class that represents a KMeansModel
+#' S4 class that represents a KMeansModel
+#'
 #' @param jobj a Java object reference to the backing Scala KMeansModel
 #' @export
 setClass("KMeansModel", representation(jobj = "jobj"))
@@ -197,10 +201,11 @@ print.summary.GeneralizedLinearRegressionModel <- function(x, ...) {
   invisible(x)
   }
 
+#' predict
+#'
 #' Makes predictions from a generalized linear model produced by glm() or spark.glm(),
 #' similarly to R's predict().
 #'
-#' @title predict
 #' @param object A fitted generalized linear model
 #' @param newData SparkDataFrame for testing
 #' @return SparkDataFrame containing predicted labels in a column named "prediction"
@@ -217,10 +222,11 @@ setMethod("predict", signature(object = "GeneralizedLinearRegressionModel"),
             return(dataFrame(callJMethod(object@jobj, "transform", newData@sdf)))
           })
 
+#' predict
+#'
 #' Makes predictions from a naive Bayes model or a model produced by spark.naiveBayes(),
 #' similarly to R package e1071's predict.
 #'
-#' @title predict
 #' @param object A fitted naive Bayes model
 #' @param newData SparkDataFrame for testing
 #' @return SparkDataFrame containing predicted labels in a column named "prediction"
@@ -355,9 +361,10 @@ setMethod("summary", signature(object = "KMeansModel"),
                    cluster = cluster, is.loaded = is.loaded))
           })
 
+#' predict
+#'
 #' Makes predictions from a k-means model or a model produced by spark.kmeans().
 #'
-#' @title predict
 #' @param object A fitted k-means model
 #' @param newData SparkDataFrame for testing
 #' @return SparkDataFrame containing predicted labels in a column named "prediction"
@@ -579,10 +586,11 @@ setMethod("summary", signature(object = "AFTSurvivalRegressionModel"),
             return(list(coefficients = coefficients))
           })
 
+#' predict
+#'
 #' Makes predictions from an AFT survival regression model or a model produced by spark.survreg(),
 #' similarly to R package survival's predict.
 #'
-#' @title predict
 #' @param object A fitted AFT survival regression model
 #' @param newData SparkDataFrame for testing
 #' @return SparkDataFrame containing predicted labels in a column named "prediction"

From 23fc1d737f598efb9031cc888e7eacc3ffbcbba1 Mon Sep 17 00:00:00 2001
From: Kai Jiang <jiangkai@gmail.com>
Date: Sun, 12 Jun 2016 03:45:36 -0700
Subject: [PATCH 7/9] revert

---
 R/pkg/R/DataFrame.R | 26 ++++++--------------------
 1 file changed, 6 insertions(+), 20 deletions(-)

diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 96628a491c669..edc0832c5bcae 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -715,7 +715,7 @@ setMethod("saveAsParquetFile",
 
 #' Save the content of SparkDataFrame in a text file at the specified path.
 #'
-#' Saves the content of the SparkDataFrame in a text file at the specified path.
+#' Save the content of the SparkDataFrame in a text file at the specified path.
 #' The SparkDataFrame must have only one column of string type with the name "value".
 #' Each row becomes a new line in the output file.
 #'
@@ -820,8 +820,6 @@ setMethod("sample_frac",
             sample(x, withReplacement, fraction, seed)
           })
 
-#' nrow
-#'
 #' Returns the number of rows in a SparkDataFrame
 #'
 #' @param x A SparkDataFrame
@@ -852,8 +850,6 @@ setMethod("nrow",
             count(x)
           })
 
-#' ncol
-#'
 #' Returns the number of columns in a SparkDataFrame
 #'
 #' @param x a SparkDataFrame
@@ -876,7 +872,7 @@ setMethod("ncol",
             length(columns(x))
           })
 
-#' dim
+#' Returns the dimensions of SparkDataFrame
 #'
 #' Returns the dimensions (number of rows and columns) of a SparkDataFrame
 #' @param x a SparkDataFrame
@@ -899,8 +895,6 @@ setMethod("dim",
             c(count(x), ncol(x))
           })
 
-#' collect
-#'
 #' Collects all the elements of a SparkDataFrame and coerces them into an R data.frame.
 #'
 #' @param x A SparkDataFrame
@@ -998,8 +992,6 @@ setMethod("limit",
             dataFrame(res)
           })
 
-#' take
-#'
 #' Take the first NUM rows of a SparkDataFrame and return a the results as a data.frame
 #'
 #' @family SparkDataFrame functions
@@ -1050,8 +1042,6 @@ setMethod("head",
             take(x, num)
           })
 
-#' first
-#'
 #' Return the first row of a SparkDataFrame
 #'
 #' @param x A SparkDataFrame
@@ -2058,8 +2048,6 @@ setMethod("merge",
             joinRes
           })
 
-#' generateAliasesForIntersectedCols
-#'
 #' Creates a list of columns by replacing the intersected ones with aliases.
 #' The name of the alias column is formed by concatanating the original column name and a suffix.
 #'
@@ -2189,9 +2177,9 @@ setMethod("except",
 
 #' Save the contents of SparkDataFrame to a data source.
 #'
-#' Save the contents of the SparkDataFrame to a data source. The data source is specified by the
-#' `source` and a set of options (...). If `source` is not specified, the default data source
-#' configured by spark.sql.sources.default will be used.
+#' The data source is specified by the `source` and a set of options (...).
+#' If `source` is not specified, the default data source configured by
+#' spark.sql.sources.default will be used.
 #'
 #' Additionally, mode is used to specify the behavior of the save operation when data already
 #' exists in the data source. There are four modes:
@@ -2250,8 +2238,6 @@ setMethod("saveDF",
             write.df(df, path, source, mode, ...)
           })
 
-#' saveAsTable
-#'
 #' Save the contents of the SparkDataFrame to a data source as a table
 #'
 #' The data source is specified by the `source` and a set of options (...).
@@ -2791,7 +2777,7 @@ setMethod("histogram",
 
 #' Save the content of SparkDataFrame to an external database table via JDBC.
 #'
-#' Saves the content of the SparkDataFrame to an external database table via JDBC. Additional JDBC
+#' Save the content of the SparkDataFrame to an external database table via JDBC. Additional JDBC
 #' database connection properties can be set (...)
 #'
 #' Also, mode is used to specify the behavior of the save operation when

From 2537b8ff114f9d33d0228d7aaeabafdb454609aa Mon Sep 17 00:00:00 2001
From: Kai Jiang <jiangkai@gmail.com>
Date: Mon, 13 Jun 2016 23:55:27 -0700
Subject: [PATCH 8/9] address comment

---
 R/pkg/R/column.R    | 2 --
 R/pkg/R/functions.R | 3 +++
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/R/pkg/R/column.R b/R/pkg/R/column.R
index fa5e743ae48b5..cc2876ed94b7f 100644
--- a/R/pkg/R/column.R
+++ b/R/pkg/R/column.R
@@ -206,8 +206,6 @@ setMethod("between", signature(x = "Column"),
             }
           })
 
-#' cast
-#'
 #' Casts the column to a different data type.
 #'
 #' @rdname cast
diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index b6058ccce0d21..a779127b379a0 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -249,7 +249,10 @@ col <- function(x) {
 #'
 #' Returns a Column based on the given column name.
 #'
+#' @rdname col
+#' @name column
 #' @family normal_funcs
+#' @export
 #' @examples \dontrun{column(df)}
 setMethod("column",
           signature(x = "character"),

From 84bf2aadba4d3cf92ca4803cfb608283e8237ab1 Mon Sep 17 00:00:00 2001
From: Kai Jiang <jiangkai@gmail.com>
Date: Wed, 15 Jun 2016 15:40:15 -0700
Subject: [PATCH 9/9] change more titles

---
 R/pkg/R/DataFrame.R |  2 +-
 R/pkg/R/mllib.R     | 16 ++++++++++++----
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index edc0832c5bcae..ab4be171fb0fc 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -2655,7 +2655,7 @@ setMethod("drop",
             base::drop(x)
           })
 
-#' Histogram
+#' Compute histogram statistics for given column
 #'
 #' This function computes a histogram for a given SparkR Column.
 #'
diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
index 440a06646421e..ba2eee2fca76a 100644
--- a/R/pkg/R/mllib.R
+++ b/R/pkg/R/mllib.R
@@ -201,7 +201,7 @@ print.summary.GeneralizedLinearRegressionModel <- function(x, ...) {
   invisible(x)
   }
 
-#' predict
+#' Predicted values based on model
 #'
 #' Makes predictions from a generalized linear model produced by glm() or spark.glm(),
 #' similarly to R's predict().
@@ -222,7 +222,7 @@ setMethod("predict", signature(object = "GeneralizedLinearRegressionModel"),
             return(dataFrame(callJMethod(object@jobj, "transform", newData@sdf)))
           })
 
-#' predict
+#' Predicted values based on model
 #'
 #' Makes predictions from a naive Bayes model or a model produced by spark.naiveBayes(),
 #' similarly to R package e1071's predict.
@@ -361,7 +361,7 @@ setMethod("summary", signature(object = "KMeansModel"),
                    cluster = cluster, is.loaded = is.loaded))
           })
 
-#' predict
+#' Predicted values based on model
 #'
 #' Makes predictions from a k-means model or a model produced by spark.kmeans().
 #'
@@ -406,6 +406,8 @@ setMethod("spark.naiveBayes", signature(data = "SparkDataFrame", formula = "form
         return(new("NaiveBayesModel", jobj = jobj))
     })
 
+#' Save fitted MLlib model to the input path
+#'
 #' Save the Bernoulli naive Bayes model to the input path.
 #'
 #' @param object A fitted Bernoulli naive Bayes model
@@ -432,6 +434,8 @@ setMethod("write.ml", signature(object = "NaiveBayesModel", path = "character"),
             invisible(callJMethod(writer, "save", path))
           })
 
+#' Save fitted MLlib model to the input path
+#'
 #' Save the AFT survival regression model to the input path.
 #'
 #' @param object A fitted AFT survival regression model
@@ -457,6 +461,8 @@ setMethod("write.ml", signature(object = "AFTSurvivalRegressionModel", path = "c
             invisible(callJMethod(writer, "save", path))
           })
 
+#' Save fitted MLlib model to the input path
+#'
 #' Save the generalized linear model to the input path.
 #'
 #' @param object A fitted generalized linear model
@@ -482,6 +488,8 @@ setMethod("write.ml", signature(object = "GeneralizedLinearRegressionModel", pat
             invisible(callJMethod(writer, "save", path))
           })
 
+#' Save fitted MLlib model to the input path
+#'
 #' Save the k-means model to the input path.
 #'
 #' @param object A fitted k-means model
@@ -586,7 +594,7 @@ setMethod("summary", signature(object = "AFTSurvivalRegressionModel"),
             return(list(coefficients = coefficients))
           })
 
-#' predict
+#' Predicted values based on model
 #'
 #' Makes predictions from an AFT survival regression model or a model produced by spark.survreg(),
 #' similarly to R package survival's predict.