Merge pull request #13 from apache/master

merge lastest spark
rekhajoshm · May 7, 2015 · 14952e2 · 14952e2
2 parents f03fe7f + 2d6612c
commit 14952e2
Show file tree

Hide file tree

Showing 647 changed files with 61,754 additions and 6,061 deletions.
diff --git a/.rat-excludes b/.rat-excludes
@@ -15,6 +15,7 @@ TAGS
 RELEASE
 control
 docs
+docker.properties.template
 fairscheduler.xml.template
 spark-defaults.conf.template
 log4j.properties
@@ -29,7 +30,13 @@ spark-env.sh.template
 log4j-defaults.properties
 bootstrap-tooltip.js
 jquery-1.11.1.min.js
+d3.min.js
+dagre-d3.min.js
+graphlib-dot.min.js
 sorttable.js
+vis.min.js
+vis.min.css
+vis.map
 .*avsc
 .*txt
 .*json

diff --git a/LICENSE b/LICENSE
@@ -643,6 +643,36 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 
+========================================================================
+For d3 (core/src/main/resources/org/apache/spark/ui/static/d3.min.js):
+========================================================================
+
+Copyright (c) 2010-2015, Michael Bostock
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* The name Michael Bostock may not be used to endorse or promote products
+  derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL MICHAEL BOSTOCK BE LIABLE FOR ANY DIRECT,
+INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 ========================================================================
 For Scala Interpreter classes (all .scala files in repl/src/main/scala
@@ -814,6 +844,7 @@ BSD-style licenses
 The following components are provided under a BSD-style license. See project link for details.
 
      (BSD 3 Clause) core (com.github.fommil.netlib:core:1.1.2 - https://github.com/fommil/netlib-java/core)
+     (BSD 3 Clause) JPMML-Model (org.jpmml:pmml-model:1.1.15 - https://github.com/jpmml/jpmml-model)
      (BSD 3-clause style license) jblas (org.jblas:jblas:1.2.3 - http://jblas.org/)
      (BSD License) AntLR Parser Generator (antlr:antlr:2.7.7 - http://www.antlr.org/)
      (BSD License) Javolution (javolution:javolution:5.5.1 - http://javolution.org)

diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
@@ -1,117 +1,36 @@
-#exportPattern("^[[:alpha:]]+")
-exportClasses("RDD")
-exportClasses("Broadcast")
-exportMethods(
-              "aggregateByKey",
-              "aggregateRDD",
-              "cache",
-              "cartesian",
-              "checkpoint",
-              "coalesce",
-              "cogroup",
-              "collect",
-              "collectAsMap",
-              "collectPartition",
-              "combineByKey",
-              "count",
-              "countByKey",
-              "countByValue",
-              "distinct",
-              "Filter",
-              "filterRDD",
-              "first",
-              "flatMap",
-              "flatMapValues",
-              "fold",
-              "foldByKey",
-              "foreach",
-              "foreachPartition",
-              "fullOuterJoin",
-              "glom",
-              "groupByKey",
-              "intersection",
-              "join",
-              "keyBy",
-              "keys",
-              "length",
-              "lapply",
-              "lapplyPartition",
-              "lapplyPartitionsWithIndex",
-              "leftOuterJoin",
-              "lookup",
-              "map",
-              "mapPartitions",
-              "mapPartitionsWithIndex",
-              "mapValues",
-              "maximum",
-              "minimum",
-              "numPartitions",
-              "partitionBy",
-              "persist",
-              "pipeRDD",
-              "reduce",
-              "reduceByKey",
-              "reduceByKeyLocally",
-              "repartition",
-              "rightOuterJoin",
-              "sampleByKey",
-              "sampleRDD",
-              "saveAsTextFile",
-              "saveAsObjectFile",
-              "sortBy",
-              "sortByKey",
-              "subtract",
-              "subtractByKey",
-              "sumRDD",
-              "take",
-              "takeOrdered",
-              "takeSample",
-              "top",
-              "unionRDD",
-              "unpersist",
-              "value",
-              "values",
-              "zipPartitions",
-              "zipRDD",
-              "zipWithIndex",
-              "zipWithUniqueId"
-             )
+# Imports from base R
+importFrom(methods, setGeneric, setMethod, setOldClass)
+useDynLib(SparkR, stringHashCode)
 
 # S3 methods exported
-export(
-       "textFile",
-       "objectFile",
-       "parallelize",
-       "hashCode",
-       "includePackage",
-       "broadcast",
-       "setBroadcastValue",
-       "setCheckpointDir"
-      )
 export("sparkR.init")
 export("sparkR.stop")
 export("print.jobj")
-useDynLib(SparkR, stringHashCode)
-importFrom(methods, setGeneric, setMethod, setOldClass)
-
-# SparkRSQL
 
 exportClasses("DataFrame")
 
-exportMethods("columns",
+exportMethods("cache",
+              "collect",
+              "columns",
+              "count",
+              "describe",
               "distinct",
               "dtypes",
               "except",
               "explain",
               "filter",
+              "first",
               "groupBy",
               "head",
               "insertInto",
               "intersect",
               "isLocal",
+              "join",
+              "length",
               "limit",
               "orderBy",
               "names",
+              "persist",
               "printSchema",
               "registerTempTable",
               "repartition",
@@ -125,9 +44,9 @@ exportMethods("columns",
               "show",
               "showDF",
               "sortDF",
-              "toJSON",
-              "toRDD",
+              "take",
               "unionAll",
+              "unpersist",
               "where",
               "withColumn",
               "withColumnRenamed")
@@ -174,14 +93,12 @@ export("cacheTable",
        "createExternalTable",
        "dropTempTable",
        "jsonFile",
-       "jsonRDD",
        "loadDF",
        "parquetFile",
        "sql",
        "table",
        "tableNames",
        "tables",
-       "toDF",
        "uncacheTable")
 
 export("sparkRSQL.init",

diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
@@ -167,7 +167,7 @@ setMethod("isLocal",
 setMethod("showDF",
           signature(x = "DataFrame"),
           function(x, numRows = 20) {
-            cat(callJMethod(x@sdf, "showString", numToInt(numRows)), "\n")
+            callJMethod(x@sdf, "showString", numToInt(numRows))
           })
 
 #' show
@@ -272,7 +272,7 @@ setMethod("names",
 setMethod("registerTempTable",
           signature(x = "DataFrame", tableName = "character"),
           function(x, tableName) {
-              callJMethod(x@sdf, "registerTempTable", tableName)
+              invisible(callJMethod(x@sdf, "registerTempTable", tableName))
           })
 
 #' insertInto
@@ -1276,3 +1276,40 @@ setMethod("saveAsTable",
             callJMethod(df@sdf, "saveAsTable", tableName, source, jmode, options)
           })
 
+#' describe
+#'
+#' Computes statistics for numeric columns.
+#' If no columns are given, this function computes statistics for all numerical columns.
+#'
+#' @param x A DataFrame to be computed.
+#' @param col A string of name
+#' @param ... Additional expressions
+#' @return A DataFrame
+#' @rdname describe 
+#' @export
+#' @examples
+#'\dontrun{
+#' sc <- sparkR.init()
+#' sqlCtx <- sparkRSQL.init(sc)
+#' path <- "path/to/file.json"
+#' df <- jsonFile(sqlCtx, path)
+#' describe(df)
+#' describe(df, "col1")
+#' describe(df, "col1", "col2")
+#' }
+setMethod("describe",
+          signature(x = "DataFrame", col = "character"),
+          function(x, col, ...) {
+            colList <- list(col, ...)
+            sdf <- callJMethod(x@sdf, "describe", listToSeq(colList))
+            dataFrame(sdf)
+          })
+
+#' @rdname describe
+setMethod("describe",
+          signature(x = "DataFrame"),
+          function(x) {
+            colList <- as.list(c(columns(x)))
+            sdf <- callJMethod(x@sdf, "describe", listToSeq(colList))
+            dataFrame(sdf)
+          })
diff --git a/R/pkg/R/RDD.R b/R/pkg/R/RDD.R
@@ -797,7 +797,7 @@ setMethod("first",
 #' @aliases distinct,RDD-method
 setMethod("distinct",
           signature(x = "RDD"),
-          function(x, numPartitions = SparkR::numPartitions(x)) {
+          function(x, numPartitions = SparkR:::numPartitions(x)) {
             identical.mapped <- lapply(x, function(x) { list(x, NULL) })
             reduced <- reduceByKey(identical.mapped,
                                    function(x, y) { x },
@@ -993,7 +993,7 @@ setMethod("coalesce",
            signature(x = "RDD", numPartitions = "numeric"),
            function(x, numPartitions, shuffle = FALSE) {
              numPartitions <- numToInt(numPartitions)
-             if (shuffle || numPartitions > SparkR::numPartitions(x)) {
+             if (shuffle || numPartitions > SparkR:::numPartitions(x)) {
                func <- function(partIndex, part) {
                  set.seed(partIndex)  # partIndex as seed
                  start <- as.integer(sample(numPartitions, 1) - 1)
@@ -1078,7 +1078,7 @@ setMethod("saveAsTextFile",
 #' @aliases sortBy,RDD,RDD-method
 setMethod("sortBy",
           signature(x = "RDD", func = "function"),
-          function(x, func, ascending = TRUE, numPartitions = SparkR::numPartitions(x)) {          
+          function(x, func, ascending = TRUE, numPartitions = SparkR:::numPartitions(x)) {
             values(sortByKey(keyBy(x, func), ascending, numPartitions))
           })
 
@@ -1552,7 +1552,7 @@ setMethod("cartesian",
 #' @aliases subtract,RDD
 setMethod("subtract",
           signature(x = "RDD", other = "RDD"),
-          function(x, other, numPartitions = SparkR::numPartitions(x)) {
+          function(x, other, numPartitions = SparkR:::numPartitions(x)) {
             mapFunction <- function(e) { list(e, NA) }
             rdd1 <- map(x, mapFunction)
             rdd2 <- map(other, mapFunction)
@@ -1583,7 +1583,7 @@ setMethod("subtract",
 #' @aliases intersection,RDD
 setMethod("intersection",
           signature(x = "RDD", other = "RDD"),
-          function(x, other, numPartitions = SparkR::numPartitions(x)) {
+          function(x, other, numPartitions = SparkR:::numPartitions(x)) {
             rdd1 <- map(x, function(v) { list(v, NA) })
             rdd2 <- map(other, function(v) { list(v, NA) })
 

diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
@@ -384,6 +384,10 @@ setGeneric("value", function(bcast) { standardGeneric("value") })
 #' @export
 setGeneric("columns", function(x) {standardGeneric("columns") })
 
+#' @rdname describe
+#' @export
+setGeneric("describe", function(x, col, ...) { standardGeneric("describe") })
+
 #' @rdname schema
 #' @export
 setGeneric("dtypes", function(x) { standardGeneric("dtypes") })

diff --git a/R/pkg/R/pairRDD.R b/R/pkg/R/pairRDD.R
@@ -739,7 +739,7 @@ setMethod("cogroup",
 #' @aliases sortByKey,RDD,RDD-method
 setMethod("sortByKey",
           signature(x = "RDD"),
-          function(x, ascending = TRUE, numPartitions = SparkR::numPartitions(x)) {
+          function(x, ascending = TRUE, numPartitions = SparkR:::numPartitions(x)) {
             rangeBounds <- list()
 
             if (numPartitions > 1) {
@@ -806,7 +806,7 @@ setMethod("sortByKey",
 #' @aliases subtractByKey,RDD
 setMethod("subtractByKey",
           signature(x = "RDD", other = "RDD"),
-          function(x, other, numPartitions = SparkR::numPartitions(x)) {
+          function(x, other, numPartitions = SparkR:::numPartitions(x)) {
             filterFunction <- function(elem) {
               iters <- elem[[2]]
               (length(iters[[1]]) > 0) && (length(iters[[2]]) == 0)

diff --git a/R/pkg/inst/profile/shell.R b/R/pkg/inst/profile/shell.R
@@ -20,11 +20,13 @@
   .libPaths(c(file.path(home, "R", "lib"), .libPaths()))
   Sys.setenv(NOAWT=1)
 
-  library(utils)
-  library(SparkR)
-  sc <- sparkR.init(Sys.getenv("MASTER", unset = ""))
+  # Make sure SparkR package is the last loaded one
+  old <- getOption("defaultPackages")
+  options(defaultPackages = c(old, "SparkR"))
+
+  sc <- SparkR::sparkR.init(Sys.getenv("MASTER", unset = ""))
   assign("sc", sc, envir=.GlobalEnv)
-  sqlCtx <- sparkRSQL.init(sc)
+  sqlCtx <- SparkR::sparkRSQL.init(sc)
   assign("sqlCtx", sqlCtx, envir=.GlobalEnv)
   cat("\n Welcome to SparkR!")
   cat("\n Spark context is available as sc, SQL context is available as sqlCtx\n")

diff --git a/R/pkg/inst/tests/test_broadcast.R b/R/pkg/inst/tests/test_broadcast.R
@@ -29,7 +29,7 @@ test_that("using broadcast variable", {
   randomMatBr <- broadcast(sc, randomMat)
 
   useBroadcast <- function(x) {
-    sum(value(randomMatBr) * x)
+    sum(SparkR:::value(randomMatBr) * x)
   }
   actual <- collect(lapply(rrdd, useBroadcast))
   expected <- list(sum(randomMat) * 1, sum(randomMat) * 2)